diff options
Diffstat (limited to 'db2/btree')
-rw-r--r-- | db2/btree/bt_close.c | 177 | ||||
-rw-r--r-- | db2/btree/bt_compare.c | 107 | ||||
-rw-r--r-- | db2/btree/bt_conv.c | 15 | ||||
-rw-r--r-- | db2/btree/bt_curadj.c | 272 | ||||
-rw-r--r-- | db2/btree/bt_cursor.c | 1738 | ||||
-rw-r--r-- | db2/btree/bt_delete.c | 512 | ||||
-rw-r--r-- | db2/btree/bt_open.c | 240 | ||||
-rw-r--r-- | db2/btree/bt_page.c | 141 | ||||
-rw-r--r-- | db2/btree/bt_put.c | 571 | ||||
-rw-r--r-- | db2/btree/bt_rec.c | 115 | ||||
-rw-r--r-- | db2/btree/bt_recno.c | 975 | ||||
-rw-r--r-- | db2/btree/bt_rsearch.c | 164 | ||||
-rw-r--r-- | db2/btree/bt_search.c | 132 | ||||
-rw-r--r-- | db2/btree/bt_split.c | 212 | ||||
-rw-r--r-- | db2/btree/bt_stat.c | 122 | ||||
-rw-r--r-- | db2/btree/btree_auto.c | 161 |
16 files changed, 2762 insertions, 2892 deletions
diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c deleted file mode 100644 index 9df5c717e6..0000000000 --- a/db2/btree/bt_close.c +++ /dev/null @@ -1,177 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993, 1994, 1995, 1996 - * Keith Bostic. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "config.h" - -#ifndef lint -static const char sccsid[] = "@(#)bt_close.c 10.32 (Sleepycat) 5/6/98"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "db_page.h" -#include "btree.h" - -static void __bam_upstat __P((DB *dbp)); - -/* - * __bam_close -- - * Close a btree. - * - * PUBLIC: int __bam_close __P((DB *)); - */ -int -__bam_close(dbp) - DB *dbp; -{ - BTREE *t; - - DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0); - - t = dbp->internal; - - /* Update tree statistics. */ - __bam_upstat(dbp); - - /* Free any allocated memory. */ - if (t->bt_rkey.data) - FREE(t->bt_rkey.data, t->bt_rkey.size); - if (t->bt_rdata.data) - FREE(t->bt_rdata.data, t->bt_rdata.ulen); - if (t->bt_sp != t->bt_stack) - FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG)); - - FREE(t, sizeof(BTREE)); - dbp->internal = NULL; - - return (0); -} - -/* - * __bam_sync -- - * Sync the btree to disk. - * - * PUBLIC: int __bam_sync __P((DB *, u_int32_t)); - */ -int -__bam_sync(argdbp, flags) - DB *argdbp; - u_int32_t flags; -{ - DB *dbp; - int ret; - - DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags); - - /* Check for invalid flags. */ - if ((ret = __db_syncchk(argdbp, flags)) != 0) - return (ret); - - /* If it wasn't possible to modify the file, we're done. */ - if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY)) - return (0); - - GETHANDLE(argdbp, NULL, &dbp, ret); - - /* Flush any dirty pages from the cache to the backing file. */ - if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) - ret = 0; - - PUTHANDLE(dbp); - return (ret); -} - -/* - * __bam_upstat -- - * Update tree statistics. - */ -static void -__bam_upstat(dbp) - DB *dbp; -{ - BTREE *t; - BTMETA *meta; - DB_LOCK metalock; - db_pgno_t pgno; - u_int32_t flags; - - /* - * We use a no-op log call to log the update of the statistics onto the - * metadata page. The Db->close call isn't transaction protected to - * start with, and I'm not sure what undoing a statistics update means, - * anyway. - */ - if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY)) - return; - - flags = 0; - pgno = PGNO_METADATA; - - /* Lock and retrieve the page. */ - if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock) != 0) - return; - if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) { - /* Log the change. */ - if (DB_LOGGING(dbp) && - __db_noop_log(dbp->dbenv->lg_info, dbp->txn, &LSN(meta), 0, - dbp->log_fileid, PGNO_METADATA, &LSN(meta)) != 0) - goto err; - - /* Update the statistics. */ - t = dbp->internal; - __bam_add_mstat(&t->lstat, &meta->stat); - - flags = DB_MPOOL_DIRTY; - } - -err: (void)memp_fput(dbp->mpf, (PAGE *)meta, flags); - (void)__BT_LPUT(dbp, metalock); -} diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c index 5c6d1e38ca..c60f920612 100644 --- a/db2/btree/bt_compare.c +++ b/db2/btree/bt_compare.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_compare.c 10.9 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_compare.c 10.14 (Sleepycat) 10/9/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -64,93 +64,76 @@ static const char sccsid[] = "@(#)bt_compare.c 10.9 (Sleepycat) 5/6/98"; * __bam_cmp -- * Compare a key to a given record. * - * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *)); + * PUBLIC: int __bam_cmp __P((DB *, const DBT *, + * PUBLIC: PAGE *, u_int32_t, int (*)(const DBT *, const DBT *))); */ int -__bam_cmp(dbp, k1, e) +__bam_cmp(dbp, dbt, h, indx, func) DB *dbp; - const DBT *k1; - EPG *e; + const DBT *dbt; + PAGE *h; + u_int32_t indx; + int (*func)__P((const DBT *, const DBT *)); { BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; - BTREE *t; - DBT k2; - PAGE *h; - - t = dbp->internal; + DBT pg_dbt; + int ret; /* * Returns: - * < 0 if k1 is < record - * = 0 if k1 is = record - * > 0 if k1 is > record + * < 0 if dbt is < page record + * = 0 if dbt is = page record + * > 0 if dbt is > page record * - * The left-most key on internal pages, at any level of the tree, is - * guaranteed, by the following code, to be less than any user key. - * This saves us from having to update the leftmost key on an internal - * page when the user inserts a new key in the tree smaller than - * anything we've yet seen. + * !!! + * We do not clear the pg_dbt DBT even though it's likely to contain + * random bits. That should be okay, because the app's comparison + * routine had better not be looking at fields other than data/size. + * We don't clear it because we go through this path a lot and it's + * expensive. */ - h = e->page; - if (e->indx == 0 && - h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE) - return (1); - - bo = NULL; - if (TYPE(h) == P_LBTREE) { - bk = GET_BKEYDATA(h, e->indx); + if (TYPE(h) == P_LBTREE || TYPE(h) == P_DUPLICATE) { + bk = GET_BKEYDATA(h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) bo = (BOVERFLOW *)bk; else { - k2.data = bk->data; - k2.size = bk->len; + pg_dbt.data = bk->data; + pg_dbt.size = bk->len; + return (func(dbt, &pg_dbt)); } } else { - bi = GET_BINTERNAL(h, e->indx); - if (B_TYPE(bi->type) == B_OVERFLOW) - bo = (BOVERFLOW *)(bi->data); - else { - k2.data = bi->data; - k2.size = bi->len; - } - } - - /* - * XXX - * We ignore system errors; the only recoverable one is ENOMEM, and we - * don't want to require that comparison routines handle random errors. - * We don't want to return a valid comparison, either, so we stop. - */ - if (bo != NULL) { /* - * If using the default comparison routine, use __db_moff(), - * which compares the overflow key a page at a time. + * The following code guarantees that the left-most key on an + * internal page at any level of the btree is less than any + * user specified key. This saves us from having to update the + * leftmost key on an internal page when the user inserts a new + * key in the tree smaller than anything we've seen before. */ - if (t->bt_compare == __bam_defcmp) - return (__db_moff(dbp, k1, bo->pgno)); + if (indx == 0 && h->prev_pgno == PGNO_INVALID) + return (1); - /* - * Otherwise, we need a contiguous record so we can hand it - * to the user's routine. - */ - memset(&k2, 0, sizeof(k2)); - if (__db_goff(dbp, &k2, bo->tlen, - bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0) { - (void)__db_panic(dbp); - return (0); + bi = GET_BINTERNAL(h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + pg_dbt.data = bi->data; + pg_dbt.size = bi->len; + return (func(dbt, &pg_dbt)); } } /* + * Overflow. + * * XXX - * Note, we have not cleared the k2 DBT in this path. This should - * be okay, because the user's comparison routine had better not be - * looking at any fields other than the data/size. We don't clear - * it because we go through this path a lot and it's expensive. + * We ignore __db_moff() errors, because we have no way of returning + * them. */ - return ((*t->bt_compare)(k1, &k2)); + (void) __db_moff(dbp, + dbt, bo->pgno, bo->tlen, func == __bam_defcmp ? NULL : func, &ret); + return (ret); } /* diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c index 3da4507723..a3069082ae 100644 --- a/db2/btree/bt_conv.c +++ b/db2/btree/bt_conv.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_conv.c 10.6 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)bt_conv.c 10.7 (Sleepycat) 9/20/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -90,18 +90,5 @@ __bam_mswap(pg) SWAP32(p); /* free */ SWAP32(p); /* flags */ - /* Swap the statistics. */ - p = (u_int8_t *)&((BTMETA *)pg)->stat; - SWAP32(p); /* bt_freed */ - SWAP32(p); /* bt_pfxsaved */ - SWAP32(p); /* bt_split */ - SWAP32(p); /* bt_rootsplit */ - SWAP32(p); /* bt_fastsplit */ - SWAP32(p); /* bt_added */ - SWAP32(p); /* bt_deleted */ - SWAP32(p); /* bt_get */ - SWAP32(p); /* bt_cache_hit */ - SWAP32(p); /* bt_cache_miss */ - return (0); } diff --git a/db2/btree/bt_curadj.c b/db2/btree/bt_curadj.c new file mode 100644 index 0000000000..9b86fbb6d7 --- /dev/null +++ b/db2/btree/bt_curadj.c @@ -0,0 +1,272 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_curadj.c 10.69 (Sleepycat) 12/2/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +#ifdef DEBUG +/* + * __bam_cprint -- + * Display the current cursor list. + * + * PUBLIC: int __bam_cprint __P((DB *)); + */ +int +__bam_cprint(dbp) + DB *dbp; +{ + CURSOR *cp; + DBC *dbc; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + fprintf(stderr, + "%#0x->%#0x: page: %lu index: %lu dpage %lu dindex: %lu recno: %lu", + (u_int)dbc, (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, + (u_long)cp->dpgno, (u_long)cp->dindx, (u_long)cp->recno); + if (F_ISSET(cp, C_DELETED)) + fprintf(stderr, " (deleted)"); + fprintf(stderr, "\n"); + } + DB_THREAD_UNLOCK(dbp); + + return (0); +} +#endif /* DEBUG */ + +/* + * __bam_ca_delete -- + * Update the cursors when items are deleted and when already deleted + * items are overwritten. Return the number of relevant cursors found. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_delete(dbp, pgno, indx, delete) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int delete; +{ + DBC *dbc; + CURSOR *cp; + int count; /* !!!: Has to contain max number of cursors. */ + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return (0); + + /* + * Adjust the cursors. We don't have to review the cursors for any + * thread of control other than the current one, because we have the + * page write locked at this point, and any other thread of control + * had better be using a different locker ID, meaning only cursors in + * our thread of control can be on the page. + * + * It's possible for multiple cursors within the thread to have write + * locks on the same page, but, cursors within a thread must be single + * threaded, so all we're locking here is the cursor linked list. + */ + DB_THREAD_LOCK(dbp); + for (count = 0, dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + + if ((cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) { + if (delete) + F_SET(cp, C_DELETED); + else + F_CLR(cp, C_DELETED); + ++count; + } + } + DB_THREAD_UNLOCK(dbp); + + return (count); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_di(dbp, pgno, indx, adjust) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int adjust; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx) + cp->indx += adjust; + if (cp->dpgno == pgno && cp->dindx >= indx) + cp->dindx += adjust; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving items from a leaf page to a duplicates + * page. + * + * PUBLIC: void __bam_ca_dup __P((DB *, + * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +void +__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) + DB *dbp; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + /* + * Ignore matching entries that have already been moved, + * we move from the same location on the leaf page more + * than once. + */ + if (cp->dpgno == PGNO_INVALID && + cp->pgno == fpgno && cp->indx == fi) { + cp->indx = first; + cp->dpgno = tpgno; + cp->dindx = ti; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_rsplit -- + * Adjust the cursors when doing reverse splits. + * + * PUBLIC: void __bam_ca_rsplit __P((DB *, db_pgno_t, db_pgno_t)); + */ +void +__bam_ca_rsplit(dbp, fpgno, tpgno) + DB *dbp; + db_pgno_t fpgno, tpgno; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == fpgno) + cp->pgno = tpgno; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: void __bam_ca_split __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) + DB *dbp; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DBC *dbc; + CURSOR *cp; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == ppgno) { + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + } + if (cp->dpgno == ppgno) { + if (cp->dindx < split_indx) { + if (cleft) + cp->dpgno = lpgno; + } else { + cp->dpgno = rpgno; + cp->dindx -= split_indx; + } + } + } + DB_THREAD_UNLOCK(dbp); +} diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c index 5d3366a3a1..10bc095c9d 100644 --- a/db2/btree/bt_cursor.c +++ b/db2/btree/bt_cursor.c @@ -8,148 +8,219 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_cursor.c 10.53 (Sleepycat) 5/25/98"; +static const char sccsid[] = "@(#)bt_cursor.c 10.81 (Sleepycat) 12/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <stdlib.h> #include <string.h> #endif #include "db_int.h" #include "db_page.h" #include "btree.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" static int __bam_c_close __P((DBC *)); static int __bam_c_del __P((DBC *, u_int32_t)); -static int __bam_c_first __P((DB *, CURSOR *)); +static int __bam_c_destroy __P((DBC *)); +static int __bam_c_first __P((DBC *, CURSOR *)); static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __bam_c_getstack __P((DB *, CURSOR *)); -static int __bam_c_last __P((DB *, CURSOR *)); -static int __bam_c_next __P((DB *, CURSOR *, int)); -static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *)); -static int __bam_c_prev __P((DB *, CURSOR *)); +static int __bam_c_getstack __P((DBC *, CURSOR *)); +static int __bam_c_last __P((DBC *, CURSOR *)); +static int __bam_c_next __P((DBC *, CURSOR *, int)); +static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *)); +static int __bam_c_prev __P((DBC *, CURSOR *)); static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __bam_c_rget __P((DB *, CURSOR *, DBT *, u_int32_t)); -static int __bam_c_search - __P((DB *, CURSOR *, const DBT *, u_int32_t, int, int *)); +static void __bam_c_reset __P((CURSOR *)); +static int __bam_c_rget __P((DBC *, DBT *, u_int32_t)); +static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *)); +static int __bam_dsearch __P((DBC *, CURSOR *, DBT *, u_int32_t *)); /* Discard the current page/lock held by a cursor. */ #undef DISCARD -#define DISCARD(dbp, cp) { \ +#define DISCARD(dbc, cp) { \ if ((cp)->page != NULL) { \ - (void)memp_fput(dbp->mpf, (cp)->page, 0); \ + (void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0); \ (cp)->page = NULL; \ } \ if ((cp)->lock != LOCK_INVALID) { \ - (void)__BT_TLPUT((dbp), (cp)->lock); \ + (void)__BT_TLPUT((dbc), (cp)->lock); \ (cp)->lock = LOCK_INVALID; \ } \ } +/* If the cursor references a deleted record. */ +#undef IS_CUR_DELETED +#define IS_CUR_DELETED(cp) \ + (((cp)->dpgno == PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, \ + (cp)->indx + O_INDX)->type)) || \ + ((cp)->dpgno != PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type))) + +/* If the cursor and index combination references a deleted record. */ +#undef IS_DELETED +#define IS_DELETED(cp, indx) \ + (((cp)->dpgno == PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) || \ + ((cp)->dpgno != PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type))) + /* - * __bam_cursor -- - * Interface to the cursor functions. + * Test to see if two cursors could point to duplicates of the same key, + * whether on-page or off-page. The leaf page numbers must be the same + * in both cases. In the case of off-page duplicates, the key indices + * on the leaf page will be the same. In the case of on-page duplicates, + * the duplicate page number must not be set, and the key index offsets + * must be the same. For the last test, as the saved copy of the cursor + * will not have a valid page pointer, we use the cursor's. + */ +#undef POSSIBLE_DUPLICATE +#define POSSIBLE_DUPLICATE(cursor, saved_copy) \ + ((cursor)->pgno == (saved_copy).pgno && \ + ((cursor)->indx == (saved_copy).indx || \ + ((cursor)->dpgno == PGNO_INVALID && \ + (saved_copy).dpgno == PGNO_INVALID && \ + (cursor)->page->inp[(cursor)->indx] == \ + (cursor)->page->inp[(saved_copy).indx]))) + +/* + * __bam_c_reset -- + * Initialize internal cursor structure. + */ +static void +__bam_c_reset(cp) + CURSOR *cp; +{ + cp->sp = cp->csp = cp->stack; + cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); + cp->page = NULL; + cp->pgno = PGNO_INVALID; + cp->indx = 0; + cp->dpgno = PGNO_INVALID; + cp->dindx = 0; + cp->lock = LOCK_INVALID; + cp->mode = DB_LOCK_NG; + cp->recno = RECNO_OOB; + cp->flags = 0; +} + +/* + * __bam_c_init -- + * Initialize the access private portion of a cursor * - * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **)); + * PUBLIC: int __bam_c_init __P((DBC *)); */ int -__bam_cursor(dbp, txn, dbcp) - DB *dbp; - DB_TXN *txn; - DBC **dbcp; +__bam_c_init(dbc) + DBC *dbc; { + DB *dbp; CURSOR *cp; - DBC *dbc; - - DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0); + int ret; - if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL) - return (ENOMEM); - if ((cp = (CURSOR *)__db_calloc(1, sizeof(CURSOR))) == NULL) { - __db_free(dbc); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0) + return (ret); + dbp = dbc->dbp; cp->dbc = dbc; - cp->pgno = cp->dpgno = PGNO_INVALID; - cp->lock = LOCK_INVALID; - - dbc->dbp = dbp; - dbc->txn = txn; - dbc->internal = cp; - dbc->c_close = __bam_c_close; - dbc->c_del = __bam_c_del; - dbc->c_get = __bam_c_get; - dbc->c_put = __bam_c_put; /* - * All cursors are queued from the master DB structure. Add the - * cursor to that queue. + * Logical record numbers are always the same size, and we don't want + * to have to check for space every time we return one. Allocate it + * in advance. */ - CURSOR_SETUP(dbp); - TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); + if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + if ((ret = __os_malloc(sizeof(db_recno_t), + NULL, &dbc->rkey.data)) != 0) { + __os_free(cp, sizeof(CURSOR)); + return (ret); + } + dbc->rkey.ulen = sizeof(db_recno_t); + } + + /* Initialize methods. */ + dbc->internal = cp; + if (dbp->type == DB_BTREE) { + dbc->c_am_close = __bam_c_close; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_del = __bam_c_del; + dbc->c_get = __bam_c_get; + dbc->c_put = __bam_c_put; + } else { + dbc->c_am_close = __bam_c_close; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_del = __ram_c_del; + dbc->c_get = __ram_c_get; + dbc->c_put = __ram_c_put; + } + + /* Initialize dynamic information. */ + __bam_c_reset(cp); - *dbcp = dbc; return (0); } /* * __bam_c_close -- - * Close a single cursor. + * Close down the cursor from a single use. */ static int __bam_c_close(dbc) DBC *dbc; { + CURSOR *cp; DB *dbp; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0); + dbp = dbc->dbp; + cp = dbc->internal; + ret = 0; - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + /* + * If a cursor deleted a btree key, perform the actual deletion. + * (Recno keys are either deleted immediately or never deleted.) + */ + if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED)) + ret = __bam_c_physdel(dbc, cp, NULL); - ret = __bam_c_iclose(dbp, dbc); + /* Discard any locks not acquired inside of a transaction. */ + if (cp->lock != LOCK_INVALID) { + (void)__BT_TLPUT(dbc, cp->lock); + cp->lock = LOCK_INVALID; + } + + /* Sanity checks. */ +#ifdef DIAGNOSTIC + if (cp->csp != cp->stack) + __db_err(dbp->dbenv, "btree cursor close: stack not empty"); +#endif + + /* Initialize dynamic information. */ + __bam_c_reset(cp); - PUTHANDLE(dbp); return (ret); } /* - * __bam_c_iclose -- + * __bam_c_destroy -- * Close a single cursor -- internal version. - * - * PUBLIC: int __bam_c_iclose __P((DB *, DBC *)); */ -int -__bam_c_iclose(dbp, dbc) - DB *dbp; +static int +__bam_c_destroy(dbc) DBC *dbc; { - CURSOR *cp; - int ret; - - /* If a cursor key was deleted, perform the actual deletion. */ - cp = dbc->internal; - ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0; - - /* Discard any lock if we're not inside a transaction. */ - if (cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); - - /* Remove the cursor from the queue. */ - CURSOR_SETUP(dbp); - TAILQ_REMOVE(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); - /* Discard the structures. */ - FREE(dbc->internal, sizeof(CURSOR)); - FREE(dbc, sizeof(DBC)); + __os_free(dbc->internal, sizeof(CURSOR)); - return (ret); + return (0); } /* @@ -161,7 +232,6 @@ __bam_c_del(dbc, flags) DBC *dbc; u_int32_t flags; { - BTREE *t; CURSOR *cp; DB *dbp; DB_LOCK lock; @@ -170,23 +240,31 @@ __bam_c_del(dbc, flags) db_indx_t indx; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags); - + dbp = dbc->dbp; cp = dbc->internal; h = NULL; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cdelchk(dbc->dbp, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + if ((ret = __db_cdelchk(dbp, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) return (ret); + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. + */ + if (F_ISSET(dbp, DB_AM_CDB)) + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags); + /* If already deleted, return failure. */ - if (F_ISSET(cp, C_DELETED | C_REPLACE)) + if (F_ISSET(cp, C_DELETED)) return (DB_KEYEMPTY); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; - /* * We don't physically delete the record until the cursor moves, * so we have to have a long-lived write lock on the page instead @@ -194,10 +272,10 @@ __bam_c_del(dbc, flags) * to even get here, so we simply discard it. */ if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) { - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) goto err; - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); cp->lock = lock; cp->mode = DB_LOCK_WRITE; } @@ -216,85 +294,50 @@ __bam_c_del(dbc, flags) indx = cp->dindx; } - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp) && - (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + if (DB_LOGGING(dbc) && + (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) { (void)memp_fput(dbp->mpf, h, 0); goto err; } - /* Set the intent-to-delete flag on the page and in all cursors. */ + /* + * Set the intent-to-delete flag on the page and update all cursors. */ if (cp->dpgno == PGNO_INVALID) B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); else B_DSET(GET_BKEYDATA(h, indx)->type); - (void)__bam_ca_delete(dbp, pgno, indx, NULL, 0); + (void)__bam_ca_delete(dbp, pgno, indx, 1); ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); h = NULL; /* - * If it's a btree with record numbers, we have to adjust the - * counts. + * If the tree has record numbers, we have to adjust the counts. + * + * !!! + * This test is right -- we don't yet support duplicates and record + * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM + * set. */ - if (F_ISSET(dbp, DB_BT_RECNUM) && - (ret = __bam_c_getstack(dbp, cp)) == 0) { - ret = __bam_adjust(dbp, t, -1); - (void)__bam_stkrel(dbp); + if (F_ISSET(dbp, DB_BT_RECNUM)) { + if ((ret = __bam_c_getstack(dbc, cp)) != 0) + goto err; + if ((ret = __bam_adjust(dbc, -1)) != 0) + goto err; + (void)__bam_stkrel(dbc, 0); } err: if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); - PUTHANDLE(dbp); return (ret); } /* - * __bam_get -- - * Retrieve a key/data pair from the tree. - * - * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - */ -int -__bam_get(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - DBC dbc; - CURSOR cp; - int ret; - - DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags); - - /* Check for invalid flags. */ - if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) - return (ret); - - /* Build an internal cursor. */ - memset(&cp, 0, sizeof(cp)); - cp.dbc = &dbc; - cp.pgno = cp.dpgno = PGNO_INVALID; - cp.lock = LOCK_INVALID; - cp.flags = C_INTERNAL; - - /* Build an external cursor. */ - memset(&dbc, 0, sizeof(dbc)); - dbc.dbp = argdbp; - dbc.txn = txn; - dbc.internal = &cp; - - /* Get the key. */ - return(__bam_c_get(&dbc, - key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET)); -} - -/* * __bam_c_get -- * Get using a cursor (btree). */ @@ -304,91 +347,197 @@ __bam_c_get(dbc, key, data, flags) DBT *key, *data; u_int32_t flags; { - BTREE *t; - CURSOR *cp, copy; + CURSOR *cp, copy, start; DB *dbp; PAGE *h; - int exact, ret; - - DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get", - flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + int exact, ret, tmp_rmw; + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cgetchk(dbc->dbp, + if ((ret = __db_cgetchk(dbp, key, data, flags, cp->pgno != PGNO_INVALID)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = 0; + if (LF_ISSET(DB_RMW)) { + if (!F_ISSET(dbp, DB_AM_CDB)) { + tmp_rmw = 1; + F_SET(dbc, DBC_RMW); + } + LF_CLR(DB_RMW); + } + + DEBUG_LREAD(dbc, dbc->txn, "bam_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); /* - * Break out the code to return a cursor's record number. It - * has nothing to do with the cursor get code except that it's - * been rammed into the interface. + * Return a cursor's record number. It has nothing to do with the + * cursor get code except that it's been rammed into the interface. */ - if (LF_ISSET(DB_GET_RECNO)) { - ret = __bam_c_rget(dbp, cp, data, flags); - PUTHANDLE(dbp); + if (flags == DB_GET_RECNO) { + ret = __bam_c_rget(dbc, data, flags); + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); return (ret); } - /* Initialize the cursor for a new retrieval. */ - copy = *cp; + /* + * Initialize the cursor for a new retrieval. Clear the cursor's + * page pointer, it was set before this operation, and no longer + * has any meaning. + */ cp->page = NULL; + copy = *cp; cp->lock = LOCK_INVALID; switch (flags) { case DB_CURRENT: /* It's not possible to return a deleted record. */ - if (F_ISSET(cp, C_DELETED | C_REPLACE)) { - PUTHANDLE(dbp); - return (DB_KEYEMPTY); + if (F_ISSET(cp, C_DELETED)) { + ret = DB_KEYEMPTY; + goto err; } - /* Get the page with the current item on it. */ - if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + /* Acquire the current page. */ + if ((ret = __bam_lget(dbc, + 0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0) + ret = memp_fget(dbp->mpf, + cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, + 0, &cp->page); + if (ret != 0) goto err; break; + case DB_NEXT_DUP: + if (cp->pgno == PGNO_INVALID) { + ret = EINVAL; + goto err; + } + if ((ret = __bam_c_next(dbc, cp, 1)) != 0) + goto err; + + /* Make sure we didn't go past the end of the duplicates. */ + if (!POSSIBLE_DUPLICATE(cp, copy)) { + ret = DB_NOTFOUND; + goto err; + } + break; case DB_NEXT: if (cp->pgno != PGNO_INVALID) { - if ((ret = __bam_c_next(dbp, cp, 1)) != 0) + if ((ret = __bam_c_next(dbc, cp, 1)) != 0) goto err; break; } /* FALLTHROUGH */ case DB_FIRST: - if ((ret = __bam_c_first(dbp, cp)) != 0) + if ((ret = __bam_c_first(dbc, cp)) != 0) goto err; break; case DB_PREV: if (cp->pgno != PGNO_INVALID) { - if ((ret = __bam_c_prev(dbp, cp)) != 0) + if ((ret = __bam_c_prev(dbc, cp)) != 0) goto err; break; } /* FALLTHROUGH */ case DB_LAST: - if ((ret = __bam_c_last(dbp, cp)) != 0) + if ((ret = __bam_c_last(dbc, cp)) != 0) goto err; break; + case DB_SET: + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * We cannot currently be referencing a deleted record, but we + * may be referencing off-page duplicates. + * + * If we're referencing off-page duplicates, move off-page. + * If we moved off-page, move to the next non-deleted record. + * If we moved to the next non-deleted record, check to make + * sure we didn't switch records because our current record + * had no non-deleted data items. + */ + start = *cp; + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) { + if ((ret = __bam_c_next(dbc, cp, 0)) != 0) + goto err; + if (!POSSIBLE_DUPLICATE(cp, start)) { + ret = DB_NOTFOUND; + goto err; + } + } + break; case DB_SET_RECNO: - exact = 1; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0) + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) goto err; break; - case DB_SET: - exact = 1; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + case DB_GET_BOTH: + if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) { + /* Acquire the current page. */ + if ((ret = memp_fget(dbp->mpf, + cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, + 0, &cp->page)) != 0) + goto err; + + /* If DBC_CONTINUE, move to the next item. */ + if (F_ISSET(dbc, DBC_CONTINUE) && + (ret = __bam_c_next(dbc, cp, 1)) != 0) + goto err; + } else { + if ((ret = + __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * We may be referencing a duplicates page. Move to + * the first duplicate. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + } + + /* Search for a matching entry. */ + if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0) goto err; + + /* Ignore deleted entries. */ + if (IS_CUR_DELETED(cp)) { + ret = DB_NOTFOUND; + goto err; + } break; case DB_SET_RANGE: - exact = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * As we didn't require an exact match, the search function + * may have returned an entry past the end of the page. If + * so, move to the next entry. + */ + if (cp->indx == NUM_ENT(cp->page) && + (ret = __bam_c_next(dbc, cp, 0)) != 0) + goto err; + + /* + * We may be referencing off-page duplicates, if so, move + * off-page. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + + /* + * We may be referencing a deleted record, if so, move to + * the next non-deleted record. + */ + if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0) goto err; break; } @@ -401,12 +550,12 @@ __bam_c_get(dbc, key, data, flags) */ if (flags != DB_SET) { if (cp->dpgno != PGNO_INVALID) { - if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) goto err; } else h = cp->page; ret = __db_ret(dbp, - h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen); + h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen); if (cp->dpgno != PGNO_INVALID) (void)memp_fput(dbp->mpf, h, 0); if (ret) @@ -416,62 +565,163 @@ __bam_c_get(dbc, key, data, flags) /* Return the data. */ if ((ret = __db_ret(dbp, cp->page, cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx, - data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) goto err; /* - * If the previous cursor record has been deleted, delete it. The - * returned key isn't a deleted key, so clear the flag. + * If the previous cursor record has been deleted, physically delete + * the entry from the page. We clear the deleted flag before we call + * the underlying delete routine so that, if an error occurs, and we + * restore the cursor, the deleted flag is cleared. This is because, + * if we manage to physically modify the page, and then restore the + * cursor, we might try to repeat the page modification when closing + * the cursor. */ - if (F_ISSET(©, C_DELETED) && __bam_c_physdel(dbp, ©, cp->page)) - goto err; - F_CLR(cp, C_DELETED | C_REPLACE); + if (F_ISSET(©, C_DELETED)) { + F_CLR(©, C_DELETED); + if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) + goto err; + } + F_CLR(cp, C_DELETED); - /* Release the previous lock, if any. */ + /* Release the previous lock, if any; the current lock is retained. */ if (copy.lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, copy.lock); - - /* Release the pinned page. */ - ret = memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbc, copy.lock); - /* Internal cursors don't hold locks. */ - if (F_ISSET(cp, C_INTERNAL) && cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); - - ++t->lstat.bt_get; + /* Release the current page. */ + if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + goto err; if (0) { err: if (cp->page != NULL) (void)memp_fput(dbp->mpf, cp->page, 0); if (cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); *cp = copy; } - PUTHANDLE(dbp); + /* Release temporary lock upgrade. */ + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); + return (ret); } /* + * __bam_dsearch -- + * Search for a matching data item (or the first data item that's + * equal to or greater than the one we're searching for). + */ +static int +__bam_dsearch(dbc, cp, data, iflagp) + DBC *dbc; + CURSOR *cp; + DBT *data; + u_int32_t *iflagp; +{ + DB *dbp; + CURSOR copy, last; + int cmp, ret; + + dbp = dbc->dbp; + + /* + * If iflagp is non-NULL, we're doing an insert. + * + * If the duplicates are off-page, use the duplicate search routine. + */ + if (cp->dpgno != PGNO_INVALID) { + if ((ret = __db_dsearch(dbc, iflagp != NULL, + data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0) + return (ret); + cp->dpgno = cp->page->pgno; + + if (iflagp == NULL) { + if (cmp != 0) + return (DB_NOTFOUND); + return (0); + } + *iflagp = DB_BEFORE; + return (0); + } + + /* Otherwise, do the search ourselves. */ + copy = *cp; + for (;;) { + /* Save the last interesting cursor position. */ + last = *cp; + + /* See if the data item matches the one we're looking for. */ + if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX, + dbp->dup_compare == NULL ? + __bam_defcmp : dbp->dup_compare)) == 0) { + if (iflagp != NULL) + *iflagp = DB_AFTER; + return (0); + } + + /* + * If duplicate entries are sorted, we're done if we find a + * page entry that sorts greater than the application item. + * If doing an insert, return success, otherwise DB_NOTFOUND. + */ + if (dbp->dup_compare != NULL && cmp < 0) { + if (iflagp == NULL) + return (DB_NOTFOUND); + *iflagp = DB_BEFORE; + return (0); + } + + /* + * Move to the next item. If we reach the end of the page and + * we're doing an insert, set the cursor to the last item and + * set the referenced memory location so callers know to insert + * after the item, instead of before it. If not inserting, we + * return DB_NOTFOUND. + */ + if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) { + if (iflagp == NULL) + return (DB_NOTFOUND); + goto use_last; + } + + /* + * Make sure we didn't go past the end of the duplicates. The + * error conditions are the same as above. + */ + if (!POSSIBLE_DUPLICATE(cp, copy)) { + if (iflagp == NULL) + return (DB_NOTFOUND); +use_last: *cp = last; + *iflagp = DB_AFTER; + return (0); + } + } + /* NOTREACHED */ +} + +/* * __bam_c_rget -- * Return the record number for a cursor. */ static int -__bam_c_rget(dbp, cp, data, flags) - DB *dbp; - CURSOR *cp; +__bam_c_rget(dbc, data, flags) + DBC *dbc; DBT *data; u_int32_t flags; { - BTREE *t; + CURSOR *cp; + DB *dbp; DBT dbt; db_recno_t recno; int exact, ret; COMPQUIET(flags, 0); + dbp = dbc->dbp; + cp = dbc->internal; /* Get the page with the current item on it. */ - if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) return (ret); /* Get a copy of the key. */ @@ -481,18 +731,19 @@ __bam_c_rget(dbp, cp, data, flags) goto err; exact = 1; - if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0) + if ((ret = __bam_search(dbc, &dbt, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, + 1, &recno, &exact)) != 0) goto err; - t = dbp->internal; ret = __db_retcopy(data, &recno, sizeof(recno), - &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc); + &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc); /* Release the stack. */ - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); err: (void)memp_fput(dbp->mpf, cp->page, 0); - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); return (ret); } @@ -506,62 +757,97 @@ __bam_c_put(dbc, key, data, flags) DBT *key, *data; u_int32_t flags; { - BTREE *t; CURSOR *cp, copy; DB *dbp; DBT dbt; db_indx_t indx; db_pgno_t pgno; - u_int32_t iiflags; + u_int32_t iiflags, iiop; int exact, needkey, ret, stack; void *arg; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put", - flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, - data, flags); - + dbp = dbc->dbp; cp = dbc->internal; - if ((ret = __db_cputchk(dbc->dbp, key, data, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) - return (ret); + DB_PANIC_CHECK(dbp); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, + data, flags); - /* Initialize the cursor for a new retrieval. */ - copy = *cp; - cp->page = NULL; - cp->lock = LOCK_INVALID; + if ((ret = __db_cputchk(dbp, key, data, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); /* - * To split, we need a valid key for the page. Since it's a cursor, - * we have to build one. + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. If it's a regular writer, + * that means we have an IWRITE lock and we need to upgrade + * it to a write lock. */ - stack = 0; + if (F_ISSET(dbp, DB_AM_CDB)) { + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } + if (0) { -split: /* Acquire a copy of a key from the page. */ +split: /* + * To split, we need a valid key for the page. Since it's a + * cursor, we have to build one. + * + * Acquire a copy of a key from the page. + */ if (needkey) { memset(&dbt, 0, sizeof(DBT)); if ((ret = __db_ret(dbp, cp->page, indx, - &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen)) != 0) + &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) goto err; arg = &dbt; } else arg = key; - /* Discard any pinned pages. */ + /* + * Discard any locks and pinned pages (the locks are discarded + * even if we're running with transactions, as they lock pages + * that we're sorry we ever acquired). If stack is set and the + * cursor entries are valid, they point to the same entries as + * the stack, don't free them twice. + */ if (stack) { - (void)__bam_stkrel(dbp); + (void)__bam_stkrel(dbc, 1); stack = 0; } else - DISCARD(dbp, cp); + DISCARD(dbc, cp); - if ((ret = __bam_split(dbp, arg)) != 0) + /* + * Restore the cursor to its original value. This is necessary + * for two reasons. First, we are about to copy it in case of + * error, again. Second, we adjust cursors during the split, + * and we have to ensure this cursor is adjusted appropriately, + * along with all the other cursors. + */ + *cp = copy; + + if ((ret = __bam_split(dbc, arg)) != 0) goto err; } - ret = 0; + /* + * Initialize the cursor for a new retrieval. Clear the cursor's + * page pointer, it was set before this operation, and no longer + * has any meaning. + */ + cp->page = NULL; + copy = *cp; + cp->lock = LOCK_INVALID; + + iiflags = needkey = ret = stack = 0; switch (flags) { case DB_AFTER: case DB_BEFORE: @@ -574,64 +860,148 @@ split: /* Acquire a copy of a key from the page. */ pgno = cp->dpgno; indx = cp->dindx; } + /* - * XXX - * This test is right -- we don't currently support duplicates - * in the presence of record numbers, so we don't worry about - * them if DB_BT_RECNUM is set. + * !!! + * This test is right -- we don't yet support duplicates and + * record numbers in the same tree, so ignore duplicates if + * DB_BT_RECNUM set. */ if (F_ISSET(dbp, DB_BT_RECNUM) && (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) { /* Acquire a complete stack. */ - if ((ret = __bam_c_getstack(dbp, cp)) != 0) + if ((ret = __bam_c_getstack(dbc, cp)) != 0) goto err; - cp->page = t->bt_csp->page; + cp->page = cp->csp->page; stack = 1; iiflags = BI_DOINCR; } else { /* Acquire the current page. */ - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0) - ret = __bam_pget(dbp, &cp->page, &pgno, 0); + ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page); if (ret != 0) goto err; iiflags = 0; } - if ((ret = __bam_iitem(dbp, &cp->page, - &indx, key, data, flags, iiflags)) == DB_NEEDSPLIT) - goto split; - break; - case DB_KEYFIRST: - exact = needkey = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0) - goto err; - stack = 1; - indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; - if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, - data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) - goto split; + /* + * If the user has specified a duplicate comparison function, + * we return an error if DB_CURRENT was specified and the + * replacement data doesn't compare equal to the current data. + * This stops apps from screwing up the duplicate sort order. + */ + if (flags == DB_CURRENT && dbp->dup_compare != NULL) + if (__bam_cmp(dbp, data, + cp->page, indx, dbp->dup_compare) != 0) { + ret = EINVAL; + goto err; + } + + iiop = flags; break; + case DB_KEYFIRST: case DB_KEYLAST: - exact = needkey = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0) + /* + * If we have a duplicate comparison function, we position to + * the first of any on-page duplicates, and use __bam_dsearch + * to search for the right slot. Otherwise, we position to + * the first/last of any on-page duplicates based on the flag + * value. + */ + if ((ret = __bam_c_search(dbc, cp, key, + flags == DB_KEYFIRST || dbp->dup_compare != NULL ? + DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) goto err; stack = 1; - indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; - if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, - data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) - goto split; + /* + * If an exact match: + * If duplicates aren't supported, replace the current + * item. (When implementing the DB->put function, our + * caller has already checked the DB_NOOVERWRITE flag.) + * + * If there's a duplicate comparison function, find the + * correct slot for this duplicate item. + * + * If there's no duplicate comparison function, set the + * insert flag based on the argument flags. + * + * If there's no match, the search function returned the + * smallest slot greater than the key, use it. + */ + if (exact) { + if (F_ISSET(dbp, DB_AM_DUP)) { + /* + * If at off-page duplicate page, move to the + * first or last entry -- if a comparison + * function was specified, start searching at + * the first entry. Otherwise, move based on + * the DB_KEYFIRST/DB_KEYLAST flags. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, + dbp->dup_compare == NULL && + flags != DB_KEYFIRST)) != 0) + goto err; + + /* + * If there's a comparison function, search for + * the correct slot. Otherwise, set the insert + * flag based on the argment flag. + */ + if (dbp->dup_compare == NULL) + iiop = flags == DB_KEYFIRST ? + DB_BEFORE : DB_AFTER; + else + if ((ret = __bam_dsearch(dbc, + cp, data, &iiop)) != 0) + goto err; + } else + iiop = DB_CURRENT; + iiflags = 0; + } else { + iiop = DB_BEFORE; + iiflags = BI_NEWKEY; + } + + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } break; } - if (ret) + + ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags); + + if (ret == DB_NEEDSPLIT) + goto split; + if (ret != 0) goto err; /* + * Reset any cursors referencing this item that might have the item + * marked for deletion. + */ + if (iiop == DB_CURRENT) { + (void)__bam_ca_delete(dbp, pgno, indx, 0); + + /* + * It's also possible that we are the cursor that had the + * item marked for deletion, in which case we want to make + * sure that we don't delete it because we had the delete + * flag set already. + */ + if (cp->pgno == copy.pgno && cp->indx == copy.indx && + cp->dpgno == copy.dpgno && cp->dindx == copy.dindx) + F_CLR(©, C_DELETED); + } + + /* * Update the cursor to point to the new entry. The new entry was * stored on the current page, because we split pages until it was * possible. @@ -642,17 +1012,24 @@ split: /* Acquire a copy of a key from the page. */ cp->dindx = indx; /* - * If the previous cursor record has been deleted, delete it. The - * returned key isn't a deleted key, so clear the flag. + * If the previous cursor record has been deleted, physically delete + * the entry from the page. We clear the deleted flag before we call + * the underlying delete routine so that, if an error occurs, and we + * restore the cursor, the deleted flag is cleared. This is because, + * if we manage to physically modify the page, and then restore the + * cursor, we might try to repeat the page modification when closing + * the cursor. */ - if (F_ISSET(©, C_DELETED) && - (ret = __bam_c_physdel(dbp, ©, cp->page)) != 0) - goto err; - F_CLR(cp, C_DELETED | C_REPLACE); + if (F_ISSET(©, C_DELETED)) { + F_CLR(©, C_DELETED); + if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) + goto err; + } + F_CLR(cp, C_DELETED); - /* Release the previous lock, if any. */ + /* Release the previous lock, if any; the current lock is retained. */ if (copy.lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, copy.lock); + (void)__BT_TLPUT(dbc, copy.lock); /* * Discard any pages pinned in the tree and their locks, except for @@ -662,23 +1039,26 @@ split: /* Acquire a copy of a key from the page. */ * we have to adjust the stack as necessary. If there was only a * single page on the stack, we don't have to free further stack pages. */ + if (stack && BT_STK_POP(cp) != NULL) + (void)__bam_stkrel(dbc, 0); - if (stack && BT_STK_POP(t) != NULL) - (void)__bam_stkrel(dbp); - + /* Release the current page. */ if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) goto err; if (0) { err: /* Discard any pinned pages. */ if (stack) - (void)__bam_stkrel(dbp); + (void)__bam_stkrel(dbc, 0); else - DISCARD(dbp, cp); + DISCARD(dbc, cp); *cp = copy; } - PUTHANDLE(dbp); + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + return (ret); } @@ -687,19 +1067,22 @@ err: /* Discard any pinned pages. */ * Return the first record. */ static int -__bam_c_first(dbp, cp) - DB *dbp; +__bam_c_first(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* Walk down the left-hand side of the tree. */ for (pgno = PGNO_ROOT;;) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); /* If we find a leaf page, we're done. */ @@ -707,28 +1090,22 @@ __bam_c_first(dbp, cp) break; pgno = GET_BINTERNAL(cp->page, 0)->pgno; - DISCARD(dbp, cp); + DISCARD(dbc, cp); } cp->pgno = cp->page->pgno; cp->indx = 0; cp->dpgno = PGNO_INVALID; - /* If it's an empty page or a deleted record, go to the next one. */ - if (NUM_ENT(cp->page) == 0 || - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); - - /* If it's a duplicate reference, go to the first entry. */ - if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0) + /* Check for duplicates. */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) return (ret); - /* If it's a deleted record, go to the next one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + /* If on an empty page or a deleted record, move to the next one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) + if ((ret = __bam_c_next(dbc, cp, 0)) != 0) return (ret); + return (0); } @@ -737,19 +1114,22 @@ __bam_c_first(dbp, cp) * Return the last record. */ static int -__bam_c_last(dbp, cp) - DB *dbp; +__bam_c_last(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* Walk down the right-hand side of the tree. */ for (pgno = PGNO_ROOT;;) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); /* If we find a leaf page, we're done. */ @@ -758,28 +1138,22 @@ __bam_c_last(dbp, cp) pgno = GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; - DISCARD(dbp, cp); + DISCARD(dbc, cp); } cp->pgno = cp->page->pgno; cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX; cp->dpgno = PGNO_INVALID; - /* If it's an empty page or a deleted record, go to the previous one. */ - if (NUM_ENT(cp->page) == 0 || - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_prev(dbp, cp)) != 0) - return (ret); - - /* If it's a duplicate reference, go to the last entry. */ - if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0) + /* Check for duplicates. */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0) return (ret); - /* If it's a deleted record, go to the previous one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) - if ((ret = __bam_c_prev(dbp, cp)) != 0) + /* If on an empty page or a deleted record, move to the next one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) + if ((ret = __bam_c_prev(dbc, cp)) != 0) return (ret); + return (0); } @@ -788,15 +1162,18 @@ __bam_c_last(dbp, cp) * Move to the next record. */ static int -__bam_c_next(dbp, cp, initial_move) - DB *dbp; +__bam_c_next(dbc, cp, initial_move) + DBC *dbc; CURSOR *cp; int initial_move; { + DB *dbp; db_indx_t adjust, indx; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* * We're either moving through a page of duplicates or a btree leaf * page. @@ -812,9 +1189,9 @@ __bam_c_next(dbp, cp, initial_move) } if (cp->page == NULL) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); } @@ -832,15 +1209,13 @@ __bam_c_next(dbp, cp, initial_move) indx += adjust; for (;;) { if (indx >= NUM_ENT(cp->page)) { - pgno = cp->page->next_pgno; - DISCARD(dbp, cp); - /* * If we're in a btree leaf page, we've reached the end * of the tree. If we've reached the end of a page of * duplicates, continue from the btree leaf page where * we found this page of duplicates. */ + pgno = cp->page->next_pgno; if (pgno == PGNO_INVALID) { /* If in a btree leaf page, it's EOF. */ if (cp->dpgno == PGNO_INVALID) @@ -855,20 +1230,18 @@ __bam_c_next(dbp, cp, initial_move) } else indx = 0; - if ((ret = __bam_lget(dbp, + DISCARD(dbc, cp); + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = + memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); continue; } /* Ignore deleted records. */ - if (dbp->type == DB_BTREE && - ((cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) || - (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx)->type)))) { + if (IS_DELETED(cp, indx)) { indx += adjust; continue; } @@ -882,8 +1255,7 @@ __bam_c_next(dbp, cp, initial_move) cp->pgno = cp->page->pgno; cp->indx = indx; - if ((ret = - __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0) + if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0) return (ret); if (cp->dpgno != PGNO_INVALID) { indx = cp->dindx; @@ -904,14 +1276,17 @@ __bam_c_next(dbp, cp, initial_move) * Move to the previous record. */ static int -__bam_c_prev(dbp, cp) - DB *dbp; +__bam_c_prev(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_indx_t indx, adjust; db_pgno_t pgno; int ret, set_indx; + dbp = dbc->dbp; + /* * We're either moving through a page of duplicates or a btree leaf * page. @@ -927,9 +1302,9 @@ __bam_c_prev(dbp, cp) } if (cp->page == NULL) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); } @@ -941,15 +1316,13 @@ __bam_c_prev(dbp, cp) */ for (;;) { if (indx == 0) { - pgno = cp->page->prev_pgno; - DISCARD(dbp, cp); - /* * If we're in a btree leaf page, we've reached the * beginning of the tree. If we've reached the first * of a page of duplicates, continue from the btree * leaf page where we found this page of duplicates. */ + pgno = cp->page->prev_pgno; if (pgno == PGNO_INVALID) { /* If in a btree leaf page, it's SOF. */ if (cp->dpgno == PGNO_INVALID) @@ -965,10 +1338,12 @@ __bam_c_prev(dbp, cp) } else set_indx = 1; - if ((ret = __bam_lget(dbp, + DISCARD(dbc, cp); + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = + memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); if (set_indx) @@ -979,11 +1354,7 @@ __bam_c_prev(dbp, cp) /* Ignore deleted records. */ indx -= adjust; - if (dbp->type == DB_BTREE && - ((cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) || - (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx)->type)))) + if (IS_DELETED(cp, indx)) continue; /* @@ -995,8 +1366,7 @@ __bam_c_prev(dbp, cp) cp->pgno = cp->page->pgno; cp->indx = indx; - if ((ret = - __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0) + if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0) return (ret); if (cp->dpgno != PGNO_INVALID) { indx = cp->dindx + O_INDX; @@ -1017,499 +1387,261 @@ __bam_c_prev(dbp, cp) * Move to a specified record. */ static int -__bam_c_search(dbp, cp, key, flags, isrecno, exactp) - DB *dbp; +__bam_c_search(dbc, cp, key, flags, exactp) + DBC *dbc; CURSOR *cp; const DBT *key; u_int32_t flags; - int isrecno, *exactp; + int *exactp; { BTREE *t; + DB *dbp; + DB_LOCK lock; + PAGE *h; db_recno_t recno; - int needexact, ret; + db_indx_t indx; + u_int32_t sflags; + int cmp, needexact, ret; + dbp = dbc->dbp; t = dbp->internal; - needexact = *exactp; - /* - * Find any matching record; the search function pins the page. Make - * sure it's a valid key (__bam_search may return an index just past - * the end of a page) and return it. - */ - if (isrecno) { - if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + /* Find an entry in the database. */ + switch (flags) { + case DB_SET_RECNO: + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) return (ret); - ret = __bam_rsearch(dbp, &recno, flags, 1, exactp); - } else - ret = __bam_search(dbp, key, flags, 1, NULL, exactp); + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 1; + ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp); + break; + case DB_SET: + case DB_GET_BOTH: + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 1; + goto search; + case DB_SET_RANGE: + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 0; + goto search; + case DB_KEYFIRST: + sflags = S_KEYFIRST; + goto fast_search; + case DB_KEYLAST: + sflags = S_KEYLAST; +fast_search: needexact = *exactp = 0; + /* + * If the application has a history of inserting into the first + * or last pages of the database, we check those pages first to + * avoid doing a full search. + * + * Record numbers can't be fast-tracked, the entire tree has to + * be locked. + */ + h = NULL; + lock = LOCK_INVALID; + if (F_ISSET(dbp, DB_BT_RECNUM)) + goto search; + + /* Check if the application has a history of sorted input. */ + if (t->bt_lpgno == PGNO_INVALID) + goto search; + + /* + * Lock and retrieve the page on which we did the last insert. + * It's okay if it doesn't exist, or if it's not the page type + * we expected, it just means that the world changed. + */ + if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) + goto fast_miss; + if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h)) + goto fast_miss; + if (TYPE(h) != P_LBTREE) + goto fast_miss; + if (NUM_ENT(h) == 0) + goto fast_miss; + + /* + * What we do here is test to see if we're at the beginning or + * end of the tree and if the new item sorts before/after the + * first/last page entry. We don't try and catch inserts into + * the middle of the tree (although we could, as long as there + * were two keys on the page and we saved both the index and + * the page number of the last insert). + */ + if (h->next_pgno == PGNO_INVALID) { + indx = NUM_ENT(h) - P_INDX; + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0) + goto try_begin; + if (cmp > 0) { + indx += P_INDX; + goto fast_hit; + } + + /* + * Found a duplicate. If doing DB_KEYLAST, we're at + * the correct position, otherwise, move to the first + * of the duplicates. + */ + if (flags == DB_KEYLAST) + goto fast_hit; + for (; + indx > 0 && h->inp[indx - P_INDX] == h->inp[indx]; + indx -= P_INDX) + ; + goto fast_hit; + } +try_begin: if (h->prev_pgno == PGNO_INVALID) { + indx = 0; + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0) + goto fast_miss; + if (cmp < 0) + goto fast_hit; + /* + * Found a duplicate. If doing DB_KEYFIRST, we're at + * the correct position, otherwise, move to the last + * of the duplicates. + */ + if (flags == DB_KEYFIRST) + goto fast_hit; + for (; + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]; + indx += P_INDX) + ; + goto fast_hit; + } + goto fast_miss; + +fast_hit: /* Set the exact match flag, we may have found a duplicate. */ + *exactp = cmp == 0; + + /* Enter the entry in the stack. */ + BT_STK_CLR(cp); + BT_STK_ENTER(cp, h, indx, lock, ret); + break; + +fast_miss: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + if (lock != LOCK_INVALID) + (void)__BT_LPUT(dbc, lock); + +search: ret = __bam_search(dbc, key, sflags, 1, NULL, exactp); + break; + default: /* XXX: Impossible. */ + abort(); + /* NOTREACHED */ + } if (ret != 0) return (ret); - cp->page = t->bt_csp->page; - cp->pgno = cp->page->pgno; - cp->indx = t->bt_csp->indx; - cp->lock = t->bt_csp->lock; - cp->dpgno = PGNO_INVALID; - /* - * If we have an exact match, make sure that we're not looking at a - * chain of duplicates -- if so, move to an entry in that chain. + * Initialize the cursor to reference it. This has to be done + * before we return (even with DB_NOTFOUND) because we have to + * free the page(s) we locked in __bam_search. */ - if (*exactp) { - if ((ret = __bam_ovfl_chk(dbp, - cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0) - return (ret); - } else - if (needexact) - return (DB_NOTFOUND); - - /* If past the end of a page, find the next entry. */ - if (cp->indx == NUM_ENT(cp->page) && - (ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + cp->lock = cp->csp->lock; + cp->dpgno = PGNO_INVALID; - /* If it's a deleted record, go to the next or previous one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) { - if (flags == S_KEYLAST) { - if ((ret = __bam_c_prev(dbp, cp)) != 0) - return (ret); - } else - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); - } /* - * If we don't specify an exact match (the DB_KEYFIRST/DB_KEYLAST or - * DB_SET_RANGE flags were set) __bam_search() may return a deleted - * item. For DB_KEYFIRST/DB_KEYLAST, we don't care since we're only - * using it for a tree position. For DB_SET_RANGE, we're returning - * the key, so we have to adjust it. + * If we inserted a key into the first or last slot of the tree, + * remember where it was so we can do it more quickly next time. */ - if (LF_ISSET(S_DELNO) && cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); + if (flags == DB_KEYFIRST || flags == DB_KEYLAST) + t->bt_lpgno = + ((cp->page->next_pgno == PGNO_INVALID && + cp->indx >= NUM_ENT(cp->page)) || + (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ? + cp->pgno : PGNO_INVALID; + + /* If we need an exact match and didn't find one, we're done. */ + if (needexact && *exactp == 0) + return (DB_NOTFOUND); return (0); } /* - * __bam_ovfl_chk -- - * Check for an overflow record, and if found, move to the correct - * record. + * __bam_dup -- + * Check for an off-page duplicates entry, and if found, move to the + * first or last entry. * - * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); + * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int)); */ int -__bam_ovfl_chk(dbp, cp, indx, to_end) - DB *dbp; +__bam_dup(dbc, cp, indx, last_dup) + DBC *dbc; CURSOR *cp; u_int32_t indx; - int to_end; + int last_dup; { BOVERFLOW *bo; + DB *dbp; db_pgno_t pgno; int ret; - /* Check for an overflow entry. */ - bo = GET_BOVERFLOW(cp->page, indx); - if (B_TYPE(bo->type) != B_DUPLICATE) - return (0); + dbp = dbc->dbp; /* - * If we find one, go to the duplicates page, and optionally move - * to the last record on that page. + * Check for an overflow entry. If we find one, move to the + * duplicates page, and optionally move to the last record on + * that page. * - * XXX + * !!! * We don't lock duplicates pages, we've already got the correct * lock on the main page. */ + bo = GET_BOVERFLOW(cp->page, indx + O_INDX); + if (B_TYPE(bo->type) != B_DUPLICATE) + return (0); + pgno = bo->pgno; if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) return (ret); cp->page = NULL; - if (to_end) { - if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0) + if (last_dup) { + if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0) return (ret); indx = NUM_ENT(cp->page) - O_INDX; } else { - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); indx = 0; } - /* Update the duplicate entry in the cursor. */ + /* Update the cursor's duplicate information. */ cp->dpgno = cp->page->pgno; cp->dindx = indx; return (0); } -#ifdef DEBUG -/* - * __bam_cprint -- - * Display the current btree cursor list. - * - * PUBLIC: int __bam_cprint __P((DB *)); - */ -int -__bam_cprint(dbp) - DB *dbp; -{ - CURSOR *cp; - DBC *dbc; - - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - fprintf(stderr, - "%#0x: page: %lu index: %lu dpage %lu dindex: %lu", - (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, - (u_long)cp->dpgno, (u_long)cp->dindx); - if (F_ISSET(cp, C_DELETED)) - fprintf(stderr, "(deleted)"); - fprintf(stderr, "\n"); - } - CURSOR_TEARDOWN(dbp); - - return (0); -} -#endif /* DEBUG */ - -/* - * __bam_ca_delete -- - * Check if any of the cursors refer to the item we are about to delete, - * returning the number of cursors that refer to the item in question. - * - * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int)); - */ -int -__bam_ca_delete(dbp, pgno, indx, curs, key_delete) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - CURSOR *curs; - int key_delete; -{ - DBC *dbc; - CURSOR *cp; - int count; /* !!!: Has to contain max number of cursors. */ - - /* - * Adjust the cursors. We don't have to review the cursors for any - * process other than the current one, because we have the page write - * locked at this point, and any other process had better be using a - * different locker ID, meaning that only cursors in our process can - * be on the page. - * - * It's possible for multiple cursors within the thread to have write - * locks on the same page, but, cursors within a thread must be single - * threaded, so all we're locking here is the cursor linked list. - */ - CURSOR_SETUP(dbp); - for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - - /* - * Optionally, a cursor passed in is the one initiating the - * delete, so we don't want to count it or set its deleted - * flag. Otherwise, if a cursor refers to the item, then we - * set its deleted flag. - */ - if (curs == cp) - continue; - - /* - * If we're deleting the key itself and not just one of its - * duplicates, repoint the cursor to the main-page key/data - * pair, everything else is about to be discarded. - */ - if (key_delete || cp->dpgno == PGNO_INVALID) { - if (cp->pgno == pgno && cp->indx == indx) { - cp->dpgno = PGNO_INVALID; - ++count; - F_SET(cp, C_DELETED); - } - } else - if (cp->dpgno == pgno && cp->dindx == indx) { - ++count; - F_SET(cp, C_DELETED); - } - } - CURSOR_TEARDOWN(dbp); - - return (count); -} - -/* - * __bam_ca_di -- - * Adjust the cursors during a delete or insert. - * - * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); - */ -void -__bam_ca_di(dbp, pgno, indx, adjust) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - int adjust; -{ - CURSOR *cp; - DBC *dbc; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == pgno && cp->indx >= indx) - cp->indx += adjust; - if (cp->dpgno == pgno && cp->dindx >= indx) - cp->dindx += adjust; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_dup -- - * Adjust the cursors when moving data items to a duplicates page. - * - * PUBLIC: void __bam_ca_dup __P((DB *, - * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); - */ -void -__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) - DB *dbp; - db_pgno_t fpgno, tpgno; - u_int32_t first, fi, ti; -{ - CURSOR *cp; - DBC *dbc; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * No need to test duplicates, this only gets called when moving - * leaf page data items onto a duplicates page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - /* - * Ignore matching entries that have already been moved, - * we move from the same location on the leaf page more - * than once. - */ - if (cp->dpgno == PGNO_INVALID && - cp->pgno == fpgno && cp->indx == fi) { - cp->indx = first; - cp->dpgno = tpgno; - cp->dindx = ti; - } - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_move -- - * Adjust the cursors when moving data items to another page. - * - * PUBLIC: void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t)); - */ -void -__bam_ca_move(dbp, fpgno, tpgno) - DB *dbp; - db_pgno_t fpgno, tpgno; -{ - CURSOR *cp; - DBC *dbc; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * No need to test duplicates, this only gets called when copying - * over the root page with a leaf or internal page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == fpgno) - cp->pgno = tpgno; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_replace -- - * Check if any of the cursors refer to the item we are about to replace. - * If so, their flags should be changed from deleted to replaced. - * - * PUBLIC: void __bam_ca_replace - * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); - */ -void -__bam_ca_replace(dbp, pgno, indx, pass) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - ca_replace_arg pass; -{ - CURSOR *cp; - DBC *dbc; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * Find any cursors that have logically deleted a record we're about - * to overwrite. - * - * Pass == REPLACE_SETUP: - * Set C_REPLACE_SETUP so we can find the cursors again. - * - * Pass == REPLACE_SUCCESS: - * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the - * overwrite was successful. - * - * Pass == REPLACE_FAILED: - * Clear C_REPLACE_SETUP, the overwrite failed. - * - * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value - * for the cursor as it may have been changed by other cursor update - * routines as the item was deleted/inserted. - */ - CURSOR_SETUP(dbp); - switch (pass) { - case REPLACE_SETUP: /* Setup. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if ((cp->pgno == pgno && cp->indx == indx) || - (cp->dpgno == pgno && cp->dindx == indx)) - F_SET(cp, C_REPLACE_SETUP); - } - break; - case REPLACE_SUCCESS: /* Overwrite succeeded. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (F_ISSET(cp, C_REPLACE_SETUP)) { - if (cp->dpgno == pgno) - cp->dindx = indx; - if (cp->pgno == pgno) - cp->indx = indx; - F_SET(cp, C_REPLACE); - F_CLR(cp, C_DELETED | C_REPLACE_SETUP); - } - } - break; - case REPLACE_FAILED: /* Overwrite failed. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (F_ISSET(cp, C_REPLACE_SETUP)) { - if (cp->dpgno == pgno) - cp->dindx = indx; - if (cp->pgno == pgno) - cp->indx = indx; - F_CLR(cp, C_REPLACE_SETUP); - } - } - break; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_split -- - * Adjust the cursors when splitting a page. - * - * PUBLIC: void __bam_ca_split __P((DB *, - * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); - */ -void -__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) - DB *dbp; - db_pgno_t ppgno, lpgno, rpgno; - u_int32_t split_indx; - int cleft; -{ - DBC *dbc; - CURSOR *cp; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * If splitting the page that a cursor was on, the cursor has to be - * adjusted to point to the same record as before the split. Most - * of the time we don't adjust pointers to the left page, because - * we're going to copy its contents back over the original page. If - * the cursor is on the right page, it is decremented by the number of - * records split to the left page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == ppgno) { - if (cp->indx < split_indx) { - if (cleft) - cp->pgno = lpgno; - } else { - cp->pgno = rpgno; - cp->indx -= split_indx; - } - } - if (cp->dpgno == ppgno) { - if (cp->dindx < split_indx) { - if (cleft) - cp->dpgno = lpgno; - } else { - cp->dpgno = rpgno; - cp->dindx -= split_indx; - } - } - } - CURSOR_TEARDOWN(dbp); -} - /* * __bam_c_physdel -- * Actually do the cursor deletion. */ static int -__bam_c_physdel(dbp, cp, h) - DB *dbp; +__bam_c_physdel(dbc, cp, h) + DBC *dbc; CURSOR *cp; PAGE *h; { enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd; BOVERFLOW bo; - BTREE *t; + DB *dbp; DBT dbt; DB_LOCK lock; db_indx_t indx; db_pgno_t pgno, next_pgno, prev_pgno; int delete_page, local_page, ret; - t = dbp->internal; + dbp = dbc->dbp; + delete_page = ret = 0; /* Figure out what we're deleting. */ @@ -1522,20 +1654,37 @@ __bam_c_physdel(dbp, cp, h) } /* - * If the item is referenced by another cursor, leave it up to that - * cursor to do the delete. + * If the item is referenced by another cursor, set that cursor's + * delete flag and leave it up to it to do the delete. + * + * !!! + * This test for > 0 is a tricky. There are two ways that we can + * be called here. Either we are closing the cursor or we've moved + * off the page with the deleted entry. In the first case, we've + * already removed the cursor from the active queue, so we won't see + * it in __bam_ca_delete. In the second case, it will be on a different + * item, so we won't bother with it in __bam_ca_delete. */ - if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0) + if (__bam_ca_delete(dbp, pgno, indx, 1) > 0) return (0); /* + * If this is concurrent DB, upgrade the lock if necessary. + */ + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, + dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + + /* * If we don't already have the page locked, get it and delete the * items. */ if ((h == NULL || h->pgno != pgno)) { - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); local_page = 1; } else @@ -1581,7 +1730,7 @@ __bam_c_physdel(dbp, cp, h) cmd = DELETE_ITEM; /* Delete the duplicate. */ - if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0) + if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0) goto err; /* @@ -1610,7 +1759,7 @@ __bam_c_physdel(dbp, cp, h) if (local_page) { if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); local_page = 0; } @@ -1619,10 +1768,10 @@ __bam_c_physdel(dbp, cp, h) /* Acquire the parent page and switch the index to its entry. */ if ((ret = - __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) { - (void)__BT_TLPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) { + (void)__BT_TLPUT(dbc, lock); goto err; } local_page = 1; @@ -1641,12 +1790,12 @@ __bam_c_physdel(dbp, cp, h) */ indx += O_INDX; bo = *GET_BOVERFLOW(h, indx); - (void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE); + (void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE); bo.pgno = next_pgno; memset(&dbt, 0, sizeof(dbt)); dbt.data = &bo; dbt.size = BOVERFLOW_SIZE; - (void)__db_pitem(dbp, h, indx, BOVERFLOW_SIZE, &dbt, NULL); + (void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL); (void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); goto done; } @@ -1661,7 +1810,7 @@ btd: /* * set them is because we're (potentially) about to do a reverse split, * which would make our saved page information useless. * - * XXX + * !!! * The following operations to delete a page might deadlock. I think * that's OK. The problem is if we're deleting an item because we're * closing cursors because we've already deadlocked and want to call @@ -1680,37 +1829,44 @@ btd: /* /* * Do a normal btree delete. * - * XXX + * !!! * Delete the key item first, otherwise the duplicate checks in * __bam_ditem() won't work! */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) goto err; - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) goto err; /* Discard any remaining locks/pages. */ if (local_page) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); local_page = 0; } /* Delete the page if it was emptied. */ if (delete_page) - ret = __bam_dpage(dbp, &dbt); + ret = __bam_dpage(dbc, &dbt); err: done: if (delete_page) - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); if (local_page) { - (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + /* + * It's possible for h to be NULL, as __db_drem may have + * been relinking pages by the time that it deadlocked. + */ + if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbc, lock); } - if (ret == 0) - ++t->lstat.bt_deleted; + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + return (ret); } @@ -1719,22 +1875,24 @@ done: if (delete_page) * Acquire a full stack for a cursor. */ static int -__bam_c_getstack(dbp, cp) - DB *dbp; +__bam_c_getstack(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; DBT dbt; PAGE *h; db_pgno_t pgno; int exact, ret; - ret = 0; + dbp = dbc->dbp; h = NULL; memset(&dbt, 0, sizeof(DBT)); + ret = 0; /* Get the page with the current item on it. */ pgno = cp->pgno; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); /* Get a copy of a key from the page. */ @@ -1744,12 +1902,12 @@ __bam_c_getstack(dbp, cp) /* Get a write-locked stack for that page. */ exact = 0; - ret = __bam_search(dbp, &dbt, S_KEYFIRST, 1, NULL, &exact); + ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact); /* We no longer need the key or the page. */ err: if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); if (dbt.data != NULL) - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); return (ret); } diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c index 7e71037e46..d623bd8a6f 100644 --- a/db2/btree/bt_delete.c +++ b/db2/btree/bt_delete.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_delete.c 10.31 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_delete.c 10.43 (Sleepycat) 12/7/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -60,8 +60,6 @@ static const char sccsid[] = "@(#)bt_delete.c 10.31 (Sleepycat) 5/6/98"; #include "db_page.h" #include "btree.h" -static int __bam_dpages __P((DB *, BTREE *)); - /* * __bam_delete -- * Delete the items referenced by a key. @@ -69,182 +67,67 @@ static int __bam_dpages __P((DB *, BTREE *)); * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); */ int -__bam_delete(argdbp, txn, key, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key; - u_int32_t flags; -{ - BTREE *t; +__bam_delete(dbp, txn, key, flags) DB *dbp; - PAGE *h; - db_indx_t cnt, i, indx; - int dpage, exact, ret, stack; - - DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags); - - stack = 0; - - /* Check for invalid flags. */ - if ((ret = __db_delchk(argdbp, - key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) - return (ret); - - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - - /* Search the tree for the key; delete only deletes exact matches. */ - if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0) - goto err; - stack = 1; - h = t->bt_csp->page; - indx = t->bt_csp->indx; - - /* Delete the key/data pair, including any on-or-off page duplicates. */ - for (cnt = 1, i = indx;; ++cnt) - if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx]) - break; - for (; cnt > 0; --cnt, ++t->lstat.bt_deleted) - if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) { - /* - * XXX - * Delete the key item first, otherwise the duplicate - * checks in __bam_ditem() won't work! - */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - } else { - B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); - indx += P_INDX; - } - - /* If we're using record numbers, update internal page record counts. */ - if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0) - goto err; - - /* If the page is now empty, delete it. */ - dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT; - - __bam_stkrel(dbp); - stack = 0; - - ret = dpage ? __bam_dpage(dbp, key) : 0; - -err: if (stack) - __bam_stkrel(dbp); - PUTHANDLE(dbp); - return (ret); -} - -/* - * __ram_delete -- - * Delete the items referenced by a key. - * - * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); - */ -int -__ram_delete(argdbp, txn, key, flags) - DB *argdbp; DB_TXN *txn; DBT *key; u_int32_t flags; { - BKEYDATA bk; - BTREE *t; - DB *dbp; - DBT hdr, data; - PAGE *h; - db_indx_t indx; - db_recno_t recno; - int exact, ret, stack; + DBC *dbc; + DBT data; + u_int32_t f_init, f_next; + int ret, t_ret; - stack = 0; + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_delchk(argdbp, - key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + if ((ret = + __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) return (ret); - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - - /* Check the user's record number and fill in as necessary. */ - if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0) - goto err; - - /* Search the tree for the key; delete only deletes exact matches. */ - if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0) - goto err; - if (!exact) { - ret = DB_NOTFOUND; - goto err; - } - - h = t->bt_csp->page; - indx = t->bt_csp->indx; - stack = 1; + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); - /* If the record has already been deleted, we couldn't have found it. */ - if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { - ret = DB_KEYEMPTY; - goto done; - } + DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags); /* - * If we're not renumbering records, replace the record with a marker - * and return. + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, so request a partial of length 0. */ - if (!F_ISSET(dbp, DB_RE_RENUMBER)) { - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - - B_TSET(bk.type, B_KEYDATA, 1); - bk.len = 0; - memset(&hdr, 0, sizeof(hdr)); - hdr.data = &bk; - hdr.size = SSZA(BKEYDATA, data); - memset(&data, 0, sizeof(data)); - data.data = (char *)""; - data.size = 0; - if ((ret = __db_pitem(dbp, - h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) - goto err; - - ++t->lstat.bt_deleted; - goto done; + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* If locking, set read-modify-write flag. */ + f_init = DB_SET; + f_next = DB_NEXT_DUP; + if (dbp->dbenv != NULL && dbp->dbenv->lk_info != NULL) { + f_init |= DB_RMW; + f_next |= DB_RMW; } - /* Delete the item. */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + /* Walk through the set of key/data pairs, deleting as we go. */ + if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0) goto err; - - ++t->lstat.bt_deleted; - if (t->bt_recno != NULL) - F_SET(t->bt_recno, RECNO_MODIFIED); - - /* Adjust the counts. */ - __bam_adjust(dbp, t, -1); - - /* Adjust the cursors. */ - __ram_ca(dbp, recno, CA_DELETE); - - /* - * If the page is now empty, delete it -- we have the whole tree - * locked, so there are no preparations to make. Else, release - * the pages. - */ - if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { - stack = 0; - ret = __bam_dpages(dbp, t); + for (;;) { + if ((ret = dbc->c_del(dbc, 0)) != 0) + goto err; + if ((ret = dbc->c_get(dbc, key, &data, f_next)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } } -done: -err: if (stack) - __bam_stkrel(dbp); +err: /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && + (ret == 0 || ret == DB_NOTFOUND)) + ret = t_ret; - PUTHANDLE(dbp); return (ret); } @@ -252,20 +135,23 @@ err: if (stack) * __bam_ditem -- * Delete one or more entries from a page. * - * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t)); + * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); */ int -__bam_ditem(dbp, h, indx) - DB *dbp; +__bam_ditem(dbc, h, indx) + DBC *dbc; PAGE *h; u_int32_t indx; { BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; + DB *dbp; u_int32_t nbytes; int ret; + dbp = dbc->dbp; + switch (TYPE(h)) { case P_IBTREE: bi = GET_BINTERNAL(h, indx); @@ -304,7 +190,7 @@ __bam_ditem(dbp, h, indx) */ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && h->inp[indx] == h->inp[indx + P_INDX]) - return (__bam_adjindx(dbp, + return (__bam_adjindx(dbc, h, indx, indx + O_INDX, 0)); /* * Check for a duplicate before us on the page. It @@ -312,7 +198,7 @@ __bam_ditem(dbp, h, indx) * after the data item for the purposes of this one. */ if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) - return (__bam_adjindx(dbp, + return (__bam_adjindx(dbc, h, indx, indx - P_INDX, 0)); } /* FALLTHROUGH */ @@ -327,11 +213,11 @@ __bam_ditem(dbp, h, indx) offpage: /* Delete duplicate/offpage chains. */ if (B_TYPE(bo->type) == B_DUPLICATE) { if ((ret = - __db_ddup(dbp, bo->pgno, __bam_free)) != 0) + __db_ddup(dbc, bo->pgno, __bam_free)) != 0) return (ret); } else if ((ret = - __db_doff(dbp, bo->pgno, __bam_free)) != 0) + __db_doff(dbc, bo->pgno, __bam_free)) != 0) return (ret); break; case B_KEYDATA: @@ -346,7 +232,7 @@ offpage: /* Delete duplicate/offpage chains. */ } /* Delete the item. */ - if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0) + if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) return (ret); /* Mark the page dirty. */ @@ -357,21 +243,24 @@ offpage: /* Delete duplicate/offpage chains. */ * __bam_adjindx -- * Adjust an index on the page. * - * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); + * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); */ int -__bam_adjindx(dbp, h, indx, indx_copy, is_insert) - DB *dbp; +__bam_adjindx(dbc, h, indx, indx_copy, is_insert) + DBC *dbc; PAGE *h; u_int32_t indx, indx_copy; int is_insert; { + DB *dbp; db_indx_t copy; int ret; + dbp = dbc->dbp; + /* Log the change. */ - if (DB_LOGGING(dbp) && - (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + if (DB_LOGGING(dbc) && + (ret = __bam_adj_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0) return (ret); @@ -402,22 +291,24 @@ __bam_adjindx(dbp, h, indx, indx_copy, is_insert) * __bam_dpage -- * Delete a page from the tree. * - * PUBLIC: int __bam_dpage __P((DB *, const DBT *)); + * PUBLIC: int __bam_dpage __P((DBC *, const DBT *)); */ int -__bam_dpage(dbp, key) - DB *dbp; +__bam_dpage(dbc, key) + DBC *dbc; const DBT *key; { - BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; PAGE *h; db_pgno_t pgno; int level; /* !!!: has to hold number of tree levels. */ int exact, ret; + dbp = dbc->dbp; + cp = dbc->internal; ret = 0; - t = dbp->internal; /* * The locking protocol is that we acquire locks by walking down the @@ -433,40 +324,40 @@ __bam_dpage(dbp, key) for (level = LEAFLEVEL;; ++level) { /* Acquire a page and its parent, locked. */ if ((ret = - __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0) + __bam_search(dbc, key, S_WRPAIR, level, NULL, &exact)) != 0) return (ret); /* * If we reach the root or the page isn't going to be empty * when we delete one record, quit. */ - h = t->bt_csp[-1].page; + h = cp->csp[-1].page; if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1) break; /* Release the two locked pages. */ - (void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0); - (void)__BT_TLPUT(dbp, t->bt_csp[-1].lock); - (void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0); - (void)__BT_TLPUT(dbp, t->bt_csp[0].lock); + (void)memp_fput(dbp->mpf, cp->csp[-1].page, 0); + (void)__BT_TLPUT(dbc, cp->csp[-1].lock); + (void)memp_fput(dbp->mpf, cp->csp[0].page, 0); + (void)__BT_TLPUT(dbc, cp->csp[0].lock); } /* * Leave the stack pointer one after the last entry, we may be about * to push more items on the stack. */ - ++t->bt_csp; + ++cp->csp; /* - * t->bt_csp[-2].page is the top page, which we're not going to delete, - * and t->bt_csp[-1].page is the first page we are going to delete. + * cp->csp[-2].page is the top page, which we're not going to delete, + * and cp->csp[-1].page is the first page we are going to delete. * * Walk down the chain, acquiring the rest of the pages until we've * retrieved the leaf page. If we find any pages that aren't going * to be emptied by the delete, someone else added something while we * were walking the tree, and we discontinue the delete. */ - for (h = t->bt_csp[-1].page;;) { + for (h = cp->csp[-1].page;;) { if (ISLEAF(h)) { if (NUM_ENT(h) != 0) goto release; @@ -482,45 +373,53 @@ __bam_dpage(dbp, key) pgno = TYPE(h) == P_IBTREE ? GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) - goto release; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) goto release; - BT_STK_PUSH(t, h, 0, lock, ret); - if (ret != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto release; + BT_STK_PUSH(cp, h, 0, lock, ret); } - BT_STK_POP(t); - return (__bam_dpages(dbp, t)); + /* Adjust back to reference the last page on the stack. */ + BT_STK_POP(cp); + + /* Delete the pages. */ + return (__bam_dpages(dbc)); release: + /* Adjust back to reference the last page on the stack. */ + BT_STK_POP(cp); + /* Discard any locked pages and return. */ - BT_STK_POP(t); - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); + return (ret); } /* * __bam_dpages -- * Delete a set of locked pages. + * + * PUBLIC: int __bam_dpages __P((DBC *)); */ -static int -__bam_dpages(dbp, t) - DB *dbp; - BTREE *t; +int +__bam_dpages(dbc) + DBC *dbc; { + CURSOR *cp; + DB *dbp; DBT a, b; - DB_LOCK lock; + DB_LOCK c_lock, p_lock; EPG *epg; - PAGE *h; + PAGE *child, *parent; + db_indx_t nitems; db_pgno_t pgno; db_recno_t rcnt; - int ret; - - COMPQUIET(rcnt, 0); + int done, ret; - epg = t->bt_sp; + dbp = dbc->dbp; + cp = dbc->internal; + epg = cp->sp; /* * !!! @@ -533,45 +432,107 @@ __bam_dpages(dbp, t) * that we can never again access by walking down the tree. So, before * we unlink the subtree, we relink the leaf page chain. */ - if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0) + if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0) goto release; /* - * We have the entire stack of deletable pages locked. Start from the - * top of the tree and move to the bottom, as it's better to release - * the inner pages as soon as possible. + * We have the entire stack of deletable pages locked. + * + * Delete the highest page in the tree's reference to the underlying + * stack of pages. Then, release that page, letting the rest of the + * tree get back to business. */ - if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0) - goto release; + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) { +release: (void)__bam_stkrel(dbc, 0); + return (ret); + } + + pgno = epg->page->pgno; + nitems = NUM_ENT(epg->page); + + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbc, epg->lock); + + /* + * Free the rest of the stack of pages. + * + * !!! + * Don't bother checking for errors. We've unlinked the subtree from + * the tree, and there's no possibility of recovery outside of doing + * TXN rollback. + */ + while (++epg <= cp->csp) { + /* + * Delete page entries so they will be restored as part of + * recovery. + */ + if (NUM_ENT(epg->page) != 0) + (void)__bam_ditem(dbc, epg->page, epg->indx); + + (void)__bam_free(dbc, epg->page); + (void)__BT_TLPUT(dbc, epg->lock); + } + BT_STK_CLR(cp); + + /* + * Try and collapse the tree a level -- this is only applicable + * if we've deleted the next-to-last element from the root page. + * + * There are two cases when collapsing a tree. + * + * If we've just deleted the last item from the root page, there is no + * further work to be done. The code above has emptied the root page + * and freed all pages below it. + */ + if (pgno != PGNO_ROOT || nitems != 1) + return (0); /* - * If we just deleted the last or next-to-last item from the root page, - * the tree can collapse a level. Write lock the last page referenced + * If we just deleted the next-to-last item from the root page, the + * tree can collapse one or more levels. While there remains only a + * single item on the root page, write lock the last page referenced * by the root page and copy it over the root page. If we can't get a - * write lock, that's okay, the tree just remains a level deeper than - * we'd like. + * write lock, that's okay, the tree just stays deeper than we'd like. */ - h = epg->page; - if (h->pgno == PGNO_ROOT && NUM_ENT(h) <= 1) { - pgno = TYPE(epg->page) == P_IBTREE ? - GET_BINTERNAL(epg->page, 0)->pgno : - GET_RINTERNAL(epg->page, 0)->pgno; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) - goto release; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) - goto release; + for (done = 0; !done;) { + /* Initialize. */ + parent = child = NULL; + p_lock = c_lock = LOCK_INVALID; + + /* Lock the root. */ + pgno = PGNO_ROOT; + if ((ret = + __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &p_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0) + goto stop; + + if (NUM_ENT(parent) != 1 || + (TYPE(parent) != P_IBTREE && TYPE(parent) != P_IRECNO)) + goto stop; + + pgno = TYPE(parent) == P_IBTREE ? + GET_BINTERNAL(parent, 0)->pgno : + GET_RINTERNAL(parent, 0)->pgno; + + /* Lock the child page. */ + if ((ret = + __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &c_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0) + goto stop; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { memset(&a, 0, sizeof(a)); - a.data = h; + a.data = child; a.size = dbp->pgsize; memset(&b, 0, sizeof(b)); - b.data = P_ENTRY(epg->page, 0); + b.data = P_ENTRY(parent, 0); b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); - __bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn, - &h->lsn, 0, dbp->log_fileid, h->pgno, &a, - RE_NREC(epg->page), &b, &epg->page->lsn); + __bam_rsplit_log(dbp->dbenv->lg_info, dbc->txn, + &child->lsn, 0, dbp->log_fileid, child->pgno, &a, + RE_NREC(parent), &b, &parent->lsn); } /* @@ -579,69 +540,50 @@ __bam_dpages(dbp, t) * * One fixup -- if the tree has record numbers and we're not * converting to a leaf page, we have to preserve the total - * record count. + * record count. Note that we are about to overwrite everything + * on the parent, including its LSN. This is actually OK, + * because the above log message, which describes this update, + * stores its LSN on the child page. When the child is copied + * to the parent, the correct LSN is going to copied into + * place in the parent. */ - if (TYPE(h) == P_IRECNO || - (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) - rcnt = RE_NREC(epg->page); - memcpy(epg->page, h, dbp->pgsize); - epg->page->pgno = PGNO_ROOT; - if (TYPE(h) == P_IRECNO || - (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) - RE_NREC_SET(epg->page, rcnt); - (void)memp_fset(dbp->mpf, epg->page, DB_MPOOL_DIRTY); + COMPQUIET(rcnt, 0); + if (TYPE(child) == P_IRECNO || + (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + rcnt = RE_NREC(parent); + memcpy(parent, child, dbp->pgsize); + parent->pgno = PGNO_ROOT; + if (TYPE(child) == P_IRECNO || + (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + RE_NREC_SET(parent, rcnt); + + /* Mark the pages dirty. */ + memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY); + memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY); + + /* Adjust the cursors. */ + __bam_ca_rsplit(dbp, child->pgno, PGNO_ROOT); /* * Free the page copied onto the root page and discard its * lock. (The call to __bam_free() discards our reference * to the page.) - * - * It's possible that the reverse split we're doing involves - * pages from the stack of pages we're deleting. Don't free - * the page twice. */ - if (h->pgno == (epg + 1)->page->pgno) - (void)memp_fput(dbp->mpf, h, 0); - else { - (void)__bam_free(dbp, h); - ++t->lstat.bt_freed; - } - (void)__BT_TLPUT(dbp, lock); + (void)__bam_free(dbc, child); + child = NULL; - /* Adjust the cursors. */ - __bam_ca_move(dbp, h->pgno, PGNO_ROOT); + if (0) { +stop: done = 1; + } + if (p_lock != LOCK_INVALID) + (void)__BT_TLPUT(dbc, p_lock); + if (parent != NULL) + memp_fput(dbp->mpf, parent, 0); + if (c_lock != LOCK_INVALID) + (void)__BT_TLPUT(dbc, c_lock); + if (child != NULL) + memp_fput(dbp->mpf, child, 0); } - /* Release the top page in the subtree. */ - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); - - /* - * Free the rest of the pages. - * - * XXX - * Don't bother checking for errors. We've unlinked the subtree from - * the tree, and there's no possibility of recovery. - */ - while (++epg <= t->bt_csp) { - /* - * XXX - * Why do we need to do this? Isn't the page already empty? - */ - if (NUM_ENT(epg->page) != 0) - (void)__bam_ditem(dbp, epg->page, epg->indx); - - (void)__bam_free(dbp, epg->page); - (void)__BT_TLPUT(dbp, epg->lock); - ++t->lstat.bt_freed; - } return (0); - -release: - /* Discard any remaining pages and return. */ - for (; epg <= t->bt_csp; ++epg) { - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); - } - return (ret); } diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c index f5974ec61e..a89cfccb97 100644 --- a/db2/btree/bt_open.c +++ b/db2/btree/bt_open.c @@ -47,17 +47,9 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_open.c 10.27 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_open.c 10.39 (Sleepycat) 11/21/98"; #endif /* not lint */ -/* - * Implementation of btree access method for 4.4BSD. - * - * The design here was originally based on that of the btree access method - * used in the Postgres database system at UC Berkeley. This implementation - * is wholly independent of the Postgres code. - */ - #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> @@ -70,40 +62,34 @@ static const char sccsid[] = "@(#)bt_open.c 10.27 (Sleepycat) 5/6/98"; #include "db_page.h" #include "btree.h" -static int __bam_keyalloc __P((BTREE *)); -static int __bam_setmeta __P((DB *, BTREE *)); - /* * __bam_open -- * Open a btree. * - * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *)); + * PUBLIC: int __bam_open __P((DB *, DB_INFO *)); */ int -__bam_open(dbp, type, dbinfo) +__bam_open(dbp, dbinfo) DB *dbp; - DBTYPE type; DB_INFO *dbinfo; { BTREE *t; int ret; - /* Allocate the btree internal structure. */ - if ((t = (BTREE *)__db_calloc(1, sizeof(BTREE))) == NULL) - return (ENOMEM); - - t->bt_sp = t->bt_csp = t->bt_stack; - t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); - - if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) && - (ret = __bam_keyalloc(t)) != 0) - goto err; + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->internal = t; /* * Intention is to make sure all of the user's selections are okay * here and then use them without checking. */ - if (dbinfo != NULL) { + if (dbinfo == NULL) { + t->bt_minkey = DEFMINKEYPAGE; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } else { /* Minimum number of keys per page. */ if (dbinfo->bt_minkey == 0) t->bt_minkey = DEFMINKEYPAGE; @@ -126,152 +112,125 @@ __bam_open(dbp, type, dbinfo) * If no comparison, use default comparison. If no comparison * and no prefix, use default prefix. (We can't default the * prefix if the user supplies a comparison routine; shortening - * the keys may break their comparison algorithm.) + * the keys may break their comparison algorithm. We don't + * permit the user to specify a prefix routine if they didn't + * also specify a comparison routine, they can't know enough + * about our comparison routine to get it right.) */ - t->bt_compare = dbinfo->bt_compare == NULL ? - __bam_defcmp : dbinfo->bt_compare; - t->bt_prefix = dbinfo->bt_prefix == NULL ? - (dbinfo->bt_compare == NULL ? - __bam_defpfx : NULL) : dbinfo->bt_prefix; - } else { - t->bt_minkey = DEFMINKEYPAGE; - t->bt_compare = __bam_defcmp; - t->bt_prefix = __bam_defpfx; + if ((t->bt_compare = dbinfo->bt_compare) == NULL) { + if (dbinfo->bt_prefix != NULL) + goto einval; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } else + t->bt_prefix = dbinfo->bt_prefix; } - /* Initialize the remaining fields of the DB. */ - dbp->type = type; - dbp->internal = t; - dbp->cursor = __bam_cursor; + /* Initialize the remaining fields/methods of the DB. */ + dbp->am_close = __bam_close; dbp->del = __bam_delete; - dbp->get = __bam_get; - dbp->put = __bam_put; dbp->stat = __bam_stat; - dbp->sync = __bam_sync; - - /* - * The btree data structure requires that at least two key/data pairs - * can fit on a page, but other than that there's no fixed requirement. - * Translate the minimum number of items into the bytes a key/data pair - * can use before being placed on an overflow page. We calculate for - * the worst possible alignment by assuming every item requires the - * maximum alignment for padding. - * - * Recno uses the btree bt_ovflsize value -- it's close enough. - */ - t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) - - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); - /* Create a root page if new tree. */ - if ((ret = __bam_setmeta(dbp, t)) != 0) + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp)) != 0) goto err; + /* Set the overflow page size. */ + __bam_setovflsize(dbp); + return (0); einval: ret = EINVAL; -err: if (t != NULL) { - /* If we allocated room for key/data return, discard it. */ - if (t->bt_rkey.data != NULL) - __db_free(t->bt_rkey.data); - - FREE(t, sizeof(BTREE)); - } +err: __os_free(t, sizeof(BTREE)); return (ret); } /* - * __bam_bdup -- - * Create a BTREE handle for a threaded DB handle. + * __bam_close -- + * Close a btree. * - * PUBLIC: int __bam_bdup __P((DB *, DB *)); + * PUBLIC: int __bam_close __P((DB *)); */ int -__bam_bdup(orig, new) - DB *orig, *new; +__bam_close(dbp) + DB *dbp; { - BTREE *t, *ot; - int ret; - - ot = orig->internal; - - if ((t = (BTREE *)__db_calloc(1, sizeof(*t))) == NULL) - return (ENOMEM); - - /* - * !!! - * Ignore the cursor queue, only the first DB has attached cursors. - */ + __os_free(dbp->internal, sizeof(BTREE)); + dbp->internal = NULL; - t->bt_sp = t->bt_csp = t->bt_stack; - t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + return (0); +} - if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) && - (ret = __bam_keyalloc(t)) != 0) { - FREE(t, sizeof(*t)); - return (ret); - } +/* + * __bam_setovflsize -- + * + * PUBLIC: void __bam_setovflsize __P((DB *)); + */ +void +__bam_setovflsize(dbp) + DB *dbp; +{ + BTREE *t; - t->bt_maxkey = ot->bt_maxkey; - t->bt_minkey = ot->bt_minkey; - t->bt_compare = ot->bt_compare; - t->bt_prefix = ot->bt_prefix; - t->bt_ovflsize = ot->bt_ovflsize; + t = dbp->internal; /* * !!! - * The entire RECNO structure is shared. If it breaks, the application - * was misusing it to start with. + * Correction for recno, which doesn't know anything about minimum + * keys per page. */ - t->bt_recno = ot->bt_recno; - - new->internal = t; - - return (0); -} + if (t->bt_minkey == 0) + t->bt_minkey = DEFMINKEYPAGE; -/* - * __bam_keyalloc -- - * Allocate return memory for recno keys. - */ -static int -__bam_keyalloc(t) - BTREE *t; -{ /* - * Recno keys are always the same size, and we don't want to have - * to check for space on each return. Allocate it now. + * The btree data structure requires that at least two key/data pairs + * can fit on a page, but other than that there's no fixed requirement. + * Translate the minimum number of items into the bytes a key/data pair + * can use before being placed on an overflow page. We calculate for + * the worst possible alignment by assuming every item requires the + * maximum alignment for padding. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. */ - if ((t->bt_rkey.data = (void *)__db_malloc(sizeof(db_recno_t))) == NULL) - return (ENOMEM); - t->bt_rkey.ulen = sizeof(db_recno_t); - return (0); + t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) + - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); } /* - * __bam_setmeta -- + * __bam_read_root -- * Check (and optionally create) a tree. + * + * PUBLIC: int __bam_read_root __P((DB *)); */ -static int -__bam_setmeta(dbp, t) +int +__bam_read_root(dbp) DB *dbp; - BTREE *t; { BTMETA *meta; - PAGE *root; + BTREE *t; + DBC *dbc; DB_LOCK metalock, rootlock; + PAGE *root; db_pgno_t pgno; - int ret; + int ret, t_ret; + + ret = 0; + t = dbp->internal; + + /* Get a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); /* Get, and optionally create the metadata page. */ pgno = PGNO_METADATA; if ((ret = - __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0) - return (ret); + __bam_lget(dbc, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0) + goto err; if ((ret = - __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) { - (void)__BT_LPUT(dbp, metalock); - return (ret); + memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0) { + (void)__BT_LPUT(dbc, metalock); + goto err; } /* @@ -284,8 +243,8 @@ __bam_setmeta(dbp, t) t->bt_minkey = meta->minkey; (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); - (void)__BT_LPUT(dbp, metalock); - return (0); + (void)__BT_LPUT(dbc, metalock); + goto done; } /* Initialize the tree structure metadata information. */ @@ -308,16 +267,16 @@ __bam_setmeta(dbp, t) F_SET(meta, BTM_RECNUM); if (F_ISSET(dbp, DB_RE_RENUMBER)) F_SET(meta, BTM_RENUMBER); - memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN); + memcpy(meta->uid, dbp->fileid, DB_FILE_ID_LEN); /* Create and initialize a root page. */ pgno = PGNO_ROOT; if ((ret = - __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0) - return (ret); - if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) { - (void)__BT_LPUT(dbp, rootlock); - return (ret); + __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &root)) != 0) { + (void)__BT_LPUT(dbc, rootlock); + goto err; } P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); @@ -325,9 +284,9 @@ __bam_setmeta(dbp, t) /* Release the metadata and root pages. */ if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) - return (ret); + goto err; if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) - return (ret); + goto err; /* * Flush the metadata and root pages to disk -- since the user can't @@ -341,8 +300,11 @@ __bam_setmeta(dbp, t) ret = EINVAL; /* Release the locks. */ - (void)__BT_LPUT(dbp, metalock); - (void)__BT_LPUT(dbp, rootlock); + (void)__BT_LPUT(dbc, metalock); + (void)__BT_LPUT(dbc, rootlock); +err: +done: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c index 87f2811398..6ccd68a5ab 100644 --- a/db2/btree/bt_page.c +++ b/db2/btree/bt_page.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_page.c 10.12 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_page.c 10.17 (Sleepycat) 1/3/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -65,45 +65,47 @@ static const char sccsid[] = "@(#)bt_page.c 10.12 (Sleepycat) 5/6/98"; * __bam_new -- * Get a new page, preferably from the freelist. * - * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **)); + * PUBLIC: int __bam_new __P((DBC *, u_int32_t, PAGE **)); */ int -__bam_new(dbp, type, pagepp) - DB *dbp; +__bam_new(dbc, type, pagepp) + DBC *dbc; u_int32_t type; PAGE **pagepp; { BTMETA *meta; + DB *dbp; DB_LOCK metalock; PAGE *h; db_pgno_t pgno; int ret; + dbp = dbc->dbp; meta = NULL; h = NULL; metalock = LOCK_INVALID; pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) goto err; if (meta->free == PGNO_INVALID) { - if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_NEW, &h)) != 0) goto err; ZERO_LSN(h->lsn); h->pgno = pgno; } else { pgno = meta->free; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto err; meta->free = h->next_pgno; } /* Log the change. */ - if (DB_LOGGING(dbp)) { - if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn, + if (DB_LOGGING(dbc)) { + if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbc->txn, &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn, h->pgno, (u_int32_t)type, meta->free)) != 0) goto err; @@ -111,7 +113,7 @@ __bam_new(dbp, type, pagepp) } (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); *pagepp = h; @@ -122,28 +124,45 @@ err: if (h != NULL) if (meta != NULL) (void)memp_fput(dbp->mpf, meta, 0); if (metalock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); return (ret); } /* + * __bam_lput -- + * The standard lock put call. + * + * PUBLIC: int __bam_lput __P((DBC *, DB_LOCK)); + */ +int +__bam_lput(dbc, lock) + DBC *dbc; + DB_LOCK lock; +{ + return (__BT_LPUT(dbc, lock)); +} + +/* * __bam_free -- * Add a page to the head of the freelist. * - * PUBLIC: int __bam_free __P((DB *, PAGE *)); + * PUBLIC: int __bam_free __P((DBC *, PAGE *)); */ int -__bam_free(dbp, h) - DB *dbp; +__bam_free(dbc, h) + DBC *dbc; PAGE *h; { BTMETA *meta; + DB *dbp; DBT ldbt; DB_LOCK metalock; db_pgno_t pgno; u_int32_t dirty_flag; int ret, t_ret; + dbp = dbc->dbp; + /* * Retrieve the metadata page and insert the page at the head of * the free list. If either the lock get or page get routines @@ -152,23 +171,23 @@ __bam_free(dbp, h) */ dirty_flag = 0; pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) { - (void)__BT_TLPUT(dbp, metalock); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) { + (void)__BT_TLPUT(dbc, metalock); goto err; } /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { memset(&ldbt, 0, sizeof(ldbt)); ldbt.data = h; ldbt.size = P_OVERHEAD; if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info, - dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, + dbc->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, &meta->lsn, &ldbt, meta->free)) != 0) { (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); return (ret); } LSN(h) = LSN(meta); @@ -182,7 +201,7 @@ __bam_free(dbp, h) { db_pgno_t __pgno; DB_LSN __lsn; __pgno = h->pgno; __lsn = h->lsn; - memset(h, 0xff, dbp->pgsize); + memset(h, 0xdb, dbp->pgsize); h->pgno = __pgno; h->lsn = __lsn; } @@ -194,7 +213,7 @@ __bam_free(dbp, h) /* Discard the metadata page. */ ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); - if ((t_ret = __BT_TLPUT(dbp, metalock)) != 0) + if ((t_ret = __BT_TLPUT(dbc, metalock)) != 0) ret = t_ret; /* Discard the caller's page reference. */ @@ -212,19 +231,21 @@ err: if ((t_ret = memp_fput(dbp->mpf, h, dirty_flag)) != 0 && ret == 0) #ifdef DEBUG /* * __bam_lt -- - * Print out the list of currently held locks. + * Print out the list of locks currently held by a cursor. * - * PUBLIC: int __bam_lt __P((DB *)); + * PUBLIC: int __bam_lt __P((DBC *)); */ int -__bam_lt(dbp) - DB *dbp; +__bam_lt(dbc) + DBC *dbc; { + DB *dbp; DB_LOCKREQ req; + dbp = dbc->dbp; if (F_ISSET(dbp, DB_AM_LOCKING)) { req.op = DB_LOCK_DUMP; - lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL); + lock_vec(dbp->dbenv->lk_info, dbc->locker, 0, &req, 1, NULL); } return (0); } @@ -234,27 +255,29 @@ __bam_lt(dbp) * __bam_lget -- * The standard lock get call. * - * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); + * PUBLIC: int __bam_lget + * PUBLIC: __P((DBC *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); */ int -__bam_lget(dbp, do_couple, pgno, mode, lockp) - DB *dbp; +__bam_lget(dbc, do_couple, pgno, mode, lockp) + DBC *dbc; int do_couple; db_pgno_t pgno; db_lockmode_t mode; DB_LOCK *lockp; { + DB *dbp; DB_LOCKREQ couple[2]; - u_int32_t locker; int ret; + dbp = dbc->dbp; + if (!F_ISSET(dbp, DB_AM_LOCKING)) { *lockp = LOCK_INVALID; return (0); } - locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid; - dbp->lock.pgno = pgno; + dbc->lock.pgno = pgno; /* * If the object not currently locked, acquire the lock and return, @@ -263,54 +286,32 @@ __bam_lget(dbp, do_couple, pgno, mode, lockp) */ if (do_couple) { couple[0].op = DB_LOCK_GET; - couple[0].obj = &dbp->lock_dbt; + couple[0].obj = &dbc->lock_dbt; couple[0].mode = mode; couple[1].op = DB_LOCK_PUT; couple[1].lock = *lockp; - ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL); + if (dbc->txn == NULL) + ret = lock_vec(dbp->dbenv->lk_info, + dbc->locker, 0, couple, 2, NULL); + else + ret = lock_tvec(dbp->dbenv->lk_info, + dbc->txn, 0, couple, 2, NULL); if (ret != 0) { /* If we fail, discard the lock we held. */ - __bam_lput(dbp, *lockp); + __BT_LPUT(dbc, *lockp); return (ret < 0 ? EAGAIN : ret); } *lockp = couple[0].lock; } else { - ret = lock_get(dbp->dbenv->lk_info, - locker, 0, &dbp->lock_dbt, mode, lockp); + if (dbc->txn == NULL) + ret = lock_get(dbp->dbenv->lk_info, + dbc->locker, 0, &dbc->lock_dbt, mode, lockp); + else + ret = lock_tget(dbp->dbenv->lk_info, + dbc->txn, 0, &dbc->lock_dbt, mode, lockp); return (ret < 0 ? EAGAIN : ret); } return (0); } - -/* - * __bam_lput -- - * The standard lock put call. - * - * PUBLIC: int __bam_lput __P((DB *, DB_LOCK)); - */ -int -__bam_lput(dbp, lock) - DB *dbp; - DB_LOCK lock; -{ - return (__BT_LPUT(dbp, lock)); -} - -/* - * __bam_pget -- - * The standard page get call. - * - * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, u_int32_t)); - */ -int -__bam_pget(dbp, hp, pgnop, mpool_flags) - DB *dbp; - PAGE **hp; - db_pgno_t *pgnop; - u_int32_t mpool_flags; -{ - return (memp_fget((dbp)->mpf, - pgnop, mpool_flags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop)); -} diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c index a93faac98c..0d7a69889a 100644 --- a/db2/btree/bt_put.c +++ b/db2/btree/bt_put.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98"; +static const char sccsid[] = "@(#)bt_put.c 10.54 (Sleepycat) 12/6/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -61,372 +61,23 @@ static const char sccsid[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98"; #include "db_page.h" #include "btree.h" -static int __bam_fixed __P((BTREE *, DBT *)); -static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *)); -static int __bam_lookup __P((DB *, DBT *, int *)); -static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); -static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *)); -static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t)); +static int __bam_fixed __P((DBC *, DBT *)); +static int __bam_ndup __P((DBC *, PAGE *, u_int32_t)); +static int __bam_ovput __P((DBC *, PAGE *, u_int32_t, DBT *)); +static int __bam_partial __P((DBC *, + DBT *, PAGE *, u_int32_t, u_int32_t, u_int32_t)); static u_int32_t __bam_partsize __P((DBT *, PAGE *, u_int32_t)); /* - * __bam_put -- - * Add a new key/data pair or replace an existing pair (btree). - * - * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - */ -int -__bam_put(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - BTREE *t; - CURSOR c; - DB *dbp; - PAGE *h; - db_indx_t indx; - u_int32_t iitem_flags, insert_flags; - int exact, isdeleted, newkey, ret, stack; - - DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags); - - /* Check flags. */ - if ((ret = __db_putchk(argdbp, key, data, flags, - F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0) - return (ret); - - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - -retry: /* - * Find the location at which to insert. The call to __bam_lookup - * leaves the returned page pinned. - */ - if ((ret = __bam_lookup(dbp, key, &exact)) != 0) { - PUTHANDLE(dbp); - return (ret); - } - h = t->bt_csp->page; - indx = t->bt_csp->indx; - stack = 1; - - /* - * If DB_NOOVERWRITE is set and there's an identical key in the tree, - * return an error unless the data item has already been marked for - * deletion, or, all the remaining data items have already been marked - * for deletion in the case of duplicates. If all the data items have - * been marked for deletion, we do a replace, otherwise, it has to be - * a set of duplicates, and we simply append a new one to the set. - */ - isdeleted = 0; - if (exact) { - if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0) - goto err; - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); - else - if (flags == DB_NOOVERWRITE) { - ret = DB_KEYEXIST; - goto err; - } - } - - /* - * If we're inserting into the first or last page of the tree, - * remember where we did it so we can do fast lookup next time. - * - * XXX - * Does reverse order still work (did it ever!?!?) - */ - t->bt_lpgno = - h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ? - h->pgno : PGNO_INVALID; - - /* - * Select the arguments for __bam_iitem() and do the insert. If the - * key is an exact match, we're either adding a new duplicate at the - * end of the duplicate set, or we're replacing the data item with a - * new data item. If the key isn't an exact match, we're inserting - * a new key/data pair, before the search location. - */ - newkey = dbp->type == DB_BTREE && !exact; - if (exact) { - if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) { - /* - * Make sure that we're not looking at a page of - * duplicates -- if so, move to the last entry on - * that page. - */ - c.page = h; - c.pgno = h->pgno; - c.indx = indx; - c.dpgno = PGNO_INVALID; - c.dindx = 0; - if ((ret = - __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0) - goto err; - if (c.dpgno != PGNO_INVALID) { - /* - * XXX - * The __bam_ovfl_chk() routine memp_fput() the - * current page and acquired a new one, but did - * not do anything about the lock we're holding. - */ - t->bt_csp->page = h = c.page; - indx = c.dindx; - } - insert_flags = DB_AFTER; - } else - insert_flags = DB_CURRENT; - } else - insert_flags = DB_BEFORE; - - /* - * The pages we're using may be modified by __bam_iitem(), so make - * sure we reset the stack. - */ - iitem_flags = 0; - if (newkey) - iitem_flags |= BI_NEWKEY; - if (isdeleted) - iitem_flags |= BI_DOINCR; - ret = __bam_iitem(dbp, &h, &indx, key, data, insert_flags, iitem_flags); - t->bt_csp->page = h; - t->bt_csp->indx = indx; - - switch (ret) { - case 0: - /* Done. Clean up the cursor. */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); - break; - case DB_NEEDSPLIT: - /* - * We have to split the page. Back out the cursor setup, - * discard the stack of pages, and do the split. - */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - - (void)__bam_stkrel(dbp); - stack = 0; - - if ((ret = __bam_split(dbp, key)) != 0) - break; - - goto retry; - /* NOTREACHED */ - default: - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - break; - } - -err: if (stack) - (void)__bam_stkrel(dbp); - - PUTHANDLE(dbp); - return (ret); -} - -/* - * __bam_isdeleted -- - * Return if the only remaining data item for the element has been - * deleted. - */ -static int -__bam_isdeleted(dbp, h, indx, isdeletedp) - DB *dbp; - PAGE *h; - u_int32_t indx; - int *isdeletedp; -{ - BKEYDATA *bk; - db_pgno_t pgno; - int ret; - - *isdeletedp = 1; - for (;;) { - bk = GET_BKEYDATA(h, indx + O_INDX); - switch (B_TYPE(bk->type)) { - case B_KEYDATA: - case B_OVERFLOW: - if (!B_DISSET(bk->type)) { - *isdeletedp = 0; - return (0); - } - break; - case B_DUPLICATE: - /* - * If the data item referencing the off-page duplicates - * is flagged as deleted, we're done. Else, we have to - * walk the chain of duplicate pages. - */ - if (B_DISSET(bk->type)) - return (0); - goto dupchk; - default: - return (__db_pgfmt(dbp, h->pgno)); - } - - /* - * If there are no more on-page duplicate items, then every - * data item for this key must have been deleted. - */ - if (indx + P_INDX >= (u_int32_t)NUM_ENT(h)) - return (0); - if (h->inp[indx] != h->inp[indx + P_INDX]) - return (0); - - /* Check the next item. */ - indx += P_INDX; - } - /* NOTREACHED */ - -dupchk: /* Check a chain of duplicate pages. */ - pgno = ((BOVERFLOW *)bk)->pgno; - for (;;) { - /* Acquire the next page in the duplicate chain. */ - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) - return (ret); - - /* Check each item for a delete flag. */ - for (indx = 0; indx < NUM_ENT(h); ++indx) - if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) { - *isdeletedp = 0; - goto done; - } - /* - * If we reach the end of the duplicate pages, then every - * item we reviewed must have been deleted. - */ - if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID) - goto done; - - (void)memp_fput(dbp->mpf, h, 0); - } - /* NOTREACHED */ - -done: (void)memp_fput(dbp->mpf, h, 0); - return (0); -} - -/* - * __bam_lookup -- - * Find the right location in the tree for the key. - */ -static int -__bam_lookup(dbp, key, exactp) - DB *dbp; - DBT *key; - int *exactp; -{ - BTREE *t; - DB_LOCK lock; - EPG e; - PAGE *h; - db_indx_t indx; - int cmp, ret; - - t = dbp->internal; - h = NULL; - - /* - * Record numbers can't be fast-tracked, we have to lock the entire - * tree. - */ - if (F_ISSET(dbp, DB_BT_RECNUM)) - goto slow; - - /* Check to see if we've been seeing sorted input. */ - if (t->bt_lpgno == PGNO_INVALID) - goto slow; - - /* - * Retrieve the page on which we did the last insert. It's okay if - * it doesn't exist, or if it's not the page type we expect, it just - * means that the world changed. - */ - if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) - goto miss; - if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) { - (void)__BT_LPUT(dbp, lock); - goto miss; - } - if (TYPE(h) != P_LBTREE) - goto miss; - if (NUM_ENT(h) == 0) - goto miss; - - /* - * We have to be at the end or beginning of the tree to know that - * we're inserting in a sort order. If that's the case and we're - * in the right order in comparison to the first/last key/data pair, - * we have the right position. - */ - if (h->next_pgno == PGNO_INVALID) { - e.page = h; - e.indx = NUM_ENT(h) - P_INDX; - if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) { - if (cmp > 0) - e.indx += P_INDX; - goto fast; - } - } - if (h->prev_pgno == PGNO_INVALID) { - e.page = h; - e.indx = 0; - if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) { - /* - * We're doing a put, so we want to insert as the last - * of any set of duplicates. - */ - if (cmp == 0) { - for (indx = 0; - indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && - h->inp[indx] == h->inp[indx + P_INDX]; - indx += P_INDX) - ; - e.indx = indx; - } - goto fast; - } - } - goto miss; - - /* Set the exact match flag in case we've already inserted this key. */ -fast: *exactp = cmp == 0; - - /* Enter the entry in the stack. */ - BT_STK_CLR(t); - BT_STK_ENTER(t, e.page, e.indx, lock, ret); - if (ret != 0) - return (ret); - - ++t->lstat.bt_cache_hit; - return (0); - -miss: ++t->lstat.bt_cache_miss; - if (h != NULL) { - (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); - } - -slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp)); -} - -/* * __bam_iitem -- * Insert an item into the tree. * - * PUBLIC: int __bam_iitem __P((DB *, + * PUBLIC: int __bam_iitem __P((DBC *, * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t)); */ int -__bam_iitem(dbp, hp, indxp, key, data, op, flags) - DB *dbp; +__bam_iitem(dbc, hp, indxp, key, data, op, flags) + DBC *dbc; PAGE **hp; db_indx_t *indxp; DBT *key, *data; @@ -434,6 +85,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) { BTREE *t; BKEYDATA *bk; + DB *dbp; DBT tdbt; PAGE *h; db_indx_t indx, nbytes; @@ -442,6 +94,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) COMPQUIET(bk, NULL); + dbp = dbc->dbp; t = dbp->internal; h = *hp; indx = *indxp; @@ -473,21 +126,21 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) default: return (__db_pgfmt(dbp, h->pgno)); } - if ((ret = __db_ditem(dbp, *hp, *indxp, nbytes)) != 0) + if ((ret = __db_ditem(dbc, *hp, *indxp, nbytes)) != 0) return (ret); } /* Put the new/replacement item onto the page. */ - if ((ret = __db_dput(dbp, data, hp, indxp, __bam_new)) != 0) + if ((ret = __db_dput(dbc, data, hp, indxp, __bam_new)) != 0) return (ret); goto done; } /* Handle fixed-length records: build the real record. */ - if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) { + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->recno->re_len) { tdbt = *data; - if ((ret = __bam_fixed(t, &tdbt)) != 0) + if ((ret = __bam_fixed(dbc, &tdbt)) != 0) return (ret); data = &tdbt; } @@ -554,7 +207,8 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Handle partial puts: build the real record. */ if (F_ISSET(data, DB_DBT_PARTIAL)) { tdbt = *data; - if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0) + if ((ret = __bam_partial(dbc, + &tdbt, h, indx, data_size, flags)) != 0) return (ret); data = &tdbt; } @@ -583,10 +237,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Add the key. */ if (bigkey) { - if ((ret = __bam_ovput(dbp, h, indx, key)) != 0) + if ((ret = __bam_ovput(dbc, h, indx, key)) != 0) return (ret); } else - if ((ret = __db_pitem(dbp, h, indx, + if ((ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(key->size), NULL, key)) != 0) return (ret); ++indx; @@ -598,7 +252,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * Adjust the cursor and copy in the key for * the duplicate. */ - if ((ret = __bam_adjindx(dbp, + if ((ret = __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0) return (ret); @@ -620,7 +274,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * the duplicate. */ if ((ret = - __bam_adjindx(dbp, h, indx, indx, 1)) != 0) + __bam_adjindx(dbc, h, indx, indx, 1)) != 0) return (ret); ++indx; @@ -639,7 +293,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * delete and then re-add the item. */ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) return (ret); break; } @@ -654,7 +308,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Add the data. */ if (bigdata) { - if ((ret = __bam_ovput(dbp, h, indx, data)) != 0) + if ((ret = __bam_ovput(dbc, h, indx, data)) != 0) return (ret); } else { BKEYDATA __bk; @@ -665,12 +319,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) __bk.len = data->size; __hdr.data = &__bk; __hdr.size = SSZA(BKEYDATA, data); - ret = __db_pitem(dbp, h, indx, + ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), &__hdr, data); } else if (replace) - ret = __bam_ritem(dbp, h, indx, data); + ret = __bam_ritem(dbc, h, indx, data); else - ret = __db_pitem(dbp, h, indx, + ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), NULL, data); if (ret != 0) return (ret); @@ -686,7 +340,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) */ if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { --indx; - if ((ret = __bam_ndup(dbp, h, indx)) != 0) + if ((ret = __bam_ndup(dbc, h, indx)) != 0) return (ret); } @@ -700,14 +354,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) done: if (LF_ISSET(BI_DOINCR) || (op != DB_CURRENT && (F_ISSET(dbp, DB_BT_RECNUM) || dbp->type == DB_RECNO))) - if ((ret = __bam_adjust(dbp, t, 1)) != 0) + if ((ret = __bam_adjust(dbc, 1)) != 0) return (ret); /* If we've modified a recno file, set the flag */ - if (t->bt_recno != NULL) - F_SET(t->bt_recno, RECNO_MODIFIED); - - ++t->lstat.bt_added; + if (t->recno != NULL) + F_SET(t->recno, RECNO_MODIFIED); return (ret); } @@ -770,7 +422,7 @@ __bam_partsize(data, h, indx) memset(&__hdr, 0, sizeof(__hdr)); \ __hdr.data = &bo; \ __hdr.size = BOVERFLOW_SIZE; \ - if ((ret = __db_pitem(dbp, \ + if ((ret = __db_pitem(dbc, \ h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ return (ret); \ } while (0) @@ -780,8 +432,8 @@ __bam_partsize(data, h, indx) * Build an overflow item and put it on the page. */ static int -__bam_ovput(dbp, h, indx, item) - DB *dbp; +__bam_ovput(dbc, h, indx, item) + DBC *dbc; PAGE *h; u_int32_t indx; DBT *item; @@ -789,10 +441,12 @@ __bam_ovput(dbp, h, indx, item) BOVERFLOW bo; int ret; + UMRW(bo.unused1); B_TSET(bo.type, B_OVERFLOW, 0); - bo.tlen = item->size; - if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0) + UMRW(bo.unused2); + if ((ret = __db_poff(dbc, item, &bo.pgno, __bam_new)) != 0) return (ret); + bo.tlen = item->size; OVPUT(h, indx, bo); @@ -803,22 +457,25 @@ __bam_ovput(dbp, h, indx, item) * __bam_ritem -- * Replace an item on a page. * - * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *)); + * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *)); */ int -__bam_ritem(dbp, h, indx, data) - DB *dbp; +__bam_ritem(dbc, h, indx, data) + DBC *dbc; PAGE *h; u_int32_t indx; DBT *data; { BKEYDATA *bk; + DB *dbp; DBT orig, repl; db_indx_t cnt, lo, ln, min, off, prefix, suffix; int32_t nbytes; int ret; u_int8_t *p, *t; + dbp = dbc->dbp; + /* * Replace a single item onto a page. The logic figuring out where * to insert and whether it fits is handled in the caller. All we do @@ -827,7 +484,7 @@ __bam_ritem(dbp, h, indx, data) bk = GET_BKEYDATA(h, indx); /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { /* * We might as well check to see if the two data items share * a common prefix and suffix -- it can save us a lot of log @@ -851,7 +508,7 @@ __bam_ritem(dbp, h, indx, data) orig.size = bk->len - (prefix + suffix); repl.data = (u_int8_t *)data->data + prefix; repl.size = data->size - (prefix + suffix); - if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) @@ -907,18 +564,21 @@ __bam_ritem(dbp, h, indx, data) * If it should, create it. */ static int -__bam_ndup(dbp, h, indx) - DB *dbp; +__bam_ndup(dbc, h, indx) + DBC *dbc; PAGE *h; u_int32_t indx; { BKEYDATA *bk; BOVERFLOW bo; + DB *dbp; DBT hdr; PAGE *cp; db_indx_t cnt, cpindx, first, sz; int ret; + dbp = dbc->dbp; + while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) indx -= P_INDX; for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { @@ -941,7 +601,7 @@ __bam_ndup(dbp, h, indx) return (0); /* Get a new page. */ - if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0) + if ((ret = __bam_new(dbc, P_DUPLICATE, &cp)) != 0) return (ret); /* @@ -957,7 +617,7 @@ __bam_ndup(dbp, h, indx) hdr.size = B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; if ((ret = - __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0) + __db_pitem(dbc, cp, cpindx, hdr.size, &hdr, NULL)) != 0) goto err; /* @@ -970,18 +630,20 @@ __bam_ndup(dbp, h, indx) PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx); /* Delete the data item. */ - if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0) + if ((ret = __db_ditem(dbc, h, indx, hdr.size)) != 0) goto err; /* Delete all but the first reference to the key. */ if (--cnt == 0) break; - if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0) + if ((ret = __bam_adjindx(dbc, h, indx, first, 0)) != 0) goto err; } /* Put in a new data item that points to the duplicates page. */ + UMRW(bo.unused1); B_TSET(bo.type, B_DUPLICATE, 0); + UMRW(bo.unused2); bo.pgno = cp->pgno; bo.tlen = 0; @@ -989,7 +651,7 @@ __bam_ndup(dbp, h, indx) return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY)); -err: (void)__bam_free(dbp, cp); +err: (void)__bam_free(dbc, cp); return (ret); } @@ -998,13 +660,16 @@ err: (void)__bam_free(dbp, cp); * Build the real record for a fixed length put. */ static int -__bam_fixed(t, dbt) - BTREE *t; +__bam_fixed(dbc, dbt) + DBC *dbc; DBT *dbt; { + DB *dbp; RECNO *rp; + int ret; - rp = t->bt_recno; + dbp = dbc->dbp; + rp = ((BTREE *)dbp->internal)->recno; /* * If database contains fixed-length records, and the record is long, @@ -1018,29 +683,27 @@ __bam_fixed(t, dbt) * short. Pad it out. We use the record data return memory, it's * only a short-term use. */ - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } - memcpy(t->bt_rdata.data, dbt->data, dbt->size); - memset((u_int8_t *)t->bt_rdata.data + dbt->size, + memcpy(dbc->rdata.data, dbt->data, dbt->size); + memset((u_int8_t *)dbc->rdata.data + dbt->size, rp->re_pad, rp->re_len - dbt->size); /* * Clean up our flags and other information just in case, and * change the caller's DBT to reference our created record. */ - t->bt_rdata.size = rp->re_len; - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; - *dbt = t->bt_rdata; + dbc->rdata.size = rp->re_len; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + *dbt = dbc->rdata; return (0); } @@ -1050,15 +713,15 @@ __bam_fixed(t, dbt) * Build the real record for a partial put. */ static int -__bam_partial(dbp, dbt, h, indx, nbytes) - DB *dbp; +__bam_partial(dbc, dbt, h, indx, nbytes, flags) + DBC *dbc; DBT *dbt; PAGE *h; - u_int32_t indx, nbytes; + u_int32_t indx, nbytes, flags; { - BTREE *t; BKEYDATA *bk, tbk; BOVERFLOW *bo; + DB *dbp; DBT copy; u_int32_t len, tlen; u_int8_t *p; @@ -1066,18 +729,34 @@ __bam_partial(dbp, dbt, h, indx, nbytes) COMPQUIET(bo, NULL); - t = dbp->internal; + dbp = dbc->dbp; /* We use the record data return memory, it's only a short-term use. */ - if (t->bt_rdata.ulen < nbytes) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(nbytes) : - (void *)__db_realloc(t->bt_rdata.data, nbytes); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < nbytes) { + if ((ret = __os_realloc(&dbc->rdata.data, nbytes)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = nbytes; + dbc->rdata.ulen = nbytes; + } + + /* + * We use nul bytes for any part of the record that isn't specified; + * get it over with. + */ + memset(dbc->rdata.data, 0, nbytes); + + /* + * In the next clauses, we need to do three things: a) set p to point + * to the place at which to copy the user's data, b) set tlen to the + * total length of the record, not including the bytes contributed by + * the user, and c) copy any valid data from an existing record. + */ + if (LF_ISSET(BI_NEWKEY)) { + tlen = dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; + goto ucopy; } /* Find the current record. */ @@ -1089,13 +768,6 @@ __bam_partial(dbp, dbt, h, indx, nbytes) B_TSET(bk->type, B_KEYDATA, 0); bk->len = 0; } - - /* - * We use nul bytes for any part of the record that isn't specified, - * get it over with. - */ - memset(t->bt_rdata.data, 0, nbytes); - if (B_TYPE(bk->type) == B_OVERFLOW) { /* * In the case of an overflow record, we shift things around @@ -1103,12 +775,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes) */ memset(©, 0, sizeof(copy)); if ((ret = __db_goff(dbp, ©, bo->tlen, - bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) return (ret); /* Skip any leading data from the original record. */ tlen = dbt->doff; - p = (u_int8_t *)t->bt_rdata.data + dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; /* * Copy in any trailing data from the original record. @@ -1127,20 +799,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes) memmove(p + dbt->size, p + dbt->dlen, len); tlen += len; } - - /* Copy in the application provided data. */ - memcpy(p, dbt->data, dbt->size); - tlen += dbt->size; } else { /* Copy in any leading data from the original record. */ - memcpy(t->bt_rdata.data, + memcpy(dbc->rdata.data, bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); tlen = dbt->doff; - p = (u_int8_t *)t->bt_rdata.data + dbt->doff; - - /* Copy in the application provided data. */ - memcpy(p, dbt->data, dbt->size); - tlen += dbt->size; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; /* Copy in any trailing data from the original record. */ len = dbt->doff + dbt->dlen; @@ -1150,11 +814,18 @@ __bam_partial(dbp, dbt, h, indx, nbytes) } } +ucopy: /* + * Copy in the application provided data -- p and tlen must have been + * initialized above. + */ + memcpy(p, dbt->data, dbt->size); + tlen += dbt->size; + /* Set the DBT to reference our new record. */ - t->bt_rdata.size = tlen; - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; - *dbt = t->bt_rdata; + dbc->rdata.size = tlen; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + *dbt = dbc->rdata; return (0); } diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c index fe33825ec4..de6b3b7d0e 100644 --- a/db2/btree/bt_rec.c +++ b/db2/btree/bt_rec.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_rec.c 10.21 (Sleepycat) 4/28/98"; +static const char sccsid[] = "@(#)bt_rec.c 10.28 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -45,7 +45,8 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) BTMETA *meta; DB_MPOOLFILE *mpf; PAGE *pagep; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; db_pgno_t pgno; int cmp_n, cmp_p, modified, ret; @@ -101,7 +102,6 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) modified = 1; } if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); (void)memp_fput(mpf, meta, 0); goto out; } @@ -121,12 +121,10 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) meta->free = argp->pgno; modified = 1; } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; ret = 0; out: REC_CLOSE; @@ -149,7 +147,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) { __bam_pg_free_args *argp; BTMETA *meta; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; db_pgno_t pgno; @@ -192,10 +191,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } /* * Fix up the metadata page. If we're redoing or undoing the operation @@ -224,10 +221,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) meta->lsn = argp->meta_lsn; modified = 1; } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } done: *lsnp = argp->prev_lsn; ret = 0; @@ -251,7 +246,8 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_split_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; db_pgno_t pgno; @@ -310,12 +306,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) goto done; /* Allocate and initialize new left/right child pages. */ - if ((_lp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL || - (_rp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL) { - ret = ENOMEM; - __db_err(file_dbp->dbenv, "%s", strerror(ret)); + if ((ret = __os_malloc(file_dbp->pgsize, NULL, &_lp)) != 0 || + (ret = __os_malloc(file_dbp->pgsize, NULL, &_rp)) != 0) goto out; - } if (rootsplit) { P_INIT(_lp, file_dbp->pgsize, argp->left, PGNO_INVALID, @@ -352,7 +345,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) memcpy(lp, _lp, file_dbp->pgsize); lp->lsn = *lsnp; if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; lp = NULL; } @@ -367,7 +360,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) memcpy(rp, _rp, file_dbp->pgsize); rp->lsn = *lsnp; if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; rp = NULL; } @@ -392,7 +385,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) __bam_total(_lp) + __bam_total(_rp) : 0); pp->lsn = *lsnp; if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; pp = NULL; } @@ -412,9 +405,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) if (log_compare(&LSN(np), &argp->nlsn) == 0) { PREV_PGNO(np) = argp->right; np->lsn = *lsnp; - if ((ret = memp_fput(mpf, - np, DB_MPOOL_DIRTY)) != 0) - goto fatal; + if ((ret = + memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0) + goto out; np = NULL; } } @@ -433,7 +426,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) if (log_compare(lsnp, &LSN(pp)) == 0) { memcpy(pp, argp->pg.data, argp->pg.size); if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; pp = NULL; } @@ -451,7 +444,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { lp->lsn = argp->llsn; if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; lp = NULL; } if (rp != NULL && @@ -459,7 +452,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { rp->lsn = argp->rlsn; if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; rp = NULL; } } @@ -481,7 +474,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { PREV_PGNO(np) = argp->left; np->lsn = argp->nlsn; if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) - goto fatal; + goto out; np = NULL; } } @@ -490,9 +483,6 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { done: *lsnp = argp->prev_lsn; ret = 0; - if (0) { -fatal: (void)__db_panic(file_dbp); - } out: /* Free any pages that weren't dirtied. */ if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) ret = t_ret; @@ -505,9 +495,9 @@ out: /* Free any pages that weren't dirtied. */ /* Free any allocated space. */ if (_lp != NULL) - __db_free(_lp); + __os_free(_lp, file_dbp->pgsize); if (_rp != NULL) - __db_free(_rp); + __os_free(_rp, file_dbp->pgsize); REC_CLOSE; } @@ -528,7 +518,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_rsplit_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; db_pgno_t pgno; @@ -558,16 +549,14 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT, argp->nrec, PGNO_INVALID, pagep->level + 1, file_dbp->type == DB_BTREE ? P_IBTREE : P_IRECNO); - if ((ret = __db_pitem(file_dbp, pagep, 0, + if ((ret = __db_pitem(dbc, pagep, 0, argp->rootent.size, &argp->rootent, NULL)) != 0) goto out; pagep->lsn = argp->rootlsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } /* * Fix the page copied over the root page. It's possible that the @@ -592,10 +581,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } done: *lsnp = argp->prev_lsn; ret = 0; @@ -619,7 +606,8 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_adj_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -640,7 +628,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) cmp_p = log_compare(&LSN(pagep), &argp->lsn); if (cmp_p == 0 && redo) { /* Need to redo update described. */ - if ((ret = __bam_adjindx(file_dbp, + if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) goto err; @@ -648,7 +636,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) modified = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo update described. */ - if ((ret = __bam_adjindx(file_dbp, + if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) goto err; @@ -684,7 +672,8 @@ __bam_cadjust_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_cadjust_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -760,7 +749,8 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_cdel_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -781,13 +771,19 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info) cmp_p = log_compare(&LSN(pagep), &argp->lsn); if (cmp_p == 0 && redo) { /* Need to redo update described. */ - B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); + if (pagep->type == P_DUPLICATE) + B_DSET(GET_BKEYDATA(pagep, argp->indx)->type); + else + B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); LSN(pagep) = *lsnp; modified = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo update described. */ - B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); + if (pagep->type == P_DUPLICATE) + B_DCLR(GET_BKEYDATA(pagep, argp->indx)->type); + else + B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); LSN(pagep) = argp->lsn; modified = 1; @@ -818,7 +814,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) { __bam_repl_args *argp; BKEYDATA *bk; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DBT dbt; DB_MPOOLFILE *mpf; PAGE *pagep; @@ -848,10 +845,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->repl.size; - if ((dbt.data = __db_malloc(dbt.size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0) goto err; - } p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -859,8 +854,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) p += argp->repl.size; memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); - ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt); - __db_free(dbt.data); + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); if (ret != 0) goto err; @@ -874,10 +869,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->orig.size; - if ((dbt.data = __db_malloc(dbt.size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0) goto err; - } p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -885,8 +878,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) p += argp->orig.size; memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); - ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt); - __db_free(dbt.data); + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); if (ret != 0) goto err; diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c index 38dbbd1c55..c69877ff7f 100644 --- a/db2/btree/bt_recno.c +++ b/db2/btree/bt_recno.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_recno.c 10.37 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)bt_recno.c 10.53 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -22,64 +22,89 @@ static const char sccsid[] = "@(#)bt_recno.c 10.37 (Sleepycat) 5/23/98"; #include "db_int.h" #include "db_page.h" #include "btree.h" - -static int __ram_add __P((DB *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); -static int __ram_c_close __P((DBC *)); -static int __ram_c_del __P((DBC *, u_int32_t)); -static int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __ram_fmap __P((DB *, db_recno_t)); -static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -static int __ram_iget __P((DB *, DBT *, DBT *)); +#include "db_ext.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" + +static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); +static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); +static int __ram_fmap __P((DBC *, db_recno_t)); +static int __ram_i_delete __P((DBC *)); static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); static int __ram_source __P((DB *, RECNO *, const char *)); static int __ram_sync __P((DB *, u_int32_t)); -static int __ram_update __P((DB *, db_recno_t, int)); -static int __ram_vmap __P((DB *, db_recno_t)); -static int __ram_writeback __P((DB *)); +static int __ram_update __P((DBC *, db_recno_t, int)); +static int __ram_vmap __P((DBC *, db_recno_t)); +static int __ram_writeback __P((DBC *)); /* - * If we're renumbering records, then we have to detect in the cursor that a - * record was deleted, and adjust the cursor as necessary. If not renumbering - * records, then we can detect this by looking at the actual record, so we - * ignore the cursor delete flag. + * In recno, there are two meanings to the on-page "deleted" flag. If we're + * re-numbering records, it means the record was implicitly created. We skip + * over implicitly created records if doing a cursor "next" or "prev", and + * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering + * records, it means that the record was implicitly created, or was deleted. + * We skip over implicitly created or deleted records if doing a cursor "next" + * or "prev", and return DB_KEYEMPTY if they're explicitly requested. + * + * If we're re-numbering records, then we have to detect in the cursor that + * a record was deleted, and adjust the cursor as necessary on the next get. + * If we're not re-numbering records, then we can detect that a record has + * been deleted by looking at the actual on-page record, so we completely + * ignore the cursor's delete flag. This is different from the B+tree code. + * It also maintains whether the cursor references a deleted record in the + * cursor, and it doesn't always check the on-page value. */ #define CD_SET(dbp, cp) { \ if (F_ISSET(dbp, DB_RE_RENUMBER)) \ - F_SET(cp, CR_DELETED); \ + F_SET(cp, C_DELETED); \ } #define CD_CLR(dbp, cp) { \ if (F_ISSET(dbp, DB_RE_RENUMBER)) \ - F_CLR(cp, CR_DELETED); \ + F_CLR(cp, C_DELETED); \ } #define CD_ISSET(dbp, cp) \ - (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED)) + (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED)) /* * __ram_open -- * Recno open function. * - * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *)); + * PUBLIC: int __ram_open __P((DB *, DB_INFO *)); */ int -__ram_open(dbp, type, dbinfo) +__ram_open(dbp, dbinfo) DB *dbp; - DBTYPE type; DB_INFO *dbinfo; { BTREE *t; + DBC *dbc; RECNO *rp; - int ret; - - COMPQUIET(type, DB_RECNO); + int ret, t_ret; - ret = 0; + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->internal = t; + __bam_setovflsize(dbp); - /* Allocate and initialize the private RECNO structure. */ - if ((rp = (RECNO *)__db_calloc(1, sizeof(*rp))) == NULL) - return (ENOMEM); + /* Allocate and initialize the private recno structure. */ + if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0) + return (ret); + /* Link in the private recno structure. */ + t->recno = rp; - if (dbinfo != NULL) { + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. + */ + if (dbinfo == NULL) { + rp->re_delim = '\n'; + rp->re_pad = ' '; + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } else { /* * If the user specified a source tree, open it and map it in. * @@ -111,31 +136,40 @@ __ram_open(dbp, type, dbinfo) } } else rp->re_len = 0; - } else { - rp->re_delim = '\n'; - rp->re_pad = ' '; - rp->re_fd = -1; - F_SET(rp, RECNO_EOF); } - /* Open the underlying btree. */ - if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0) - goto err; - - /* Set the routines necessary to make it look like a recno tree. */ - dbp->cursor = __ram_cursor; + /* Initialize the remaining fields/methods of the DB. */ + dbp->am_close = __ram_close; dbp->del = __ram_delete; - dbp->get = __ram_get; dbp->put = __ram_put; + dbp->stat = __bam_stat; dbp->sync = __ram_sync; - /* Link in the private recno structure. */ - ((BTREE *)dbp->internal)->bt_recno = rp; + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp)) != 0) + goto err; + + /* Set the overflow page size. */ + __bam_setovflsize(dbp); /* If we're snapshotting an underlying source file, do it now. */ - if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) - if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) { + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + goto err; + + /* Do the snapshot. */ + if ((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND) + ret = 0; + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0) goto err; + } return (0); @@ -145,143 +179,169 @@ err: /* If we mmap'd a source file, discard it. */ /* If we opened a source file, discard it. */ if (rp->re_fd != -1) - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); if (rp->re_source != NULL) - FREES(rp->re_source); - - /* If we allocated room for key/data return, discard it. */ - t = dbp->internal; - if (t != NULL && t->bt_rkey.data != NULL) - __db_free(t->bt_rkey.data); + __os_freestr(rp->re_source); - FREE(rp, sizeof(*rp)); + __os_free(rp, sizeof(*rp)); return (ret); } /* - * __ram_cursor -- - * Recno db->cursor function. - * - * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **)); + * __ram_delete -- + * Recno db->del function. */ -int -__ram_cursor(dbp, txn, dbcp) +static int +__ram_delete(dbp, txn, key, flags) DB *dbp; DB_TXN *txn; - DBC **dbcp; + DBT *key; + u_int32_t flags; { - RCURSOR *cp; + CURSOR *cp; DBC *dbc; + db_recno_t recno; + int ret, t_ret; - DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0); - - if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL) - return (ENOMEM); - if ((cp = (RCURSOR *)__db_calloc(1, sizeof(RCURSOR))) == NULL) { - __db_free(dbc); - return (ENOMEM); - } - - cp->dbc = dbc; - cp->recno = RECNO_OOB; - - dbc->dbp = dbp; - dbc->txn = txn; - dbc->internal = cp; - dbc->c_close = __ram_c_close; - dbc->c_del = __ram_c_del; - dbc->c_get = __ram_c_get; - dbc->c_put = __ram_c_put; - - /* - * All cursors are queued from the master DB structure. Add the - * cursor to that queue. - */ - CURSOR_SETUP(dbp); - TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); + DB_PANIC_CHECK(dbp); - *dbcp = dbc; - return (0); -} + /* Check for invalid flags. */ + if ((ret = __db_delchk(dbp, + key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); -/* - * __ram_get -- - * Recno db->get function. - */ -static int -__ram_get(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - DB *dbp; - int ret; + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); - DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags); + DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags); - /* Check for invalid flags. */ - if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) - return (ret); + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) + goto err; - GETHANDLE(argdbp, txn, &dbp, ret); + /* Do the delete. */ + cp = dbc->internal; + cp->recno = recno; + ret = __ram_i_delete(dbc); - ret = __ram_iget(dbp, key, data); + /* Release the cursor. */ +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; - PUTHANDLE(dbp); return (ret); } /* - * __ram_iget -- - * Internal ram get function, called for both standard and cursor - * get after the flags have been checked. + * __ram_i_delete -- + * Internal version of recno delete, called by __ram_delete and + * __ram_c_del. */ static int -__ram_iget(dbp, key, data) - DB *dbp; - DBT *key, *data; +__ram_i_delete(dbc) + DBC *dbc; { + BKEYDATA bk; BTREE *t; + CURSOR *cp; + DB *dbp; + DBT hdr, data; PAGE *h; db_indx_t indx; - db_recno_t recno; int exact, ret, stack; - stack = 0; + dbp = dbc->dbp; + cp = dbc->internal; t = dbp->internal; + stack = 0; - /* Check the user's record number and fill in as necessary. */ - if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) - goto done; + /* + * If this is CDB and this isn't a write cursor, then it's an error. + * If it is a write cursor, but we don't yet hold the write lock, then + * we need to upgrade to the write lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + /* Make sure it's a valid update cursor. */ + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } - /* Search the tree for the record. */ - if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0) - goto done; - if (!exact) - return (DB_NOTFOUND); + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } stack = 1; - h = t->bt_csp->page; - indx = t->bt_csp->indx; + h = cp->csp->page; + indx = cp->csp->indx; - /* If the record has already been deleted, we couldn't have found it. */ + /* + * If re-numbering records, the on-page deleted flag can only mean + * that this record was implicitly created. Applications aren't + * permitted to delete records they never created, return an error. + * + * If not re-numbering records, the on-page deleted flag means that + * this record was implicitly created, or, was deleted at some time. + * The former is an error because applications aren't permitted to + * delete records they never created, the latter is an error because + * if the record was "deleted", we could never have found it. + */ if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { ret = DB_KEYEMPTY; - goto done; + goto err; } - /* Return the data item. */ - ret = __db_ret(dbp, - h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen); - ++t->lstat.bt_get; + if (F_ISSET(dbp, DB_RE_RENUMBER)) { + /* Delete the item, adjust the counts, adjust the cursors. */ + if ((ret = __bam_ditem(dbc, h, indx)) != 0) + goto err; + __bam_adjust(dbc, -1); + __ram_ca(dbp, cp->recno, CA_DELETE); + + /* + * If the page is empty, delete it. The whole tree is locked + * so there are no preparations to make. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + stack = 0; + ret = __bam_dpages(dbc); + } + } else { + /* Use a delete/put pair to replace the record with a marker. */ + if ((ret = __bam_ditem(dbc, h, indx)) != 0) + goto err; + + B_TSET(bk.type, B_KEYDATA, 1); + bk.len = 0; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bk; + hdr.size = SSZA(BKEYDATA, data); + memset(&data, 0, sizeof(data)); + data.data = (char *)""; + data.size = 0; + if ((ret = __db_pitem(dbc, + h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + } + F_SET(t->recno, RECNO_MODIFIED); -done: /* Discard the stack. */ - if (stack) - __bam_stkrel(dbp); +err: if (stack) + __bam_stkrel(dbc, 0); + /* If we upgraded the CDB lock upon entry; downgrade it now. */ + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); return (ret); } @@ -290,46 +350,50 @@ done: /* Discard the stack. */ * Recno db->put function. */ static int -__ram_put(argdbp, txn, key, data, flags) - DB *argdbp; +__ram_put(dbp, txn, key, data, flags) + DB *dbp; DB_TXN *txn; DBT *key, *data; u_int32_t flags; { - BTREE *t; - DB *dbp; + DBC *dbc; db_recno_t recno; - int ret; + int ret, t_ret; - DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags); + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_putchk(argdbp, - key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0) + if ((ret = __db_putchk(dbp, + key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0) + return (ret); + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) return (ret); - GETHANDLE(argdbp, txn, &dbp, ret); + DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags); /* * If we're appending to the tree, make sure we've read in all of * the backing source file. Otherwise, check the user's record * number and fill in as necessary. */ - ret = LF_ISSET(DB_APPEND) ? - __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1); + ret = flags == DB_APPEND ? + __ram_update(dbc, DB_MAX_RECORDS, 0) : + __ram_getno(dbc, key, &recno, 1); /* Add the record. */ if (ret == 0) - ret = __ram_add(dbp, &recno, data, flags, 0); + ret = __ram_add(dbc, &recno, data, flags, 0); - /* If we're appending to the tree, we have to return the record. */ - if (ret == 0 && LF_ISSET(DB_APPEND)) { - t = dbp->internal; - ret = __db_retcopy(key, &recno, sizeof(recno), - &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc); - } + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Return the record number if we're appending to the tree. */ + if (ret == 0 && flags == DB_APPEND) + *(db_recno_t *)key->data = recno; - PUTHANDLE(dbp); return (ret); } @@ -338,23 +402,35 @@ __ram_put(argdbp, txn, key, data, flags) * Recno db->sync function. */ static int -__ram_sync(argdbp, flags) - DB *argdbp; +__ram_sync(dbp, flags) + DB *dbp; u_int32_t flags; { - DB *dbp; - int ret; + DBC *dbc; + int ret, t_ret; - DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags); + /* + * Sync the underlying btree. + * + * !!! + * We don't need to do a panic check or flags check, the "real" + * sync function does all that for us. + */ + if ((ret = __db_sync(dbp, flags)) != 0) + return (ret); - /* Sync the underlying btree. */ - if ((ret = __bam_sync(argdbp, flags)) != 0) + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) return (ret); + DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags); + /* Copy back the backing source file. */ - GETHANDLE(argdbp, NULL, &dbp, ret); - ret = __ram_writeback(dbp); - PUTHANDLE(dbp); + ret = __ram_writeback(dbc); + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -366,14 +442,12 @@ __ram_sync(argdbp, flags) * PUBLIC: int __ram_close __P((DB *)); */ int -__ram_close(argdbp) - DB *argdbp; +__ram_close(dbp) + DB *dbp; { RECNO *rp; - DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0); - - rp = ((BTREE *)argdbp->internal)->bt_recno; + rp = ((BTREE *)dbp->internal)->recno; /* Close any underlying mmap region. */ if (rp->re_smap != NULL) @@ -381,136 +455,133 @@ __ram_close(argdbp) /* Close any backing source file descriptor. */ if (rp->re_fd != -1) - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); /* Free any backing source file name. */ if (rp->re_source != NULL) - FREES(rp->re_source); + __os_freestr(rp->re_source); /* Free allocated memory. */ - FREE(rp, sizeof(RECNO)); - ((BTREE *)argdbp->internal)->bt_recno = NULL; + __os_free(rp, sizeof(RECNO)); + ((BTREE *)dbp->internal)->recno = NULL; /* Close the underlying btree. */ - return (__bam_close(argdbp)); -} - -/* - * __ram_c_close -- - * Recno cursor->close function. - */ -static int -__ram_c_close(dbc) - DBC *dbc; -{ - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0); - - return (__ram_c_iclose(dbc->dbp, dbc)); -} - -/* - * __ram_c_iclose -- - * Close a single cursor -- internal version. - * - * PUBLIC: int __ram_c_iclose __P((DB *, DBC *)); - */ -int -__ram_c_iclose(dbp, dbc) - DB *dbp; - DBC *dbc; -{ - /* Remove the cursor from the queue. */ - CURSOR_SETUP(dbp); - TAILQ_REMOVE(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); - - /* Discard the structures. */ - FREE(dbc->internal, sizeof(RCURSOR)); - FREE(dbc, sizeof(DBC)); - - return (0); + return (__bam_close(dbp)); } /* * __ram_c_del -- * Recno cursor->c_del function. + * + * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t)); */ -static int +int __ram_c_del(dbc, flags) DBC *dbc; u_int32_t flags; { - DBT key; - RCURSOR *cp; + CURSOR *cp; + DB *dbp; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags); - + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cdelchk(dbc->dbp, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + if ((ret = __db_cdelchk(dbp, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) return (ret); - /* If already deleted, return failure. */ - if (CD_ISSET(dbc->dbp, cp)) - return (DB_KEYEMPTY); + DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags); - /* Build a normal delete request. */ - memset(&key, 0, sizeof(key)); - key.data = &cp->recno; - key.size = sizeof(db_recno_t); - if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0) - CD_SET(dbc->dbp, cp); + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. + */ + if (F_ISSET(dbp, DB_AM_CDB)) + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); - return (ret); + /* + * The semantics of cursors during delete are as follows: if record + * numbers are mutable (DB_RE_RENUMBER is set), deleting a record + * causes the cursor to automatically point to the record immediately + * following. In this case it is possible to use a single cursor for + * repeated delete operations, without intervening operations. + * + * If record numbers are not mutable, then records are replaced with + * a marker containing a delete flag. If the record referenced by + * this cursor has already been deleted, we will detect that as part + * of the delete operation, and fail. + */ + return (__ram_i_delete(dbc)); } /* * __ram_c_get -- * Recno cursor->c_get function. + * + * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); */ -static int +int __ram_c_get(dbc, key, data, flags) DBC *dbc; DBT *key, *data; u_int32_t flags; { - BTREE *t; + CURSOR *cp, copy; DB *dbp; - RCURSOR *cp, copy; - int ret; - - DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get", - flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, - NULL, flags); + PAGE *h; + db_indx_t indx; + int exact, ret, stack, tmp_rmw; - cp = dbc->internal; dbp = dbc->dbp; + cp = dbc->internal; + + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ if ((ret = __db_cgetchk(dbc->dbp, key, data, flags, cp->recno != RECNO_OOB)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = 0; + if (LF_ISSET(DB_RMW)) { + if (!F_ISSET(dbp, DB_AM_CDB)) { + tmp_rmw = 1; + F_SET(dbc, DBC_RMW); + } + LF_CLR(DB_RMW); + } + + DEBUG_LREAD(dbc, dbc->txn, "ram_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); /* Initialize the cursor for a new retrieval. */ copy = *cp; retry: /* Update the record number. */ + stack = 0; switch (flags) { case DB_CURRENT: - if (CD_ISSET(dbp, cp)) { - PUTHANDLE(dbp); - return (DB_KEYEMPTY); - } + /* + * If record numbers are mutable: if we just deleted a record, + * there is no action necessary, we return the record following + * the deleted item by virtue of renumbering the tree. + */ break; case DB_NEXT: + /* + * If record numbers are mutable: if we just deleted a record, + * we have to avoid incrementing the record number so that we + * return the right record by virtue of renumbering the tree. + */ if (CD_ISSET(dbp, cp)) break; + if (cp->recno != RECNO_OOB) { ++cp->recno; break; @@ -522,86 +593,133 @@ retry: /* Update the record number. */ break; case DB_PREV: if (cp->recno != RECNO_OOB) { - if (cp->recno == 1) - return (DB_NOTFOUND); + if (cp->recno == 1) { + ret = DB_NOTFOUND; + goto err; + } --cp->recno; break; } /* FALLTHROUGH */ case DB_LAST: flags = DB_PREV; - if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND) + if (((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND) goto err; - if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0) + if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) goto err; - if (cp->recno == 0) - return (DB_NOTFOUND); + if (cp->recno == 0) { + ret = DB_NOTFOUND; + goto err; + } break; case DB_SET: case DB_SET_RANGE: - if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0) + if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0) goto err; break; } - /* - * Return the key if the user didn't give us one, and then pass it - * into __ram_iget(). - */ + /* Return the key if the user didn't give us one. */ if (flags != DB_SET && flags != DB_SET_RANGE && (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno), - &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0) - return (ret); + &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0) + goto err; - /* - * The cursor was reset, so the delete adjustment is no - * longer necessary. - */ - CD_CLR(dbp, cp); + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0) + goto err; + stack = 1; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + h = cp->csp->page; + indx = cp->csp->indx; /* - * Retrieve the record. - * - * Skip any keys that don't really exist. + * If re-numbering records, the on-page deleted flag means this record + * was implicitly created. If not re-numbering records, the on-page + * deleted flag means this record was implicitly created, or, it was + * deleted at some time. Regardless, we skip such records if doing + * cursor next/prev operations, and fail if the application requested + * them explicitly. */ - if ((ret = __ram_iget(dbp, key, data)) != 0) - if (ret == DB_KEYEMPTY && - (flags == DB_NEXT || flags == DB_PREV)) + if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { + if (flags == DB_NEXT || flags == DB_PREV) { + (void)__bam_stkrel(dbc, 0); goto retry; + } + ret = DB_KEYEMPTY; + goto err; + } + + /* Return the data item. */ + if ((ret = __db_ret(dbp, + h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + goto err; + + /* The cursor was reset, no further delete adjustment is necessary. */ + CD_CLR(dbp, cp); + +err: if (stack) + (void)__bam_stkrel(dbc, 0); + + /* Release temporary lock upgrade. */ + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); -err: if (ret != 0) + if (ret != 0) *cp = copy; - PUTHANDLE(dbp); return (ret); } /* * __ram_c_put -- * Recno cursor->c_put function. + * + * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); */ -static int +int __ram_c_put(dbc, key, data, flags) DBC *dbc; DBT *key, *data; u_int32_t flags; { - BTREE *t; - RCURSOR *cp, copy; + CURSOR *cp, copy; DB *dbp; int exact, ret; void *arg; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags); - + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags); + + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. If it's a regular writer, + * that means we have an IWRITE lock and we need to upgrade + * it to a write lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } /* Initialize the cursor for a new retrieval. */ copy = *cp; @@ -614,23 +732,23 @@ __ram_c_put(dbc, key, data, flags) */ if (0) { split: arg = &cp->recno; - if ((ret = __bam_split(dbp, arg)) != 0) + if ((ret = __bam_split(dbc, arg)) != 0) goto err; } - if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0) + if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) goto err; if (!exact) { ret = DB_NOTFOUND; goto err; } - if ((ret = __bam_iitem(dbp, &t->bt_csp->page, - &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { - if ((ret = __bam_stkrel(dbp)) != 0) + if ((ret = __bam_iitem(dbc, &cp->csp->page, + &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { + if ((ret = __bam_stkrel(dbc, 0)) != 0) goto err; goto split; } - if ((ret = __bam_stkrel(dbp)) != 0) + if ((ret = __bam_stkrel(dbc, 0)) != 0) goto err; switch (flags) { @@ -650,16 +768,16 @@ split: arg = &cp->recno; break; } - /* - * The cursor was reset, so the delete adjustment is no - * longer necessary. - */ + /* The cursor was reset, no further delete adjustment is necessary. */ CD_CLR(dbp, cp); -err: if (ret != 0) +err: if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + + if (ret != 0) *cp = copy; - PUTHANDLE(dbp); return (ret); } @@ -675,20 +793,22 @@ __ram_ca(dbp, recno, op) db_recno_t recno; ca_recno_arg op; { + CURSOR *cp; DBC *dbc; - RCURSOR *cp; /* * Adjust the cursors. See the comment in __bam_ca_delete(). */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (RCURSOR *)dbc->internal; + cp = dbc->internal; switch (op) { case CA_DELETE: if (recno > cp->recno) --cp->recno; + if (recno == cp->recno) + CD_SET(dbp, cp); break; case CA_IAFTER: if (recno > cp->recno) @@ -700,51 +820,27 @@ __ram_ca(dbp, recno, op) break; } } - CURSOR_TEARDOWN(dbp); + DB_THREAD_UNLOCK(dbp); } -#ifdef DEBUG -/* - * __ram_cprint -- - * Display the current recno cursor list. - * - * PUBLIC: int __ram_cprint __P((DB *)); - */ -int -__ram_cprint(dbp) - DB *dbp; -{ - DBC *dbc; - RCURSOR *cp; - - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (RCURSOR *)dbc->internal; - fprintf(stderr, - "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno); - } - CURSOR_TEARDOWN(dbp); - - return (0); -} -#endif /* DEBUG */ - /* * __ram_getno -- * Check the user's record number, and make sure we've seen it. * - * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); + * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int)); */ int -__ram_getno(dbp, key, rep, can_create) - DB *dbp; +__ram_getno(dbc, key, rep, can_create) + DBC *dbc; const DBT *key; db_recno_t *rep; int can_create; { + DB *dbp; db_recno_t recno; + dbp = dbc->dbp; + /* Check the user's record number. */ if ((recno = *(db_recno_t *)key->data) == 0) { __db_err(dbp->dbenv, "illegal record number of 0"); @@ -754,24 +850,11 @@ __ram_getno(dbp, key, rep, can_create) *rep = recno; /* - * Btree can neither create records or read them in. Recno can + * Btree can neither create records nor read them in. Recno can * do both, see if we can find the record. */ return (dbp->type == DB_RECNO ? - __ram_update(dbp, recno, can_create) : 0); -} - -/* - * __ram_snapshot -- - * Read in any remaining records from the backing input file. - * - * PUBLIC: int __ram_snapshot __P((DB *)); - */ -int -__ram_snapshot(dbp) - DB *dbp; -{ - return (__ram_update(dbp, DB_MAX_RECORDS, 0)); + __ram_update(dbc, recno, can_create) : 0); } /* @@ -779,18 +862,20 @@ __ram_snapshot(dbp) * Ensure the tree has records up to and including the specified one. */ static int -__ram_update(dbp, recno, can_create) - DB *dbp; +__ram_update(dbc, recno, can_create) + DBC *dbc; db_recno_t recno; int can_create; { BTREE *t; + DB *dbp; RECNO *rp; db_recno_t nrecs; int ret; + dbp = dbc->dbp; t = dbp->internal; - rp = t->bt_recno; + rp = t->recno; /* * If we can't create records and we've read the entire backing input @@ -803,12 +888,12 @@ __ram_update(dbp, recno, can_create) * If we haven't seen this record yet, try to get it from the original * file. */ - if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) return (ret); if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) { - if ((ret = rp->re_irec(dbp, recno)) != 0) + if ((ret = rp->re_irec(dbc, recno)) != 0) return (ret); - if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) return (ret); } @@ -819,28 +904,27 @@ __ram_update(dbp, recno, can_create) if (!can_create || recno <= nrecs + 1) return (0); - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = + __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } - t->bt_rdata.size = rp->re_len; - memset(t->bt_rdata.data, rp->re_pad, rp->re_len); + dbc->rdata.size = rp->re_len; + memset(dbc->rdata.data, rp->re_pad, rp->re_len); } else - t->bt_rdata.size = 0; + dbc->rdata.size = 0; while (recno > ++nrecs) - if ((ret = __ram_add(dbp, - &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0) + if ((ret = __ram_add(dbc, + &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0) return (ret); return (0); } @@ -859,6 +943,11 @@ __ram_source(dbp, rp, fname) u_int32_t bytes, mbytes, oflags; int ret; + /* + * !!! + * The caller has full responsibility for cleaning up on error -- + * (it has to anyway, in case it fails after this routine succeeds). + */ if ((ret = __db_appname(dbp->dbenv, DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0) return (ret); @@ -867,7 +956,7 @@ __ram_source(dbp, rp, fname) if ((ret = __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) { __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); - goto err; + return (ret); } /* @@ -878,10 +967,10 @@ __ram_source(dbp, rp, fname) * compiler will perpetrate, doing the comparison in a portable way is * flatly impossible. Hope that mmap fails if the file is too large. */ - if ((ret = __db_ioinfo(rp->re_source, + if ((ret = __os_ioinfo(rp->re_source, rp->re_fd, &mbytes, &bytes, NULL)) != 0) { __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); - goto err; + return (ret); } if (mbytes == 0 && bytes == 0) { F_SET(rp, RECNO_EOF); @@ -891,14 +980,11 @@ __ram_source(dbp, rp, fname) size = mbytes * MEGABYTE + bytes; if ((ret = __db_mapfile(rp->re_source, rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0) - goto err; + return (ret); rp->re_cmap = rp->re_smap; rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size); rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ? __ram_fmap : __ram_vmap; return (0); - -err: FREES(rp->re_source) - return (ret); } /* @@ -906,17 +992,19 @@ err: FREES(rp->re_source) * Rewrite the backing file. */ static int -__ram_writeback(dbp) - DB *dbp; +__ram_writeback(dbc) + DBC *dbc; { - RECNO *rp; + DB *dbp; DBT key, data; + RECNO *rp; db_recno_t keyno; ssize_t nw; int fd, ret, t_ret; u_int8_t delim, *pad; - rp = ((BTREE *)dbp->internal)->bt_recno; + dbp = dbc->dbp; + rp = ((BTREE *)dbp->internal)->recno; /* If the file wasn't modified, we're done. */ if (!F_ISSET(rp, RECNO_MODIFIED)) @@ -931,7 +1019,7 @@ __ram_writeback(dbp) /* * Read any remaining records into the tree. * - * XXX + * !!! * This is why we can't support transactions when applications specify * backing (re_source) files. At this point we have to read in the * rest of the records from the file so that we can write all of the @@ -946,7 +1034,8 @@ __ram_writeback(dbp) * protecting the backing source file, i.e. mpool would have to know * about it, and we don't want to go there. */ - if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + if ((ret = + __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND) return (ret); /* @@ -962,7 +1051,7 @@ __ram_writeback(dbp) /* Get rid of any backing file descriptor, just on GP's. */ if (rp->re_fd != -1) { - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); rp->re_fd = -1; } @@ -990,10 +1079,8 @@ __ram_writeback(dbp) */ delim = rp->re_delim; if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if ((pad = (u_int8_t *)__db_malloc(rp->re_len)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0) goto err; - } memset(pad, rp->re_pad, rp->re_len); } else COMPQUIET(pad, NULL); @@ -1001,7 +1088,7 @@ __ram_writeback(dbp) switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { case 0: if ((ret = - __db_write(fd, data.data, data.size, &nw)) != 0) + __os_write(fd, data.data, data.size, &nw)) != 0) goto err; if (nw != (ssize_t)data.size) { ret = EIO; @@ -1011,7 +1098,7 @@ __ram_writeback(dbp) case DB_KEYEMPTY: if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { if ((ret = - __db_write(fd, pad, rp->re_len, &nw)) != 0) + __os_write(fd, pad, rp->re_len, &nw)) != 0) goto err; if (nw != (ssize_t)rp->re_len) { ret = EIO; @@ -1024,7 +1111,7 @@ __ram_writeback(dbp) goto done; } if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if ((ret = __db_write(fd, &delim, 1, &nw)) != 0) + if ((ret = __os_write(fd, &delim, 1, &nw)) != 0) goto err; if (nw != 1) { ret = EIO; @@ -1035,7 +1122,7 @@ __ram_writeback(dbp) err: done: /* Close the file descriptor. */ - if ((t_ret = __db_close(fd)) != 0 || ret == 0) + if ((t_ret = __os_close(fd)) != 0 || ret == 0) ret = t_ret; if (ret == 0) @@ -1048,11 +1135,11 @@ done: /* Close the file descriptor. */ * Get fixed length records from a file. */ static int -__ram_fmap(dbp, top) - DB *dbp; +__ram_fmap(dbc, top) + DBC *dbc; db_recno_t top; { - BTREE *t; + DB *dbp; DBT data; RECNO *rp; db_recno_t recno; @@ -1060,24 +1147,23 @@ __ram_fmap(dbp, top) u_int8_t *sp, *ep, *p; int ret; - if ((ret = __bam_nrecs(dbp, &recno)) != 0) + if ((ret = __bam_nrecs(dbc, &recno)) != 0) return (ret); - t = dbp->internal; - rp = t->bt_recno; - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + dbp = dbc->dbp; + rp = ((BTREE *)(dbp->internal))->recno; + + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } memset(&data, 0, sizeof(data)); - data.data = t->bt_rdata.data; + data.data = dbc->rdata.data; data.size = rp->re_len; sp = (u_int8_t *)rp->re_cmap; @@ -1088,7 +1174,7 @@ __ram_fmap(dbp, top) return (DB_NOTFOUND); } len = rp->re_len; - for (p = t->bt_rdata.data; + for (p = dbc->rdata.data; sp < ep && len > 0; *p++ = *sp++, --len) ; @@ -1108,7 +1194,7 @@ __ram_fmap(dbp, top) memset(p, rp->re_pad, len); ++recno; - if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) return (ret); } ++rp->re_last; @@ -1122,21 +1208,19 @@ __ram_fmap(dbp, top) * Get variable length records from a file. */ static int -__ram_vmap(dbp, top) - DB *dbp; +__ram_vmap(dbc, top) + DBC *dbc; db_recno_t top; { - BTREE *t; DBT data; RECNO *rp; db_recno_t recno; u_int8_t *sp, *ep; int delim, ret; - t = dbp->internal; - rp = t->bt_recno; + rp = ((BTREE *)(dbc->dbp->internal))->recno; - if ((ret = __bam_nrecs(dbp, &recno)) != 0) + if ((ret = __bam_nrecs(dbc, &recno)) != 0) return (ret); memset(&data, 0, sizeof(data)); @@ -1163,7 +1247,7 @@ __ram_vmap(dbp, top) if (rp->re_last >= recno) { data.size = sp - (u_int8_t *)data.data; ++recno; - if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) return (ret); } ++rp->re_last; @@ -1178,40 +1262,47 @@ __ram_vmap(dbp, top) * Add records into the tree. */ static int -__ram_add(dbp, recnop, data, flags, bi_flags) - DB *dbp; +__ram_add(dbc, recnop, data, flags, bi_flags) + DBC *dbc; db_recno_t *recnop; DBT *data; u_int32_t flags, bi_flags; { BKEYDATA *bk; - BTREE *t; + CURSOR *cp; + DB *dbp; PAGE *h; db_indx_t indx; int exact, isdeleted, ret, stack; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; retry: /* Find the slot for insertion. */ - if ((ret = __bam_rsearch(dbp, recnop, - S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0) + if ((ret = __bam_rsearch(dbc, recnop, + S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0) return (ret); - h = t->bt_csp->page; - indx = t->bt_csp->indx; + h = cp->csp->page; + indx = cp->csp->indx; stack = 1; /* + * If re-numbering records, the on-page deleted flag means this record + * was implicitly created. If not re-numbering records, the on-page + * deleted flag means this record was implicitly created, or, it was + * deleted at some time. + * * If DB_NOOVERWRITE is set and the item already exists in the tree, - * return an error unless the item has been marked for deletion. + * return an error unless the item was either marked for deletion or + * only implicitly created. */ isdeleted = 0; if (exact) { bk = GET_BKEYDATA(h, indx); - if (B_DISSET(bk->type)) { + if (B_DISSET(bk->type)) isdeleted = 1; - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); - } else - if (LF_ISSET(DB_NOOVERWRITE)) { + else + if (flags == DB_NOOVERWRITE) { ret = DB_KEYEXIST; goto err; } @@ -1224,40 +1315,42 @@ retry: /* Find the slot for insertion. */ * match, we're inserting a new key/data pair, before the search * location. */ - switch (ret = __bam_iitem(dbp, + switch (ret = __bam_iitem(dbc, &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) { case 0: /* - * Done. Clean up the cursor and adjust the internal page - * counts. + * Don't adjust anything. + * + * If we inserted a record, no cursors need adjusting because + * the only new record it's possible to insert is at the very + * end of the tree. The necessary adjustments to the internal + * page counts were made by __bam_iitem(). + * + * If we overwrote a record, no cursors need adjusting because + * future DBcursor->get calls will simply return the underlying + * record (there's no adjustment made for the DB_CURRENT flag + * when a cursor get operation immediately follows a cursor + * delete operation, and the normal adjustment for the DB_NEXT + * flag is still correct). */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); break; case DB_NEEDSPLIT: - /* - * We have to split the page. Back out the cursor setup, - * discard the stack of pages, and do the split. - */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - - (void)__bam_stkrel(dbp); + /* Discard the stack of pages and split the page. */ + (void)__bam_stkrel(dbc, 0); stack = 0; - if ((ret = __bam_split(dbp, recnop)) != 0) - break; + if ((ret = __bam_split(dbc, recnop)) != 0) + goto err; goto retry; /* NOTREACHED */ default: - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - break; + goto err; } + err: if (stack) - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); return (ret); } diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c index caa6b3515e..8efe4059a8 100644 --- a/db2/btree/bt_rsearch.c +++ b/db2/btree/bt_rsearch.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_rsearch.c 10.15 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_rsearch.c 10.21 (Sleepycat) 12/2/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -59,39 +59,37 @@ static const char sccsid[] = "@(#)bt_rsearch.c 10.15 (Sleepycat) 5/6/98"; * __bam_rsearch -- * Search a btree for a record number. * - * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *)); + * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); */ int -__bam_rsearch(dbp, recnop, flags, stop, exactp) - DB *dbp; +__bam_rsearch(dbc, recnop, flags, stop, exactp) + DBC *dbc; db_recno_t *recnop; u_int32_t flags; int stop, *exactp; { BINTERNAL *bi; - BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; PAGE *h; RINTERNAL *ri; db_indx_t indx, top; db_pgno_t pg; db_recno_t i, recno, total; - int isappend, ret, stack; + int ret, stack; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; - /* - * We test for groups of flags, S_APPEND is the only one that can be - * OR'd into the set. Clear it now so that the tests for equality - * will work. - */ - if ((isappend = LF_ISSET(S_APPEND)) != 0) - LF_CLR(S_APPEND); + BT_STK_CLR(cp); /* * There are several ways we search a btree tree. The flags argument * specifies if we're acquiring read or write locks and if we are - * locking pairs of pages. See btree.h for more details. + * locking pairs of pages. In addition, if we're adding or deleting + * an item, we have to lock the entire tree, regardless. See btree.h + * for more details. * * If write-locking pages, we need to know whether or not to acquire a * write lock on a page before getting it. This depends on how deep it @@ -102,15 +100,36 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * Retrieve the root page. */ pg = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, - flags == S_INSERT || flags == S_DELETE ? - DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + stack = LF_ISSET(S_STACK); + if ((ret = __bam_lget(dbc, + 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } - total = RE_NREC(h); + + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbc, lock); + if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); + return (ret); + } + stack = 1; + } /* * If appending to the tree, set the record number now -- we have the @@ -124,7 +143,8 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * for the record immediately after the last record in the tree, so do * a fast check now. */ - if (isappend) { + total = RE_NREC(h); + if (LF_ISSET(S_APPEND)) { *exactp = 0; *recnop = recno = total + 1; } else { @@ -133,33 +153,14 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) *exactp = 1; else { *exactp = 0; - if (!PAST_END_OK(flags) || recno > total + 1) { + if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); return (DB_NOTFOUND); } } } - /* Decide if we're building a stack based on the operation. */ - BT_STK_CLR(t); - stack = flags == S_DELETE || flags == S_INSERT; - - /* - * Decide if we need to save this page; if we do, write lock it, and - * start to build a stack. - */ - if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) { - (void)memp_fput(dbp->mpf, h, 0); - if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) - return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); - return (ret); - } - stack = 1; - } - /* * !!! * Record numbers in the tree are 0-based, but the recno is @@ -177,7 +178,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * not exist if there are enough deleted records in the * page. */ - if (recno <= NUM_ENT(h)) + if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX) for (i = recno - 1;; --i) { if (B_DISSET(GET_BKEYDATA(h, i * P_INDX + O_INDX)->type)) @@ -185,10 +186,10 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) if (i == 0) break; } - if (recno > NUM_ENT(h)) { + if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) { *exactp = 0; - if (!PAST_END_OK(flags) || - recno > (db_recno_t)(NUM_ENT(h) + 1)) { + if (!LF_ISSET(S_PAST_EOF) || recno > + (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) { ret = DB_NOTFOUND; goto err; } @@ -197,7 +198,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) /* Correct from 1-based to 0-based for a page offset. */ --recno; - BT_STK_ENTER(t, h, recno * P_INDX, lock, ret); + BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret); return (ret); case P_IBTREE: for (indx = 0, top = NUM_ENT(h);;) { @@ -213,7 +214,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) /* Correct from 1-based to 0-based for a page offset. */ --recno; - BT_STK_ENTER(t, h, recno, lock, ret); + BT_STK_ENTER(cp, h, recno, lock, ret); return (ret); case P_IRECNO: for (indx = 0, top = NUM_ENT(h);;) { @@ -232,42 +233,42 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) if (stack) { /* Return if this is the lowest page wanted. */ if (LF_ISSET(S_PARENT) && stop == h->level) { - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); } - BT_STK_PUSH(t, h, indx, lock, ret); - if (ret) + BT_STK_PUSH(cp, h, indx, lock, ret); + if (ret != 0) goto err; - if ((ret = __bam_lget(dbp, 0, pg, - LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, - &lock)) != 0) + if ((ret = + __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) goto err; } else { - (void)memp_fput(dbp->mpf, h, 0); - /* * Decide if we want to return a pointer to the next * page in the stack. If we do, write lock it and * never unlock it. */ - if (LF_ISSET(S_PARENT) && - (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) stack = 1; - if ((ret = __bam_lget(dbp, 1, pg, - LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, - &lock)) != 0) + (void)memp_fput(dbp->mpf, h, 0); + + if ((ret = + __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) goto err; } - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) goto err; } /* NOTREACHED */ -err: BT_STK_POP(t); - __bam_stkrel(dbp); +err: BT_STK_POP(cp); + __bam_stkrel(dbc, 0); return (ret); } @@ -275,25 +276,29 @@ err: BT_STK_POP(t); * __bam_adjust -- * Adjust the tree after adding or deleting a record. * - * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int32_t)); + * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); */ int -__bam_adjust(dbp, t, adjust) - DB *dbp; - BTREE *t; +__bam_adjust(dbc, adjust) + DBC *dbc; int32_t adjust; { + CURSOR *cp; + DB *dbp; EPG *epg; PAGE *h; int ret; + dbp = dbc->dbp; + cp = dbc->internal; + /* Update the record counts for the tree. */ - for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + for (epg = cp->sp; epg <= cp->csp; ++epg) { h = epg->page; if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { - if (DB_LOGGING(dbp) && + if (DB_LOGGING(dbc) && (ret = __bam_cadjust_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(h), 0, dbp->log_fileid, + dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust, 1)) != 0) return (ret); @@ -317,28 +322,31 @@ __bam_adjust(dbp, t, adjust) * __bam_nrecs -- * Return the number of records in the tree. * - * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *)); + * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); */ int -__bam_nrecs(dbp, rep) - DB *dbp; +__bam_nrecs(dbc, rep) + DBC *dbc; db_recno_t *rep; { + DB *dbp; DB_LOCK lock; PAGE *h; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); *rep = RE_NREC(h); (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); return (0); } diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c index 09ce46d90a..1f439a4261 100644 --- a/db2/btree/bt_search.c +++ b/db2/btree/bt_search.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_search.c 10.15 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_search.c 10.25 (Sleepycat) 12/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -65,38 +65,41 @@ static const char sccsid[] = "@(#)bt_search.c 10.15 (Sleepycat) 5/6/98"; * __bam_search -- * Search a btree for a key. * - * PUBLIC: int __bam_search __P((DB *, + * PUBLIC: int __bam_search __P((DBC *, * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); */ int -__bam_search(dbp, key, flags, stop, recnop, exactp) - DB *dbp; +__bam_search(dbc, key, flags, stop, recnop, exactp) + DBC *dbc; const DBT *key; u_int32_t flags; int stop, *exactp; db_recno_t *recnop; { BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; - EPG cur; PAGE *h; db_indx_t base, i, indx, lim; db_pgno_t pg; db_recno_t recno; int cmp, jump, ret, stack; + dbp = dbc->dbp; + cp = dbc->internal; t = dbp->internal; recno = 0; - BT_STK_CLR(t); + BT_STK_CLR(cp); /* * There are several ways we search a btree tree. The flags argument * specifies if we're acquiring read or write locks, if we position * to the first or last item in a set of duplicates, if we return - * deleted items, and if we are locking pairs of pages. See btree.h - * for more details. In addition, if we're doing record numbers, we - * have to lock the entire tree regardless. + * deleted items, and if we are locking pairs of pages. In addition, + * if we're modifying record numbers, we have to lock the entire tree + * regardless. See btree.h for more details. * * If write-locking pages, we need to know whether or not to acquire a * write lock on a page before getting it. This depends on how deep it @@ -108,11 +111,11 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) */ pg = PGNO_ROOT; stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK); - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } @@ -128,14 +131,13 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); - if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + (void)__BT_LPUT(dbc, lock); + if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } - stack = 1; } @@ -147,12 +149,12 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) * per page item. If we find an exact match on a leaf page, * we're done. */ - cur.page = h; jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; for (base = 0, lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) { - cur.indx = indx = base + ((lim >> 1) * jump); - if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) { + indx = base + ((lim >> 1) * jump); + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) { if (TYPE(h) == P_LBTREE) goto match; goto next; @@ -184,7 +186,7 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) * to find an undeleted record. This is handled in the * __bam_c_search() routine. */ - BT_STK_ENTER(t, h, base, lock, ret); + BT_STK_ENTER(cp, h, base, lock, ret); return (ret); } @@ -208,39 +210,39 @@ next: pg = GET_BINTERNAL(h, indx)->pgno; if (stack) { /* Return if this is the lowest page wanted. */ if (LF_ISSET(S_PARENT) && stop == h->level) { - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); } - BT_STK_PUSH(t, h, indx, lock, ret); + BT_STK_PUSH(cp, h, indx, lock, ret); if (ret != 0) goto err; if ((ret = - __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) goto err; } else { - (void)memp_fput(dbp->mpf, h, 0); - /* - * Decide if we want to return a pointer to the next - * page in the stack. If we do, write lock it and - * never unlock it. + * Decide if we want to return a reference to the next + * page in the return stack. If so, lock it and never + * unlock it. */ if ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || (h->level - 1) == LEAFLEVEL) stack = 1; + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = - __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ? + __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) goto err; } - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) goto err; } - /* NOTREACHED */ + match: *exactp = 1; /* @@ -288,17 +290,17 @@ match: *exactp = 1; goto notfound; } - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); notfound: (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); ret = DB_NOTFOUND; -err: if (t->bt_csp > t->bt_sp) { - BT_STK_POP(t); - __bam_stkrel(dbp); +err: if (cp->csp > cp->sp) { + BT_STK_POP(cp); + __bam_stkrel(dbc, 0); } return (ret); } @@ -307,20 +309,35 @@ err: if (t->bt_csp > t->bt_sp) { * __bam_stkrel -- * Release all pages currently held in the stack. * - * PUBLIC: int __bam_stkrel __P((DB *)); + * PUBLIC: int __bam_stkrel __P((DBC *, int)); */ int -__bam_stkrel(dbp) - DB *dbp; +__bam_stkrel(dbc, nolocks) + DBC *dbc; + int nolocks; { - BTREE *t; + CURSOR *cp; + DB *dbp; EPG *epg; - t = dbp->internal; - for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); + dbp = dbc->dbp; + cp = dbc->internal; + + /* Release inner pages first. */ + for (epg = cp->sp; epg <= cp->csp; ++epg) { + if (epg->page != NULL) + (void)memp_fput(dbp->mpf, epg->page, 0); + if (epg->lock != LOCK_INVALID) { + if (nolocks) + (void)__BT_LPUT(dbc, epg->lock); + else + (void)__BT_TLPUT(dbc, epg->lock); + } } + + /* Clear the stack, all pages have been released. */ + BT_STK_CLR(cp); + return (0); } @@ -328,24 +345,25 @@ __bam_stkrel(dbp) * __bam_stkgrow -- * Grow the stack. * - * PUBLIC: int __bam_stkgrow __P((BTREE *)); + * PUBLIC: int __bam_stkgrow __P((CURSOR *)); */ int -__bam_stkgrow(t) - BTREE *t; +__bam_stkgrow(cp) + CURSOR *cp; { EPG *p; size_t entries; + int ret; - entries = t->bt_esp - t->bt_sp; + entries = cp->esp - cp->sp; - if ((p = (EPG *)__db_calloc(entries * 2, sizeof(EPG))) == NULL) - return (ENOMEM); - memcpy(p, t->bt_sp, entries * sizeof(EPG)); - if (t->bt_sp != t->bt_stack) - FREE(t->bt_sp, entries * sizeof(EPG)); - t->bt_sp = p; - t->bt_csp = p + entries; - t->bt_esp = p + entries * 2; + if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0) + return (ret); + memcpy(p, cp->sp, entries * sizeof(EPG)); + if (cp->sp != cp->stack) + __os_free(cp->sp, entries * sizeof(EPG)); + cp->sp = p; + cp->csp = p + entries; + cp->esp = p + entries * 2; return (0); } diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c index da9417c781..1d8e926d85 100644 --- a/db2/btree/bt_split.c +++ b/db2/btree/bt_split.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_split.c 10.23 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)bt_split.c 10.33 (Sleepycat) 10/13/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -59,27 +59,31 @@ static const char sccsid[] = "@(#)bt_split.c 10.23 (Sleepycat) 5/23/98"; #include "db_page.h" #include "btree.h" -static int __bam_page __P((DB *, EPG *, EPG *)); -static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *)); -static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int)); -static int __bam_root __P((DB *, EPG *)); +static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); +static int __bam_page __P((DBC *, EPG *, EPG *)); +static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *)); +static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); +static int __bam_root __P((DBC *, EPG *)); +static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); /* * __bam_split -- * Split a page. * - * PUBLIC: int __bam_split __P((DB *, void *)); + * PUBLIC: int __bam_split __P((DBC *, void *)); */ int -__bam_split(dbp, arg) - DB *dbp; +__bam_split(dbc, arg) + DBC *dbc; void *arg; { - BTREE *t; + CURSOR *cp; + DB *dbp; enum { UP, DOWN } dir; int exact, level, ret; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; /* * The locking protocol we use to avoid deadlock to acquire locks by @@ -113,15 +117,16 @@ __bam_split(dbp, arg) * Acquire a page and its parent, locked. */ if ((ret = (dbp->type == DB_BTREE ? - __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) : - __bam_rsearch(dbp, + __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbc, (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) return (ret); /* Split the page. */ - ret = t->bt_csp[0].page->pgno == PGNO_ROOT ? - __bam_root(dbp, &t->bt_csp[0]) : - __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]); + ret = cp->csp[0].page->pgno == PGNO_ROOT ? + __bam_root(dbc, &cp->csp[0]) : + __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); + BT_STK_CLR(cp); switch (ret) { case 0: @@ -155,15 +160,16 @@ __bam_split(dbp, arg) * Split the root page of a btree. */ static int -__bam_root(dbp, cp) - DB *dbp; +__bam_root(dbc, cp) + DBC *dbc; EPG *cp; { - BTREE *t; + DB *dbp; PAGE *lp, *rp; + db_indx_t split; int ret; - t = dbp->internal; + dbp = dbc->dbp; /* Yeah, right. */ if (cp->page->level >= MAXBTREELEVEL) { @@ -173,8 +179,8 @@ __bam_root(dbp, cp) /* Create new left and right pages for the split. */ lp = rp = NULL; - if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 || - (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 || + (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) goto err; P_INIT(lp, dbp->pgsize, lp->pgno, PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, @@ -184,18 +190,18 @@ __bam_root(dbp, cp) cp->page->level, TYPE(cp->page)); /* Split the page. */ - if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0) + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { DBT __a; DB_LSN __lsn; memset(&__a, 0, sizeof(__a)); __a.data = cp->page; __a.size = dbp->pgsize; ZERO_LSN(__lsn); - if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, &__a)) != 0) @@ -205,26 +211,27 @@ __bam_root(dbp, cp) /* Clean up the new root page. */ if ((ret = (dbp->type == DB_RECNO ? - __ram_root(dbp, cp->page, lp, rp) : - __bam_broot(dbp, cp->page, lp, rp))) != 0) + __ram_root(dbc, cp->page, lp, rp) : + __bam_broot(dbc, cp->page, lp, rp))) != 0) goto err; + /* Adjust any cursors. Do it last so we don't have to undo it. */ + __bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1); + /* Success -- write the real pages back to the store. */ (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); - ++t->lstat.bt_split; - ++t->lstat.bt_rootsplit; return (0); err: if (lp != NULL) - (void)__bam_free(dbp, lp); + (void)__bam_free(dbc, lp); if (rp != NULL) - (void)__bam_free(dbp, rp); + (void)__bam_free(dbc, rp); (void)memp_fput(dbp->mpf, cp->page, 0); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); return (ret); } @@ -233,19 +240,22 @@ err: if (lp != NULL) * Split the non-root page of a btree. */ static int -__bam_page(dbp, pp, cp) - DB *dbp; +__bam_page(dbc, pp, cp) + DBC *dbc; EPG *pp, *cp; { + DB *dbp; DB_LOCK tplock; PAGE *lp, *rp, *tp; + db_indx_t split; int ret; + dbp = dbc->dbp; lp = rp = tp = NULL; ret = -1; /* Create new right page for the split. */ - if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) goto err; P_INIT(rp, dbp->pgsize, rp->pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, @@ -253,13 +263,8 @@ __bam_page(dbp, pp, cp) cp->page->level, TYPE(cp->page)); /* Create new left page for the split. */ - if ((lp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0) goto err; - } -#ifdef DIAGNOSTIC - memset(lp, 0xff, dbp->pgsize); -#endif P_INIT(lp, dbp->pgsize, cp->page->pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, @@ -276,7 +281,7 @@ __bam_page(dbp, pp, cp) * change, we swap the original and the allocated left page after the * split. */ - if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0) + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* @@ -293,19 +298,19 @@ __bam_page(dbp, pp, cp) * the page we're splitting. */ if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0) goto err; } /* Insert the new pages into the parent page. */ - if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0) + if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { DBT __a; DB_LSN __lsn; memset(&__a, 0, sizeof(__a)); @@ -313,7 +318,7 @@ __bam_page(dbp, pp, cp) __a.size = dbp->pgsize; if (tp == NULL) ZERO_LSN(__lsn); - if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), tp == NULL ? 0 : PGNO(tp), @@ -329,56 +334,69 @@ __bam_page(dbp, pp, cp) memcpy(cp->page, lp, LOFFSET(lp)); memcpy((u_int8_t *)cp->page + HOFFSET(lp), (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); - FREE(lp, dbp->pgsize); + __os_free(lp, dbp->pgsize); lp = NULL; /* Finish the next-page link. */ if (tp != NULL) tp->prev_pgno = rp->pgno; + /* Adjust any cursors. Do so last so we don't have to undo it. */ + __bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0); + /* Success -- write the real pages back to the store. */ (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, pp->lock); + (void)__BT_TLPUT(dbc, pp->lock); (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); if (tp != NULL) { (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, tplock); + (void)__BT_TLPUT(dbc, tplock); } return (0); err: if (lp != NULL) - FREE(lp, dbp->pgsize); + __os_free(lp, dbp->pgsize); if (rp != NULL) - (void)__bam_free(dbp, rp); + (void)__bam_free(dbc, rp); if (tp != NULL) { (void)memp_fput(dbp->mpf, tp, 0); - (void)__BT_TLPUT(dbp, tplock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, tplock); + else + (void)__BT_TLPUT(dbc, tplock); } (void)memp_fput(dbp->mpf, pp->page, 0); - (void)__BT_TLPUT(dbp, pp->lock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, pp->lock); + else + (void)__BT_TLPUT(dbc, pp->lock); (void)memp_fput(dbp->mpf, cp->page, 0); - (void)__BT_TLPUT(dbp, cp->lock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, cp->lock); + else + (void)__BT_TLPUT(dbc, cp->lock); return (ret); } /* * __bam_broot -- * Fix up the btree root page after it has been split. - * - * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); */ -int -__bam_broot(dbp, rootp, lp, rp) - DB *dbp; +static int +__bam_broot(dbc, rootp, lp, rp) + DBC *dbc; PAGE *rootp, *lp, *rp; { BINTERNAL bi, *child_bi; BKEYDATA *child_bk; + DB *dbp; DBT hdr, data; int ret; + dbp = dbc->dbp; + /* * If the root page was a leaf page, change it into an internal page. * We copy the key we split on (but not the key's data, in the case of @@ -405,7 +423,7 @@ __bam_broot(dbp, rootp, lp, rp) hdr.data = &bi; hdr.size = SSZA(BINTERNAL, data); if ((ret = - __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) + __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) return (ret); switch (TYPE(rp)) { @@ -424,13 +442,13 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bi->data; data.size = child_bi->len; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bi->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) return (ret); break; @@ -450,7 +468,7 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bk->data; data.size = child_bk->len; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) return (ret); break; @@ -467,13 +485,13 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bk; data.size = BOVERFLOW_SIZE; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bk->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) return (ret); break; @@ -490,18 +508,19 @@ __bam_broot(dbp, rootp, lp, rp) /* * __ram_root -- * Fix up the recno root page after it has been split. - * - * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); */ -int -__ram_root(dbp, rootp, lp, rp) - DB *dbp; +static int +__ram_root(dbc, rootp, lp, rp) + DBC *dbc; PAGE *rootp, *lp, *rp; { + DB *dbp; DBT hdr; RINTERNAL ri; int ret; + dbp = dbc->dbp; + /* Initialize the page. */ P_INIT(rootp, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); @@ -514,12 +533,12 @@ __ram_root(dbp, rootp, lp, rp) /* Insert the left and right keys, set the header information. */ ri.pgno = lp->pgno; ri.nrecs = __bam_total(lp); - if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_SET(rootp, ri.nrecs); ri.pgno = rp->pgno; ri.nrecs = __bam_total(rp); - if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_ADJ(rootp, ri.nrecs); return (0); @@ -530,14 +549,15 @@ __ram_root(dbp, rootp, lp, rp) * Insert a new key into a parent page, completing the split. */ static int -__bam_pinsert(dbp, parent, lchild, rchild) - DB *dbp; +__bam_pinsert(dbc, parent, lchild, rchild) + DBC *dbc; EPG *parent; PAGE *lchild, *rchild; { BINTERNAL bi, *child_bi; BKEYDATA *child_bk, *tmp_bk; BTREE *t; + DB *dbp; DBT a, b, hdr, data; PAGE *ppage; RINTERNAL ri; @@ -546,6 +566,7 @@ __bam_pinsert(dbp, parent, lchild, rchild) u_int32_t n, nbytes, nksize; int ret; + dbp = dbc->dbp; t = dbp->internal; ppage = parent->page; @@ -600,13 +621,13 @@ __bam_pinsert(dbp, parent, lchild, rchild) memset(&data, 0, sizeof(data)); data.data = child_bi->data; data.size = child_bi->len; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bi->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) return (ret); break; @@ -630,10 +651,9 @@ __bam_pinsert(dbp, parent, lchild, rchild) b.size = child_bk->len; b.data = child_bk->data; nksize = t->bt_prefix(&a, &b); - if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) { - t->lstat.bt_pfxsaved += nbytes - n; + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) nbytes = n; - } else + else noprefix: nksize = child_bk->len; if (P_FREESPACE(ppage) < nbytes) @@ -650,7 +670,7 @@ noprefix: nksize = child_bk->len; memset(&data, 0, sizeof(data)); data.data = child_bk->data; data.size = nksize; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) return (ret); break; @@ -672,13 +692,13 @@ noprefix: nksize = child_bk->len; memset(&data, 0, sizeof(data)); data.data = child_bk; data.size = BOVERFLOW_SIZE; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bk->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) return (ret); break; @@ -699,7 +719,7 @@ noprefix: nksize = child_bk->len; hdr.size = RINTERNAL_SIZE; ri.pgno = rchild->pgno; ri.nrecs = nrecs; - if ((ret = __db_pitem(dbp, + if ((ret = __db_pitem(dbc, ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); break; @@ -710,9 +730,9 @@ noprefix: nksize = child_bk->len; /* Adjust the parent page's left page record count. */ if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { /* Log the change. */ - if (DB_LOGGING(dbp) && + if (DB_LOGGING(dbc) && (ret = __bam_cadjust_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(ppage), 0, dbp->log_fileid, + dbc->txn, &LSN(ppage), 0, dbp->log_fileid, PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, -(int32_t)nrecs, (int32_t)0)) != 0) return (ret); @@ -732,18 +752,18 @@ noprefix: nksize = child_bk->len; * Do the real work of splitting the page. */ static int -__bam_psplit(dbp, cp, lp, rp, cleft) - DB *dbp; +__bam_psplit(dbc, cp, lp, rp, splitret) + DBC *dbc; EPG *cp; PAGE *lp, *rp; - int cleft; + db_indx_t *splitret; { - BTREE *t; + DB *dbp; PAGE *pp; db_indx_t half, nbytes, off, splitp, top; int adjust, cnt, isbigkey, ret; - t = dbp->internal; + dbp = dbc->dbp; pp = cp->page; adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; @@ -762,11 +782,8 @@ __bam_psplit(dbp, cp, lp, rp, cleft) else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) off = adjust; - ++t->lstat.bt_split; - if (off != 0) { - ++t->lstat.bt_fastsplit; + if (off != 0) goto sort; - } /* * Split the data to the left and right pages. Try not to split on @@ -887,8 +904,7 @@ sort: splitp = off; if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) return (ret); - /* Adjust the cursors. */ - __bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft); + *splitret = splitp; return (0); } diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c index 2236434b38..855ef40bbd 100644 --- a/db2/btree/bt_stat.c +++ b/db2/btree/bt_stat.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_stat.c 10.17 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)bt_stat.c 10.27 (Sleepycat) 11/25/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -22,8 +22,6 @@ static const char sccsid[] = "@(#)bt_stat.c 10.17 (Sleepycat) 4/26/98"; #include "db_page.h" #include "btree.h" -static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); - /* * __bam_stat -- * Gather/print the btree statistics @@ -31,62 +29,62 @@ static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); */ int -__bam_stat(argdbp, spp, db_malloc, flags) - DB *argdbp; +__bam_stat(dbp, spp, db_malloc, flags) + DB *dbp; void *spp; void *(*db_malloc) __P((size_t)); u_int32_t flags; { BTMETA *meta; BTREE *t; - DB *dbp; + DBC *dbc; DB_BTREE_STAT *sp; DB_LOCK lock; PAGE *h; db_pgno_t lastpgno, pgno; - int ret; + int ret, t_ret; - DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags); + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_statchk(argdbp, flags)) != 0) + if ((ret = __db_statchk(dbp, flags)) != 0) return (ret); - if (spp == NULL) - return (0); + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags); - GETHANDLE(argdbp, NULL, &dbp, ret); t = dbp->internal; + if (spp == NULL) + return (0); + /* Allocate and clear the structure. */ - if ((sp = db_malloc == NULL ? - (DB_BTREE_STAT *)__db_malloc(sizeof(*sp)) : - (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(sizeof(*sp), db_malloc, &sp)) != 0) goto err; - } memset(sp, 0, sizeof(*sp)); /* If the app just wants the record count, make it fast. */ - if (LF_ISSET(DB_RECORDCOUNT)) { + if (flags == DB_RECORDCOUNT) { pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&h)) != 0) goto err; sp->bt_nrecs = RE_NREC(h); (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); goto done; } /* Get the meta-data page. */ pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) goto err; /* Translate the metadata flags. */ @@ -110,24 +108,13 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Get the page size from the DB. */ sp->bt_pagesize = dbp->pgsize; - /* Initialize counters with the meta-data page information. */ - __bam_add_rstat(&meta->stat, sp); - - /* - * Add in the local information from this handle. - * - * !!! - * This is a bit odd, but it gets us closer to the truth. - */ - __bam_add_rstat(&t->lstat, sp); - /* Walk the free list, counting pages. */ for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) { ++sp->bt_free; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { (void)memp_fput(dbp->mpf, meta, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); goto err; } pgno = h->next_pgno; @@ -136,7 +123,7 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Discard the meta-data page. */ (void)memp_fput(dbp->mpf, meta, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); /* Determine the last page of the database. */ if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0) @@ -145,10 +132,10 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Get the root page. */ pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); goto err; } @@ -185,19 +172,19 @@ __bam_stat(argdbp, spp, db_malloc, flags) break; default: (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); return (__db_pgfmt(dbp, pgno)); } (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); if (++pgno > lastpgno) break; - if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) + if (__bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) break; if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) { - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); break; } } @@ -205,50 +192,7 @@ __bam_stat(argdbp, spp, db_malloc, flags) done: *(DB_BTREE_STAT **)spp = sp; ret = 0; -err: PUTHANDLE(dbp); +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } - -/* - * __bam_add_mstat -- - * Add the local statistics to the meta-data page statistics. - * - * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); - */ -void -__bam_add_mstat(from, to) - DB_BTREE_LSTAT *from; - DB_BTREE_LSTAT *to; -{ - to->bt_freed += from->bt_freed; - to->bt_pfxsaved += from->bt_pfxsaved; - to->bt_split += from->bt_split; - to->bt_rootsplit += from->bt_rootsplit; - to->bt_fastsplit += from->bt_fastsplit; - to->bt_added += from->bt_added; - to->bt_deleted += from->bt_deleted; - to->bt_get += from->bt_get; - to->bt_cache_hit += from->bt_cache_hit; - to->bt_cache_miss += from->bt_cache_miss; -} - -/* - * __bam_add_rstat -- - * Add the local statistics to the returned statistics. - */ -static void -__bam_add_rstat(from, to) - DB_BTREE_LSTAT *from; - DB_BTREE_STAT *to; -{ - to->bt_freed += from->bt_freed; - to->bt_pfxsaved += from->bt_pfxsaved; - to->bt_split += from->bt_split; - to->bt_rootsplit += from->bt_rootsplit; - to->bt_fastsplit += from->bt_fastsplit; - to->bt_added += from->bt_added; - to->bt_deleted += from->bt_deleted; - to->bt_get += from->bt_get; - to->bt_cache_hit += from->bt_cache_hit; - to->bt_cache_miss += from->bt_cache_miss; -} diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c index 75eadb1d62..95ea76e2cd 100644 --- a/db2/btree/btree_auto.c +++ b/db2/btree/btree_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "btree.h" @@ -43,8 +42,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_pg_alloc; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -55,8 +53,8 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(ptype) + sizeof(next); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -90,7 +88,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -135,7 +133,7 @@ __bam_pg_alloc_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tptype: %lu\n", (u_long)argp->ptype); printf("\tnext: %lu\n", (u_long)argp->next); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -149,11 +147,12 @@ __bam_pg_alloc_read(recbuf, argpp) { __bam_pg_alloc_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_pg_alloc_args *)__db_malloc(sizeof(__bam_pg_alloc_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_pg_alloc_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -206,8 +205,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_pg_free; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -217,8 +215,8 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, + sizeof(*meta_lsn) + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + sizeof(next); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -255,7 +253,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -306,7 +304,7 @@ __bam_pg_free_print(notused1, dbtp, lsnp, notused2, notused3) printf("\n"); printf("\tnext: %lu\n", (u_long)argp->next); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -320,11 +318,12 @@ __bam_pg_free_read(recbuf, argpp) { __bam_pg_free_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_pg_free_args *)__db_malloc(sizeof(__bam_pg_free_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_pg_free_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -383,8 +382,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_split; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -398,8 +396,8 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, + sizeof(npgno) + sizeof(*nlsn) + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -450,7 +448,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -507,7 +505,7 @@ __bam_split_print(notused1, dbtp, lsnp, notused2, notused3) } printf("\n"); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -521,11 +519,12 @@ __bam_split_read(recbuf, argpp) { __bam_split_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_split_args *)__db_malloc(sizeof(__bam_split_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_split_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -587,8 +586,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_rsplit; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -599,8 +597,8 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, + sizeof(nrec) + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + sizeof(*rootlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -647,7 +645,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -707,7 +705,7 @@ __bam_rsplit_print(notused1, dbtp, lsnp, notused2, notused3) printf("\trootlsn: [%lu][%lu]\n", (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -721,11 +719,12 @@ __bam_rsplit_read(recbuf, argpp) { __bam_rsplit_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_rsplit_args *)__db_malloc(sizeof(__bam_rsplit_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_rsplit_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -782,8 +781,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_adj; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -794,8 +792,8 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, + sizeof(indx) + sizeof(indx_copy) + sizeof(is_insert); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -826,7 +824,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -870,7 +868,7 @@ __bam_adj_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); printf("\tis_insert: %lu\n", (u_long)argp->is_insert); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -884,11 +882,12 @@ __bam_adj_read(recbuf, argpp) { __bam_adj_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_adj_args *)__db_malloc(sizeof(__bam_adj_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_adj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -941,8 +940,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_cadjust; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -953,8 +951,8 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, + sizeof(indx) + sizeof(adjust) + sizeof(total); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -985,7 +983,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1029,7 +1027,7 @@ __bam_cadjust_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tadjust: %ld\n", (long)argp->adjust); printf("\ttotal: %ld\n", (long)argp->total); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1043,11 +1041,12 @@ __bam_cadjust_read(recbuf, argpp) { __bam_cadjust_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_cadjust_args *)__db_malloc(sizeof(__bam_cadjust_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_cadjust_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1097,8 +1096,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_cdel; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1107,8 +1105,8 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(*lsn) + sizeof(indx); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1135,7 +1133,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1177,7 +1175,7 @@ __bam_cdel_print(notused1, dbtp, lsnp, notused2, notused3) (u_long)argp->lsn.file, (u_long)argp->lsn.offset); printf("\tindx: %lu\n", (u_long)argp->indx); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1191,11 +1189,12 @@ __bam_cdel_read(recbuf, argpp) { __bam_cdel_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_cdel_args *)__db_malloc(sizeof(__bam_cdel_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_cdel_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1250,8 +1249,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_repl; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1265,8 +1263,8 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size) + sizeof(prefix) + sizeof(suffix); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1319,7 +1317,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1382,7 +1380,7 @@ __bam_repl_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tprefix: %lu\n", (u_long)argp->prefix); printf("\tsuffix: %lu\n", (u_long)argp->suffix); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1396,11 +1394,12 @@ __bam_repl_read(recbuf, argpp) { __bam_repl_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_repl_args *)__db_malloc(sizeof(__bam_repl_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_repl_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); |