summary refs log tree commit diff
path: root/db2
diff options
context:
space:
mode:
Diffstat (limited to 'db2')
-rw-r--r--db2/Makefile5
-rw-r--r--db2/btree/bt_close.c177
-rw-r--r--db2/btree/bt_compare.c107
-rw-r--r--db2/btree/bt_conv.c15
-rw-r--r--db2/btree/bt_curadj.c272
-rw-r--r--db2/btree/bt_cursor.c1738
-rw-r--r--db2/btree/bt_delete.c512
-rw-r--r--db2/btree/bt_open.c240
-rw-r--r--db2/btree/bt_page.c141
-rw-r--r--db2/btree/bt_put.c571
-rw-r--r--db2/btree/bt_rec.c115
-rw-r--r--db2/btree/bt_recno.c975
-rw-r--r--db2/btree/bt_rsearch.c164
-rw-r--r--db2/btree/bt_search.c132
-rw-r--r--db2/btree/bt_split.c212
-rw-r--r--db2/btree/bt_stat.c122
-rw-r--r--db2/btree/btree_auto.c161
-rw-r--r--db2/common/db_appinit.c306
-rw-r--r--db2/common/db_apprec.c67
-rw-r--r--db2/common/db_err.c624
-rw-r--r--db2/common/db_region.c129
-rw-r--r--db2/common/db_salloc.c4
-rw-r--r--db2/db.h389
-rw-r--r--db2/db/db.c313
-rw-r--r--db2/db/db.src13
-rw-r--r--db2/db/db_am.c430
-rw-r--r--db2/db/db_auto.c299
-rw-r--r--db2/db/db_dispatch.c41
-rw-r--r--db2/db/db_dup.c511
-rw-r--r--db2/db/db_iface.c488
-rw-r--r--db2/db/db_join.c271
-rw-r--r--db2/db/db_overflow.c129
-rw-r--r--db2/db/db_pr.c110
-rw-r--r--db2/db/db_rec.c155
-rw-r--r--db2/db/db_ret.c21
-rw-r--r--db2/db/db_thread.c121
-rw-r--r--db2/db185/db185.c97
-rw-r--r--db2/db_185.h10
-rw-r--r--db2/db_int.h141
-rw-r--r--db2/dbm/dbm.c199
-rw-r--r--db2/hash/hash.c1151
-rw-r--r--db2/hash/hash_auto.c161
-rw-r--r--db2/hash/hash_debug.c92
-rw-r--r--db2/hash/hash_dup.c295
-rw-r--r--db2/hash/hash_page.c1088
-rw-r--r--db2/hash/hash_rec.c281
-rw-r--r--db2/hash/hash_stat.c37
-rw-r--r--db2/include/btree.h233
-rw-r--r--db2/include/btree_ext.h76
-rw-r--r--db2/include/clib_ext.h6
-rw-r--r--db2/include/common_ext.h22
-rw-r--r--db2/include/db.h.src994
-rw-r--r--db2/include/db_am.h39
-rw-r--r--db2/include/db_auto.h13
-rw-r--r--db2/include/db_cxx.h158
-rw-r--r--db2/include/db_ext.h65
-rw-r--r--db2/include/db_int.h.src402
-rw-r--r--db2/include/db_join.h23
-rw-r--r--db2/include/db_page.h56
-rw-r--r--db2/include/hash.h156
-rw-r--r--db2/include/hash_ext.h61
-rw-r--r--db2/include/lock.h22
-rw-r--r--db2/include/lock_ext.h3
-rw-r--r--db2/include/log.h37
-rw-r--r--db2/include/log_ext.h5
-rw-r--r--db2/include/mp.h24
-rw-r--r--db2/include/mp_ext.h4
-rw-r--r--db2/include/os.h24
-rw-r--r--db2/include/os_ext.h28
-rw-r--r--db2/include/os_jump.h (renamed from db2/include/os_func.h)31
-rw-r--r--db2/include/txn.h33
-rw-r--r--db2/include/txn_auto.h26
-rw-r--r--db2/include/txn_ext.h22
-rw-r--r--db2/include/xa.h179
-rw-r--r--db2/include/xa_ext.h13
-rw-r--r--db2/lock/lock.c386
-rw-r--r--db2/lock/lock_conflict.c8
-rw-r--r--db2/lock/lock_deadlock.c76
-rw-r--r--db2/lock/lock_region.c59
-rw-r--r--db2/lock/lock_util.c8
-rw-r--r--db2/log/log.c194
-rw-r--r--db2/log/log_archive.c123
-rw-r--r--db2/log/log_auto.c21
-rw-r--r--db2/log/log_findckp.c32
-rw-r--r--db2/log/log_get.c54
-rw-r--r--db2/log/log_put.c162
-rw-r--r--db2/log/log_rec.c198
-rw-r--r--db2/log/log_register.c76
-rw-r--r--db2/mp/mp_bh.c176
-rw-r--r--db2/mp/mp_fget.c27
-rw-r--r--db2/mp/mp_fopen.c123
-rw-r--r--db2/mp/mp_fput.c14
-rw-r--r--db2/mp/mp_fset.c4
-rw-r--r--db2/mp/mp_open.c45
-rw-r--r--db2/mp/mp_pr.c26
-rw-r--r--db2/mp/mp_region.c42
-rw-r--r--db2/mp/mp_sync.c166
-rw-r--r--db2/mutex/alpha.dec25
-rw-r--r--db2/mutex/alpha.gcc52
-rw-r--r--db2/mutex/mutex.c49
-rw-r--r--db2/mutex/parisc.hp29
-rw-r--r--db2/mutex/uts4_cc.s (renamed from db2/mutex/uts4.cc.s)0
-rw-r--r--db2/os/os_abs.c8
-rw-r--r--db2/os/os_alloc.c202
-rw-r--r--db2/os/os_config.c66
-rw-r--r--db2/os/os_dir.c25
-rw-r--r--db2/os/os_fid.c8
-rw-r--r--db2/os/os_fsync.c42
-rw-r--r--db2/os/os_map.c72
-rw-r--r--db2/os/os_oflags.c2
-rw-r--r--db2/os/os_open.c75
-rw-r--r--db2/os/os_rw.c82
-rw-r--r--db2/os/os_seek.c22
-rw-r--r--db2/os/os_sleep.c6
-rw-r--r--db2/os/os_spin.c92
-rw-r--r--db2/os/os_stat.c11
-rw-r--r--db2/os/os_tmpdir.c113
-rw-r--r--db2/os/os_unlink.c15
-rw-r--r--db2/progs/db_archive/db_archive.c53
-rw-r--r--db2/progs/db_checkpoint/db_checkpoint.c39
-rw-r--r--db2/progs/db_deadlock/db_deadlock.c34
-rw-r--r--db2/progs/db_dump/db_dump.c82
-rw-r--r--db2/progs/db_load/db_load.c102
-rw-r--r--db2/progs/db_printlog/README22
-rw-r--r--db2/progs/db_printlog/commit.awk7
-rw-r--r--db2/progs/db_printlog/count.awk9
-rw-r--r--db2/progs/db_printlog/db_printlog.c83
-rw-r--r--db2/progs/db_printlog/pgno.awk43
-rw-r--r--db2/progs/db_printlog/range.awk27
-rw-r--r--db2/progs/db_printlog/status.awk26
-rw-r--r--db2/progs/db_printlog/txn.awk30
-rw-r--r--db2/progs/db_recover/db_recover.c25
-rw-r--r--db2/progs/db_stat/db_stat.c56
-rw-r--r--db2/txn/txn.c445
-rw-r--r--db2/txn/txn.src36
-rw-r--r--db2/txn/txn_auto.c357
-rw-r--r--db2/txn/txn_rec.c196
-rw-r--r--db2/xa/xa.c682
-rw-r--r--db2/xa/xa_db.c308
-rw-r--r--db2/xa/xa_map.c305
140 files changed, 12435 insertions, 9935 deletions
diff --git a/db2/Makefile b/db2/Makefile
index da1c622642..9020ce5f6a 100644
--- a/db2/Makefile
+++ b/db2/Makefile
@@ -45,9 +45,8 @@ distribute = db_int.h config.h compat.h clib/getlong.c btree/btree.src \
 				  mp.h mp_ext.h mutex_ext.h os_ext.h queue.h \
 				  shqueue.h txn.h txn_auto.h txn_ext.h \
 				  os.h os_jump.h xa.h xa_ext.h) \
-	     $(addprefix mutex/,x86.gcc uts4_cc.s sparc.gcc parisc.hp \
-				parisc.gcc alpha.gcc alpha.dec README \
-				68020.gcc tsl_parisc.s sco.cc)
+	     $(addprefix mutex/,x86.gcc uts4_cc.s sparc.gcc parisc.gcc \
+				README 68020.gcc tsl_parisc.s sco.cc)
 
 vpath %.c $(subdir-dirs)
 
diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c
deleted file mode 100644
index 9df5c717e6..0000000000
--- a/db2/btree/bt_close.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- */
-/*
- * Copyright (c) 1990, 1993, 1994, 1995, 1996
- *	Keith Bostic.  All rights reserved.
- */
-/*
- * Copyright (c) 1990, 1993, 1994, 1995
- *	The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#ifndef lint
-static const char sccsid[] = "@(#)bt_close.c	10.32 (Sleepycat) 5/6/98";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "btree.h"
-
-static void __bam_upstat __P((DB *dbp));
-
-/*
- * __bam_close --
- *	Close a btree.
- *
- * PUBLIC: int __bam_close __P((DB *));
- */
-int
-__bam_close(dbp)
-	DB *dbp;
-{
-	BTREE *t;
-
-	DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0);
-
-	t = dbp->internal;
-
-	/* Update tree statistics. */
-	__bam_upstat(dbp);
-
-	/* Free any allocated memory. */
-	if (t->bt_rkey.data)
-		FREE(t->bt_rkey.data, t->bt_rkey.size);
-	if (t->bt_rdata.data)
-		FREE(t->bt_rdata.data, t->bt_rdata.ulen);
-	if (t->bt_sp != t->bt_stack)
-		FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG));
-
-	FREE(t, sizeof(BTREE));
-	dbp->internal = NULL;
-
-	return (0);
-}
-
-/*
- * __bam_sync --
- *	Sync the btree to disk.
- *
- * PUBLIC: int __bam_sync __P((DB *, u_int32_t));
- */
-int
-__bam_sync(argdbp, flags)
-	DB *argdbp;
-	u_int32_t flags;
-{
-	DB *dbp;
-	int ret;
-
-	DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags);
-
-	/* Check for invalid flags. */
-	if ((ret = __db_syncchk(argdbp, flags)) != 0)
-		return (ret);
-
-	/* If it wasn't possible to modify the file, we're done. */
-	if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY))
-		return (0);
-
-	GETHANDLE(argdbp, NULL, &dbp, ret);
-
-	/* Flush any dirty pages from the cache to the backing file. */
-	if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
-		ret = 0;
-
-	PUTHANDLE(dbp);
-	return (ret);
-}
-
-/*
- * __bam_upstat --
- *	Update tree statistics.
- */
-static void
-__bam_upstat(dbp)
-	DB *dbp;
-{
-	BTREE *t;
-	BTMETA *meta;
-	DB_LOCK metalock;
-	db_pgno_t pgno;
-	u_int32_t flags;
-
-	/*
-	 * We use a no-op log call to log the update of the statistics onto the
-	 * metadata page.  The Db->close call isn't transaction protected to
-	 * start with, and I'm not sure what undoing a statistics update means,
-	 * anyway.
-	 */
-	if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY))
-		return;
-
-	flags = 0;
-	pgno = PGNO_METADATA;
-
-	/* Lock and retrieve the page. */
-	if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock) != 0)
-		return;
-	if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) {
-		/* Log the change. */
-		if (DB_LOGGING(dbp) &&
-		    __db_noop_log(dbp->dbenv->lg_info, dbp->txn, &LSN(meta), 0,
-		    dbp->log_fileid, PGNO_METADATA, &LSN(meta)) != 0)
-			goto err;
-
-		/* Update the statistics. */
-		t = dbp->internal;
-		__bam_add_mstat(&t->lstat, &meta->stat);
-
-		flags = DB_MPOOL_DIRTY;
-	}
-
-err:	(void)memp_fput(dbp->mpf, (PAGE *)meta, flags);
-	(void)__BT_LPUT(dbp, metalock);
-}
diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c
index 5c6d1e38ca..c60f920612 100644
--- a/db2/btree/bt_compare.c
+++ b/db2/btree/bt_compare.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_compare.c	10.9 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_compare.c	10.14 (Sleepycat) 10/9/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -64,93 +64,76 @@ static const char sccsid[] = "@(#)bt_compare.c	10.9 (Sleepycat) 5/6/98";
  * __bam_cmp --
  *	Compare a key to a given record.
  *
- * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *));
+ * PUBLIC: int __bam_cmp __P((DB *, const DBT *,
+ * PUBLIC:    PAGE *, u_int32_t, int (*)(const DBT *, const DBT *)));
  */
 int
-__bam_cmp(dbp, k1, e)
+__bam_cmp(dbp, dbt, h, indx, func)
 	DB *dbp;
-	const DBT *k1;
-	EPG *e;
+	const DBT *dbt;
+	PAGE *h;
+	u_int32_t indx;
+	int (*func)__P((const DBT *, const DBT *));
 {
 	BINTERNAL *bi;
 	BKEYDATA *bk;
 	BOVERFLOW *bo;
-	BTREE *t;
-	DBT k2;
-	PAGE *h;
-
-	t = dbp->internal;
+	DBT pg_dbt;
+	int ret;
 
 	/*
 	 * Returns:
-	 *	< 0 if k1 is < record
-	 *	= 0 if k1 is = record
-	 *	> 0 if k1 is > record
+	 *	< 0 if dbt is < page record
+	 *	= 0 if dbt is = page record
+	 *	> 0 if dbt is > page record
 	 *
-	 * The left-most key on internal pages, at any level of the tree, is
-	 * guaranteed, by the following code, to be less than any user key.
-	 * This saves us from having to update the leftmost key on an internal
-	 * page when the user inserts a new key in the tree smaller than
-	 * anything we've yet seen.
+	 * !!!
+	 * We do not clear the pg_dbt DBT even though it's likely to contain
+	 * random bits.  That should be okay, because the app's comparison
+	 * routine had better not be looking at fields other than data/size.
+	 * We don't clear it because we go through this path a lot and it's
+	 * expensive.
 	 */
-	h = e->page;
-	if (e->indx == 0 &&
-	    h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE)
-		return (1);
-
-	bo = NULL;
-	if (TYPE(h) == P_LBTREE) {
-		bk = GET_BKEYDATA(h, e->indx);
+	if (TYPE(h) == P_LBTREE || TYPE(h) == P_DUPLICATE) {
+		bk = GET_BKEYDATA(h, indx);
 		if (B_TYPE(bk->type) == B_OVERFLOW)
 			bo = (BOVERFLOW *)bk;
 		else {
-			k2.data = bk->data;
-			k2.size = bk->len;
+			pg_dbt.data = bk->data;
+			pg_dbt.size = bk->len;
+			return (func(dbt, &pg_dbt));
 		}
 	} else {
-		bi = GET_BINTERNAL(h, e->indx);
-		if (B_TYPE(bi->type) == B_OVERFLOW)
-			bo = (BOVERFLOW *)(bi->data);
-		else {
-			k2.data = bi->data;
-			k2.size = bi->len;
-		}
-	}
-
-	/*
-	 * XXX
-	 * We ignore system errors; the only recoverable one is ENOMEM, and we
-	 * don't want to require that comparison routines handle random errors.
-	 * We don't want to return a valid comparison, either, so we stop.
-	 */
-	if (bo != NULL) {
 		/*
-		 * If using the default comparison routine, use __db_moff(),
-		 * which compares the overflow key a page at a time.
+		 * The following code guarantees that the left-most key on an
+		 * internal page at any level of the btree is less than any
+		 * user specified key.  This saves us from having to update the
+		 * leftmost key on an internal page when the user inserts a new
+		 * key in the tree smaller than anything we've seen before.
 		 */
-		if (t->bt_compare == __bam_defcmp)
-			return (__db_moff(dbp, k1, bo->pgno));
+		if (indx == 0 && h->prev_pgno == PGNO_INVALID)
+			return (1);
 
-		/*
-		 * Otherwise, we need a contiguous record so we can hand it
-		 * to the user's routine.
-		 */
-		memset(&k2, 0, sizeof(k2));
-		if (__db_goff(dbp, &k2, bo->tlen,
-		    bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0) {
-			(void)__db_panic(dbp);
-			return (0);
+		bi = GET_BINTERNAL(h, indx);
+		if (B_TYPE(bi->type) == B_OVERFLOW)
+			bo = (BOVERFLOW *)(bi->data);
+		else {
+			pg_dbt.data = bi->data;
+			pg_dbt.size = bi->len;
+			return (func(dbt, &pg_dbt));
 		}
 	}
 
 	/*
+	 * Overflow.
+	 *
 	 * XXX
-	 * Note, we have not cleared the k2 DBT in this path.  This should
-	 * be okay, because the user's comparison routine had better not be
-	 * looking at any fields other than the data/size.  We don't clear
-	 * it because we go through this path a lot and it's expensive.
+	 * We ignore __db_moff() errors, because we have no way of returning
+	 * them.
 	 */
-	return ((*t->bt_compare)(k1, &k2));
+	(void) __db_moff(dbp,
+	    dbt, bo->pgno, bo->tlen, func == __bam_defcmp ? NULL : func, &ret);
+	return (ret);
 }
 
 /*
diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c
index 3da4507723..a3069082ae 100644
--- a/db2/btree/bt_conv.c
+++ b/db2/btree/bt_conv.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_conv.c	10.6 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)bt_conv.c	10.7 (Sleepycat) 9/20/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -90,18 +90,5 @@ __bam_mswap(pg)
 	SWAP32(p);		/* free */
 	SWAP32(p);		/* flags */
 
-	/* Swap the statistics. */
-	p = (u_int8_t *)&((BTMETA *)pg)->stat;
-	SWAP32(p);		/* bt_freed */
-	SWAP32(p);		/* bt_pfxsaved */
-	SWAP32(p);		/* bt_split */
-	SWAP32(p);		/* bt_rootsplit */
-	SWAP32(p);		/* bt_fastsplit */
-	SWAP32(p);		/* bt_added */
-	SWAP32(p);		/* bt_deleted */
-	SWAP32(p);		/* bt_get */
-	SWAP32(p);		/* bt_cache_hit */
-	SWAP32(p);		/* bt_cache_miss */
-
 	return (0);
 }
diff --git a/db2/btree/bt_curadj.c b/db2/btree/bt_curadj.c
new file mode 100644
index 0000000000..9b86fbb6d7
--- /dev/null
+++ b/db2/btree/bt_curadj.c
@@ -0,0 +1,272 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)bt_curadj.c	10.69 (Sleepycat) 12/2/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+#ifdef DEBUG
+/*
+ * __bam_cprint --
+ *	Display the current cursor list.
+ *
+ * PUBLIC: int __bam_cprint __P((DB *));
+ */
+int
+__bam_cprint(dbp)
+	DB *dbp;
+{
+	CURSOR *cp;
+	DBC *dbc;
+
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+		fprintf(stderr,
+	    "%#0x->%#0x: page: %lu index: %lu dpage %lu dindex: %lu recno: %lu",
+		    (u_int)dbc, (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx,
+		    (u_long)cp->dpgno, (u_long)cp->dindx, (u_long)cp->recno);
+		if (F_ISSET(cp, C_DELETED))
+			fprintf(stderr, " (deleted)");
+		fprintf(stderr, "\n");
+	}
+	DB_THREAD_UNLOCK(dbp);
+
+	return (0);
+}
+#endif /* DEBUG */
+
+/*
+ * __bam_ca_delete --
+ *	Update the cursors when items are deleted and when already deleted
+ *	items are overwritten.  Return the number of relevant cursors found.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, delete)
+	DB *dbp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	int delete;
+{
+	DBC *dbc;
+	CURSOR *cp;
+	int count;		/* !!!: Has to contain max number of cursors. */
+
+	/* Recno is responsible for its own adjustments. */
+	if (dbp->type == DB_RECNO)
+		return (0);
+
+	/*
+	 * Adjust the cursors.  We don't have to review the cursors for any
+	 * thread of control other than the current one, because we have the
+	 * page write locked at this point, and any other thread of control
+	 * had better be using a different locker ID, meaning only cursors in
+	 * our thread of control can be on the page.
+	 *
+	 * It's possible for multiple cursors within the thread to have write
+	 * locks on the same page, but, cursors within a thread must be single
+	 * threaded, so all we're locking here is the cursor linked list.
+	 */
+	DB_THREAD_LOCK(dbp);
+	for (count = 0, dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+
+		if ((cp->pgno == pgno && cp->indx == indx) ||
+		    (cp->dpgno == pgno && cp->dindx == indx)) {
+			if (delete)
+				F_SET(cp, C_DELETED);
+			else
+				F_CLR(cp, C_DELETED);
+			++count;
+		}
+	}
+	DB_THREAD_UNLOCK(dbp);
+
+	return (count);
+}
+
+/*
+ * __bam_ca_di --
+ *	Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int));
+ */
+void
+__bam_ca_di(dbp, pgno, indx, adjust)
+	DB *dbp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	int adjust;
+{
+	CURSOR *cp;
+	DBC *dbc;
+
+	/* Recno is responsible for its own adjustments. */
+	if (dbp->type == DB_RECNO)
+		return;
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 */
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+		if (cp->pgno == pgno && cp->indx >= indx)
+			cp->indx += adjust;
+		if (cp->dpgno == pgno && cp->dindx >= indx)
+			cp->dindx += adjust;
+	}
+	DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_dup --
+ *	Adjust the cursors when moving items from a leaf page to a duplicates
+ *	page.
+ *
+ * PUBLIC: void __bam_ca_dup __P((DB *,
+ * PUBLIC:    db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+void
+__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti)
+	DB *dbp;
+	db_pgno_t fpgno, tpgno;
+	u_int32_t first, fi, ti;
+{
+	CURSOR *cp;
+	DBC *dbc;
+
+	/* Recno is responsible for its own adjustments. */
+	if (dbp->type == DB_RECNO)
+		return;
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 */
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+		/*
+		 * Ignore matching entries that have already been moved,
+		 * we move from the same location on the leaf page more
+		 * than once.
+		 */
+		if (cp->dpgno == PGNO_INVALID &&
+		    cp->pgno == fpgno && cp->indx == fi) {
+			cp->indx = first;
+			cp->dpgno = tpgno;
+			cp->dindx = ti;
+		}
+	}
+	DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_rsplit --
+ *	Adjust the cursors when doing reverse splits.
+ *
+ * PUBLIC: void __bam_ca_rsplit __P((DB *, db_pgno_t, db_pgno_t));
+ */
+void
+__bam_ca_rsplit(dbp, fpgno, tpgno)
+	DB *dbp;
+	db_pgno_t fpgno, tpgno;
+{
+	CURSOR *cp;
+	DBC *dbc;
+
+	/* Recno is responsible for its own adjustments. */
+	if (dbp->type == DB_RECNO)
+		return;
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 */
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+		if (cp->pgno == fpgno)
+			cp->pgno = tpgno;
+	}
+	DB_THREAD_UNLOCK(dbp);
+}
+
+/*
+ * __bam_ca_split --
+ *	Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: void __bam_ca_split __P((DB *,
+ * PUBLIC:    db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+void
+__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft)
+	DB *dbp;
+	db_pgno_t ppgno, lpgno, rpgno;
+	u_int32_t split_indx;
+	int cleft;
+{
+	DBC *dbc;
+	CURSOR *cp;
+
+	/* Recno is responsible for its own adjustments. */
+	if (dbp->type == DB_RECNO)
+		return;
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 *
+	 * If splitting the page that a cursor was on, the cursor has to be
+	 * adjusted to point to the same record as before the split.  Most
+	 * of the time we don't adjust pointers to the left page, because
+	 * we're going to copy its contents back over the original page.  If
+	 * the cursor is on the right page, it is decremented by the number of
+	 * records split to the left page.
+	 */
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		cp = (CURSOR *)dbc->internal;
+		if (cp->pgno == ppgno) {
+			if (cp->indx < split_indx) {
+				if (cleft)
+					cp->pgno = lpgno;
+			} else {
+				cp->pgno = rpgno;
+				cp->indx -= split_indx;
+			}
+		}
+		if (cp->dpgno == ppgno) {
+			if (cp->dindx < split_indx) {
+				if (cleft)
+					cp->dpgno = lpgno;
+			} else {
+				cp->dpgno = rpgno;
+				cp->dindx -= split_indx;
+			}
+		}
+	}
+	DB_THREAD_UNLOCK(dbp);
+}
diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c
index 5d3366a3a1..10bc095c9d 100644
--- a/db2/btree/bt_cursor.c
+++ b/db2/btree/bt_cursor.c
@@ -8,148 +8,219 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_cursor.c	10.53 (Sleepycat) 5/25/98";
+static const char sccsid[] = "@(#)bt_cursor.c	10.81 (Sleepycat) 12/16/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <stdlib.h>
 #include <string.h>
 #endif
 
 #include "db_int.h"
 #include "db_page.h"
 #include "btree.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
 
 static int __bam_c_close __P((DBC *));
 static int __bam_c_del __P((DBC *, u_int32_t));
-static int __bam_c_first __P((DB *, CURSOR *));
+static int __bam_c_destroy __P((DBC *));
+static int __bam_c_first __P((DBC *, CURSOR *));
 static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
-static int __bam_c_getstack __P((DB *, CURSOR *));
-static int __bam_c_last __P((DB *, CURSOR *));
-static int __bam_c_next __P((DB *, CURSOR *, int));
-static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *));
-static int __bam_c_prev __P((DB *, CURSOR *));
+static int __bam_c_getstack __P((DBC *, CURSOR *));
+static int __bam_c_last __P((DBC *, CURSOR *));
+static int __bam_c_next __P((DBC *, CURSOR *, int));
+static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
+static int __bam_c_prev __P((DBC *, CURSOR *));
 static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
-static int __bam_c_rget __P((DB *, CURSOR *, DBT *, u_int32_t));
-static int __bam_c_search
-	       __P((DB *, CURSOR *, const DBT *, u_int32_t, int, int *));
+static void __bam_c_reset __P((CURSOR *));
+static int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
+static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
+static int __bam_dsearch __P((DBC *, CURSOR *,  DBT *, u_int32_t *));
 
 /* Discard the current page/lock held by a cursor. */
 #undef	DISCARD
-#define	DISCARD(dbp, cp) {						\
+#define	DISCARD(dbc, cp) {						\
 	if ((cp)->page != NULL) {					\
-		(void)memp_fput(dbp->mpf, (cp)->page, 0);		\
+		(void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0);	\
 		(cp)->page = NULL;					\
 	}								\
 	if ((cp)->lock != LOCK_INVALID) {				\
-		(void)__BT_TLPUT((dbp), (cp)->lock);			\
+		(void)__BT_TLPUT((dbc), (cp)->lock);			\
 		(cp)->lock = LOCK_INVALID;				\
 	}								\
 }
 
+/* If the cursor references a deleted record. */
+#undef	IS_CUR_DELETED
+#define	IS_CUR_DELETED(cp)						\
+	(((cp)->dpgno == PGNO_INVALID &&				\
+	B_DISSET(GET_BKEYDATA((cp)->page,				\
+	(cp)->indx + O_INDX)->type)) ||					\
+	((cp)->dpgno != PGNO_INVALID &&					\
+	B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
+
+/* If the cursor and index combination references a deleted record. */
+#undef	IS_DELETED
+#define	IS_DELETED(cp, indx)						\
+	(((cp)->dpgno == PGNO_INVALID &&				\
+	B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) ||	\
+	((cp)->dpgno != PGNO_INVALID &&					\
+	B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
+
 /*
- * __bam_cursor --
- *	Interface to the cursor functions.
+ * Test to see if two cursors could point to duplicates of the same key,
+ * whether on-page or off-page.  The leaf page numbers must be the same
+ * in both cases.  In the case of off-page duplicates, the key indices
+ * on the leaf page will be the same.  In the case of on-page duplicates,
+ * the duplicate page number must not be set, and the key index offsets
+ * must be the same.  For the last test, as the saved copy of the cursor
+ * will not have a valid page pointer, we use the cursor's.
+ */
+#undef	POSSIBLE_DUPLICATE
+#define	POSSIBLE_DUPLICATE(cursor, saved_copy)				\
+	((cursor)->pgno == (saved_copy).pgno &&				\
+	((cursor)->indx == (saved_copy).indx ||				\
+	((cursor)->dpgno == PGNO_INVALID &&				\
+	    (saved_copy).dpgno == PGNO_INVALID &&			\
+	    (cursor)->page->inp[(cursor)->indx] ==			\
+	    (cursor)->page->inp[(saved_copy).indx])))
+
+/*
+ * __bam_c_reset --
+ *	Initialize internal cursor structure.
+ */
+static void
+__bam_c_reset(cp)
+	CURSOR *cp;
+{
+	cp->sp = cp->csp = cp->stack;
+	cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
+	cp->page = NULL;
+	cp->pgno = PGNO_INVALID;
+	cp->indx = 0;
+	cp->dpgno = PGNO_INVALID;
+	cp->dindx = 0;
+	cp->lock = LOCK_INVALID;
+	cp->mode = DB_LOCK_NG;
+	cp->recno = RECNO_OOB;
+	cp->flags = 0;
+}
+
+/*
+ * __bam_c_init --
+ *	Initialize the access private portion of a cursor
  *
- * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **));
+ * PUBLIC: int __bam_c_init __P((DBC *));
  */
 int
-__bam_cursor(dbp, txn, dbcp)
-	DB *dbp;
-	DB_TXN *txn;
-	DBC **dbcp;
+__bam_c_init(dbc)
+	DBC *dbc;
 {
+	DB *dbp;
 	CURSOR *cp;
-	DBC *dbc;
-
-	DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0);
+	int ret;
 
-	if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL)
-		return (ENOMEM);
-	if ((cp = (CURSOR *)__db_calloc(1, sizeof(CURSOR))) == NULL) {
-		__db_free(dbc);
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
+		return (ret);
 
+	dbp = dbc->dbp;
 	cp->dbc = dbc;
-	cp->pgno = cp->dpgno = PGNO_INVALID;
-	cp->lock = LOCK_INVALID;
-
-	dbc->dbp = dbp;
-	dbc->txn = txn;
-	dbc->internal = cp;
-	dbc->c_close = __bam_c_close;
-	dbc->c_del = __bam_c_del;
-	dbc->c_get = __bam_c_get;
-	dbc->c_put = __bam_c_put;
 
 	/*
-	 * All cursors are queued from the master DB structure.  Add the
-	 * cursor to that queue.
+	 * Logical record numbers are always the same size, and we don't want
+	 * to have to check for space every time we return one.  Allocate it
+	 * in advance.
 	 */
-	CURSOR_SETUP(dbp);
-	TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links);
-	CURSOR_TEARDOWN(dbp);
+	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
+		if ((ret = __os_malloc(sizeof(db_recno_t),
+		    NULL, &dbc->rkey.data)) != 0) {
+			__os_free(cp, sizeof(CURSOR));
+			return (ret);
+		}
+		dbc->rkey.ulen = sizeof(db_recno_t);
+	}
+
+	/* Initialize methods. */
+	dbc->internal = cp;
+	if (dbp->type == DB_BTREE) {
+		dbc->c_am_close = __bam_c_close;
+		dbc->c_am_destroy = __bam_c_destroy;
+		dbc->c_del = __bam_c_del;
+		dbc->c_get = __bam_c_get;
+		dbc->c_put = __bam_c_put;
+	} else {
+		dbc->c_am_close = __bam_c_close;
+		dbc->c_am_destroy = __bam_c_destroy;
+		dbc->c_del = __ram_c_del;
+		dbc->c_get = __ram_c_get;
+		dbc->c_put = __ram_c_put;
+	}
+
+	/* Initialize dynamic information. */
+	__bam_c_reset(cp);
 
-	*dbcp = dbc;
 	return (0);
 }
 
 /*
  * __bam_c_close --
- *	Close a single cursor.
+ *	Close down the cursor from a single use.
  */
 static int
 __bam_c_close(dbc)
 	DBC *dbc;
 {
+	CURSOR *cp;
 	DB *dbp;
 	int ret;
 
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0);
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+	ret = 0;
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
+	/*
+	 * If a cursor deleted a btree key, perform the actual deletion.
+	 * (Recno keys are either deleted immediately or never deleted.)
+	 */
+	if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
+		ret = __bam_c_physdel(dbc, cp, NULL);
 
-	ret = __bam_c_iclose(dbp, dbc);
+	/* Discard any locks not acquired inside of a transaction. */
+	if (cp->lock != LOCK_INVALID) {
+		(void)__BT_TLPUT(dbc, cp->lock);
+		cp->lock = LOCK_INVALID;
+	}
+
+	/* Sanity checks. */
+#ifdef DIAGNOSTIC
+	if (cp->csp != cp->stack)
+		__db_err(dbp->dbenv, "btree cursor close: stack not empty");
+#endif
+
+	/* Initialize dynamic information. */
+	__bam_c_reset(cp);
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
 /*
- * __bam_c_iclose --
+ * __bam_c_destroy --
  *	Close a single cursor -- internal version.
- *
- * PUBLIC: int __bam_c_iclose __P((DB *, DBC *));
  */
-int
-__bam_c_iclose(dbp, dbc)
-	DB *dbp;
+static int
+__bam_c_destroy(dbc)
 	DBC *dbc;
 {
-	CURSOR *cp;
-	int ret;
-
-	/* If a cursor key was deleted, perform the actual deletion.  */
-	cp = dbc->internal;
-	ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0;
-
-	/* Discard any lock if we're not inside a transaction. */
-	if (cp->lock != LOCK_INVALID)
-		(void)__BT_TLPUT(dbp, cp->lock);
-
-	/* Remove the cursor from the queue. */
-	CURSOR_SETUP(dbp);
-	TAILQ_REMOVE(&dbp->curs_queue, dbc, links);
-	CURSOR_TEARDOWN(dbp);
-
 	/* Discard the structures. */
-	FREE(dbc->internal, sizeof(CURSOR));
-	FREE(dbc, sizeof(DBC));
+	__os_free(dbc->internal, sizeof(CURSOR));
 
-	return (ret);
+	return (0);
 }
 
 /*
@@ -161,7 +232,6 @@ __bam_c_del(dbc, flags)
 	DBC *dbc;
 	u_int32_t flags;
 {
-	BTREE *t;
 	CURSOR *cp;
 	DB *dbp;
 	DB_LOCK lock;
@@ -170,23 +240,31 @@ __bam_c_del(dbc, flags)
 	db_indx_t indx;
 	int ret;
 
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags);
-
+	dbp = dbc->dbp;
 	cp = dbc->internal;
 	h = NULL;
 
+	DB_PANIC_CHECK(dbp);
+
 	/* Check for invalid flags. */
-	if ((ret = __db_cdelchk(dbc->dbp, flags,
-	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
+	if ((ret = __db_cdelchk(dbp, flags,
+	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
 		return (ret);
 
+	/*
+	 * If we are running CDB, this had better be either a write
+	 * cursor or an immediate writer.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB))
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
+
 	/* If already deleted, return failure. */
-	if (F_ISSET(cp, C_DELETED | C_REPLACE))
+	if (F_ISSET(cp, C_DELETED))
 		return (DB_KEYEMPTY);
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
-	t = dbp->internal;
-
 	/*
 	 * We don't physically delete the record until the cursor moves,
 	 * so we have to have a long-lived write lock on the page instead
@@ -194,10 +272,10 @@ __bam_c_del(dbc, flags)
 	 * to even get here, so we simply discard it.
 	 */
 	if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
-		if ((ret = __bam_lget(dbp,
+		if ((ret = __bam_lget(dbc,
 		    0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
 			goto err;
-		(void)__BT_TLPUT(dbp, cp->lock);
+		(void)__BT_TLPUT(dbc, cp->lock);
 		cp->lock = lock;
 		cp->mode = DB_LOCK_WRITE;
 	}
@@ -216,85 +294,50 @@ __bam_c_del(dbc, flags)
 		indx = cp->dindx;
 	}
 
-	if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 		goto err;
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp) &&
-	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h),
+	if (DB_LOGGING(dbc) &&
+	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
 	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
 		(void)memp_fput(dbp->mpf, h, 0);
 		goto err;
 	}
 
-	/* Set the intent-to-delete flag on the page and in all cursors. */
+	/*
+	 * Set the intent-to-delete flag on the page and update all cursors. */
 	if (cp->dpgno == PGNO_INVALID)
 		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
 	else
 		B_DSET(GET_BKEYDATA(h, indx)->type);
-	(void)__bam_ca_delete(dbp, pgno, indx, NULL, 0);
+	(void)__bam_ca_delete(dbp, pgno, indx, 1);
 
 	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
 	h = NULL;
 
 	/*
-	 * If it's a btree with record numbers, we have to adjust the
-	 * counts.
+	 * If the tree has record numbers, we have to adjust the counts.
+	 *
+	 * !!!
+	 * This test is right -- we don't yet support duplicates and record
+	 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
+	 * set.
 	 */
-	if (F_ISSET(dbp, DB_BT_RECNUM) &&
-	    (ret = __bam_c_getstack(dbp, cp)) == 0) {
-		ret = __bam_adjust(dbp, t, -1);
-		(void)__bam_stkrel(dbp);
+	if (F_ISSET(dbp, DB_BT_RECNUM)) {
+		if ((ret = __bam_c_getstack(dbc, cp)) != 0)
+			goto err;
+		if ((ret = __bam_adjust(dbc, -1)) != 0)
+			goto err;
+		(void)__bam_stkrel(dbc, 0);
 	}
 
 err:	if (h != NULL)
 		(void)memp_fput(dbp->mpf, h, 0);
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
 /*
- * __bam_get --
- *	Retrieve a key/data pair from the tree.
- *
- * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
- */
-int
-__bam_get(argdbp, txn, key, data, flags)
-	DB *argdbp;
-	DB_TXN *txn;
-	DBT *key, *data;
-	u_int32_t flags;
-{
-	DBC dbc;
-	CURSOR cp;
-	int ret;
-
-	DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags);
-
-	/* Check for invalid flags. */
-	if ((ret = __db_getchk(argdbp, key, data, flags)) != 0)
-		return (ret);
-
-	/* Build an internal cursor. */
-	memset(&cp, 0, sizeof(cp));
-	cp.dbc = &dbc;
-	cp.pgno = cp.dpgno = PGNO_INVALID;
-	cp.lock = LOCK_INVALID;
-	cp.flags = C_INTERNAL;
-
-	/* Build an external cursor. */
-	memset(&dbc, 0, sizeof(dbc));
-	dbc.dbp = argdbp;
-	dbc.txn = txn;
-	dbc.internal = &cp;
-
-	/* Get the key. */
-	return(__bam_c_get(&dbc,
-	    key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET));
-}
-
-/*
  * __bam_c_get --
  *	Get using a cursor (btree).
  */
@@ -304,91 +347,197 @@ __bam_c_get(dbc, key, data, flags)
 	DBT *key, *data;
 	u_int32_t flags;
 {
-	BTREE *t;
-	CURSOR *cp, copy;
+	CURSOR *cp, copy, start;
 	DB *dbp;
 	PAGE *h;
-	int exact, ret;
-
-	DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get",
-	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+	int exact, ret, tmp_rmw;
 
+	dbp = dbc->dbp;
 	cp = dbc->internal;
 
+	DB_PANIC_CHECK(dbp);
+
 	/* Check for invalid flags. */
-	if ((ret = __db_cgetchk(dbc->dbp,
+	if ((ret = __db_cgetchk(dbp,
 	    key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
 		return (ret);
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
-	t = dbp->internal;
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = 0;
+	if (LF_ISSET(DB_RMW)) {
+		if (!F_ISSET(dbp, DB_AM_CDB)) {
+			tmp_rmw = 1;
+			F_SET(dbc, DBC_RMW);
+		}
+		LF_CLR(DB_RMW);
+	}
+
+	DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
 
 	/*
-	 * Break out the code to return a cursor's record number.  It
-	 * has nothing to do with the cursor get code except that it's
-	 * been rammed into the interface.
+	 * Return a cursor's record number.  It has nothing to do with the
+	 * cursor get code except that it's been rammed into the interface.
 	 */
-	if (LF_ISSET(DB_GET_RECNO)) {
-		ret = __bam_c_rget(dbp, cp, data, flags);
-		PUTHANDLE(dbp);
+	if (flags == DB_GET_RECNO) {
+		ret = __bam_c_rget(dbc, data, flags);
+		if (tmp_rmw)
+			F_CLR(dbc, DBC_RMW);
 		return (ret);
 	}
 
-	/* Initialize the cursor for a new retrieval. */
-	copy = *cp;
+	/*
+	 * Initialize the cursor for a new retrieval.  Clear the cursor's
+	 * page pointer, it was set before this operation, and no longer
+	 * has any meaning.
+	 */
 	cp->page = NULL;
+	copy = *cp;
 	cp->lock = LOCK_INVALID;
 
 	switch (flags) {
 	case DB_CURRENT:
 		/* It's not possible to return a deleted record. */
-		if (F_ISSET(cp, C_DELETED | C_REPLACE)) {
-			PUTHANDLE(dbp);
-			return (DB_KEYEMPTY);
+		if (F_ISSET(cp, C_DELETED)) {
+			ret = DB_KEYEMPTY;
+			goto err;
 		}
 
-		/* Get the page with the current item on it. */
-		if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0)
+		/* Acquire the current page. */
+		if ((ret = __bam_lget(dbc,
+		    0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
+			ret = memp_fget(dbp->mpf,
+			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
+			    0, &cp->page);
+		if (ret != 0)
 			goto err;
 		break;
+	case DB_NEXT_DUP:
+		if (cp->pgno == PGNO_INVALID) {
+			ret = EINVAL;
+			goto err;
+		}
+		if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
+			goto err;
+
+		/* Make sure we didn't go past the end of the duplicates. */
+		if (!POSSIBLE_DUPLICATE(cp, copy)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		break;
 	case DB_NEXT:
 		if (cp->pgno != PGNO_INVALID) {
-			if ((ret = __bam_c_next(dbp, cp, 1)) != 0)
+			if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
 				goto err;
 			break;
 		}
 		/* FALLTHROUGH */
 	case DB_FIRST:
-		if ((ret = __bam_c_first(dbp, cp)) != 0)
+		if ((ret = __bam_c_first(dbc, cp)) != 0)
 			goto err;
 		break;
 	case DB_PREV:
 		if (cp->pgno != PGNO_INVALID) {
-			if ((ret = __bam_c_prev(dbp, cp)) != 0)
+			if ((ret = __bam_c_prev(dbc, cp)) != 0)
 				goto err;
 			break;
 		}
 		/* FALLTHROUGH */
 	case DB_LAST:
-		if ((ret = __bam_c_last(dbp, cp)) != 0)
+		if ((ret = __bam_c_last(dbc, cp)) != 0)
 			goto err;
 		break;
+	case DB_SET:
+		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
+			goto err;
+
+		/*
+		 * We cannot currently be referencing a deleted record, but we
+		 * may be referencing off-page duplicates.
+		 *
+		 * If we're referencing off-page duplicates, move off-page.
+		 * If we moved off-page, move to the next non-deleted record.  
+		 * If we moved to the next non-deleted record, check to make
+		 * sure we didn't switch records because our current record
+		 * had no non-deleted data items.
+		 */
+		start = *cp;
+		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
+			goto err;
+		if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
+			if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
+				goto err;
+			if (!POSSIBLE_DUPLICATE(cp, start)) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+		}
+		break;
 	case DB_SET_RECNO:
-		exact = 1;
-		if ((ret =
-		    __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0)
+		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
 			goto err;
 		break;
-	case DB_SET:
-		exact = 1;
-		if ((ret =
-		    __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0)
+	case DB_GET_BOTH:
+		if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
+			/* Acquire the current page. */
+			if ((ret = memp_fget(dbp->mpf,
+			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
+			    0, &cp->page)) != 0)
+				goto err;
+
+			/* If DBC_CONTINUE, move to the next item. */
+			if (F_ISSET(dbc, DBC_CONTINUE) &&
+			    (ret = __bam_c_next(dbc, cp, 1)) != 0)
+				goto err;
+		} else {
+			if ((ret =
+			    __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
+				goto err;
+
+			/*
+			 * We may be referencing a duplicates page.  Move to
+			 * the first duplicate.
+			 */
+			if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
+				goto err;
+		}
+
+		/* Search for a matching entry. */
+		if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
 			goto err;
+
+		/* Ignore deleted entries. */
+		if (IS_CUR_DELETED(cp)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
 		break;
 	case DB_SET_RANGE:
-		exact = 0;
-		if ((ret =
-		    __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0)
+		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
+			goto err;
+
+		/*
+		 * As we didn't require an exact match, the search function
+		 * may have returned an entry past the end of the page.  If
+		 * so, move to the next entry.
+		 */
+		if (cp->indx == NUM_ENT(cp->page) &&
+		    (ret = __bam_c_next(dbc, cp, 0)) != 0)
+			goto err;
+
+		/*
+		 * We may be referencing off-page duplicates, if so, move
+		 * off-page.
+		 */
+		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
+			goto err;
+
+		/*
+		 * We may be referencing a deleted record, if so, move to
+		 * the next non-deleted record.
+		 */
+		if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
 			goto err;
 		break;
 	}
@@ -401,12 +550,12 @@ __bam_c_get(dbc, key, data, flags)
 	 */
 	if (flags != DB_SET) {
 		if (cp->dpgno != PGNO_INVALID) {
-			if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0)
+			if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
 				goto err;
 		} else
 			h = cp->page;
 		ret = __db_ret(dbp,
-		    h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen);
+		    h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
 		if (cp->dpgno != PGNO_INVALID)
 			(void)memp_fput(dbp->mpf, h, 0);
 		if (ret)
@@ -416,62 +565,163 @@ __bam_c_get(dbc, key, data, flags)
 	/* Return the data. */
 	if ((ret = __db_ret(dbp, cp->page,
 	    cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
-	    data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0)
+	    data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
 		goto err;
 
 	/*
-	 * If the previous cursor record has been deleted, delete it.  The
-	 * returned key isn't a deleted key, so clear the flag.
+	 * If the previous cursor record has been deleted, physically delete
+	 * the entry from the page.  We clear the deleted flag before we call
+	 * the underlying delete routine so that, if an error occurs, and we
+	 * restore the cursor, the deleted flag is cleared.  This is because,
+	 * if we manage to physically modify the page, and then restore the
+	 * cursor, we might try to repeat the page modification when closing
+	 * the cursor.
 	 */
-	if (F_ISSET(&copy, C_DELETED) && __bam_c_physdel(dbp, &copy, cp->page))
-		goto err;
-	F_CLR(cp, C_DELETED | C_REPLACE);
+	if (F_ISSET(&copy, C_DELETED)) {
+		F_CLR(&copy, C_DELETED);
+		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
+			goto err;
+	}
+	F_CLR(cp, C_DELETED);
 
-	/* Release the previous lock, if any. */
+	/* Release the previous lock, if any; the current lock is retained. */
 	if (copy.lock != LOCK_INVALID)
-		(void)__BT_TLPUT(dbp, copy.lock);
-
-	/* Release the pinned page. */
-	ret = memp_fput(dbp->mpf, cp->page, 0);
+		(void)__BT_TLPUT(dbc, copy.lock);
 
-	/* Internal cursors don't hold locks. */
-	if (F_ISSET(cp, C_INTERNAL) && cp->lock != LOCK_INVALID)
-		(void)__BT_TLPUT(dbp, cp->lock);
-
-	++t->lstat.bt_get;
+	/* Release the current page. */
+	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
+		goto err;
 
 	if (0) {
 err:		if (cp->page != NULL)
 			(void)memp_fput(dbp->mpf, cp->page, 0);
 		if (cp->lock != LOCK_INVALID)
-			(void)__BT_TLPUT(dbp, cp->lock);
+			(void)__BT_TLPUT(dbc, cp->lock);
 		*cp = copy;
 	}
 
-	PUTHANDLE(dbp);
+	/* Release temporary lock upgrade. */
+	if (tmp_rmw)
+		F_CLR(dbc, DBC_RMW);
+
 	return (ret);
 }
 
 /*
+ * __bam_dsearch --
+ *	Search for a matching data item (or the first data item that's
+ *	equal to or greater than the one we're searching for).
+ */
+static int
+__bam_dsearch(dbc, cp, data, iflagp)
+	DBC *dbc;
+	CURSOR *cp;
+	DBT *data;
+	u_int32_t *iflagp;
+{
+	DB *dbp;
+	CURSOR copy, last;
+	int cmp, ret;
+
+	dbp = dbc->dbp;
+
+	/*
+	 * If iflagp is non-NULL, we're doing an insert.
+	 *
+	 * If the duplicates are off-page, use the duplicate search routine.
+	 */
+	if (cp->dpgno != PGNO_INVALID) {
+		if ((ret = __db_dsearch(dbc, iflagp != NULL,
+		    data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
+			return (ret);
+		cp->dpgno = cp->page->pgno;
+
+		if (iflagp == NULL) {
+			if (cmp != 0)
+				return (DB_NOTFOUND);
+			return (0);
+		}
+		*iflagp = DB_BEFORE;
+		return (0);
+	}
+
+	/* Otherwise, do the search ourselves. */
+	copy = *cp;
+	for (;;) {
+		/* Save the last interesting cursor position. */
+		last = *cp;
+
+		/* See if the data item matches the one we're looking for. */
+		if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
+		    dbp->dup_compare == NULL ?
+		    __bam_defcmp : dbp->dup_compare)) == 0) {
+			if (iflagp != NULL)
+				*iflagp = DB_AFTER;
+			return (0);
+		}
+
+		/*
+		 * If duplicate entries are sorted, we're done if we find a
+		 * page entry that sorts greater than the application item.
+		 * If doing an insert, return success, otherwise DB_NOTFOUND.
+		 */
+		if (dbp->dup_compare != NULL && cmp < 0) {
+			if (iflagp == NULL)
+				return (DB_NOTFOUND);
+			*iflagp = DB_BEFORE;
+			return (0);
+		}
+
+		/*
+		 * Move to the next item.  If we reach the end of the page and
+		 * we're doing an insert, set the cursor to the last item and
+		 * set the referenced memory location so callers know to insert
+		 * after the item, instead of before it.  If not inserting, we
+		 * return DB_NOTFOUND.
+		 */
+		if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
+			if (iflagp == NULL)
+				return (DB_NOTFOUND);
+			goto use_last;
+		}
+
+		/*
+		 * Make sure we didn't go past the end of the duplicates.  The
+		 * error conditions are the same as above.
+		 */
+		if (!POSSIBLE_DUPLICATE(cp, copy)) {
+			if (iflagp == NULL)
+				 return (DB_NOTFOUND);
+use_last:		*cp = last;
+			*iflagp = DB_AFTER;
+			return (0);
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
  * __bam_c_rget --
  *	Return the record number for a cursor.
  */
 static int
-__bam_c_rget(dbp, cp, data, flags)
-	DB *dbp;
-	CURSOR *cp;
+__bam_c_rget(dbc, data, flags)
+	DBC *dbc;
 	DBT *data;
 	u_int32_t flags;
 {
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	DBT dbt;
 	db_recno_t recno;
 	int exact, ret;
 
 	COMPQUIET(flags, 0);
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 
 	/* Get the page with the current item on it. */
-	if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
 		return (ret);
 
 	/* Get a copy of the key. */
@@ -481,18 +731,19 @@ __bam_c_rget(dbp, cp, data, flags)
 		goto err;
 
 	exact = 1;
-	if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0)
+	if ((ret = __bam_search(dbc, &dbt,
+	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
+	    1, &recno, &exact)) != 0)
 		goto err;
 
-	t = dbp->internal;
 	ret = __db_retcopy(data, &recno, sizeof(recno),
-	    &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc);
+	    &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
 
 	/* Release the stack. */
-	__bam_stkrel(dbp);
+	__bam_stkrel(dbc, 0);
 
 err:	(void)memp_fput(dbp->mpf, cp->page, 0);
-	__db_free(dbt.data);
+	__os_free(dbt.data, dbt.size);
 	return (ret);
 }
 
@@ -506,62 +757,97 @@ __bam_c_put(dbc, key, data, flags)
 	DBT *key, *data;
 	u_int32_t flags;
 {
-	BTREE *t;
 	CURSOR *cp, copy;
 	DB *dbp;
 	DBT dbt;
 	db_indx_t indx;
 	db_pgno_t pgno;
-	u_int32_t iiflags;
+	u_int32_t iiflags, iiop;
 	int exact, needkey, ret, stack;
 	void *arg;
 
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put",
-	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
-	    data, flags);
-
+	dbp = dbc->dbp;
 	cp = dbc->internal;
 
-	if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
-	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
-		return (ret);
+	DB_PANIC_CHECK(dbp);
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
-	t = dbp->internal;
+	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
+	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
+	    data, flags);
 
-	/* Initialize the cursor for a new retrieval. */
-	copy = *cp;
-	cp->page = NULL;
-	cp->lock = LOCK_INVALID;
+	if ((ret = __db_cputchk(dbp, key, data, flags,
+	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
+		return (ret);
 
 	/*
-	 * To split, we need a valid key for the page.  Since it's a cursor,
-	 * we have to build one.
+	 * If we are running CDB, this had better be either a write
+	 * cursor or an immediate writer.  If it's a regular writer,
+	 * that means we have an IWRITE lock and we need to upgrade
+	 * it to a write lock.
 	 */
-	stack = 0;
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+		if (F_ISSET(dbc, DBC_RMW) &&
+		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
+		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+		    &dbc->mylock)) != 0)
+			return (EAGAIN);
+	}
+
 	if (0) {
-split:		/* Acquire a copy of a key from the page. */
+split:		/*
+		 * To split, we need a valid key for the page.  Since it's a
+		 * cursor, we have to build one.
+		 *
+		 * Acquire a copy of a key from the page.
+		 */
 		if (needkey) {
 			memset(&dbt, 0, sizeof(DBT));
 			if ((ret = __db_ret(dbp, cp->page, indx,
-			    &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen)) != 0)
+			    &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
 				goto err;
 			arg = &dbt;
 		} else
 			arg = key;
 
-		/* Discard any pinned pages. */
+		/*
+		 * Discard any locks and pinned pages (the locks are discarded
+		 * even if we're running with transactions, as they lock pages
+		 * that we're sorry we ever acquired).  If stack is set and the
+		 * cursor entries are valid, they point to the same entries as
+		 * the stack, don't free them twice.
+		 */
 		if (stack) {
-			(void)__bam_stkrel(dbp);
+			(void)__bam_stkrel(dbc, 1);
 			stack = 0;
 		} else
-			DISCARD(dbp, cp);
+			DISCARD(dbc, cp);
 
-		if ((ret = __bam_split(dbp, arg)) != 0)
+		/*
+		 * Restore the cursor to its original value.  This is necessary
+		 * for two reasons.  First, we are about to copy it in case of
+		 * error, again.  Second, we adjust cursors during the split,
+		 * and we have to ensure this cursor is adjusted appropriately,
+		 * along with all the other cursors.
+		 */
+		*cp = copy;
+
+		if ((ret = __bam_split(dbc, arg)) != 0)
 			goto err;
 	}
 
-	ret = 0;
+	/*
+	 * Initialize the cursor for a new retrieval.  Clear the cursor's
+	 * page pointer, it was set before this operation, and no longer
+	 * has any meaning.
+	 */
+	cp->page = NULL;
+	copy = *cp;
+	cp->lock = LOCK_INVALID;
+
+	iiflags = needkey = ret = stack = 0;
 	switch (flags) {
 	case DB_AFTER:
 	case DB_BEFORE:
@@ -574,64 +860,148 @@ split:		/* Acquire a copy of a key from the page. */
 			pgno = cp->dpgno;
 			indx = cp->dindx;
 		}
+
 		/*
-		 * XXX
-		 * This test is right -- we don't currently support duplicates
-		 * in the presence of record numbers, so we don't worry about
-		 * them if DB_BT_RECNUM is set.
+		 * !!!
+		 * This test is right -- we don't yet support duplicates and
+		 * record numbers in the same tree, so ignore duplicates if
+		 * DB_BT_RECNUM set.
 		 */
 		if (F_ISSET(dbp, DB_BT_RECNUM) &&
 		    (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
 			/* Acquire a complete stack. */
-			if ((ret = __bam_c_getstack(dbp, cp)) != 0)
+			if ((ret = __bam_c_getstack(dbc, cp)) != 0)
 				goto err;
-			cp->page = t->bt_csp->page;
+			cp->page = cp->csp->page;
 
 			stack = 1;
 			iiflags = BI_DOINCR;
 		} else {
 			/* Acquire the current page. */
-			if ((ret = __bam_lget(dbp,
+			if ((ret = __bam_lget(dbc,
 			    0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
-				ret = __bam_pget(dbp, &cp->page, &pgno, 0);
+				ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
 			if (ret != 0)
 				goto err;
 
 			iiflags = 0;
 		}
-		if ((ret = __bam_iitem(dbp, &cp->page,
-		    &indx, key, data, flags, iiflags)) == DB_NEEDSPLIT)
-			goto split;
-		break;
-	case DB_KEYFIRST:
-		exact = needkey = 0;
-		if ((ret =
-		    __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0)
-			goto err;
-		stack = 1;
 
-		indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx;
-		if ((ret = __bam_iitem(dbp, &cp->page, &indx, key,
-		    data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT)
-			goto split;
+		/*
+		 * If the user has specified a duplicate comparison function,
+		 * we return an error if DB_CURRENT was specified and the
+		 * replacement data doesn't compare equal to the current data.
+		 * This stops apps from screwing up the duplicate sort order.
+		 */
+		if (flags == DB_CURRENT && dbp->dup_compare != NULL)
+			if (__bam_cmp(dbp, data,
+			    cp->page, indx, dbp->dup_compare) != 0) {
+				ret = EINVAL;
+				goto err;
+			}
+
+		iiop = flags;
 		break;
+	case DB_KEYFIRST:
 	case DB_KEYLAST:
-		exact = needkey = 0;
-		if ((ret =
-		    __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0)
+		/*
+		 * If we have a duplicate comparison function, we position to
+		 * the first of any on-page duplicates, and use __bam_dsearch
+		 * to search for the right slot.  Otherwise, we position to
+		 * the first/last of any on-page duplicates based on the flag
+		 * value.
+		 */
+		if ((ret = __bam_c_search(dbc, cp, key,
+		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
+		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
 			goto err;
 		stack = 1;
 
-		indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx;
-		if ((ret = __bam_iitem(dbp, &cp->page, &indx, key,
-		    data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT)
-			goto split;
+		/*
+		 * If an exact match:
+		 *	If duplicates aren't supported, replace the current
+		 *	item.  (When implementing the DB->put function, our
+		 *	caller has already checked the DB_NOOVERWRITE flag.)
+		 *
+		 *	If there's a duplicate comparison function, find the
+		 *	correct slot for this duplicate item.
+		 *
+		 *	If there's no duplicate comparison function, set the
+		 *	insert flag based on the argument flags.
+		 *
+		 * If there's no match, the search function returned the
+		 * smallest slot greater than the key, use it.
+		 */
+		if (exact) {
+			if (F_ISSET(dbp, DB_AM_DUP)) {
+				/*
+				 * If at off-page duplicate page, move to the
+				 * first or last entry -- if a comparison
+				 * function was specified, start searching at
+				 * the first entry.  Otherwise, move based on
+				 * the DB_KEYFIRST/DB_KEYLAST flags.
+				 */
+				if ((ret = __bam_dup(dbc, cp, cp->indx,
+				    dbp->dup_compare == NULL &&
+				    flags != DB_KEYFIRST)) != 0)
+					goto err;
+
+				/*
+				 * If there's a comparison function, search for
+				 * the correct slot.  Otherwise, set the insert
+				 * flag based on the argment flag.
+				 */
+				if (dbp->dup_compare == NULL)
+					iiop = flags == DB_KEYFIRST ?
+					    DB_BEFORE : DB_AFTER;
+				else
+					if ((ret = __bam_dsearch(dbc,
+					    cp, data, &iiop)) != 0)
+						goto err;
+			} else
+				iiop = DB_CURRENT;
+			iiflags = 0;
+		} else {
+			iiop = DB_BEFORE;
+			iiflags = BI_NEWKEY;
+		}
+
+		if (cp->dpgno == PGNO_INVALID) {
+			pgno = cp->pgno;
+			indx = cp->indx;
+		} else {
+			pgno = cp->dpgno;
+			indx = cp->dindx;
+		}
 		break;
 	}
-	if (ret)
+
+	ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
+
+	if (ret == DB_NEEDSPLIT)
+		goto split;
+	if (ret != 0)
 		goto err;
 
 	/*
+	 * Reset any cursors referencing this item that might have the item
+	 * marked for deletion.
+	 */
+	if (iiop == DB_CURRENT) {
+		(void)__bam_ca_delete(dbp, pgno, indx, 0);
+
+		/*
+		 * It's also possible that we are the cursor that had the
+		 * item marked for deletion, in which case we want to make
+		 * sure that we don't delete it because we had the delete
+		 * flag set already.
+		 */
+		if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
+		    cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
+			F_CLR(&copy, C_DELETED);
+	}
+
+	/*
 	 * Update the cursor to point to the new entry.  The new entry was
 	 * stored on the current page, because we split pages until it was
 	 * possible.
@@ -642,17 +1012,24 @@ split:		/* Acquire a copy of a key from the page. */
 		cp->dindx = indx;
 
 	/*
-	 * If the previous cursor record has been deleted, delete it.  The
-	 * returned key isn't a deleted key, so clear the flag.
+	 * If the previous cursor record has been deleted, physically delete
+	 * the entry from the page.  We clear the deleted flag before we call
+	 * the underlying delete routine so that, if an error occurs, and we
+	 * restore the cursor, the deleted flag is cleared.  This is because,
+	 * if we manage to physically modify the page, and then restore the
+	 * cursor, we might try to repeat the page modification when closing
+	 * the cursor.
 	 */
-	if (F_ISSET(&copy, C_DELETED) &&
-	    (ret = __bam_c_physdel(dbp, &copy, cp->page)) != 0)
-		goto err;
-	F_CLR(cp, C_DELETED | C_REPLACE);
+	if (F_ISSET(&copy, C_DELETED)) {
+		F_CLR(&copy, C_DELETED);
+		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
+			goto err;
+	}
+	F_CLR(cp, C_DELETED);
 
-	/* Release the previous lock, if any. */
+	/* Release the previous lock, if any; the current lock is retained. */
 	if (copy.lock != LOCK_INVALID)
-		(void)__BT_TLPUT(dbp, copy.lock);
+		(void)__BT_TLPUT(dbc, copy.lock);
 
 	/*
 	 * Discard any pages pinned in the tree and their locks, except for
@@ -662,23 +1039,26 @@ split:		/* Acquire a copy of a key from the page. */
 	 * we have to adjust the stack as necessary.  If there was only a
 	 * single page on the stack, we don't have to free further stack pages.
 	 */
+	if (stack && BT_STK_POP(cp) != NULL)
+		(void)__bam_stkrel(dbc, 0);
 
-	if (stack && BT_STK_POP(t) != NULL)
-		(void)__bam_stkrel(dbp);
-
+	/* Release the current page. */
 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
 		goto err;
 
 	if (0) {
 err:		/* Discard any pinned pages. */
 		if (stack)
-			(void)__bam_stkrel(dbp);
+			(void)__bam_stkrel(dbc, 0);
 		else
-			DISCARD(dbp, cp);
+			DISCARD(dbc, cp);
 		*cp = copy;
 	}
 
-	PUTHANDLE(dbp);
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
+
 	return (ret);
 }
 
@@ -687,19 +1067,22 @@ err:		/* Discard any pinned pages. */
  *	Return the first record.
  */
 static int
-__bam_c_first(dbp, cp)
-	DB *dbp;
+__bam_c_first(dbc, cp)
+	DBC *dbc;
 	CURSOR *cp;
 {
+	DB *dbp;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/* Walk down the left-hand side of the tree. */
 	for (pgno = PGNO_ROOT;;) {
 		if ((ret =
-		    __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 			return (ret);
 
 		/* If we find a leaf page, we're done. */
@@ -707,28 +1090,22 @@ __bam_c_first(dbp, cp)
 			break;
 
 		pgno = GET_BINTERNAL(cp->page, 0)->pgno;
-		DISCARD(dbp, cp);
+		DISCARD(dbc, cp);
 	}
 
 	cp->pgno = cp->page->pgno;
 	cp->indx = 0;
 	cp->dpgno = PGNO_INVALID;
 
-	/* If it's an empty page or a deleted record, go to the next one. */
-	if (NUM_ENT(cp->page) == 0 ||
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type))
-		if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
-			return (ret);
-
-	/* If it's a duplicate reference, go to the first entry. */
-	if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0)
+	/* Check for duplicates. */
+	if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
 		return (ret);
 
-	/* If it's a deleted record, go to the next one. */
-	if (cp->dpgno != PGNO_INVALID &&
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type))
-		if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
+	/* If on an empty page or a deleted record, move to the next one. */
+	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
+		if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
 			return (ret);
+
 	return (0);
 }
 
@@ -737,19 +1114,22 @@ __bam_c_first(dbp, cp)
  *	Return the last record.
  */
 static int
-__bam_c_last(dbp, cp)
-	DB *dbp;
+__bam_c_last(dbc, cp)
+	DBC *dbc;
 	CURSOR *cp;
 {
+	DB *dbp;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/* Walk down the right-hand side of the tree. */
 	for (pgno = PGNO_ROOT;;) {
 		if ((ret =
-		    __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 			return (ret);
 
 		/* If we find a leaf page, we're done. */
@@ -758,28 +1138,22 @@ __bam_c_last(dbp, cp)
 
 		pgno =
 		    GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
-		DISCARD(dbp, cp);
+		DISCARD(dbc, cp);
 	}
 
 	cp->pgno = cp->page->pgno;
 	cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
 	cp->dpgno = PGNO_INVALID;
 
-	/* If it's an empty page or a deleted record, go to the previous one. */
-	if (NUM_ENT(cp->page) == 0 ||
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type))
-		if ((ret = __bam_c_prev(dbp, cp)) != 0)
-			return (ret);
-
-	/* If it's a duplicate reference, go to the last entry. */
-	if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0)
+	/* Check for duplicates. */
+	if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
 		return (ret);
 
-	/* If it's a deleted record, go to the previous one. */
-	if (cp->dpgno != PGNO_INVALID &&
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type))
-		if ((ret = __bam_c_prev(dbp, cp)) != 0)
+	/* If on an empty page or a deleted record, move to the next one. */
+	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
+		if ((ret = __bam_c_prev(dbc, cp)) != 0)
 			return (ret);
+
 	return (0);
 }
 
@@ -788,15 +1162,18 @@ __bam_c_last(dbp, cp)
  *	Move to the next record.
  */
 static int
-__bam_c_next(dbp, cp, initial_move)
-	DB *dbp;
+__bam_c_next(dbc, cp, initial_move)
+	DBC *dbc;
 	CURSOR *cp;
 	int initial_move;
 {
+	DB *dbp;
 	db_indx_t adjust, indx;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * We're either moving through a page of duplicates or a btree leaf
 	 * page.
@@ -812,9 +1189,9 @@ __bam_c_next(dbp, cp, initial_move)
 	}
 	if (cp->page == NULL) {
 		if ((ret =
-		    __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 			return (ret);
 	}
 
@@ -832,15 +1209,13 @@ __bam_c_next(dbp, cp, initial_move)
 		indx += adjust;
 	for (;;) {
 		if (indx >= NUM_ENT(cp->page)) {
-			pgno = cp->page->next_pgno;
-			DISCARD(dbp, cp);
-
 			/*
 			 * If we're in a btree leaf page, we've reached the end
 			 * of the tree.  If we've reached the end of a page of
 			 * duplicates, continue from the btree leaf page where
 			 * we found this page of duplicates.
 			 */
+			pgno = cp->page->next_pgno;
 			if (pgno == PGNO_INVALID) {
 				/* If in a btree leaf page, it's EOF. */
 				if (cp->dpgno == PGNO_INVALID)
@@ -855,20 +1230,18 @@ __bam_c_next(dbp, cp, initial_move)
 			} else
 				indx = 0;
 
-			if ((ret = __bam_lget(dbp,
+			DISCARD(dbc, cp);
+			if ((ret = __bam_lget(dbc,
 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 				return (ret);
-			if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+			if ((ret =
+			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 				return (ret);
 			continue;
 		}
 
 		/* Ignore deleted records. */
-		if (dbp->type == DB_BTREE &&
-		    ((cp->dpgno == PGNO_INVALID &&
-		    B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) ||
-		    (cp->dpgno != PGNO_INVALID &&
-		    B_DISSET(GET_BKEYDATA(cp->page, indx)->type)))) {
+		if (IS_DELETED(cp, indx)) {
 			indx += adjust;
 			continue;
 		}
@@ -882,8 +1255,7 @@ __bam_c_next(dbp, cp, initial_move)
 			cp->pgno = cp->page->pgno;
 			cp->indx = indx;
 
-			if ((ret =
-			    __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0)
+			if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
 				return (ret);
 			if (cp->dpgno != PGNO_INVALID) {
 				indx = cp->dindx;
@@ -904,14 +1276,17 @@ __bam_c_next(dbp, cp, initial_move)
  *	Move to the previous record.
  */
 static int
-__bam_c_prev(dbp, cp)
-	DB *dbp;
+__bam_c_prev(dbc, cp)
+	DBC *dbc;
 	CURSOR *cp;
 {
+	DB *dbp;
 	db_indx_t indx, adjust;
 	db_pgno_t pgno;
 	int ret, set_indx;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * We're either moving through a page of duplicates or a btree leaf
 	 * page.
@@ -927,9 +1302,9 @@ __bam_c_prev(dbp, cp)
 	}
 	if (cp->page == NULL) {
 		if ((ret =
-		    __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 			return (ret);
 	}
 
@@ -941,15 +1316,13 @@ __bam_c_prev(dbp, cp)
 	 */
 	for (;;) {
 		if (indx == 0) {
-			pgno = cp->page->prev_pgno;
-			DISCARD(dbp, cp);
-
 			/*
 			 * If we're in a btree leaf page, we've reached the
 			 * beginning of the tree.  If we've reached the first
 			 * of a page of duplicates, continue from the btree
 			 * leaf page where we found this page of duplicates.
 			 */
+			pgno = cp->page->prev_pgno;
 			if (pgno == PGNO_INVALID) {
 				/* If in a btree leaf page, it's SOF. */
 				if (cp->dpgno == PGNO_INVALID)
@@ -965,10 +1338,12 @@ __bam_c_prev(dbp, cp)
 			} else
 				set_indx = 1;
 
-			if ((ret = __bam_lget(dbp,
+			DISCARD(dbc, cp);
+			if ((ret = __bam_lget(dbc,
 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
 				return (ret);
-			if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+			if ((ret =
+			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 				return (ret);
 
 			if (set_indx)
@@ -979,11 +1354,7 @@ __bam_c_prev(dbp, cp)
 
 		/* Ignore deleted records. */
 		indx -= adjust;
-		if (dbp->type == DB_BTREE &&
-		    ((cp->dpgno == PGNO_INVALID &&
-		    B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) ||
-		    (cp->dpgno != PGNO_INVALID &&
-		    B_DISSET(GET_BKEYDATA(cp->page, indx)->type))))
+		if (IS_DELETED(cp, indx))
 			continue;
 
 		/*
@@ -995,8 +1366,7 @@ __bam_c_prev(dbp, cp)
 			cp->pgno = cp->page->pgno;
 			cp->indx = indx;
 
-			if ((ret =
-			    __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0)
+			if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
 				return (ret);
 			if (cp->dpgno != PGNO_INVALID) {
 				indx = cp->dindx + O_INDX;
@@ -1017,499 +1387,261 @@ __bam_c_prev(dbp, cp)
  *	Move to a specified record.
  */
 static int
-__bam_c_search(dbp, cp, key, flags, isrecno, exactp)
-	DB *dbp;
+__bam_c_search(dbc, cp, key, flags, exactp)
+	DBC *dbc;
 	CURSOR *cp;
 	const DBT *key;
 	u_int32_t flags;
-	int isrecno, *exactp;
+	int *exactp;
 {
 	BTREE *t;
+	DB *dbp;
+	DB_LOCK lock;
+	PAGE *h;
 	db_recno_t recno;
-	int needexact, ret;
+	db_indx_t indx;
+	u_int32_t sflags;
+	int cmp, needexact, ret;
 
+	dbp = dbc->dbp;
 	t = dbp->internal;
-	needexact = *exactp;
 
-	/*
-	 * Find any matching record; the search function pins the page.  Make
-	 * sure it's a valid key (__bam_search may return an index just past
-	 * the end of a page) and return it.
-	 */
-	if (isrecno) {
-		if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0)
+	/* Find an entry in the database. */
+	switch (flags) {
+	case DB_SET_RECNO:
+		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
 			return (ret);
-		ret = __bam_rsearch(dbp, &recno, flags, 1, exactp);
-	} else
-		ret = __bam_search(dbp, key, flags, 1, NULL, exactp);
+		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
+		needexact = *exactp = 1;
+		ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
+		break;
+	case DB_SET:
+	case DB_GET_BOTH:
+		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
+		needexact = *exactp = 1;
+		goto search;
+	case DB_SET_RANGE:
+		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
+		needexact = *exactp = 0;
+		goto search;
+	case DB_KEYFIRST:
+		sflags = S_KEYFIRST;
+		goto fast_search;
+	case DB_KEYLAST:
+		sflags = S_KEYLAST;
+fast_search:	needexact = *exactp = 0;
+		/*
+		 * If the application has a history of inserting into the first
+		 * or last pages of the database, we check those pages first to
+		 * avoid doing a full search.
+		 *
+		 * Record numbers can't be fast-tracked, the entire tree has to
+		 * be locked.
+		 */
+		h = NULL;
+		lock = LOCK_INVALID;
+		if (F_ISSET(dbp, DB_BT_RECNUM))
+			goto search;
+
+		/* Check if the application has a history of sorted input. */
+		if (t->bt_lpgno == PGNO_INVALID)
+			goto search;
+
+		/*
+		 * Lock and retrieve the page on which we did the last insert.
+		 * It's okay if it doesn't exist, or if it's not the page type
+		 * we expected, it just means that the world changed.
+		 */
+		if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
+			goto fast_miss;
+		if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
+			goto fast_miss;
+		if (TYPE(h) != P_LBTREE)
+			goto fast_miss;
+		if (NUM_ENT(h) == 0)
+			goto fast_miss;
+
+		/*
+		 * What we do here is test to see if we're at the beginning or
+		 * end of the tree and if the new item sorts before/after the
+		 * first/last page entry.  We don't try and catch inserts into
+		 * the middle of the tree (although we could, as long as there
+		 * were two keys on the page and we saved both the index and
+		 * the page number of the last insert).
+		 */
+		if (h->next_pgno == PGNO_INVALID) {
+			indx = NUM_ENT(h) - P_INDX;
+			if ((cmp =
+			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
+				goto try_begin;
+			if (cmp > 0) {
+				indx += P_INDX;
+				goto fast_hit;
+			}
+
+			/*
+			 * Found a duplicate.  If doing DB_KEYLAST, we're at
+			 * the correct position, otherwise, move to the first
+			 * of the duplicates.
+			 */
+			if (flags == DB_KEYLAST)
+				goto fast_hit;
+			for (;
+			    indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
+			    indx -= P_INDX)
+				;
+			goto fast_hit;
+		}
+try_begin:	if (h->prev_pgno == PGNO_INVALID) {
+			indx = 0;
+			if ((cmp =
+			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
+				goto fast_miss;
+			if (cmp < 0)
+				goto fast_hit;
+			/*
+			 * Found a duplicate.  If doing DB_KEYFIRST, we're at
+			 * the correct position, otherwise, move to the last
+			 * of the duplicates.
+			 */
+			if (flags == DB_KEYFIRST)
+				goto fast_hit;
+			for (;
+			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+			    h->inp[indx] == h->inp[indx + P_INDX];
+			    indx += P_INDX)
+				;
+			goto fast_hit;
+		}
+		goto fast_miss;
+
+fast_hit:	/* Set the exact match flag, we may have found a duplicate. */
+		*exactp = cmp == 0;
+
+		/* Enter the entry in the stack. */
+		BT_STK_CLR(cp);
+		BT_STK_ENTER(cp, h, indx, lock, ret);
+		break;
+
+fast_miss:	if (h != NULL)
+			(void)memp_fput(dbp->mpf, h, 0);
+		if (lock != LOCK_INVALID)
+			(void)__BT_LPUT(dbc, lock);
+
+search:		ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
+		break;
+	default:				/* XXX: Impossible. */
+		abort();
+		/* NOTREACHED */
+	}
 	if (ret != 0)
 		return (ret);
 
-	cp->page = t->bt_csp->page;
-	cp->pgno = cp->page->pgno;
-	cp->indx = t->bt_csp->indx;
-	cp->lock = t->bt_csp->lock;
-	cp->dpgno = PGNO_INVALID;
-
 	/*
-	 * If we have an exact match, make sure that we're not looking at a
-	 * chain of duplicates -- if so, move to an entry in that chain.
+	 * Initialize the cursor to reference it.  This has to be done
+	 * before we return (even with DB_NOTFOUND) because we have to
+	 * free the page(s) we locked in __bam_search.
 	 */
-	if (*exactp) {
-		if ((ret = __bam_ovfl_chk(dbp,
-		    cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0)
-			return (ret);
-	} else
-		if (needexact)
-			return (DB_NOTFOUND);
-
-	/* If past the end of a page, find the next entry. */
-	if (cp->indx == NUM_ENT(cp->page) &&
-	    (ret = __bam_c_next(dbp, cp, 0)) != 0)
-		return (ret);
+	cp->page = cp->csp->page;
+	cp->pgno = cp->csp->page->pgno;
+	cp->indx = cp->csp->indx;
+	cp->lock = cp->csp->lock;
+	cp->dpgno = PGNO_INVALID;
 
-	/* If it's a deleted record, go to the next or previous one. */
-	if (cp->dpgno != PGNO_INVALID &&
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) {
-		if (flags == S_KEYLAST) {
-			if ((ret = __bam_c_prev(dbp, cp)) != 0)
-				return (ret);
-		} else
-			if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
-				return (ret);
-	}
 	/*
-	 * If we don't specify an exact match (the DB_KEYFIRST/DB_KEYLAST or
-	 * DB_SET_RANGE flags were set) __bam_search() may return a deleted
-	 * item.  For DB_KEYFIRST/DB_KEYLAST, we don't care since we're only
-	 * using it for a tree position.  For DB_SET_RANGE, we're returning
-	 * the key, so we have to adjust it.
+	 * If we inserted a key into the first or last slot of the tree,
+	 * remember where it was so we can do it more quickly next time.
 	 */
-	if (LF_ISSET(S_DELNO) && cp->dpgno == PGNO_INVALID &&
-	    B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type))
-		if ((ret = __bam_c_next(dbp, cp, 0)) != 0)
-			return (ret);
+	if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
+		t->bt_lpgno =
+		    ((cp->page->next_pgno == PGNO_INVALID &&
+		    cp->indx >= NUM_ENT(cp->page)) ||
+		    (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
+		    cp->pgno : PGNO_INVALID;
+
+	/* If we need an exact match and didn't find one, we're done. */
+	if (needexact && *exactp == 0)
+		return (DB_NOTFOUND);
 
 	return (0);
 }
 
 /*
- * __bam_ovfl_chk --
- *	Check for an overflow record, and if found, move to the correct
- *	record.
+ * __bam_dup --
+ *	Check for an off-page duplicates entry, and if found, move to the
+ *	first or last entry.
  *
- * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int));
+ * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
  */
 int
-__bam_ovfl_chk(dbp, cp, indx, to_end)
-	DB *dbp;
+__bam_dup(dbc, cp, indx, last_dup)
+	DBC *dbc;
 	CURSOR *cp;
 	u_int32_t indx;
-	int to_end;
+	int last_dup;
 {
 	BOVERFLOW *bo;
+	DB *dbp;
 	db_pgno_t pgno;
 	int ret;
 
-	/* Check for an overflow entry. */
-	bo = GET_BOVERFLOW(cp->page, indx);
-	if (B_TYPE(bo->type) != B_DUPLICATE)
-		return (0);
+	dbp = dbc->dbp;
 
 	/*
-	 * If we find one, go to the duplicates page, and optionally move
-	 * to the last record on that page.
+	 * Check for an overflow entry.  If we find one, move to the
+	 * duplicates page, and optionally move to the last record on
+	 * that page.
 	 *
-	 * XXX
+	 * !!!
 	 * We don't lock duplicates pages, we've already got the correct
 	 * lock on the main page.
 	 */
+	bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
+	if (B_TYPE(bo->type) != B_DUPLICATE)
+		return (0);
+
 	pgno = bo->pgno;
 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
 		return (ret);
 	cp->page = NULL;
-	if (to_end) {
-		if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0)
+	if (last_dup) {
+		if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
 			return (ret);
 		indx = NUM_ENT(cp->page) - O_INDX;
 	} else {
-		if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
 			return (ret);
 		indx = 0;
 	}
 
-	/* Update the duplicate entry in the cursor. */
+	/* Update the cursor's duplicate information. */
 	cp->dpgno = cp->page->pgno;
 	cp->dindx = indx;
 
 	return (0);
 }
 
-#ifdef DEBUG
-/*
- * __bam_cprint --
- *	Display the current btree cursor list.
- *
- * PUBLIC: int __bam_cprint __P((DB *));
- */
-int
-__bam_cprint(dbp)
-	DB *dbp;
-{
-	CURSOR *cp;
-	DBC *dbc;
-
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-		fprintf(stderr,
-		    "%#0x: page: %lu index: %lu dpage %lu dindex: %lu",
-		    (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx,
-		    (u_long)cp->dpgno, (u_long)cp->dindx);
-		if (F_ISSET(cp, C_DELETED))
-			fprintf(stderr, "(deleted)");
-		fprintf(stderr, "\n");
-	}
-	CURSOR_TEARDOWN(dbp);
-
-	return (0);
-}
-#endif /* DEBUG */
-
-/*
- * __bam_ca_delete --
- * 	Check if any of the cursors refer to the item we are about to delete,
- *	returning the number of cursors that refer to the item in question.
- *
- * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int));
- */
-int
-__bam_ca_delete(dbp, pgno, indx, curs, key_delete)
-	DB *dbp;
-	db_pgno_t pgno;
-	u_int32_t indx;
-	CURSOR *curs;
-	int key_delete;
-{
-	DBC *dbc;
-	CURSOR *cp;
-	int count;		/* !!!: Has to contain max number of cursors. */
-
-	/*
-	 * Adjust the cursors.  We don't have to review the cursors for any
-	 * process other than the current one, because we have the page write
-	 * locked at this point, and any other process had better be using a
-	 * different locker ID, meaning that only cursors in our process can
-	 * be on the page.
-	 *
-	 * It's possible for multiple cursors within the thread to have write
-	 * locks on the same page, but, cursors within a thread must be single
-	 * threaded, so all we're locking here is the cursor linked list.
-	 */
-	CURSOR_SETUP(dbp);
-	for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-
-		/*
-		 * Optionally, a cursor passed in is the one initiating the
-		 * delete, so we don't want to count it or set its deleted
-		 * flag.  Otherwise, if a cursor refers to the item, then we
-		 * set its deleted flag.
-		 */
-		if (curs == cp)
-			continue;
-
-		/*
-		 * If we're deleting the key itself and not just one of its
-		 * duplicates, repoint the cursor to the main-page key/data
-		 * pair, everything else is about to be discarded.
-		 */
-		if (key_delete || cp->dpgno == PGNO_INVALID) {
-			if (cp->pgno == pgno && cp->indx == indx) {
-				cp->dpgno = PGNO_INVALID;
-				++count;
-				F_SET(cp, C_DELETED);
-			}
-		} else
-			if (cp->dpgno == pgno && cp->dindx == indx) {
-				++count;
-				F_SET(cp, C_DELETED);
-			}
-	}
-	CURSOR_TEARDOWN(dbp);
-
-	return (count);
-}
-
-/*
- * __bam_ca_di --
- *	Adjust the cursors during a delete or insert.
- *
- * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int));
- */
-void
-__bam_ca_di(dbp, pgno, indx, adjust)
-	DB *dbp;
-	db_pgno_t pgno;
-	u_int32_t indx;
-	int adjust;
-{
-	CURSOR *cp;
-	DBC *dbc;
-
-	/* Recno is responsible for its own adjustments. */
-	if (dbp->type == DB_RECNO)
-		return;
-
-	/*
-	 * Adjust the cursors.  See the comment in __bam_ca_delete().
-	 */
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-		if (cp->pgno == pgno && cp->indx >= indx)
-			cp->indx += adjust;
-		if (cp->dpgno == pgno && cp->dindx >= indx)
-			cp->dindx += adjust;
-	}
-	CURSOR_TEARDOWN(dbp);
-}
-
-/*
- * __bam_ca_dup --
- *	Adjust the cursors when moving data items to a duplicates page.
- *
- * PUBLIC: void __bam_ca_dup __P((DB *,
- * PUBLIC:    db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t));
- */
-void
-__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti)
-	DB *dbp;
-	db_pgno_t fpgno, tpgno;
-	u_int32_t first, fi, ti;
-{
-	CURSOR *cp;
-	DBC *dbc;
-
-	/*
-	 * Adjust the cursors.  See the comment in __bam_ca_delete().
-	 *
-	 * No need to test duplicates, this only gets called when moving
-	 * leaf page data items onto a duplicates page.
-	 */
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-		/*
-		 * Ignore matching entries that have already been moved,
-		 * we move from the same location on the leaf page more
-		 * than once.
-		 */
-		if (cp->dpgno == PGNO_INVALID &&
-		    cp->pgno == fpgno && cp->indx == fi) {
-			cp->indx = first;
-			cp->dpgno = tpgno;
-			cp->dindx = ti;
-		}
-	}
-	CURSOR_TEARDOWN(dbp);
-}
-
-/*
- * __bam_ca_move --
- *	Adjust the cursors when moving data items to another page.
- *
- * PUBLIC: void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t));
- */
-void
-__bam_ca_move(dbp, fpgno, tpgno)
-	DB *dbp;
-	db_pgno_t fpgno, tpgno;
-{
-	CURSOR *cp;
-	DBC *dbc;
-
-	/* Recno is responsible for its own adjustments. */
-	if (dbp->type == DB_RECNO)
-		return;
-
-	/*
-	 * Adjust the cursors.  See the comment in __bam_ca_delete().
-	 *
-	 * No need to test duplicates, this only gets called when copying
-	 * over the root page with a leaf or internal page.
-	 */
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-		if (cp->pgno == fpgno)
-			cp->pgno = tpgno;
-	}
-	CURSOR_TEARDOWN(dbp);
-}
-
-/*
- * __bam_ca_replace --
- * 	Check if any of the cursors refer to the item we are about to replace.
- *	If so, their flags should be changed from deleted to replaced.
- *
- * PUBLIC: void __bam_ca_replace
- * PUBLIC:    __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg));
- */
-void
-__bam_ca_replace(dbp, pgno, indx, pass)
-	DB *dbp;
-	db_pgno_t pgno;
-	u_int32_t indx;
-	ca_replace_arg pass;
-{
-	CURSOR *cp;
-	DBC *dbc;
-
-	/*
-	 * Adjust the cursors.  See the comment in __bam_ca_delete().
-	 *
-	 * Find any cursors that have logically deleted a record we're about
-	 * to overwrite.
-	 *
-	 * Pass == REPLACE_SETUP:
-	 *	Set C_REPLACE_SETUP so we can find the cursors again.
-	 *
-	 * Pass == REPLACE_SUCCESS:
-	 *	Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the
-	 *	overwrite was successful.
-	 *
-	 * Pass == REPLACE_FAILED:
-	 *	Clear C_REPLACE_SETUP, the overwrite failed.
-	 *
-	 * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value
-	 * for the cursor as it may have been changed by other cursor update
-	 * routines as the item was deleted/inserted.
-	 */
-	CURSOR_SETUP(dbp);
-	switch (pass) {
-	case REPLACE_SETUP:			/* Setup. */
-		for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-		    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-			cp = (CURSOR *)dbc->internal;
-			if ((cp->pgno == pgno && cp->indx == indx) ||
-			    (cp->dpgno == pgno && cp->dindx == indx))
-				F_SET(cp, C_REPLACE_SETUP);
-		}
-		break;
-	case REPLACE_SUCCESS:			/* Overwrite succeeded. */
-		for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-		    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-			cp = (CURSOR *)dbc->internal;
-			if (F_ISSET(cp, C_REPLACE_SETUP)) {
-				if (cp->dpgno == pgno)
-					cp->dindx = indx;
-				if (cp->pgno == pgno)
-					cp->indx = indx;
-				F_SET(cp, C_REPLACE);
-				F_CLR(cp, C_DELETED | C_REPLACE_SETUP);
-			}
-		}
-		break;
-	case REPLACE_FAILED:			/* Overwrite failed. */
-		for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-		    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-			cp = (CURSOR *)dbc->internal;
-			if (F_ISSET(cp, C_REPLACE_SETUP)) {
-				if (cp->dpgno == pgno)
-					cp->dindx = indx;
-				if (cp->pgno == pgno)
-					cp->indx = indx;
-				F_CLR(cp, C_REPLACE_SETUP);
-			}
-		}
-		break;
-	}
-	CURSOR_TEARDOWN(dbp);
-}
-
-/*
- * __bam_ca_split --
- *	Adjust the cursors when splitting a page.
- *
- * PUBLIC: void __bam_ca_split __P((DB *,
- * PUBLIC:    db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
- */
-void
-__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft)
-	DB *dbp;
-	db_pgno_t ppgno, lpgno, rpgno;
-	u_int32_t split_indx;
-	int cleft;
-{
-	DBC *dbc;
-	CURSOR *cp;
-
-	/* Recno is responsible for its own adjustments. */
-	if (dbp->type == DB_RECNO)
-		return;
-
-	/*
-	 * Adjust the cursors.  See the comment in __bam_ca_delete().
-	 *
-	 * If splitting the page that a cursor was on, the cursor has to be
-	 * adjusted to point to the same record as before the split.  Most
-	 * of the time we don't adjust pointers to the left page, because
-	 * we're going to copy its contents back over the original page.  If
-	 * the cursor is on the right page, it is decremented by the number of
-	 * records split to the left page.
-	 */
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (CURSOR *)dbc->internal;
-		if (cp->pgno == ppgno) {
-			if (cp->indx < split_indx) {
-				if (cleft)
-					cp->pgno = lpgno;
-			} else {
-				cp->pgno = rpgno;
-				cp->indx -= split_indx;
-			}
-		}
-		if (cp->dpgno == ppgno) {
-			if (cp->dindx < split_indx) {
-				if (cleft)
-					cp->dpgno = lpgno;
-			} else {
-				cp->dpgno = rpgno;
-				cp->dindx -= split_indx;
-			}
-		}
-	}
-	CURSOR_TEARDOWN(dbp);
-}
-
 /*
  * __bam_c_physdel --
  *	Actually do the cursor deletion.
  */
 static int
-__bam_c_physdel(dbp, cp, h)
-	DB *dbp;
+__bam_c_physdel(dbc, cp, h)
+	DBC *dbc;
 	CURSOR *cp;
 	PAGE *h;
 {
 	enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
 	BOVERFLOW bo;
-	BTREE *t;
+	DB *dbp;
 	DBT dbt;
 	DB_LOCK lock;
 	db_indx_t indx;
 	db_pgno_t pgno, next_pgno, prev_pgno;
 	int delete_page, local_page, ret;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
+
 	delete_page = ret = 0;
 
 	/* Figure out what we're deleting. */
@@ -1522,20 +1654,37 @@ __bam_c_physdel(dbp, cp, h)
 	}
 
 	/*
-	 * If the item is referenced by another cursor, leave it up to that
-	 * cursor to do the delete.
+	 * If the item is referenced by another cursor, set that cursor's
+	 * delete flag and leave it up to it to do the delete.
+	 *
+	 * !!!
+	 * This test for > 0 is a tricky.  There are two ways that we can
+	 * be called here.  Either we are closing the cursor or we've moved
+	 * off the page with the deleted entry.  In the first case, we've
+	 * already removed the cursor from the active queue, so we won't see
+	 * it in __bam_ca_delete. In the second case, it will be on a different
+	 * item, so we won't bother with it in __bam_ca_delete.
 	 */
-	if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0)
+	if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
 		return (0);
 
 	/*
+	 * If this is concurrent DB, upgrade the lock if necessary.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
+	    (ret = lock_get(dbp->dbenv->lk_info,
+	    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+	    &dbc->mylock)) != 0)
+		return (EAGAIN);
+
+	/*
 	 * If we don't already have the page locked, get it and delete the
 	 * items.
 	 */
 	if ((h == NULL || h->pgno != pgno)) {
-		if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
+		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 			return (ret);
 		local_page = 1;
 	} else
@@ -1581,7 +1730,7 @@ __bam_c_physdel(dbp, cp, h)
 			cmd = DELETE_ITEM;
 
 			/* Delete the duplicate. */
-			if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0)
+			if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
 				goto err;
 
 			/*
@@ -1610,7 +1759,7 @@ __bam_c_physdel(dbp, cp, h)
 		if (local_page) {
 			if (h != NULL)
 				(void)memp_fput(dbp->mpf, h, 0);
-			(void)__BT_TLPUT(dbp, lock);
+			(void)__BT_TLPUT(dbc, lock);
 			local_page = 0;
 		}
 
@@ -1619,10 +1768,10 @@ __bam_c_physdel(dbp, cp, h)
 
 		/* Acquire the parent page and switch the index to its entry. */
 		if ((ret =
-		    __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
+		    __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
 			goto err;
-		if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) {
-			(void)__BT_TLPUT(dbp, lock);
+		if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
+			(void)__BT_TLPUT(dbc, lock);
 			goto err;
 		}
 		local_page = 1;
@@ -1641,12 +1790,12 @@ __bam_c_physdel(dbp, cp, h)
 		 */
 		indx += O_INDX;
 		bo = *GET_BOVERFLOW(h, indx);
-		(void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE);
+		(void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
 		bo.pgno = next_pgno;
 		memset(&dbt, 0, sizeof(dbt));
 		dbt.data = &bo;
 		dbt.size = BOVERFLOW_SIZE;
-		(void)__db_pitem(dbp, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
+		(void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
 		(void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
 		goto done;
 	}
@@ -1661,7 +1810,7 @@ btd:	/*
 	 * set them is because we're (potentially) about to do a reverse split,
 	 * which would make our saved page information useless.
 	 *
-	 * XXX
+	 * !!!
 	 * The following operations to delete a page might deadlock.  I think
 	 * that's OK.  The problem is if we're deleting an item because we're
 	 * closing cursors because we've already deadlocked and want to call
@@ -1680,37 +1829,44 @@ btd:	/*
 	/*
 	 * Do a normal btree delete.
 	 *
-	 * XXX
+	 * !!!
 	 * Delete the key item first, otherwise the duplicate checks in
 	 * __bam_ditem() won't work!
 	 */
-	if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
 		goto err;
-	if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
 		goto err;
 
 	/* Discard any remaining locks/pages. */
 	if (local_page) {
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_TLPUT(dbp, lock);
+		(void)__BT_TLPUT(dbc, lock);
 		local_page = 0;
 	}
 
 	/* Delete the page if it was emptied. */
 	if (delete_page)
-		ret = __bam_dpage(dbp, &dbt);
+		ret = __bam_dpage(dbc, &dbt);
 
 err:
 done:	if (delete_page)
-		__db_free(dbt.data);
+		__os_free(dbt.data, dbt.size);
 
 	if (local_page) {
-		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_TLPUT(dbp, lock);
+		/*
+		 * It's possible for h to be NULL, as __db_drem may have
+		 * been relinking pages by the time that it deadlocked.
+		 */
+		if (h != NULL)
+			(void)memp_fput(dbp->mpf, h, 0);
+		(void)__BT_TLPUT(dbc, lock);
 	}
 
-	if (ret == 0)
-		++t->lstat.bt_deleted;
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
+
 	return (ret);
 }
 
@@ -1719,22 +1875,24 @@ done:	if (delete_page)
  *	Acquire a full stack for a cursor.
  */
 static int
-__bam_c_getstack(dbp, cp)
-	DB *dbp;
+__bam_c_getstack(dbc, cp)
+	DBC *dbc;
 	CURSOR *cp;
 {
+	DB *dbp;
 	DBT dbt;
 	PAGE *h;
 	db_pgno_t pgno;
 	int exact, ret;
 
-	ret = 0;
+	dbp = dbc->dbp;
 	h = NULL;
 	memset(&dbt, 0, sizeof(DBT));
+	ret = 0;
 
 	/* Get the page with the current item on it. */
 	pgno = cp->pgno;
-	if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 		return (ret);
 
 	/* Get a copy of a key from the page. */
@@ -1744,12 +1902,12 @@ __bam_c_getstack(dbp, cp)
 
 	/* Get a write-locked stack for that page. */
 	exact = 0;
-	ret = __bam_search(dbp, &dbt, S_KEYFIRST, 1, NULL, &exact);
+	ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
 
 	/* We no longer need the key or the page. */
 err:	if (h != NULL)
 		(void)memp_fput(dbp->mpf, h, 0);
 	if (dbt.data != NULL)
-		__db_free(dbt.data);
+		__os_free(dbt.data, dbt.size);
 	return (ret);
 }
diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c
index 7e71037e46..d623bd8a6f 100644
--- a/db2/btree/bt_delete.c
+++ b/db2/btree/bt_delete.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_delete.c	10.31 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_delete.c	10.43 (Sleepycat) 12/7/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -60,8 +60,6 @@ static const char sccsid[] = "@(#)bt_delete.c	10.31 (Sleepycat) 5/6/98";
 #include "db_page.h"
 #include "btree.h"
 
-static int __bam_dpages __P((DB *, BTREE *));
-
 /*
  * __bam_delete --
  *	Delete the items referenced by a key.
@@ -69,182 +67,67 @@ static int __bam_dpages __P((DB *, BTREE *));
  * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
  */
 int
-__bam_delete(argdbp, txn, key, flags)
-	DB *argdbp;
-	DB_TXN *txn;
-	DBT *key;
-	u_int32_t flags;
-{
-	BTREE *t;
+__bam_delete(dbp, txn, key, flags)
 	DB *dbp;
-	PAGE *h;
-	db_indx_t cnt, i, indx;
-	int dpage, exact, ret, stack;
-
-	DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags);
-
-	stack = 0;
-
-	/* Check for invalid flags. */
-	if ((ret = __db_delchk(argdbp,
-	    key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0)
-		return (ret);
-
-	GETHANDLE(argdbp, txn, &dbp, ret);
-	t = dbp->internal;
-
-	/* Search the tree for the key; delete only deletes exact matches. */
-	if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0)
-		goto err;
-	stack = 1;
-	h = t->bt_csp->page;
-	indx = t->bt_csp->indx;
-
-	/* Delete the key/data pair, including any on-or-off page duplicates. */
-	for (cnt = 1, i = indx;; ++cnt)
-		if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx])
-			break;
-	for (; cnt > 0; --cnt, ++t->lstat.bt_deleted)
-		if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) {
-			/*
-			 * XXX
-			 * Delete the key item first, otherwise the duplicate
-			 * checks in __bam_ditem() won't work!
-			 */
-			if ((ret = __bam_ditem(dbp, h, indx)) != 0)
-				goto err;
-			if ((ret = __bam_ditem(dbp, h, indx)) != 0)
-				goto err;
-		} else {
-			B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
-			indx += P_INDX;
-		}
-
-	/* If we're using record numbers, update internal page record counts. */
-	if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0)
-		goto err;
-
-	/* If the page is now empty, delete it. */
-	dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT;
-
-	__bam_stkrel(dbp);
-	stack = 0;
-
-	ret = dpage ? __bam_dpage(dbp, key) : 0;
-
-err:	if (stack)
-		__bam_stkrel(dbp);
-	PUTHANDLE(dbp);
-	return (ret);
-}
-
-/*
- * __ram_delete --
- *	Delete the items referenced by a key.
- *
- * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
- */
-int
-__ram_delete(argdbp, txn, key, flags)
-	DB *argdbp;
 	DB_TXN *txn;
 	DBT *key;
 	u_int32_t flags;
 {
-	BKEYDATA bk;
-	BTREE *t;
-	DB *dbp;
-	DBT hdr, data;
-	PAGE *h;
-	db_indx_t indx;
-	db_recno_t recno;
-	int exact, ret, stack;
+	DBC *dbc;
+	DBT data;
+	u_int32_t f_init, f_next;
+	int ret, t_ret;
 
-	stack = 0;
+	DB_PANIC_CHECK(dbp);
 
 	/* Check for invalid flags. */
-	if ((ret = __db_delchk(argdbp,
-	    key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0)
+	if ((ret =
+	    __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
 		return (ret);
 
-	GETHANDLE(argdbp, txn, &dbp, ret);
-	t = dbp->internal;
-
-	/* Check the user's record number and fill in as necessary. */
-	if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0)
-		goto err;
-
-	/* Search the tree for the key; delete only deletes exact matches. */
-	if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0)
-		goto err;
-	if (!exact) {
-		ret = DB_NOTFOUND;
-		goto err;
-	}
-
-	h = t->bt_csp->page;
-	indx = t->bt_csp->indx;
-	stack = 1;
+	/* Allocate a cursor. */
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
 
-	/* If the record has already been deleted, we couldn't have found it. */
-	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
-		ret = DB_KEYEMPTY;
-		goto done;
-	}
+	DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags);
 
 	/*
-	 * If we're not renumbering records, replace the record with a marker
-	 * and return.
+	 * Walk a cursor through the key/data pairs, deleting as we go.  Set
+	 * the DB_DBT_USERMEM flag, as this might be a threaded application
+	 * and the flags checking will catch us.  We don't actually want the
+	 * keys or data, so request a partial of length 0.
 	 */
-	if (!F_ISSET(dbp, DB_RE_RENUMBER)) {
-		if ((ret = __bam_ditem(dbp, h, indx)) != 0)
-			goto err;
-
-		B_TSET(bk.type, B_KEYDATA, 1);
-		bk.len = 0;
-		memset(&hdr, 0, sizeof(hdr));
-		hdr.data = &bk;
-		hdr.size = SSZA(BKEYDATA, data);
-		memset(&data, 0, sizeof(data));
-		data.data = (char *)"";
-		data.size = 0;
-		if ((ret = __db_pitem(dbp,
-		    h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
-			goto err;
-
-		++t->lstat.bt_deleted;
-		goto done;
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+	/* If locking, set read-modify-write flag. */
+	f_init = DB_SET;
+	f_next = DB_NEXT_DUP;
+	if (dbp->dbenv != NULL && dbp->dbenv->lk_info != NULL) {
+		f_init |= DB_RMW;
+		f_next |= DB_RMW;
 	}
 
-	/* Delete the item. */
-	if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+	/* Walk through the set of key/data pairs, deleting as we go. */
+	if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0)
 		goto err;
-
-	++t->lstat.bt_deleted;
-	if (t->bt_recno != NULL)
-		F_SET(t->bt_recno, RECNO_MODIFIED);
-
-	/* Adjust the counts. */
-	__bam_adjust(dbp, t, -1);
-
-	/* Adjust the cursors. */
-	__ram_ca(dbp, recno, CA_DELETE);
-
-	/*
-	 * If the page is now empty, delete it -- we have the whole tree
-	 * locked, so there are no preparations to make.  Else, release
-	 * the pages.
-	 */
-	if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
-		stack = 0;
-		ret = __bam_dpages(dbp, t);
+	for (;;) {
+		if ((ret = dbc->c_del(dbc, 0)) != 0)
+			goto err;
+		if ((ret = dbc->c_get(dbc, key, &data, f_next)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+				break;
+			}
+			goto err;
+		}
 	}
 
-done:
-err:	if (stack)
-		__bam_stkrel(dbp);
+err:	/* Discard the cursor. */
+	if ((t_ret = dbc->c_close(dbc)) != 0 &&
+	    (ret == 0 || ret == DB_NOTFOUND))
+		ret = t_ret;
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
@@ -252,20 +135,23 @@ err:	if (stack)
  * __bam_ditem --
  *	Delete one or more entries from a page.
  *
- * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t));
+ * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
  */
 int
-__bam_ditem(dbp, h, indx)
-	DB *dbp;
+__bam_ditem(dbc, h, indx)
+	DBC *dbc;
 	PAGE *h;
 	u_int32_t indx;
 {
 	BINTERNAL *bi;
 	BKEYDATA *bk;
 	BOVERFLOW *bo;
+	DB *dbp;
 	u_int32_t nbytes;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	switch (TYPE(h)) {
 	case P_IBTREE:
 		bi = GET_BINTERNAL(h, indx);
@@ -304,7 +190,7 @@ __bam_ditem(dbp, h, indx)
 			 */
 			if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
 			    h->inp[indx] == h->inp[indx + P_INDX])
-				return (__bam_adjindx(dbp,
+				return (__bam_adjindx(dbc,
 				    h, indx, indx + O_INDX, 0));
 			/*
 			 * Check for a duplicate before us on the page.  It
@@ -312,7 +198,7 @@ __bam_ditem(dbp, h, indx)
 			 * after the data item for the purposes of this one.
 			 */
 			if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
-				return (__bam_adjindx(dbp,
+				return (__bam_adjindx(dbc,
 				    h, indx, indx - P_INDX, 0));
 		}
 		/* FALLTHROUGH */
@@ -327,11 +213,11 @@ __bam_ditem(dbp, h, indx)
 offpage:		/* Delete duplicate/offpage chains. */
 			if (B_TYPE(bo->type) == B_DUPLICATE) {
 				if ((ret =
-				    __db_ddup(dbp, bo->pgno, __bam_free)) != 0)
+				    __db_ddup(dbc, bo->pgno, __bam_free)) != 0)
 					return (ret);
 			} else
 				if ((ret =
-				    __db_doff(dbp, bo->pgno, __bam_free)) != 0)
+				    __db_doff(dbc, bo->pgno, __bam_free)) != 0)
 					return (ret);
 			break;
 		case B_KEYDATA:
@@ -346,7 +232,7 @@ offpage:		/* Delete duplicate/offpage chains. */
 	}
 
 	/* Delete the item. */
-	if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0)
+	if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
 		return (ret);
 
 	/* Mark the page dirty. */
@@ -357,21 +243,24 @@ offpage:		/* Delete duplicate/offpage chains. */
  * __bam_adjindx --
  *	Adjust an index on the page.
  *
- * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int));
+ * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
  */
 int
-__bam_adjindx(dbp, h, indx, indx_copy, is_insert)
-	DB *dbp;
+__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
+	DBC *dbc;
 	PAGE *h;
 	u_int32_t indx, indx_copy;
 	int is_insert;
 {
+	DB *dbp;
 	db_indx_t copy;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/* Log the change. */
-	if (DB_LOGGING(dbp) &&
-	    (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h),
+	if (DB_LOGGING(dbc) &&
+	    (ret = __bam_adj_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
 	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy,
 	    (u_int32_t)is_insert)) != 0)
 		return (ret);
@@ -402,22 +291,24 @@ __bam_adjindx(dbp, h, indx, indx_copy, is_insert)
  * __bam_dpage --
  *	Delete a page from the tree.
  *
- * PUBLIC: int __bam_dpage __P((DB *, const DBT *));
+ * PUBLIC: int __bam_dpage __P((DBC *, const DBT *));
  */
 int
-__bam_dpage(dbp, key)
-	DB *dbp;
+__bam_dpage(dbc, key)
+	DBC *dbc;
 	const DBT *key;
 {
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	DB_LOCK lock;
 	PAGE *h;
 	db_pgno_t pgno;
 	int level;		/* !!!: has to hold number of tree levels. */
 	int exact, ret;
 
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 	ret = 0;
-	t = dbp->internal;
 
 	/*
 	 * The locking protocol is that we acquire locks by walking down the
@@ -433,40 +324,40 @@ __bam_dpage(dbp, key)
 	for (level = LEAFLEVEL;; ++level) {
 		/* Acquire a page and its parent, locked. */
 		if ((ret =
-		    __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0)
+		    __bam_search(dbc, key, S_WRPAIR, level, NULL, &exact)) != 0)
 			return (ret);
 
 		/*
 		 * If we reach the root or the page isn't going to be empty
 		 * when we delete one record, quit.
 		 */
-		h = t->bt_csp[-1].page;
+		h = cp->csp[-1].page;
 		if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1)
 			break;
 
 		/* Release the two locked pages. */
-		(void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0);
-		(void)__BT_TLPUT(dbp, t->bt_csp[-1].lock);
-		(void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0);
-		(void)__BT_TLPUT(dbp, t->bt_csp[0].lock);
+		(void)memp_fput(dbp->mpf, cp->csp[-1].page, 0);
+		(void)__BT_TLPUT(dbc, cp->csp[-1].lock);
+		(void)memp_fput(dbp->mpf, cp->csp[0].page, 0);
+		(void)__BT_TLPUT(dbc, cp->csp[0].lock);
 	}
 
 	/*
 	 * Leave the stack pointer one after the last entry, we may be about
 	 * to push more items on the stack.
 	 */
-	++t->bt_csp;
+	++cp->csp;
 
 	/*
-	 * t->bt_csp[-2].page is the top page, which we're not going to delete,
-	 * and t->bt_csp[-1].page is the first page we are going to delete.
+	 * cp->csp[-2].page is the top page, which we're not going to delete,
+	 * and cp->csp[-1].page is the first page we are going to delete.
 	 *
 	 * Walk down the chain, acquiring the rest of the pages until we've
 	 * retrieved the leaf page.  If we find any pages that aren't going
 	 * to be emptied by the delete, someone else added something while we
 	 * were walking the tree, and we discontinue the delete.
 	 */
-	for (h = t->bt_csp[-1].page;;) {
+	for (h = cp->csp[-1].page;;) {
 		if (ISLEAF(h)) {
 			if (NUM_ENT(h) != 0)
 				goto release;
@@ -482,45 +373,53 @@ __bam_dpage(dbp, key)
 		pgno = TYPE(h) == P_IBTREE ?
 		    GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno;
 
-		if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
-			goto release;
-		if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
 			goto release;
-		BT_STK_PUSH(t, h, 0, lock, ret);
-		if (ret != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 			goto release;
+		BT_STK_PUSH(cp, h, 0, lock, ret);
 	}
 
-	BT_STK_POP(t);
-	return (__bam_dpages(dbp, t));
+	/* Adjust back to reference the last page on the stack. */
+	BT_STK_POP(cp);
+
+	/* Delete the pages. */
+	return (__bam_dpages(dbc));
 
 release:
+	/* Adjust back to reference the last page on the stack. */
+	BT_STK_POP(cp);
+
 	/* Discard any locked pages and return. */
-	BT_STK_POP(t);
-	__bam_stkrel(dbp);
+	__bam_stkrel(dbc, 0);
+
 	return (ret);
 }
 
 /*
  * __bam_dpages --
  *	Delete a set of locked pages.
+ *
+ * PUBLIC: int __bam_dpages __P((DBC *));
  */
-static int
-__bam_dpages(dbp, t)
-	DB *dbp;
-	BTREE *t;
+int
+__bam_dpages(dbc)
+	DBC *dbc;
 {
+	CURSOR *cp;
+	DB *dbp;
 	DBT a, b;
-	DB_LOCK lock;
+	DB_LOCK c_lock, p_lock;
 	EPG *epg;
-	PAGE *h;
+	PAGE *child, *parent;
+	db_indx_t nitems;
 	db_pgno_t pgno;
 	db_recno_t rcnt;
-	int ret;
-
-	COMPQUIET(rcnt, 0);
+	int done, ret;
 
-	epg = t->bt_sp;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+	epg = cp->sp;
 
 	/*
 	 * !!!
@@ -533,45 +432,107 @@ __bam_dpages(dbp, t)
 	 * that we can never again access by walking down the tree.  So, before
 	 * we unlink the subtree, we relink the leaf page chain.
 	 */
-	if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0)
+	if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0)
 		goto release;
 
 	/*
-	 * We have the entire stack of deletable pages locked.  Start from the
-	 * top of the tree and move to the bottom, as it's better to release
-	 * the inner pages as soon as possible.
+	 * We have the entire stack of deletable pages locked.
+	 *
+	 * Delete the highest page in the tree's reference to the underlying
+	 * stack of pages.  Then, release that page, letting the rest of the
+	 * tree get back to business.
 	 */
-	if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0)
-		goto release;
+	if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) {
+release:	(void)__bam_stkrel(dbc, 0);
+		return (ret);
+	}
+
+	pgno = epg->page->pgno;
+	nitems = NUM_ENT(epg->page);
+
+	(void)memp_fput(dbp->mpf, epg->page, 0);
+	(void)__BT_TLPUT(dbc, epg->lock);
+
+	/*
+	 * Free the rest of the stack of pages.
+	 *
+	 * !!!
+	 * Don't bother checking for errors.  We've unlinked the subtree from
+	 * the tree, and there's no possibility of recovery outside of doing
+	 * TXN rollback.
+	 */
+	while (++epg <= cp->csp) {
+		/*
+		 * Delete page entries so they will be restored as part of
+		 * recovery.
+		 */
+		if (NUM_ENT(epg->page) != 0)
+			(void)__bam_ditem(dbc, epg->page, epg->indx);
+
+		(void)__bam_free(dbc, epg->page);
+		(void)__BT_TLPUT(dbc, epg->lock);
+	}
+	BT_STK_CLR(cp);
+
+	/*
+	 * Try and collapse the tree a level -- this is only applicable
+	 * if we've deleted the next-to-last element from the root page.
+	 *
+	 * There are two cases when collapsing a tree.
+	 *
+	 * If we've just deleted the last item from the root page, there is no
+	 * further work to be done.  The code above has emptied the root page
+	 * and freed all pages below it.
+	 */
+	if (pgno != PGNO_ROOT || nitems != 1)
+		return (0);
 
 	/*
-	 * If we just deleted the last or next-to-last item from the root page,
-	 * the tree can collapse a level.  Write lock the last page referenced
+	 * If we just deleted the next-to-last item from the root page, the
+	 * tree can collapse one or more levels.  While there remains only a
+	 * single item on the root page, write lock the last page referenced
 	 * by the root page and copy it over the root page.  If we can't get a
-	 * write lock, that's okay, the tree just remains a level deeper than
-	 * we'd like.
+	 * write lock, that's okay, the tree just stays deeper than we'd like.
 	 */
-	h = epg->page;
-	if (h->pgno == PGNO_ROOT && NUM_ENT(h) <= 1) {
-		pgno = TYPE(epg->page) == P_IBTREE ?
-		    GET_BINTERNAL(epg->page, 0)->pgno :
-		    GET_RINTERNAL(epg->page, 0)->pgno;
-		if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
-			goto release;
-		if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
-			goto release;
+	for (done = 0; !done;) {
+		/* Initialize. */
+		parent = child = NULL;
+		p_lock = c_lock = LOCK_INVALID;
+
+		/* Lock the root. */
+		pgno = PGNO_ROOT;
+		if ((ret =
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &p_lock)) != 0)
+			goto stop;
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0)
+			goto stop;
+
+		if (NUM_ENT(parent) != 1 ||
+		    (TYPE(parent) != P_IBTREE && TYPE(parent) != P_IRECNO))
+			goto stop;
+
+		pgno = TYPE(parent) == P_IBTREE ?
+		    GET_BINTERNAL(parent, 0)->pgno :
+		    GET_RINTERNAL(parent, 0)->pgno;
+
+		/* Lock the child page. */
+		if ((ret =
+		    __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &c_lock)) != 0)
+			goto stop;
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0)
+			goto stop;
 
 		/* Log the change. */
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			memset(&a, 0, sizeof(a));
-			a.data = h;
+			a.data = child;
 			a.size = dbp->pgsize;
 			memset(&b, 0, sizeof(b));
-			b.data = P_ENTRY(epg->page, 0);
+			b.data = P_ENTRY(parent, 0);
 			b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
-			__bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn,
-			   &h->lsn, 0, dbp->log_fileid, h->pgno, &a,
-			   RE_NREC(epg->page), &b, &epg->page->lsn);
+			__bam_rsplit_log(dbp->dbenv->lg_info, dbc->txn,
+			   &child->lsn, 0, dbp->log_fileid, child->pgno, &a,
+			   RE_NREC(parent), &b, &parent->lsn);
 		}
 
 		/*
@@ -579,69 +540,50 @@ __bam_dpages(dbp, t)
 		 *
 		 * One fixup -- if the tree has record numbers and we're not
 		 * converting to a leaf page, we have to preserve the total
-		 * record count.
+		 * record count.  Note that we are about to overwrite everything
+		 * on the parent, including its LSN.  This is actually OK,
+		 * because the above log message, which describes this update,
+		 * stores its LSN on the child page.  When the child is copied
+		 * to the parent, the correct LSN is going to copied into
+		 * place in the parent.
 		 */
-		if (TYPE(h) == P_IRECNO ||
-		    (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
-			rcnt = RE_NREC(epg->page);
-		memcpy(epg->page, h, dbp->pgsize);
-		epg->page->pgno = PGNO_ROOT;
-		if (TYPE(h) == P_IRECNO ||
-		    (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
-			RE_NREC_SET(epg->page, rcnt);
-		(void)memp_fset(dbp->mpf, epg->page, DB_MPOOL_DIRTY);
+		COMPQUIET(rcnt, 0);
+		if (TYPE(child) == P_IRECNO ||
+		    (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
+			rcnt = RE_NREC(parent);
+		memcpy(parent, child, dbp->pgsize);
+		parent->pgno = PGNO_ROOT;
+		if (TYPE(child) == P_IRECNO ||
+		    (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM)))
+			RE_NREC_SET(parent, rcnt);
+
+		/* Mark the pages dirty. */
+		memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY);
+		memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY);
+
+		/* Adjust the cursors. */
+		__bam_ca_rsplit(dbp, child->pgno, PGNO_ROOT);
 
 		/*
 		 * Free the page copied onto the root page and discard its
 		 * lock.  (The call to __bam_free() discards our reference
 		 * to the page.)
-		 *
-		 * It's possible that the reverse split we're doing involves
-		 * pages from the stack of pages we're deleting.  Don't free
-		 * the page twice.
 		 */
-		 if (h->pgno == (epg + 1)->page->pgno)
-			(void)memp_fput(dbp->mpf, h, 0);
-		else {
-			(void)__bam_free(dbp, h);
-			++t->lstat.bt_freed;
-		}
-		(void)__BT_TLPUT(dbp, lock);
+		(void)__bam_free(dbc, child);
+		child = NULL;
 
-		/* Adjust the cursors. */
-		__bam_ca_move(dbp, h->pgno, PGNO_ROOT);
+		if (0) {
+stop:			done = 1;
+		}
+		if (p_lock != LOCK_INVALID)
+			(void)__BT_TLPUT(dbc, p_lock);
+		if (parent != NULL)
+			memp_fput(dbp->mpf, parent, 0);
+		if (c_lock != LOCK_INVALID)
+			(void)__BT_TLPUT(dbc, c_lock);
+		if (child != NULL)
+			memp_fput(dbp->mpf, child, 0);
 	}
 
-	/* Release the top page in the subtree. */
-	(void)memp_fput(dbp->mpf, epg->page, 0);
-	(void)__BT_TLPUT(dbp, epg->lock);
-
-	/*
-	 * Free the rest of the pages.
-	 *
-	 * XXX
-	 * Don't bother checking for errors.  We've unlinked the subtree from
-	 * the tree, and there's no possibility of recovery.
-	 */
-	while (++epg <= t->bt_csp) {
-		/*
-		 * XXX
-		 * Why do we need to do this?  Isn't the page already empty?
-		 */
-		if (NUM_ENT(epg->page) != 0)
-			(void)__bam_ditem(dbp, epg->page, epg->indx);
-
-		(void)__bam_free(dbp, epg->page);
-		(void)__BT_TLPUT(dbp, epg->lock);
-		++t->lstat.bt_freed;
-	}
 	return (0);
-
-release:
-	/* Discard any remaining pages and return. */
-	for (; epg <= t->bt_csp; ++epg) {
-		(void)memp_fput(dbp->mpf, epg->page, 0);
-		(void)__BT_TLPUT(dbp, epg->lock);
-	}
-	return (ret);
 }
diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c
index f5974ec61e..a89cfccb97 100644
--- a/db2/btree/bt_open.c
+++ b/db2/btree/bt_open.c
@@ -47,17 +47,9 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_open.c	10.27 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_open.c	10.39 (Sleepycat) 11/21/98";
 #endif /* not lint */
 
-/*
- * Implementation of btree access method for 4.4BSD.
- *
- * The design here was originally based on that of the btree access method
- * used in the Postgres database system at UC Berkeley.  This implementation
- * is wholly independent of the Postgres code.
- */
-
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
@@ -70,40 +62,34 @@ static const char sccsid[] = "@(#)bt_open.c	10.27 (Sleepycat) 5/6/98";
 #include "db_page.h"
 #include "btree.h"
 
-static int __bam_keyalloc __P((BTREE *));
-static int __bam_setmeta __P((DB *, BTREE *));
-
 /*
  * __bam_open --
  *	Open a btree.
  *
- * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *));
+ * PUBLIC: int __bam_open __P((DB *, DB_INFO *));
  */
 int
-__bam_open(dbp, type, dbinfo)
+__bam_open(dbp, dbinfo)
 	DB *dbp;
-	DBTYPE type;
 	DB_INFO *dbinfo;
 {
 	BTREE *t;
 	int ret;
 
-	/* Allocate the btree internal structure. */
-	if ((t = (BTREE *)__db_calloc(1, sizeof(BTREE))) == NULL)
-		return (ENOMEM);
-
-	t->bt_sp = t->bt_csp = t->bt_stack;
-	t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]);
-
-	if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) &&
-	    (ret = __bam_keyalloc(t)) != 0)
-		goto err;
+	/* Allocate and initialize the private btree structure. */
+	if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0)
+		return (ret);
+	dbp->internal = t;
 
 	/*
 	 * Intention is to make sure all of the user's selections are okay
 	 * here and then use them without checking.
 	 */
-	if (dbinfo != NULL) {
+	if (dbinfo == NULL) {
+		t->bt_minkey = DEFMINKEYPAGE;
+		t->bt_compare = __bam_defcmp;
+		t->bt_prefix = __bam_defpfx;
+	} else {
 		/* Minimum number of keys per page. */
 		if (dbinfo->bt_minkey == 0)
 			t->bt_minkey = DEFMINKEYPAGE;
@@ -126,152 +112,125 @@ __bam_open(dbp, type, dbinfo)
 		 * If no comparison, use default comparison.  If no comparison
 		 * and no prefix, use default prefix.  (We can't default the
 		 * prefix if the user supplies a comparison routine; shortening
-		 * the keys may break their comparison algorithm.)
+		 * the keys may break their comparison algorithm.  We don't
+		 * permit the user to specify a prefix routine if they didn't
+		 * also specify a comparison routine, they can't know enough
+		 * about our comparison routine to get it right.)
 		 */
-		t->bt_compare = dbinfo->bt_compare == NULL ?
-		    __bam_defcmp : dbinfo->bt_compare;
-		t->bt_prefix = dbinfo->bt_prefix == NULL ?
-		    (dbinfo->bt_compare == NULL ?
-		    __bam_defpfx : NULL) : dbinfo->bt_prefix;
-	} else {
-		t->bt_minkey = DEFMINKEYPAGE;
-		t->bt_compare = __bam_defcmp;
-		t->bt_prefix = __bam_defpfx;
+		if ((t->bt_compare = dbinfo->bt_compare) == NULL) {
+			if (dbinfo->bt_prefix != NULL)
+				goto einval;
+			t->bt_compare = __bam_defcmp;
+			t->bt_prefix = __bam_defpfx;
+		} else
+			t->bt_prefix = dbinfo->bt_prefix;
 	}
 
-	/* Initialize the remaining fields of the DB. */
-	dbp->type = type;
-	dbp->internal = t;
-	dbp->cursor = __bam_cursor;
+	/* Initialize the remaining fields/methods of the DB. */
+	dbp->am_close = __bam_close;
 	dbp->del = __bam_delete;
-	dbp->get = __bam_get;
-	dbp->put = __bam_put;
 	dbp->stat = __bam_stat;
-	dbp->sync = __bam_sync;
-
-	/*
-	 * The btree data structure requires that at least two key/data pairs
-	 * can fit on a page, but other than that there's no fixed requirement.
-	 * Translate the minimum number of items into the bytes a key/data pair
-	 * can use before being placed on an overflow page.  We calculate for
-	 * the worst possible alignment by assuming every item requires the
-	 * maximum alignment for padding.
-	 *
-	 * Recno uses the btree bt_ovflsize value -- it's close enough.
-	 */
-	t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX)
-	    - (BKEYDATA_PSIZE(0) + ALIGN(1, 4));
 
-	/* Create a root page if new tree. */
-	if ((ret = __bam_setmeta(dbp, t)) != 0)
+	/* Start up the tree. */
+	if ((ret = __bam_read_root(dbp)) != 0)
 		goto err;
 
+	/* Set the overflow page size. */
+	__bam_setovflsize(dbp);
+
 	return (0);
 
 einval:	ret = EINVAL;
 
-err:	if (t != NULL) {
-		/* If we allocated room for key/data return, discard it. */
-		if (t->bt_rkey.data != NULL)
-			__db_free(t->bt_rkey.data);
-
-		FREE(t, sizeof(BTREE));
-	}
+err:	__os_free(t, sizeof(BTREE));
 	return (ret);
 }
 
 /*
- * __bam_bdup --
- *	Create a BTREE handle for a threaded DB handle.
+ * __bam_close --
+ *	Close a btree.
  *
- * PUBLIC: int __bam_bdup __P((DB *, DB *));
+ * PUBLIC: int __bam_close __P((DB *));
  */
 int
-__bam_bdup(orig, new)
-	DB *orig, *new;
+__bam_close(dbp)
+	DB *dbp;
 {
-	BTREE *t, *ot;
-	int ret;
-
-	ot = orig->internal;
-
-	if ((t = (BTREE *)__db_calloc(1, sizeof(*t))) == NULL)
-		return (ENOMEM);
-
-	/*
-	 * !!!
-	 * Ignore the cursor queue, only the first DB has attached cursors.
-	 */
+	__os_free(dbp->internal, sizeof(BTREE));
+	dbp->internal = NULL;
 
-	t->bt_sp = t->bt_csp = t->bt_stack;
-	t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]);
+	return (0);
+}
 
-	if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) &&
-	    (ret = __bam_keyalloc(t)) != 0) {
-		FREE(t, sizeof(*t));
-		return (ret);
-	}
+/*
+ * __bam_setovflsize --
+ *
+ * PUBLIC: void __bam_setovflsize __P((DB *));
+ */
+void
+__bam_setovflsize(dbp)
+	DB *dbp;
+{
+	BTREE *t;
 
-	t->bt_maxkey = ot->bt_maxkey;
-	t->bt_minkey = ot->bt_minkey;
-	t->bt_compare = ot->bt_compare;
-	t->bt_prefix = ot->bt_prefix;
-	t->bt_ovflsize = ot->bt_ovflsize;
+	t = dbp->internal;
 
 	/*
 	 * !!!
-	 * The entire RECNO structure is shared.  If it breaks, the application
-	 * was misusing it to start with.
+	 * Correction for recno, which doesn't know anything about minimum
+	 * keys per page.
 	 */
-	t->bt_recno = ot->bt_recno;
-
-	new->internal = t;
-
-	return (0);
-}
+	if (t->bt_minkey == 0)
+		t->bt_minkey = DEFMINKEYPAGE;
 
-/*
- * __bam_keyalloc --
- *	Allocate return memory for recno keys.
- */
-static int
-__bam_keyalloc(t)
-	BTREE *t;
-{
 	/*
-	 * Recno keys are always the same size, and we don't want to have
-	 * to check for space on each return.  Allocate it now.
+	 * The btree data structure requires that at least two key/data pairs
+	 * can fit on a page, but other than that there's no fixed requirement.
+	 * Translate the minimum number of items into the bytes a key/data pair
+	 * can use before being placed on an overflow page.  We calculate for
+	 * the worst possible alignment by assuming every item requires the
+	 * maximum alignment for padding.
+	 *
+	 * Recno uses the btree bt_ovflsize value -- it's close enough.
 	 */
-	if ((t->bt_rkey.data = (void *)__db_malloc(sizeof(db_recno_t))) == NULL)
-		return (ENOMEM);
-	t->bt_rkey.ulen = sizeof(db_recno_t);
-	return (0);
+	t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX)
+	    - (BKEYDATA_PSIZE(0) + ALIGN(1, 4));
 }
 
 /*
- * __bam_setmeta --
+ * __bam_read_root --
  *	Check (and optionally create) a tree.
+ *
+ * PUBLIC: int __bam_read_root __P((DB *));
  */
-static int
-__bam_setmeta(dbp, t)
+int
+__bam_read_root(dbp)
 	DB *dbp;
-	BTREE *t;
 {
 	BTMETA *meta;
-	PAGE *root;
+	BTREE *t;
+	DBC *dbc;
 	DB_LOCK metalock, rootlock;
+	PAGE *root;
 	db_pgno_t pgno;
-	int ret;
+	int ret, t_ret;
+
+	ret = 0;
+	t = dbp->internal;
+
+	/* Get a cursor. */
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
 
 	/* Get, and optionally create the metadata page. */
 	pgno = PGNO_METADATA;
 	if ((ret =
-	    __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0)
-		return (ret);
+	    __bam_lget(dbc, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0)
+		goto err;
 	if ((ret =
-	    __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) {
-		(void)__BT_LPUT(dbp, metalock);
-		return (ret);
+	    memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0) {
+		(void)__BT_LPUT(dbc, metalock);
+		goto err;
 	}
 
 	/*
@@ -284,8 +243,8 @@ __bam_setmeta(dbp, t)
 		t->bt_minkey = meta->minkey;
 
 		(void)memp_fput(dbp->mpf, (PAGE *)meta, 0);
-		(void)__BT_LPUT(dbp, metalock);
-		return (0);
+		(void)__BT_LPUT(dbc, metalock);
+		goto done;
 	}
 
 	/* Initialize the tree structure metadata information. */
@@ -308,16 +267,16 @@ __bam_setmeta(dbp, t)
 		F_SET(meta, BTM_RECNUM);
 	if (F_ISSET(dbp, DB_RE_RENUMBER))
 		F_SET(meta, BTM_RENUMBER);
-	memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN);
+	memcpy(meta->uid, dbp->fileid, DB_FILE_ID_LEN);
 
 	/* Create and initialize a root page. */
 	pgno = PGNO_ROOT;
 	if ((ret =
-	    __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0)
-		return (ret);
-	if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) {
-		(void)__BT_LPUT(dbp, rootlock);
-		return (ret);
+	    __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0)
+		goto err;
+	if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &root)) != 0) {
+		(void)__BT_LPUT(dbc, rootlock);
+		goto err;
 	}
 	P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID,
 	    PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
@@ -325,9 +284,9 @@ __bam_setmeta(dbp, t)
 
 	/* Release the metadata and root pages. */
 	if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0)
-		return (ret);
+		goto err;
 	if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0)
-		return (ret);
+		goto err;
 
 	/*
 	 * Flush the metadata and root pages to disk -- since the user can't
@@ -341,8 +300,11 @@ __bam_setmeta(dbp, t)
 		ret = EINVAL;
 
 	/* Release the locks. */
-	(void)__BT_LPUT(dbp, metalock);
-	(void)__BT_LPUT(dbp, rootlock);
+	(void)__BT_LPUT(dbc, metalock);
+	(void)__BT_LPUT(dbc, rootlock);
 
+err:
+done:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
 	return (ret);
 }
diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c
index 87f2811398..6ccd68a5ab 100644
--- a/db2/btree/bt_page.c
+++ b/db2/btree/bt_page.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_page.c	10.12 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_page.c	10.17 (Sleepycat) 1/3/99";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -65,45 +65,47 @@ static const char sccsid[] = "@(#)bt_page.c	10.12 (Sleepycat) 5/6/98";
  * __bam_new --
  *	Get a new page, preferably from the freelist.
  *
- * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **));
+ * PUBLIC: int __bam_new __P((DBC *, u_int32_t, PAGE **));
  */
 int
-__bam_new(dbp, type, pagepp)
-	DB *dbp;
+__bam_new(dbc, type, pagepp)
+	DBC *dbc;
 	u_int32_t type;
 	PAGE **pagepp;
 {
 	BTMETA *meta;
+	DB *dbp;
 	DB_LOCK metalock;
 	PAGE *h;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
 	meta = NULL;
 	h = NULL;
 	metalock = LOCK_INVALID;
 
 	pgno = PGNO_METADATA;
-	if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0)
 		goto err;
-	if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0)
 		goto err;
 
 	if (meta->free == PGNO_INVALID) {
-		if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_NEW, &h)) != 0)
 			goto err;
 		ZERO_LSN(h->lsn);
 		h->pgno = pgno;
 	} else {
 		pgno = meta->free;
-		if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 			goto err;
 		meta->free = h->next_pgno;
 	}
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
-		if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn,
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbc->txn,
 		    &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn,
 		    h->pgno, (u_int32_t)type, meta->free)) != 0)
 			goto err;
@@ -111,7 +113,7 @@ __bam_new(dbp, type, pagepp)
 	}
 
 	(void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
-	(void)__BT_TLPUT(dbp, metalock);
+	(void)__BT_TLPUT(dbc, metalock);
 
 	P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
 	*pagepp = h;
@@ -122,28 +124,45 @@ err:	if (h != NULL)
 	if (meta != NULL)
 		(void)memp_fput(dbp->mpf, meta, 0);
 	if (metalock != LOCK_INVALID)
-		(void)__BT_TLPUT(dbp, metalock);
+		(void)__BT_TLPUT(dbc, metalock);
 	return (ret);
 }
 
 /*
+ * __bam_lput --
+ *	The standard lock put call.
+ *
+ * PUBLIC: int __bam_lput __P((DBC *, DB_LOCK));
+ */
+int
+__bam_lput(dbc, lock)
+	DBC *dbc;
+	DB_LOCK lock;
+{
+	return (__BT_LPUT(dbc, lock));
+}
+
+/*
  * __bam_free --
  *	Add a page to the head of the freelist.
  *
- * PUBLIC: int __bam_free __P((DB *, PAGE *));
+ * PUBLIC: int __bam_free __P((DBC *, PAGE *));
  */
 int
-__bam_free(dbp, h)
-	DB *dbp;
+__bam_free(dbc, h)
+	DBC *dbc;
 	PAGE *h;
 {
 	BTMETA *meta;
+	DB *dbp;
 	DBT ldbt;
 	DB_LOCK metalock;
 	db_pgno_t pgno;
 	u_int32_t dirty_flag;
 	int ret, t_ret;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * Retrieve the metadata page and insert the page at the head of
 	 * the free list.  If either the lock get or page get routines
@@ -152,23 +171,23 @@ __bam_free(dbp, h)
 	 */
 	dirty_flag = 0;
 	pgno = PGNO_METADATA;
-	if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0)
 		goto err;
-	if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) {
-		(void)__BT_TLPUT(dbp, metalock);
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) {
+		(void)__BT_TLPUT(dbc, metalock);
 		goto err;
 	}
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		memset(&ldbt, 0, sizeof(ldbt));
 		ldbt.data = h;
 		ldbt.size = P_OVERHEAD;
 		if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info,
-		    dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno,
+		    dbc->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno,
 		    &meta->lsn, &ldbt, meta->free)) != 0) {
 			(void)memp_fput(dbp->mpf, (PAGE *)meta, 0);
-			(void)__BT_TLPUT(dbp, metalock);
+			(void)__BT_TLPUT(dbc, metalock);
 			return (ret);
 		}
 		LSN(h) = LSN(meta);
@@ -182,7 +201,7 @@ __bam_free(dbp, h)
 	{ db_pgno_t __pgno; DB_LSN __lsn;
 		__pgno = h->pgno;
 		__lsn = h->lsn;
-		memset(h, 0xff, dbp->pgsize);
+		memset(h, 0xdb, dbp->pgsize);
 		h->pgno = __pgno;
 		h->lsn = __lsn;
 	}
@@ -194,7 +213,7 @@ __bam_free(dbp, h)
 
 	/* Discard the metadata page. */
 	ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
-	if ((t_ret = __BT_TLPUT(dbp, metalock)) != 0)
+	if ((t_ret = __BT_TLPUT(dbc, metalock)) != 0)
 		ret = t_ret;
 
 	/* Discard the caller's page reference. */
@@ -212,19 +231,21 @@ err:	if ((t_ret = memp_fput(dbp->mpf, h, dirty_flag)) != 0 && ret == 0)
 #ifdef DEBUG
 /*
  * __bam_lt --
- *	Print out the list of currently held locks.
+ *	Print out the list of locks currently held by a cursor.
  *
- * PUBLIC: int __bam_lt __P((DB *));
+ * PUBLIC: int __bam_lt __P((DBC *));
  */
 int
-__bam_lt(dbp)
-	DB *dbp;
+__bam_lt(dbc)
+	DBC *dbc;
 {
+	DB *dbp;
 	DB_LOCKREQ req;
 
+	dbp = dbc->dbp;
 	if (F_ISSET(dbp, DB_AM_LOCKING)) {
 		req.op = DB_LOCK_DUMP;
-		lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL);
+		lock_vec(dbp->dbenv->lk_info, dbc->locker, 0, &req, 1, NULL);
 	}
 	return (0);
 }
@@ -234,27 +255,29 @@ __bam_lt(dbp)
  * __bam_lget --
  *	The standard lock get call.
  *
- * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *));
+ * PUBLIC: int __bam_lget
+ * PUBLIC:    __P((DBC *, int, db_pgno_t, db_lockmode_t, DB_LOCK *));
  */
 int
-__bam_lget(dbp, do_couple, pgno, mode, lockp)
-	DB *dbp;
+__bam_lget(dbc, do_couple, pgno, mode, lockp)
+	DBC *dbc;
 	int do_couple;
 	db_pgno_t pgno;
 	db_lockmode_t mode;
 	DB_LOCK *lockp;
 {
+	DB *dbp;
 	DB_LOCKREQ couple[2];
-	u_int32_t locker;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	if (!F_ISSET(dbp, DB_AM_LOCKING)) {
 		*lockp = LOCK_INVALID;
 		return (0);
 	}
 
-	locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid;
-	dbp->lock.pgno = pgno;
+	dbc->lock.pgno = pgno;
 
 	/*
 	 * If the object not currently locked, acquire the lock and return,
@@ -263,54 +286,32 @@ __bam_lget(dbp, do_couple, pgno, mode, lockp)
 	 */
 	if (do_couple) {
 		couple[0].op = DB_LOCK_GET;
-		couple[0].obj = &dbp->lock_dbt;
+		couple[0].obj = &dbc->lock_dbt;
 		couple[0].mode = mode;
 		couple[1].op = DB_LOCK_PUT;
 		couple[1].lock = *lockp;
 
-		ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL);
+		if (dbc->txn == NULL)
+			ret = lock_vec(dbp->dbenv->lk_info,
+			    dbc->locker, 0, couple, 2, NULL);
+		else
+			ret = lock_tvec(dbp->dbenv->lk_info,
+			    dbc->txn, 0, couple, 2, NULL);
 		if (ret != 0) {
 			/* If we fail, discard the lock we held. */
-			__bam_lput(dbp, *lockp);
+			__BT_LPUT(dbc, *lockp);
 
 			return (ret < 0 ? EAGAIN : ret);
 		}
 		*lockp = couple[0].lock;
 	} else {
-		 ret = lock_get(dbp->dbenv->lk_info,
-		     locker, 0, &dbp->lock_dbt, mode, lockp);
+		if (dbc->txn == NULL)
+			ret = lock_get(dbp->dbenv->lk_info,
+			    dbc->locker, 0, &dbc->lock_dbt, mode, lockp);
+		else
+			ret = lock_tget(dbp->dbenv->lk_info,
+			    dbc->txn, 0, &dbc->lock_dbt, mode, lockp);
 		 return (ret < 0 ? EAGAIN : ret);
 	}
 	return (0);
 }
-
-/*
- * __bam_lput --
- *	The standard lock put call.
- *
- * PUBLIC: int __bam_lput __P((DB *, DB_LOCK));
- */
-int
-__bam_lput(dbp, lock)
-	DB *dbp;
-	DB_LOCK lock;
-{
-	return (__BT_LPUT(dbp, lock));
-}
-
-/*
- * __bam_pget --
- *	The standard page get call.
- *
- * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, u_int32_t));
- */
-int
-__bam_pget(dbp, hp, pgnop, mpool_flags)
-	DB *dbp;
-	PAGE **hp;
-	db_pgno_t *pgnop;
-	u_int32_t mpool_flags;
-{
-	return (memp_fget((dbp)->mpf,
-	    pgnop, mpool_flags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop));
-}
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c
index a93faac98c..0d7a69889a 100644
--- a/db2/btree/bt_put.c
+++ b/db2/btree/bt_put.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_put.c	10.45 (Sleepycat) 5/25/98";
+static const char sccsid[] = "@(#)bt_put.c	10.54 (Sleepycat) 12/6/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -61,372 +61,23 @@ static const char sccsid[] = "@(#)bt_put.c	10.45 (Sleepycat) 5/25/98";
 #include "db_page.h"
 #include "btree.h"
 
-static int __bam_fixed __P((BTREE *, DBT *));
-static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *));
-static int __bam_lookup __P((DB *, DBT *, int *));
-static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
-static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *));
-static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_fixed __P((DBC *, DBT *));
+static int __bam_ndup __P((DBC *, PAGE *, u_int32_t));
+static int __bam_ovput __P((DBC *, PAGE *, u_int32_t, DBT *));
+static int __bam_partial __P((DBC *,
+    DBT *, PAGE *, u_int32_t, u_int32_t, u_int32_t));
 static u_int32_t __bam_partsize __P((DBT *, PAGE *, u_int32_t));
 
 /*
- * __bam_put --
- *	Add a new key/data pair or replace an existing pair (btree).
- *
- * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
- */
-int
-__bam_put(argdbp, txn, key, data, flags)
-	DB *argdbp;
-	DB_TXN *txn;
-	DBT *key, *data;
-	u_int32_t flags;
-{
-	BTREE *t;
-	CURSOR c;
-	DB *dbp;
-	PAGE *h;
-	db_indx_t indx;
-	u_int32_t iitem_flags, insert_flags;
-	int exact, isdeleted, newkey, ret, stack;
-
-	DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags);
-
-	/* Check flags. */
-	if ((ret = __db_putchk(argdbp, key, data, flags,
-	    F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0)
-		return (ret);
-
-	GETHANDLE(argdbp, txn, &dbp, ret);
-	t = dbp->internal;
-
-retry:	/*
-	 * Find the location at which to insert.  The call to __bam_lookup
-	 * leaves the returned page pinned.
-	 */
-	if ((ret = __bam_lookup(dbp, key, &exact)) != 0) {
-		PUTHANDLE(dbp);
-		return (ret);
-	}
-	h = t->bt_csp->page;
-	indx = t->bt_csp->indx;
-	stack = 1;
-
-	/*
-	 * If DB_NOOVERWRITE is set and there's an identical key in the tree,
-	 * return an error unless the data item has already been marked for
-	 * deletion, or, all the remaining data items have already been marked
-	 * for deletion in the case of duplicates.  If all the data items have
-	 * been marked for deletion, we do a replace, otherwise, it has to be
-	 * a set of duplicates, and we simply append a new one to the set.
-	 */
-	isdeleted = 0;
-	if (exact) {
-		if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0)
-			goto err;
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
-		else
-			if (flags == DB_NOOVERWRITE) {
-				ret = DB_KEYEXIST;
-				goto err;
-			}
-	}
-
-	/*
-	 * If we're inserting into the first or last page of the tree,
-	 * remember where we did it so we can do fast lookup next time.
-	 *
-	 * XXX
-	 * Does reverse order still work (did it ever!?!?)
-	 */
-	t->bt_lpgno =
-	    h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ?
-	    h->pgno : PGNO_INVALID;
-
-	/*
-	 * Select the arguments for __bam_iitem() and do the insert.  If the
-	 * key is an exact match, we're either adding a new duplicate at the
-	 * end of the duplicate set, or we're replacing the data item with a
-	 * new data item.  If the key isn't an exact match, we're inserting
-	 * a new key/data pair, before the search location.
-	 */
-	newkey = dbp->type == DB_BTREE && !exact;
-	if (exact) {
-		if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) {
-			/*
-			 * Make sure that we're not looking at a page of
-			 * duplicates -- if so, move to the last entry on
-			 * that page.
-			 */
-			c.page = h;
-			c.pgno = h->pgno;
-			c.indx = indx;
-			c.dpgno = PGNO_INVALID;
-			c.dindx = 0;
-			if ((ret =
-			    __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0)
-				goto err;
-			if (c.dpgno != PGNO_INVALID) {
-				/*
-				 * XXX
-				 * The __bam_ovfl_chk() routine memp_fput() the
-				 * current page and acquired a new one, but did
-				 * not do anything about the lock we're holding.
-				 */
-				t->bt_csp->page = h = c.page;
-				indx = c.dindx;
-			}
-			insert_flags = DB_AFTER;
-		} else
-			insert_flags = DB_CURRENT;
-	} else
-		insert_flags = DB_BEFORE;
-
-	/*
-	 * The pages we're using may be modified by __bam_iitem(), so make
-	 * sure we reset the stack.
-	 */
-	iitem_flags = 0;
-	if (newkey)
-		iitem_flags |= BI_NEWKEY;
-	if (isdeleted)
-		iitem_flags |= BI_DOINCR;
-	ret = __bam_iitem(dbp, &h, &indx, key, data, insert_flags, iitem_flags);
-	t->bt_csp->page = h;
-	t->bt_csp->indx = indx;
-
-	switch (ret) {
-	case 0:
-		/* Done.  Clean up the cursor. */
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS);
-		break;
-	case DB_NEEDSPLIT:
-		/*
-		 * We have to split the page.  Back out the cursor setup,
-		 * discard the stack of pages, and do the split.
-		 */
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
-
-		(void)__bam_stkrel(dbp);
-		stack = 0;
-
-		if ((ret = __bam_split(dbp, key)) != 0)
-			break;
-
-		goto retry;
-		/* NOTREACHED */
-	default:
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
-		break;
-	}
-
-err:	if (stack)
-		(void)__bam_stkrel(dbp);
-
-	PUTHANDLE(dbp);
-	return (ret);
-}
-
-/*
- * __bam_isdeleted --
- *	Return if the only remaining data item for the element has been
- *	deleted.
- */
-static int
-__bam_isdeleted(dbp, h, indx, isdeletedp)
-	DB *dbp;
-	PAGE *h;
-	u_int32_t indx;
-	int *isdeletedp;
-{
-	BKEYDATA *bk;
-	db_pgno_t pgno;
-	int ret;
-
-	*isdeletedp = 1;
-	for (;;) {
-		bk = GET_BKEYDATA(h, indx + O_INDX);
-		switch (B_TYPE(bk->type)) {
-		case B_KEYDATA:
-		case B_OVERFLOW:
-			if (!B_DISSET(bk->type)) {
-				*isdeletedp = 0;
-				return (0);
-			}
-			break;
-		case B_DUPLICATE:
-			/*
-			 * If the data item referencing the off-page duplicates
-			 * is flagged as deleted, we're done.  Else, we have to
-			 * walk the chain of duplicate pages.
-			 */
-			if (B_DISSET(bk->type))
-				return (0);
-			goto dupchk;
-		default:
-			return (__db_pgfmt(dbp, h->pgno));
-		}
-
-		/*
-		 * If there are no more on-page duplicate items, then every
-		 * data item for this key must have been deleted.
-		 */
-		if (indx + P_INDX >= (u_int32_t)NUM_ENT(h))
-			return (0);
-		if (h->inp[indx] != h->inp[indx + P_INDX])
-			return (0);
-
-		/* Check the next item. */
-		indx += P_INDX;
-	}
-	/* NOTREACHED */
-
-dupchk:	/* Check a chain of duplicate pages. */
-	pgno = ((BOVERFLOW *)bk)->pgno;
-	for (;;) {
-		/* Acquire the next page in the duplicate chain. */
-		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
-			return (ret);
-
-		/* Check each item for a delete flag. */
-		for (indx = 0; indx < NUM_ENT(h); ++indx)
-			if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) {
-				*isdeletedp = 0;
-				goto done;
-			}
-		/*
-		 * If we reach the end of the duplicate pages, then every
-		 * item we reviewed must have been deleted.
-		 */
-		if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
-			goto done;
-
-		(void)memp_fput(dbp->mpf, h, 0);
-	}
-	/* NOTREACHED */
-
-done:	(void)memp_fput(dbp->mpf, h, 0);
-	return (0);
-}
-
-/*
- * __bam_lookup --
- *	Find the right location in the tree for the key.
- */
-static int
-__bam_lookup(dbp, key, exactp)
-	DB *dbp;
-	DBT *key;
-	int *exactp;
-{
-	BTREE *t;
-	DB_LOCK lock;
-	EPG e;
-	PAGE *h;
-	db_indx_t indx;
-	int cmp, ret;
-
-	t = dbp->internal;
-	h = NULL;
-
-	/*
-	 * Record numbers can't be fast-tracked, we have to lock the entire
-	 * tree.
-	 */
-	if (F_ISSET(dbp, DB_BT_RECNUM))
-		goto slow;
-
-	/* Check to see if we've been seeing sorted input. */
-	if (t->bt_lpgno == PGNO_INVALID)
-		goto slow;
-
-	/*
-	 * Retrieve the page on which we did the last insert.  It's okay if
-	 * it doesn't exist, or if it's not the page type we expect, it just
-	 * means that the world changed.
-	 */
-	if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
-		goto miss;
-	if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) {
-		(void)__BT_LPUT(dbp, lock);
-		goto miss;
-	}
-	if (TYPE(h) != P_LBTREE)
-		goto miss;
-	if (NUM_ENT(h) == 0)
-		goto miss;
-
-	/*
-	 * We have to be at the end or beginning of the tree to know that
-	 * we're inserting in a sort order.  If that's the case and we're
-	 * in the right order in comparison to the first/last key/data pair,
-	 * we have the right position.
-	 */
-	if (h->next_pgno == PGNO_INVALID) {
-		e.page = h;
-		e.indx = NUM_ENT(h) - P_INDX;
-		if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) {
-			if (cmp > 0)
-				e.indx += P_INDX;
-			goto fast;
-		}
-	}
-	if (h->prev_pgno == PGNO_INVALID) {
-		e.page = h;
-		e.indx = 0;
-		if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) {
-			/*
-			 * We're doing a put, so we want to insert as the last
-			 * of any set of duplicates.
-			 */
-			if (cmp == 0) {
-				for (indx = 0;
-				    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
-				    h->inp[indx] == h->inp[indx + P_INDX];
-				    indx += P_INDX)
-					;
-				e.indx = indx;
-			}
-			goto fast;
-		}
-	}
-	goto miss;
-
-	/* Set the exact match flag in case we've already inserted this key. */
-fast:	*exactp = cmp == 0;
-
-	/* Enter the entry in the stack. */
-	BT_STK_CLR(t);
-	BT_STK_ENTER(t, e.page, e.indx, lock, ret);
-	if (ret != 0)
-		return (ret);
-
-	++t->lstat.bt_cache_hit;
-	return (0);
-
-miss:	++t->lstat.bt_cache_miss;
-	if (h != NULL) {
-		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_LPUT(dbp, lock);
-	}
-
-slow:	return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp));
-}
-
-/*
  * __bam_iitem --
  *	Insert an item into the tree.
  *
- * PUBLIC: int __bam_iitem __P((DB *,
+ * PUBLIC: int __bam_iitem __P((DBC *,
  * PUBLIC:    PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t));
  */
 int
-__bam_iitem(dbp, hp, indxp, key, data, op, flags)
-	DB *dbp;
+__bam_iitem(dbc, hp, indxp, key, data, op, flags)
+	DBC *dbc;
 	PAGE **hp;
 	db_indx_t *indxp;
 	DBT *key, *data;
@@ -434,6 +85,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 {
 	BTREE *t;
 	BKEYDATA *bk;
+	DB *dbp;
 	DBT tdbt;
 	PAGE *h;
 	db_indx_t indx, nbytes;
@@ -442,6 +94,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 
 	COMPQUIET(bk, NULL);
 
+	dbp = dbc->dbp;
 	t = dbp->internal;
 	h = *hp;
 	indx = *indxp;
@@ -473,21 +126,21 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 			default:
 				return (__db_pgfmt(dbp, h->pgno));
 			}
-			if ((ret = __db_ditem(dbp, *hp, *indxp, nbytes)) != 0)
+			if ((ret = __db_ditem(dbc, *hp, *indxp, nbytes)) != 0)
 				return (ret);
 		}
 
 		/* Put the new/replacement item onto the page. */
-		if ((ret = __db_dput(dbp, data, hp, indxp, __bam_new)) != 0)
+		if ((ret = __db_dput(dbc, data, hp, indxp, __bam_new)) != 0)
 			return (ret);
 
 		goto done;
 	}
 
 	/* Handle fixed-length records: build the real record. */
-	if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) {
+	if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->recno->re_len) {
 		tdbt = *data;
-		if ((ret = __bam_fixed(t, &tdbt)) != 0)
+		if ((ret = __bam_fixed(dbc, &tdbt)) != 0)
 			return (ret);
 		data = &tdbt;
 	}
@@ -554,7 +207,8 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 	/* Handle partial puts: build the real record. */
 	if (F_ISSET(data, DB_DBT_PARTIAL)) {
 		tdbt = *data;
-		if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0)
+		if ((ret = __bam_partial(dbc,
+		    &tdbt, h, indx, data_size, flags)) != 0)
 			return (ret);
 		data = &tdbt;
 	}
@@ -583,10 +237,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 
 		/* Add the key. */
 		if (bigkey) {
-			if ((ret = __bam_ovput(dbp, h, indx, key)) != 0)
+			if ((ret = __bam_ovput(dbc, h, indx, key)) != 0)
 				return (ret);
 		} else
-			if ((ret = __db_pitem(dbp, h, indx,
+			if ((ret = __db_pitem(dbc, h, indx,
 			    BKEYDATA_SIZE(key->size), NULL, key)) != 0)
 				return (ret);
 		++indx;
@@ -598,7 +252,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 				 * Adjust the cursor and copy in the key for
 				 * the duplicate.
 				 */
-				if ((ret = __bam_adjindx(dbp,
+				if ((ret = __bam_adjindx(dbc,
 				    h, indx + P_INDX, indx, 1)) != 0)
 					return (ret);
 
@@ -620,7 +274,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 				 * the duplicate.
 				 */
 				if ((ret =
-				    __bam_adjindx(dbp, h, indx, indx, 1)) != 0)
+				    __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
 					return (ret);
 
 				++indx;
@@ -639,7 +293,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 			 * delete and then re-add the item.
 			 */
 			if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
-				if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+				if ((ret = __bam_ditem(dbc, h, indx)) != 0)
 					return (ret);
 				break;
 			}
@@ -654,7 +308,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 
 	/* Add the data. */
 	if (bigdata) {
-		if ((ret = __bam_ovput(dbp, h, indx, data)) != 0)
+		if ((ret = __bam_ovput(dbc, h, indx, data)) != 0)
 			return (ret);
 	} else {
 		BKEYDATA __bk;
@@ -665,12 +319,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 			__bk.len = data->size;
 			__hdr.data = &__bk;
 			__hdr.size = SSZA(BKEYDATA, data);
-			ret = __db_pitem(dbp, h, indx,
+			ret = __db_pitem(dbc, h, indx,
 			    BKEYDATA_SIZE(data->size), &__hdr, data);
 		} else if (replace)
-			ret = __bam_ritem(dbp, h, indx, data);
+			ret = __bam_ritem(dbc, h, indx, data);
 		else
-			ret = __db_pitem(dbp, h, indx,
+			ret = __db_pitem(dbc, h, indx,
 			    BKEYDATA_SIZE(data->size), NULL, data);
 		if (ret != 0)
 			return (ret);
@@ -686,7 +340,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 	 */
 	if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
 		--indx;
-		if ((ret = __bam_ndup(dbp, h, indx)) != 0)
+		if ((ret = __bam_ndup(dbc, h, indx)) != 0)
 			return (ret);
 	}
 
@@ -700,14 +354,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 done:	if (LF_ISSET(BI_DOINCR) ||
 	    (op != DB_CURRENT &&
 	    (F_ISSET(dbp, DB_BT_RECNUM) || dbp->type == DB_RECNO)))
-		if ((ret = __bam_adjust(dbp, t, 1)) != 0)
+		if ((ret = __bam_adjust(dbc, 1)) != 0)
 			return (ret);
 
 	/* If we've modified a recno file, set the flag */
-	if (t->bt_recno != NULL)
-		F_SET(t->bt_recno, RECNO_MODIFIED);
-
-	++t->lstat.bt_added;
+	if (t->recno != NULL)
+		F_SET(t->recno, RECNO_MODIFIED);
 
 	return (ret);
 }
@@ -770,7 +422,7 @@ __bam_partsize(data, h, indx)
 	memset(&__hdr, 0, sizeof(__hdr));				\
 	__hdr.data = &bo;						\
 	__hdr.size = BOVERFLOW_SIZE;					\
-	if ((ret = __db_pitem(dbp,					\
+	if ((ret = __db_pitem(dbc,					\
 	    h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0)		\
 		return (ret);						\
 } while (0)
@@ -780,8 +432,8 @@ __bam_partsize(data, h, indx)
  *	Build an overflow item and put it on the page.
  */
 static int
-__bam_ovput(dbp, h, indx, item)
-	DB *dbp;
+__bam_ovput(dbc, h, indx, item)
+	DBC *dbc;
 	PAGE *h;
 	u_int32_t indx;
 	DBT *item;
@@ -789,10 +441,12 @@ __bam_ovput(dbp, h, indx, item)
 	BOVERFLOW bo;
 	int ret;
 
+	UMRW(bo.unused1);
 	B_TSET(bo.type, B_OVERFLOW, 0);
-	bo.tlen = item->size;
-	if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0)
+	UMRW(bo.unused2);
+	if ((ret = __db_poff(dbc, item, &bo.pgno, __bam_new)) != 0)
 		return (ret);
+	bo.tlen = item->size;
 
 	OVPUT(h, indx, bo);
 
@@ -803,22 +457,25 @@ __bam_ovput(dbp, h, indx, item)
  * __bam_ritem --
  *	Replace an item on a page.
  *
- * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *));
+ * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *));
  */
 int
-__bam_ritem(dbp, h, indx, data)
-	DB *dbp;
+__bam_ritem(dbc, h, indx, data)
+	DBC *dbc;
 	PAGE *h;
 	u_int32_t indx;
 	DBT *data;
 {
 	BKEYDATA *bk;
+	DB *dbp;
 	DBT orig, repl;
 	db_indx_t cnt, lo, ln, min, off, prefix, suffix;
 	int32_t nbytes;
 	int ret;
 	u_int8_t *p, *t;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * Replace a single item onto a page.  The logic figuring out where
 	 * to insert and whether it fits is handled in the caller.  All we do
@@ -827,7 +484,7 @@ __bam_ritem(dbp, h, indx, data)
 	bk = GET_BKEYDATA(h, indx);
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		/*
 		 * We might as well check to see if the two data items share
 		 * a common prefix and suffix -- it can save us a lot of log
@@ -851,7 +508,7 @@ __bam_ritem(dbp, h, indx, data)
 		orig.size = bk->len - (prefix + suffix);
 		repl.data = (u_int8_t *)data->data + prefix;
 		repl.size = data->size - (prefix + suffix);
-		if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn,
+		if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h),
 		    (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
 		    &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
@@ -907,18 +564,21 @@ __bam_ritem(dbp, h, indx, data)
  *	If it should, create it.
  */
 static int
-__bam_ndup(dbp, h, indx)
-	DB *dbp;
+__bam_ndup(dbc, h, indx)
+	DBC *dbc;
 	PAGE *h;
 	u_int32_t indx;
 {
 	BKEYDATA *bk;
 	BOVERFLOW bo;
+	DB *dbp;
 	DBT hdr;
 	PAGE *cp;
 	db_indx_t cnt, cpindx, first, sz;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
 		indx -= P_INDX;
 	for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
@@ -941,7 +601,7 @@ __bam_ndup(dbp, h, indx)
 		return (0);
 
 	/* Get a new page. */
-	if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0)
+	if ((ret = __bam_new(dbc, P_DUPLICATE, &cp)) != 0)
 		return (ret);
 
 	/*
@@ -957,7 +617,7 @@ __bam_ndup(dbp, h, indx)
 		hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
 		    BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
 		if ((ret =
-		    __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0)
+		    __db_pitem(dbc, cp, cpindx, hdr.size, &hdr, NULL)) != 0)
 			goto err;
 
 		/*
@@ -970,18 +630,20 @@ __bam_ndup(dbp, h, indx)
 		    PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx);
 
 		/* Delete the data item. */
-		if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0)
+		if ((ret = __db_ditem(dbc, h, indx, hdr.size)) != 0)
 			goto err;
 
 		/* Delete all but the first reference to the key. */
 		if (--cnt == 0)
 			break;
-		if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0)
+		if ((ret = __bam_adjindx(dbc, h, indx, first, 0)) != 0)
 			goto err;
 	}
 
 	/* Put in a new data item that points to the duplicates page. */
+	UMRW(bo.unused1);
 	B_TSET(bo.type, B_DUPLICATE, 0);
+	UMRW(bo.unused2);
 	bo.pgno = cp->pgno;
 	bo.tlen = 0;
 
@@ -989,7 +651,7 @@ __bam_ndup(dbp, h, indx)
 
 	return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY));
 
-err:	(void)__bam_free(dbp, cp);
+err:	(void)__bam_free(dbc, cp);
 	return (ret);
 }
 
@@ -998,13 +660,16 @@ err:	(void)__bam_free(dbp, cp);
  *	Build the real record for a fixed length put.
  */
 static int
-__bam_fixed(t, dbt)
-	BTREE *t;
+__bam_fixed(dbc, dbt)
+	DBC *dbc;
 	DBT *dbt;
 {
+	DB *dbp;
 	RECNO *rp;
+	int ret;
 
-	rp = t->bt_recno;
+	dbp = dbc->dbp;
+	rp = ((BTREE *)dbp->internal)->recno;
 
 	/*
 	 * If database contains fixed-length records, and the record is long,
@@ -1018,29 +683,27 @@ __bam_fixed(t, dbt)
 	 * short.  Pad it out.  We use the record data return memory, it's
 	 * only a short-term use.
 	 */
-	if (t->bt_rdata.ulen < rp->re_len) {
-		t->bt_rdata.data = t->bt_rdata.data == NULL ?
-		    (void *)__db_malloc(rp->re_len) :
-		    (void *)__db_realloc(t->bt_rdata.data, rp->re_len);
-		if (t->bt_rdata.data == NULL) {
-			t->bt_rdata.ulen = 0;
-			return (ENOMEM);
+	if (dbc->rdata.ulen < rp->re_len) {
+		 if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
+			dbc->rdata.ulen = 0;
+			dbc->rdata.data = NULL;
+			return (ret);
 		}
-		t->bt_rdata.ulen = rp->re_len;
+		dbc->rdata.ulen = rp->re_len;
 	}
-	memcpy(t->bt_rdata.data, dbt->data, dbt->size);
-	memset((u_int8_t *)t->bt_rdata.data + dbt->size,
+	memcpy(dbc->rdata.data, dbt->data, dbt->size);
+	memset((u_int8_t *)dbc->rdata.data + dbt->size,
 	    rp->re_pad, rp->re_len - dbt->size);
 
 	/*
 	 * Clean up our flags and other information just in case, and
 	 * change the caller's DBT to reference our created record.
 	 */
-	t->bt_rdata.size = rp->re_len;
-	t->bt_rdata.dlen = 0;
-	t->bt_rdata.doff = 0;
-	t->bt_rdata.flags = 0;
-	*dbt = t->bt_rdata;
+	dbc->rdata.size = rp->re_len;
+	dbc->rdata.dlen = 0;
+	dbc->rdata.doff = 0;
+	dbc->rdata.flags = 0;
+	*dbt = dbc->rdata;
 
 	return (0);
 }
@@ -1050,15 +713,15 @@ __bam_fixed(t, dbt)
  *	Build the real record for a partial put.
  */
 static int
-__bam_partial(dbp, dbt, h, indx, nbytes)
-	DB *dbp;
+__bam_partial(dbc, dbt, h, indx, nbytes, flags)
+	DBC *dbc;
 	DBT *dbt;
 	PAGE *h;
-	u_int32_t indx, nbytes;
+	u_int32_t indx, nbytes, flags;
 {
-	BTREE *t;
 	BKEYDATA *bk, tbk;
 	BOVERFLOW *bo;
+	DB *dbp;
 	DBT copy;
 	u_int32_t len, tlen;
 	u_int8_t *p;
@@ -1066,18 +729,34 @@ __bam_partial(dbp, dbt, h, indx, nbytes)
 
 	COMPQUIET(bo, NULL);
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
 
 	/* We use the record data return memory, it's only a short-term use. */
-	if (t->bt_rdata.ulen < nbytes) {
-		t->bt_rdata.data = t->bt_rdata.data == NULL ?
-		    (void *)__db_malloc(nbytes) :
-		    (void *)__db_realloc(t->bt_rdata.data, nbytes);
-		if (t->bt_rdata.data == NULL) {
-			t->bt_rdata.ulen = 0;
-			return (ENOMEM);
+	if (dbc->rdata.ulen < nbytes) {
+		 if ((ret = __os_realloc(&dbc->rdata.data, nbytes)) != 0) {
+			dbc->rdata.ulen = 0;
+			dbc->rdata.data = NULL;
+			return (ret);
 		}
-		t->bt_rdata.ulen = nbytes;
+		dbc->rdata.ulen = nbytes;
+	}
+
+	/*
+	 * We use nul bytes for any part of the record that isn't specified;
+	 * get it over with.
+	 */
+	memset(dbc->rdata.data, 0, nbytes);
+
+	/*
+	 * In the next clauses, we need to do three things: a) set p to point
+	 * to the place at which to copy the user's data, b) set tlen to the
+	 * total length of the record, not including the bytes contributed by
+	 * the user, and c) copy any valid data from an existing record.
+	 */
+	if (LF_ISSET(BI_NEWKEY)) {
+		tlen = dbt->doff;
+		p = (u_int8_t *)dbc->rdata.data + dbt->doff;
+		goto ucopy;
 	}
 
 	/* Find the current record. */
@@ -1089,13 +768,6 @@ __bam_partial(dbp, dbt, h, indx, nbytes)
 		B_TSET(bk->type, B_KEYDATA, 0);
 		bk->len = 0;
 	}
-
-	/*
-	 * We use nul bytes for any part of the record that isn't specified,
-	 * get it over with.
-	 */
-	memset(t->bt_rdata.data, 0, nbytes);
-
 	if (B_TYPE(bk->type) == B_OVERFLOW) {
 		/*
 		 * In the case of an overflow record, we shift things around
@@ -1103,12 +775,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes)
 		 */
 		memset(&copy, 0, sizeof(copy));
 		if ((ret = __db_goff(dbp, &copy, bo->tlen,
-		    bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0)
+		    bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
 			return (ret);
 
 		/* Skip any leading data from the original record. */
 		tlen = dbt->doff;
-		p = (u_int8_t *)t->bt_rdata.data + dbt->doff;
+		p = (u_int8_t *)dbc->rdata.data + dbt->doff;
 
 		/*
 		 * Copy in any trailing data from the original record.
@@ -1127,20 +799,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes)
 				memmove(p + dbt->size, p + dbt->dlen, len);
 			tlen += len;
 		}
-
-		/* Copy in the application provided data. */
-		memcpy(p, dbt->data, dbt->size);
-		tlen += dbt->size;
 	} else {
 		/* Copy in any leading data from the original record. */
-		memcpy(t->bt_rdata.data,
+		memcpy(dbc->rdata.data,
 		    bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
 		tlen = dbt->doff;
-		p = (u_int8_t *)t->bt_rdata.data + dbt->doff;
-
-		/* Copy in the application provided data. */
-		memcpy(p, dbt->data, dbt->size);
-		tlen += dbt->size;
+		p = (u_int8_t *)dbc->rdata.data + dbt->doff;
 
 		/* Copy in any trailing data from the original record. */
 		len = dbt->doff + dbt->dlen;
@@ -1150,11 +814,18 @@ __bam_partial(dbp, dbt, h, indx, nbytes)
 		}
 	}
 
+ucopy:	/*
+	 * Copy in the application provided data -- p and tlen must have been
+	 * initialized above.
+	 */
+	memcpy(p, dbt->data, dbt->size);
+	tlen += dbt->size;
+
 	/* Set the DBT to reference our new record. */
-	t->bt_rdata.size = tlen;
-	t->bt_rdata.dlen = 0;
-	t->bt_rdata.doff = 0;
-	t->bt_rdata.flags = 0;
-	*dbt = t->bt_rdata;
+	dbc->rdata.size = tlen;
+	dbc->rdata.dlen = 0;
+	dbc->rdata.doff = 0;
+	dbc->rdata.flags = 0;
+	*dbt = dbc->rdata;
 	return (0);
 }
diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c
index fe33825ec4..de6b3b7d0e 100644
--- a/db2/btree/bt_rec.c
+++ b/db2/btree/bt_rec.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_rec.c	10.21 (Sleepycat) 4/28/98";
+static const char sccsid[] = "@(#)bt_rec.c	10.28 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -45,7 +45,8 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info)
 	BTMETA *meta;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	db_pgno_t pgno;
 	int cmp_n, cmp_p, modified, ret;
 
@@ -101,7 +102,6 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info)
 		modified = 1;
 	}
 	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
 		(void)memp_fput(mpf, meta, 0);
 		goto out;
 	}
@@ -121,12 +121,10 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info)
 		meta->free = argp->pgno;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
 	ret = 0;
 
 out:	REC_CLOSE;
@@ -149,7 +147,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info)
 {
 	__bam_pg_free_args *argp;
 	BTMETA *meta;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	db_pgno_t pgno;
@@ -192,10 +191,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info)
 
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 	/*
 	 * Fix up the metadata page.  If we're redoing or undoing the operation
@@ -224,10 +221,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info)
 		meta->lsn = argp->meta_lsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 done:	*lsnp = argp->prev_lsn;
 	ret = 0;
@@ -251,7 +246,8 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__bam_split_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
 	db_pgno_t pgno;
@@ -310,12 +306,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 			goto done;
 
 		/* Allocate and initialize new left/right child pages. */
-		if ((_lp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL ||
-		    (_rp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL) {
-			ret = ENOMEM;
-			__db_err(file_dbp->dbenv, "%s", strerror(ret));
+		if ((ret = __os_malloc(file_dbp->pgsize, NULL, &_lp)) != 0 ||
+		    (ret = __os_malloc(file_dbp->pgsize, NULL, &_rp)) != 0)
 			goto out;
-		}
 		if (rootsplit) {
 			P_INIT(_lp, file_dbp->pgsize, argp->left,
 			    PGNO_INVALID,
@@ -352,7 +345,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 			memcpy(lp, _lp, file_dbp->pgsize);
 			lp->lsn = *lsnp;
 			if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
-				goto fatal;
+				goto out;
 			lp = NULL;
 		}
 
@@ -367,7 +360,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 			memcpy(rp, _rp, file_dbp->pgsize);
 			rp->lsn = *lsnp;
 			if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
-				goto fatal;
+				goto out;
 			rp = NULL;
 		}
 
@@ -392,7 +385,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 			    __bam_total(_lp) + __bam_total(_rp) : 0);
 			pp->lsn = *lsnp;
 			if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
-				goto fatal;
+				goto out;
 			pp = NULL;
 		}
 
@@ -412,9 +405,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 			if (log_compare(&LSN(np), &argp->nlsn) == 0) {
 				PREV_PGNO(np) = argp->right;
 				np->lsn = *lsnp;
-				if ((ret = memp_fput(mpf,
-				    np, DB_MPOOL_DIRTY)) != 0)
-					goto fatal;
+				if ((ret =
+				    memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0)
+					goto out;
 				np = NULL;
 			}
 		}
@@ -433,7 +426,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info)
 		if (log_compare(lsnp, &LSN(pp)) == 0) {
 			memcpy(pp, argp->pg.data, argp->pg.size);
 			if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
-				goto fatal;
+				goto out;
 			pp = NULL;
 		}
 
@@ -451,7 +444,7 @@ lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
 				lp->lsn = argp->llsn;
 				if ((ret =
 				    memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
-					goto fatal;
+					goto out;
 				lp = NULL;
 			}
 			if (rp != NULL &&
@@ -459,7 +452,7 @@ lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
 				rp->lsn = argp->rlsn;
 				if ((ret =
 				    memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
-					goto fatal;
+					goto out;
 				rp = NULL;
 			}
 		}
@@ -481,7 +474,7 @@ lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
 				PREV_PGNO(np) = argp->left;
 				np->lsn = argp->nlsn;
 				if (memp_fput(mpf, np, DB_MPOOL_DIRTY))
-					goto fatal;
+					goto out;
 				np = NULL;
 			}
 		}
@@ -490,9 +483,6 @@ lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
 done:	*lsnp = argp->prev_lsn;
 	ret = 0;
 
-	if (0) {
-fatal:		(void)__db_panic(file_dbp);
-	}
 out:	/* Free any pages that weren't dirtied. */
 	if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0)
 		ret = t_ret;
@@ -505,9 +495,9 @@ out:	/* Free any pages that weren't dirtied. */
 
 	/* Free any allocated space. */
 	if (_lp != NULL)
-		__db_free(_lp);
+		__os_free(_lp, file_dbp->pgsize);
 	if (_rp != NULL)
-		__db_free(_rp);
+		__os_free(_rp, file_dbp->pgsize);
 
 	REC_CLOSE;
 }
@@ -528,7 +518,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__bam_rsplit_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	db_pgno_t pgno;
@@ -558,16 +549,14 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info)
 		P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT,
 		    argp->nrec, PGNO_INVALID, pagep->level + 1,
 		    file_dbp->type == DB_BTREE ? P_IBTREE : P_IRECNO);
-		if ((ret = __db_pitem(file_dbp, pagep, 0,
+		if ((ret = __db_pitem(dbc, pagep, 0,
 		    argp->rootent.size, &argp->rootent, NULL)) != 0)
 			goto out;
 		pagep->lsn = argp->rootlsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 	/*
 	 * Fix the page copied over the root page.  It's possible that the
@@ -592,10 +581,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info)
 		memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 done:	*lsnp = argp->prev_lsn;
 	ret = 0;
@@ -619,7 +606,8 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__bam_adj_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int cmp_n, cmp_p, modified, ret;
@@ -640,7 +628,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info)
 	cmp_p = log_compare(&LSN(pagep), &argp->lsn);
 	if (cmp_p == 0 && redo) {
 		/* Need to redo update described. */
-		if ((ret = __bam_adjindx(file_dbp,
+		if ((ret = __bam_adjindx(dbc,
 		    pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
 			goto err;
 
@@ -648,7 +636,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info)
 		modified = 1;
 	} else if (cmp_n == 0 && !redo) {
 		/* Need to undo update described. */
-		if ((ret = __bam_adjindx(file_dbp,
+		if ((ret = __bam_adjindx(dbc,
 		    pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
 			goto err;
 
@@ -684,7 +672,8 @@ __bam_cadjust_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__bam_cadjust_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int cmp_n, cmp_p, modified, ret;
@@ -760,7 +749,8 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__bam_cdel_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int cmp_n, cmp_p, modified, ret;
@@ -781,13 +771,19 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info)
 	cmp_p = log_compare(&LSN(pagep), &argp->lsn);
 	if (cmp_p == 0 && redo) {
 		/* Need to redo update described. */
-		B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type);
+		if (pagep->type == P_DUPLICATE)
+			B_DSET(GET_BKEYDATA(pagep, argp->indx)->type);
+		else
+			B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type);
 
 		LSN(pagep) = *lsnp;
 		modified = 1;
 	} else if (cmp_n == 0 && !redo) {
 		/* Need to undo update described. */
-		B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type);
+		if (pagep->type == P_DUPLICATE)
+			B_DCLR(GET_BKEYDATA(pagep, argp->indx)->type);
+		else
+			B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type);
 
 		LSN(pagep) = argp->lsn;
 		modified = 1;
@@ -818,7 +814,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info)
 {
 	__bam_repl_args *argp;
 	BKEYDATA *bk;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DBT dbt;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
@@ -848,10 +845,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info)
 		 */
 		memset(&dbt, 0, sizeof(dbt));
 		dbt.size = argp->prefix + argp->suffix + argp->repl.size;
-		if ((dbt.data = __db_malloc(dbt.size)) == NULL) {
-			ret = ENOMEM;
+		if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0)
 			goto err;
-		}
 		p = dbt.data;
 		memcpy(p, bk->data, argp->prefix);
 		p += argp->prefix;
@@ -859,8 +854,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info)
 		p += argp->repl.size;
 		memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
 
-		ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt);
-		__db_free(dbt.data);
+		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
+		__os_free(dbt.data, dbt.size);
 		if (ret != 0)
 			goto err;
 
@@ -874,10 +869,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info)
 		 */
 		memset(&dbt, 0, sizeof(dbt));
 		dbt.size = argp->prefix + argp->suffix + argp->orig.size;
-		if ((dbt.data = __db_malloc(dbt.size)) == NULL) {
-			ret = ENOMEM;
+		if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0)
 			goto err;
-		}
 		p = dbt.data;
 		memcpy(p, bk->data, argp->prefix);
 		p += argp->prefix;
@@ -885,8 +878,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info)
 		p += argp->orig.size;
 		memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
 
-		ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt);
-		__db_free(dbt.data);
+		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
+		__os_free(dbt.data, dbt.size);
 		if (ret != 0)
 			goto err;
 
diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c
index 38dbbd1c55..c69877ff7f 100644
--- a/db2/btree/bt_recno.c
+++ b/db2/btree/bt_recno.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_recno.c	10.37 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)bt_recno.c	10.53 (Sleepycat) 12/11/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -22,64 +22,89 @@ static const char sccsid[] = "@(#)bt_recno.c	10.37 (Sleepycat) 5/23/98";
 #include "db_int.h"
 #include "db_page.h"
 #include "btree.h"
-
-static int __ram_add __P((DB *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
-static int __ram_c_close __P((DBC *));
-static int __ram_c_del __P((DBC *, u_int32_t));
-static int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
-static int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
-static int __ram_fmap __P((DB *, db_recno_t));
-static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-static int __ram_iget __P((DB *, DBT *, DBT *));
+#include "db_ext.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
+
+static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+static int __ram_fmap __P((DBC *, db_recno_t));
+static int __ram_i_delete __P((DBC *));
 static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
 static int __ram_source __P((DB *, RECNO *, const char *));
 static int __ram_sync __P((DB *, u_int32_t));
-static int __ram_update __P((DB *, db_recno_t, int));
-static int __ram_vmap __P((DB *, db_recno_t));
-static int __ram_writeback __P((DB *));
+static int __ram_update __P((DBC *, db_recno_t, int));
+static int __ram_vmap __P((DBC *, db_recno_t));
+static int __ram_writeback __P((DBC *));
 
 /*
- * If we're renumbering records, then we have to detect in the cursor that a
- * record was deleted, and adjust the cursor as necessary.  If not renumbering
- * records, then we can detect this by looking at the actual record, so we
- * ignore the cursor delete flag.
+ * In recno, there are two meanings to the on-page "deleted" flag.  If we're
+ * re-numbering records, it means the record was implicitly created.  We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested..  If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag.  This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
  */
 #define	CD_SET(dbp, cp) {						\
 	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
-		F_SET(cp, CR_DELETED);					\
+		F_SET(cp, C_DELETED);					\
 }
 #define	CD_CLR(dbp, cp) {						\
 	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
-		F_CLR(cp, CR_DELETED);					\
+		F_CLR(cp, C_DELETED);					\
 }
 #define	CD_ISSET(dbp, cp)						\
-	(F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED))
+	(F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED))
 
 /*
  * __ram_open --
  *	Recno open function.
  *
- * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *));
+ * PUBLIC: int __ram_open __P((DB *, DB_INFO *));
  */
 int
-__ram_open(dbp, type, dbinfo)
+__ram_open(dbp, dbinfo)
 	DB *dbp;
-	DBTYPE type;
 	DB_INFO *dbinfo;
 {
 	BTREE *t;
+	DBC *dbc;
 	RECNO *rp;
-	int ret;
-
-	COMPQUIET(type, DB_RECNO);
+	int ret, t_ret;
 
-	ret = 0;
+	/* Allocate and initialize the private btree structure. */
+	if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0)
+		return (ret);
+	dbp->internal = t;
+	__bam_setovflsize(dbp);
 
-	/* Allocate and initialize the private RECNO structure. */
-	if ((rp = (RECNO *)__db_calloc(1, sizeof(*rp))) == NULL)
-		return (ENOMEM);
+	/* Allocate and initialize the private recno structure. */
+	if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0)
+		return (ret);
+	/* Link in the private recno structure. */
+	t->recno = rp;
 
-	if (dbinfo != NULL) {
+	/*
+	 * Intention is to make sure all of the user's selections are okay
+	 * here and then use them without checking.
+	 */
+	if (dbinfo == NULL) {
+		rp->re_delim = '\n';
+		rp->re_pad = ' ';
+		rp->re_fd = -1;
+		F_SET(rp, RECNO_EOF);
+	} else {
 		/*
 		 * If the user specified a source tree, open it and map it in.
 		 *
@@ -111,31 +136,40 @@ __ram_open(dbp, type, dbinfo)
 			}
 		} else
 			rp->re_len = 0;
-	} else {
-		rp->re_delim = '\n';
-		rp->re_pad = ' ';
-		rp->re_fd = -1;
-		F_SET(rp, RECNO_EOF);
 	}
 
-	/* Open the underlying btree. */
-	if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0)
-		goto err;
-
-	/* Set the routines necessary to make it look like a recno tree. */
-	dbp->cursor = __ram_cursor;
+	/* Initialize the remaining fields/methods of the DB. */
+	dbp->am_close = __ram_close;
 	dbp->del = __ram_delete;
-	dbp->get = __ram_get;
 	dbp->put = __ram_put;
+	dbp->stat = __bam_stat;
 	dbp->sync = __ram_sync;
 
-	/* Link in the private recno structure. */
-	((BTREE *)dbp->internal)->bt_recno = rp;
+	/* Start up the tree. */
+	if ((ret = __bam_read_root(dbp)) != 0)
+		goto err;
+
+	/* Set the overflow page size. */
+	__bam_setovflsize(dbp);
 
 	/* If we're snapshotting an underlying source file, do it now. */
-	if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT))
-		if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND)
+	if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) {
+		/* Allocate a cursor. */
+		if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+			goto err;
+
+		/* Do the snapshot. */
+		if ((ret = __ram_update(dbc,
+		    DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+			ret = 0;
+
+		/* Discard the cursor. */
+		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if (ret != 0)
 			goto err;
+	}
 
 	return (0);
 
@@ -145,143 +179,169 @@ err:	/* If we mmap'd a source file, discard it. */
 
 	/* If we opened a source file, discard it. */
 	if (rp->re_fd != -1)
-		(void)__db_close(rp->re_fd);
+		(void)__os_close(rp->re_fd);
 	if (rp->re_source != NULL)
-		FREES(rp->re_source);
-
-	/* If we allocated room for key/data return, discard it. */
-	t = dbp->internal;
-	if (t != NULL && t->bt_rkey.data != NULL)
-		__db_free(t->bt_rkey.data);
+		__os_freestr(rp->re_source);
 
-	FREE(rp, sizeof(*rp));
+	__os_free(rp, sizeof(*rp));
 
 	return (ret);
 }
 
 /*
- * __ram_cursor --
- *	Recno db->cursor function.
- *
- * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **));
+ * __ram_delete --
+ *	Recno db->del function.
  */
-int
-__ram_cursor(dbp, txn, dbcp)
+static int
+__ram_delete(dbp, txn, key, flags)
 	DB *dbp;
 	DB_TXN *txn;
-	DBC **dbcp;
+	DBT *key;
+	u_int32_t flags;
 {
-	RCURSOR *cp;
+	CURSOR *cp;
 	DBC *dbc;
+	db_recno_t recno;
+	int ret, t_ret;
 
-	DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0);
-
-	if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL)
-		return (ENOMEM);
-	if ((cp = (RCURSOR *)__db_calloc(1, sizeof(RCURSOR))) == NULL) {
-		__db_free(dbc);
-		return (ENOMEM);
-	}
-
-	cp->dbc = dbc;
-	cp->recno = RECNO_OOB;
-
-	dbc->dbp = dbp;
-	dbc->txn = txn;
-	dbc->internal = cp;
-	dbc->c_close = __ram_c_close;
-	dbc->c_del = __ram_c_del;
-	dbc->c_get = __ram_c_get;
-	dbc->c_put = __ram_c_put;
-
-	/*
-	 * All cursors are queued from the master DB structure.  Add the
-	 * cursor to that queue.
-	 */
-	CURSOR_SETUP(dbp);
-	TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links);
-	CURSOR_TEARDOWN(dbp);
+	DB_PANIC_CHECK(dbp);
 
-	*dbcp = dbc;
-	return (0);
-}
+	/* Check for invalid flags. */
+	if ((ret = __db_delchk(dbp,
+	    key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+		return (ret);
 
-/*
- * __ram_get --
- *	Recno db->get function.
- */
-static int
-__ram_get(argdbp, txn, key, data, flags)
-	DB *argdbp;
-	DB_TXN *txn;
-	DBT *key, *data;
-	u_int32_t flags;
-{
-	DB *dbp;
-	int ret;
+	/* Acquire a cursor. */
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
 
-	DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags);
+	DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
 
-	/* Check for invalid flags. */
-	if ((ret = __db_getchk(argdbp, key, data, flags)) != 0)
-		return (ret);
+	/* Check the user's record number and fill in as necessary. */
+	if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+		goto err;
 
-	GETHANDLE(argdbp, txn, &dbp, ret);
+	/* Do the delete. */
+	cp = dbc->internal;
+	cp->recno = recno;
+	ret = __ram_i_delete(dbc);
 
-	ret = __ram_iget(dbp, key, data);
+	/* Release the cursor. */
+err:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
 /*
- * __ram_iget --
- *	Internal ram get function, called for both standard and cursor
- *	get after the flags have been checked.
+ * __ram_i_delete --
+ *	Internal version of recno delete, called by __ram_delete and
+ *	__ram_c_del.
  */
 static int
-__ram_iget(dbp, key, data)
-	DB *dbp;
-	DBT *key, *data;
+__ram_i_delete(dbc)
+	DBC *dbc;
 {
+	BKEYDATA bk;
 	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
+	DBT hdr, data;
 	PAGE *h;
 	db_indx_t indx;
-	db_recno_t recno;
 	int exact, ret, stack;
 
-	stack = 0;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 	t = dbp->internal;
+	stack = 0;
 
-	/* Check the user's record number and fill in as necessary. */
-	if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0)
-		goto done;
+	/*
+	 * If this is CDB and this isn't a write cursor, then it's an error.
+	 * If it is a write cursor, but we don't yet hold the write lock, then
+	 * we need to upgrade to the write lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		/* Make sure it's a valid update cursor. */
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+		if (F_ISSET(dbc, DBC_RMW) &&
+		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
+		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+		    &dbc->mylock)) != 0)
+			return (EAGAIN);
+	}
 
-	/* Search the tree for the record. */
-	if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0)
-		goto done;
-	if (!exact)
-		return (DB_NOTFOUND);
+	/* Search the tree for the key; delete only deletes exact matches. */
+	if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
+		goto err;
+	if (!exact) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
 	stack = 1;
 
-	h = t->bt_csp->page;
-	indx = t->bt_csp->indx;
+	h = cp->csp->page;
+	indx = cp->csp->indx;
 
-	/* If the record has already been deleted, we couldn't have found it. */
+	/*
+	 * If re-numbering records, the on-page deleted flag can only mean
+	 * that this record was implicitly created.  Applications aren't
+	 * permitted to delete records they never created, return an error.
+	 *
+	 * If not re-numbering records, the on-page deleted flag means that
+	 * this record was implicitly created, or, was deleted at some time.
+	 * The former is an error because applications aren't permitted to
+	 * delete records they never created, the latter is an error because
+	 * if the record was "deleted", we could never have found it.
+	 */
 	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
 		ret = DB_KEYEMPTY;
-		goto done;
+		goto err;
 	}
 
-	/* Return the data item. */
-	ret = __db_ret(dbp,
-	    h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen);
-	++t->lstat.bt_get;
+	if (F_ISSET(dbp, DB_RE_RENUMBER)) {
+		/* Delete the item, adjust the counts, adjust the cursors. */
+		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+			goto err;
+		__bam_adjust(dbc, -1);
+		__ram_ca(dbp, cp->recno, CA_DELETE);
+
+		/*
+		 * If the page is empty, delete it.   The whole tree is locked
+		 * so there are no preparations to make.
+		 */
+		if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
+			stack = 0;
+			ret = __bam_dpages(dbc);
+		}
+	} else {
+		/* Use a delete/put pair to replace the record with a marker. */
+		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+			goto err;
+
+		B_TSET(bk.type, B_KEYDATA, 1);
+		bk.len = 0;
+		memset(&hdr, 0, sizeof(hdr));
+		hdr.data = &bk;
+		hdr.size = SSZA(BKEYDATA, data);
+		memset(&data, 0, sizeof(data));
+		data.data = (char *)"";
+		data.size = 0;
+		if ((ret = __db_pitem(dbc,
+		    h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+			goto err;
+	}
+	F_SET(t->recno, RECNO_MODIFIED);
 
-done:	/* Discard the stack. */
-	if (stack)
-		__bam_stkrel(dbp);
+err:	if (stack)
+		__bam_stkrel(dbc, 0);
 
+	/* If we upgraded the CDB lock upon entry; downgrade it now. */
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
 	return (ret);
 }
 
@@ -290,46 +350,50 @@ done:	/* Discard the stack. */
  *	Recno db->put function.
  */
 static int
-__ram_put(argdbp, txn, key, data, flags)
-	DB *argdbp;
+__ram_put(dbp, txn, key, data, flags)
+	DB *dbp;
 	DB_TXN *txn;
 	DBT *key, *data;
 	u_int32_t flags;
 {
-	BTREE *t;
-	DB *dbp;
+	DBC *dbc;
 	db_recno_t recno;
-	int ret;
+	int ret, t_ret;
 
-	DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags);
+	DB_PANIC_CHECK(dbp);
 
 	/* Check for invalid flags. */
-	if ((ret = __db_putchk(argdbp,
-	    key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0)
+	if ((ret = __db_putchk(dbp,
+	    key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
+		return (ret);
+
+	/* Allocate a cursor. */
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
 		return (ret);
 
-	GETHANDLE(argdbp, txn, &dbp, ret);
+	DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
 
 	/*
 	 * If we're appending to the tree, make sure we've read in all of
 	 * the backing source file.  Otherwise, check the user's record
 	 * number and fill in as necessary.
 	 */
-	ret = LF_ISSET(DB_APPEND) ?
-	    __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1);
+	ret = flags == DB_APPEND ?
+	    __ram_update(dbc, DB_MAX_RECORDS, 0) :
+	    __ram_getno(dbc, key, &recno, 1);
 
 	/* Add the record. */
 	if (ret == 0)
-		ret = __ram_add(dbp, &recno, data, flags, 0);
+		ret = __ram_add(dbc, &recno, data, flags, 0);
 
-	/* If we're appending to the tree, we have to return the record. */
-	if (ret == 0 && LF_ISSET(DB_APPEND)) {
-		t = dbp->internal;
-		ret = __db_retcopy(key, &recno, sizeof(recno),
-		    &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc);
-	}
+	/* Discard the cursor. */
+	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Return the record number if we're appending to the tree. */
+	if (ret == 0 && flags == DB_APPEND)
+		*(db_recno_t *)key->data = recno;
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
@@ -338,23 +402,35 @@ __ram_put(argdbp, txn, key, data, flags)
  *	Recno db->sync function.
  */
 static int
-__ram_sync(argdbp, flags)
-	DB *argdbp;
+__ram_sync(dbp, flags)
+	DB *dbp;
 	u_int32_t flags;
 {
-	DB *dbp;
-	int ret;
+	DBC *dbc;
+	int ret, t_ret;
 
-	DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags);
+	/*
+	 * Sync the underlying btree.
+	 *
+	 * !!!
+	 * We don't need to do a panic check or flags check, the "real"
+	 * sync function does all that for us.
+	 */
+	if ((ret = __db_sync(dbp, flags)) != 0)
+		return (ret);
 
-	/* Sync the underlying btree. */
-	if ((ret = __bam_sync(argdbp, flags)) != 0)
+	/* Allocate a cursor. */
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
 		return (ret);
 
+	DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags);
+
 	/* Copy back the backing source file. */
-	GETHANDLE(argdbp, NULL, &dbp, ret);
-	ret = __ram_writeback(dbp);
-	PUTHANDLE(dbp);
+	ret = __ram_writeback(dbc);
+
+	/* Discard the cursor. */
+	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
 
 	return (ret);
 }
@@ -366,14 +442,12 @@ __ram_sync(argdbp, flags)
  * PUBLIC: int __ram_close __P((DB *));
  */
 int
-__ram_close(argdbp)
-	DB *argdbp;
+__ram_close(dbp)
+	DB *dbp;
 {
 	RECNO *rp;
 
-	DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0);
-
-	rp = ((BTREE *)argdbp->internal)->bt_recno;
+	rp = ((BTREE *)dbp->internal)->recno;
 
 	/* Close any underlying mmap region. */
 	if (rp->re_smap != NULL)
@@ -381,136 +455,133 @@ __ram_close(argdbp)
 
 	/* Close any backing source file descriptor. */
 	if (rp->re_fd != -1)
-		(void)__db_close(rp->re_fd);
+		(void)__os_close(rp->re_fd);
 
 	/* Free any backing source file name. */
 	if (rp->re_source != NULL)
-		FREES(rp->re_source);
+		__os_freestr(rp->re_source);
 
 	/* Free allocated memory. */
-	FREE(rp, sizeof(RECNO));
-	((BTREE *)argdbp->internal)->bt_recno = NULL;
+	__os_free(rp, sizeof(RECNO));
+	((BTREE *)dbp->internal)->recno = NULL;
 
 	/* Close the underlying btree. */
-	return (__bam_close(argdbp));
-}
-
-/*
- * __ram_c_close --
- *	Recno cursor->close function.
- */
-static int
-__ram_c_close(dbc)
-	DBC *dbc;
-{
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0);
-
-	return (__ram_c_iclose(dbc->dbp, dbc));
-}
-
-/*
- * __ram_c_iclose --
- *	Close a single cursor -- internal version.
- *
- * PUBLIC: int __ram_c_iclose __P((DB *, DBC *));
- */
-int
-__ram_c_iclose(dbp, dbc)
-	DB *dbp;
-	DBC *dbc;
-{
-	/* Remove the cursor from the queue. */
-	CURSOR_SETUP(dbp);
-	TAILQ_REMOVE(&dbp->curs_queue, dbc, links);
-	CURSOR_TEARDOWN(dbp);
-
-	/* Discard the structures. */
-	FREE(dbc->internal, sizeof(RCURSOR));
-	FREE(dbc, sizeof(DBC));
-
-	return (0);
+	return (__bam_close(dbp));
 }
 
 /*
  * __ram_c_del --
  *	Recno cursor->c_del function.
+ *
+ * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t));
  */
-static int
+int
 __ram_c_del(dbc, flags)
 	DBC *dbc;
 	u_int32_t flags;
 {
-	DBT key;
-	RCURSOR *cp;
+	CURSOR *cp;
+	DB *dbp;
 	int ret;
 
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags);
-
+	dbp = dbc->dbp;
 	cp = dbc->internal;
 
+	DB_PANIC_CHECK(dbp);
+
 	/* Check for invalid flags. */
-	if ((ret = __db_cdelchk(dbc->dbp, flags,
-	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
+	if ((ret = __db_cdelchk(dbp, flags,
+	    F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
 		return (ret);
 
-	/* If already deleted, return failure. */
-	if (CD_ISSET(dbc->dbp, cp))
-		return (DB_KEYEMPTY);
+	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags);
 
-	/* Build a normal delete request. */
-	memset(&key, 0, sizeof(key));
-	key.data = &cp->recno;
-	key.size = sizeof(db_recno_t);
-	if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0)
-		CD_SET(dbc->dbp, cp);
+	/*
+	 * If we are running CDB, this had better be either a write
+	 * cursor or an immediate writer.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB))
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
 
-	return (ret);
+	/*
+	 * The semantics of cursors during delete are as follows: if record
+	 * numbers are mutable (DB_RE_RENUMBER is set), deleting a record
+	 * causes the cursor to automatically point to the record immediately
+	 * following.  In this case it is possible to use a single cursor for
+	 * repeated delete operations, without intervening operations.
+	 *
+	 * If record numbers are not mutable, then records are replaced with
+	 * a marker containing a delete flag.  If the record referenced by
+	 * this cursor has already been deleted, we will detect that as part
+	 * of the delete operation, and fail.
+	 */
+	return (__ram_i_delete(dbc));
 }
 
 /*
  * __ram_c_get --
  *	Recno cursor->c_get function.
+ *
+ * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
  */
-static int
+int
 __ram_c_get(dbc, key, data, flags)
 	DBC *dbc;
 	DBT *key, *data;
 	u_int32_t flags;
 {
-	BTREE *t;
+	CURSOR *cp, copy;
 	DB *dbp;
-	RCURSOR *cp, copy;
-	int ret;
-
-	DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get",
-	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL,
-	    NULL, flags);
+	PAGE *h;
+	db_indx_t indx;
+	int exact, ret, stack, tmp_rmw;
 
-	cp = dbc->internal;
 	dbp = dbc->dbp;
+	cp = dbc->internal;
+
+	DB_PANIC_CHECK(dbp);
 
 	/* Check for invalid flags. */
 	if ((ret = __db_cgetchk(dbc->dbp,
 	    key, data, flags, cp->recno != RECNO_OOB)) != 0)
 		return (ret);
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
-	t = dbp->internal;
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = 0;
+	if (LF_ISSET(DB_RMW)) {
+		if (!F_ISSET(dbp, DB_AM_CDB)) {
+			tmp_rmw = 1;
+			F_SET(dbc, DBC_RMW);
+		}
+		LF_CLR(DB_RMW);
+	}
+
+	DEBUG_LREAD(dbc, dbc->txn, "ram_c_get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
 
 	/* Initialize the cursor for a new retrieval. */
 	copy = *cp;
 
 retry:	/* Update the record number. */
+	stack = 0;
 	switch (flags) {
 	case DB_CURRENT:
-		if (CD_ISSET(dbp, cp)) {
-			PUTHANDLE(dbp);
-			return (DB_KEYEMPTY);
-		}
+		/*
+		 * If record numbers are mutable: if we just deleted a record,
+		 * there is no action necessary, we return the record following
+		 * the deleted item by virtue of renumbering the tree.
+		 */
 		break;
 	case DB_NEXT:
+		/*
+		 * If record numbers are mutable: if we just deleted a record,
+		 * we have to avoid incrementing the record number so that we
+		 * return the right record by virtue of renumbering the tree.
+		 */
 		if (CD_ISSET(dbp, cp))
 			break;
+
 		if (cp->recno != RECNO_OOB) {
 			++cp->recno;
 			break;
@@ -522,86 +593,133 @@ retry:	/* Update the record number. */
 		break;
 	case DB_PREV:
 		if (cp->recno != RECNO_OOB) {
-			if (cp->recno == 1)
-				return (DB_NOTFOUND);
+			if (cp->recno == 1) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
 			--cp->recno;
 			break;
 		}
 		/* FALLTHROUGH */
 	case DB_LAST:
 		flags = DB_PREV;
-		if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND)
+		if (((ret = __ram_update(dbc,
+		    DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
 			goto err;
-		if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0)
+		if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
 			goto err;
-		if (cp->recno == 0)
-			return (DB_NOTFOUND);
+		if (cp->recno == 0) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
 		break;
 	case DB_SET:
 	case DB_SET_RANGE:
-		if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0)
+		if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
 			goto err;
 		break;
 	}
 
-	/*
-	 * Return the key if the user didn't give us one, and then pass it
-	 * into __ram_iget().
-	 */
+	/* Return the key if the user didn't give us one. */
 	if (flags != DB_SET && flags != DB_SET_RANGE &&
 	    (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno),
-	    &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0)
-		return (ret);
+	    &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0)
+		goto err;
 
-	/*
-	 * The cursor was reset, so the delete adjustment is no
-	 * longer necessary.
-	 */
-	CD_CLR(dbp, cp);
+	/* Search the tree for the record. */
+	if ((ret = __bam_rsearch(dbc, &cp->recno,
+	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0)
+		goto err;
+	stack = 1;
+	if (!exact) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+	h = cp->csp->page;
+	indx = cp->csp->indx;
 
 	/*
-	 * Retrieve the record.
-	 *
-	 * Skip any keys that don't really exist.
+	 * If re-numbering records, the on-page deleted flag means this record
+	 * was implicitly created.  If not re-numbering records, the on-page
+	 * deleted flag means this record was implicitly created, or, it was
+	 * deleted at some time.  Regardless, we skip such records if doing
+	 * cursor next/prev operations, and fail if the application requested
+	 * them explicitly.
 	 */
-	if ((ret = __ram_iget(dbp, key, data)) != 0)
-		if (ret == DB_KEYEMPTY &&
-		    (flags == DB_NEXT || flags == DB_PREV))
+	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
+		if (flags == DB_NEXT || flags == DB_PREV) {
+			(void)__bam_stkrel(dbc, 0);
 			goto retry;
+		}
+		ret = DB_KEYEMPTY;
+		goto err;
+	}
+
+	/* Return the data item. */
+	if ((ret = __db_ret(dbp,
+	    h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
+		goto err;
+
+	/* The cursor was reset, no further delete adjustment is necessary. */
+	CD_CLR(dbp, cp);
+
+err:	if (stack)
+		(void)__bam_stkrel(dbc, 0);
+
+	/* Release temporary lock upgrade. */
+	if (tmp_rmw)
+		F_CLR(dbc, DBC_RMW);
 
-err:	if (ret != 0)
+	if (ret != 0)
 		*cp = copy;
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
 /*
  * __ram_c_put --
  *	Recno cursor->c_put function.
+ *
+ * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
  */
-static int
+int
 __ram_c_put(dbc, key, data, flags)
 	DBC *dbc;
 	DBT *key, *data;
 	u_int32_t flags;
 {
-	BTREE *t;
-	RCURSOR *cp, copy;
+	CURSOR *cp, copy;
 	DB *dbp;
 	int exact, ret;
 	void *arg;
 
-	DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags);
-
+	dbp = dbc->dbp;
 	cp = dbc->internal;
 
+	DB_PANIC_CHECK(dbp);
+
 	if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
 	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
 		return (ret);
 
-	GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret);
-	t = dbp->internal;
+	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags);
+
+	/*
+	 * If we are running CDB, this had better be either a write
+	 * cursor or an immediate writer.  If it's a regular writer,
+	 * that means we have an IWRITE lock and we need to upgrade
+	 * it to a write lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+		if (F_ISSET(dbc, DBC_RMW) &&
+		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
+		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+		    &dbc->mylock)) != 0)
+			return (EAGAIN);
+	}
 
 	/* Initialize the cursor for a new retrieval. */
 	copy = *cp;
@@ -614,23 +732,23 @@ __ram_c_put(dbc, key, data, flags)
 	 */
 	if (0) {
 split:		arg = &cp->recno;
-		if ((ret = __bam_split(dbp, arg)) != 0)
+		if ((ret = __bam_split(dbc, arg)) != 0)
 			goto err;
 	}
 
-	if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0)
+	if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
 		goto err;
 	if (!exact) {
 		ret = DB_NOTFOUND;
 		goto err;
 	}
-	if ((ret = __bam_iitem(dbp, &t->bt_csp->page,
-	    &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
-		if ((ret = __bam_stkrel(dbp)) != 0)
+	if ((ret = __bam_iitem(dbc, &cp->csp->page,
+	    &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
+		if ((ret = __bam_stkrel(dbc, 0)) != 0)
 			goto err;
 		goto split;
 	}
-	if ((ret = __bam_stkrel(dbp)) != 0)
+	if ((ret = __bam_stkrel(dbc, 0)) != 0)
 		goto err;
 
 	switch (flags) {
@@ -650,16 +768,16 @@ split:		arg = &cp->recno;
 		break;
 	}
 
-	/*
-	 * The cursor was reset, so the delete adjustment is no
-	 * longer necessary.
-	 */
+	/* The cursor was reset, no further delete adjustment is necessary. */
 	CD_CLR(dbp, cp);
 
-err:	if (ret != 0)
+err:	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
+
+	if (ret != 0)
 		*cp = copy;
 
-	PUTHANDLE(dbp);
 	return (ret);
 }
 
@@ -675,20 +793,22 @@ __ram_ca(dbp, recno, op)
 	db_recno_t recno;
 	ca_recno_arg op;
 {
+	CURSOR *cp;
 	DBC *dbc;
-	RCURSOR *cp;
 
 	/*
 	 * Adjust the cursors.  See the comment in __bam_ca_delete().
 	 */
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
 	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (RCURSOR *)dbc->internal;
+		cp = dbc->internal;
 		switch (op) {
 		case CA_DELETE:
 			if (recno > cp->recno)
 				--cp->recno;
+			if (recno == cp->recno)
+				CD_SET(dbp, cp);
 			break;
 		case CA_IAFTER:
 			if (recno > cp->recno)
@@ -700,51 +820,27 @@ __ram_ca(dbp, recno, op)
 			break;
 		}
 	}
-	CURSOR_TEARDOWN(dbp);
+	DB_THREAD_UNLOCK(dbp);
 }
 
-#ifdef DEBUG
-/*
- * __ram_cprint --
- *	Display the current recno cursor list.
- *
- * PUBLIC: int __ram_cprint __P((DB *));
- */
-int
-__ram_cprint(dbp)
-	DB *dbp;
-{
-	DBC *dbc;
-	RCURSOR *cp;
-
-	CURSOR_SETUP(dbp);
-	for (dbc = TAILQ_FIRST(&dbp->curs_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		cp = (RCURSOR *)dbc->internal;
-		fprintf(stderr,
-		    "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno);
-	}
-	CURSOR_TEARDOWN(dbp);
-
-	return (0);
-}
-#endif /* DEBUG */
-
 /*
  * __ram_getno --
  *	Check the user's record number, and make sure we've seen it.
  *
- * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int));
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
  */
 int
-__ram_getno(dbp, key, rep, can_create)
-	DB *dbp;
+__ram_getno(dbc, key, rep, can_create)
+	DBC *dbc;
 	const DBT *key;
 	db_recno_t *rep;
 	int can_create;
 {
+	DB *dbp;
 	db_recno_t recno;
 
+	dbp = dbc->dbp;
+
 	/* Check the user's record number. */
 	if ((recno = *(db_recno_t *)key->data) == 0) {
 		__db_err(dbp->dbenv, "illegal record number of 0");
@@ -754,24 +850,11 @@ __ram_getno(dbp, key, rep, can_create)
 		*rep = recno;
 
 	/*
-	 * Btree can neither create records or read them in.  Recno can
+	 * Btree can neither create records nor read them in.  Recno can
 	 * do both, see if we can find the record.
 	 */
 	return (dbp->type == DB_RECNO ?
-	    __ram_update(dbp, recno, can_create) : 0);
-}
-
-/*
- * __ram_snapshot --
- *	Read in any remaining records from the backing input file.
- *
- * PUBLIC: int __ram_snapshot __P((DB *));
- */
-int
-__ram_snapshot(dbp)
-	DB *dbp;
-{
-	return (__ram_update(dbp, DB_MAX_RECORDS, 0));
+	    __ram_update(dbc, recno, can_create) : 0);
 }
 
 /*
@@ -779,18 +862,20 @@ __ram_snapshot(dbp)
  *	Ensure the tree has records up to and including the specified one.
  */
 static int
-__ram_update(dbp, recno, can_create)
-	DB *dbp;
+__ram_update(dbc, recno, can_create)
+	DBC *dbc;
 	db_recno_t recno;
 	int can_create;
 {
 	BTREE *t;
+	DB *dbp;
 	RECNO *rp;
 	db_recno_t nrecs;
 	int ret;
 
+	dbp = dbc->dbp;
 	t = dbp->internal;
-	rp = t->bt_recno;
+	rp = t->recno;
 
 	/*
 	 * If we can't create records and we've read the entire backing input
@@ -803,12 +888,12 @@ __ram_update(dbp, recno, can_create)
 	 * If we haven't seen this record yet, try to get it from the original
 	 * file.
 	 */
-	if ((ret = __bam_nrecs(dbp, &nrecs)) != 0)
+	if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
 		return (ret);
 	if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) {
-		if ((ret = rp->re_irec(dbp, recno)) != 0)
+		if ((ret = rp->re_irec(dbc, recno)) != 0)
 			return (ret);
-		if ((ret = __bam_nrecs(dbp, &nrecs)) != 0)
+		if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
 			return (ret);
 	}
 
@@ -819,28 +904,27 @@ __ram_update(dbp, recno, can_create)
 	if (!can_create || recno <= nrecs + 1)
 		return (0);
 
-	t->bt_rdata.dlen = 0;
-	t->bt_rdata.doff = 0;
-	t->bt_rdata.flags = 0;
+	dbc->rdata.dlen = 0;
+	dbc->rdata.doff = 0;
+	dbc->rdata.flags = 0;
 	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
-		if (t->bt_rdata.ulen < rp->re_len) {
-			t->bt_rdata.data = t->bt_rdata.data == NULL ?
-			    (void *)__db_malloc(rp->re_len) :
-			    (void *)__db_realloc(t->bt_rdata.data, rp->re_len);
-			if (t->bt_rdata.data == NULL) {
-				t->bt_rdata.ulen = 0;
-				return (ENOMEM);
+		if (dbc->rdata.ulen < rp->re_len) {
+			if ((ret =
+			    __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
+				dbc->rdata.ulen = 0;
+				dbc->rdata.data = NULL;
+				return (ret);
 			}
-			t->bt_rdata.ulen = rp->re_len;
+			dbc->rdata.ulen = rp->re_len;
 		}
-		t->bt_rdata.size = rp->re_len;
-		memset(t->bt_rdata.data, rp->re_pad, rp->re_len);
+		dbc->rdata.size = rp->re_len;
+		memset(dbc->rdata.data, rp->re_pad, rp->re_len);
 	} else
-		t->bt_rdata.size = 0;
+		dbc->rdata.size = 0;
 
 	while (recno > ++nrecs)
-		if ((ret = __ram_add(dbp,
-		    &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0)
+		if ((ret = __ram_add(dbc,
+		    &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
 			return (ret);
 	return (0);
 }
@@ -859,6 +943,11 @@ __ram_source(dbp, rp, fname)
 	u_int32_t bytes, mbytes, oflags;
 	int ret;
 
+	/*
+	 * !!!
+	 * The caller has full responsibility for cleaning up on error --
+	 * (it has to anyway, in case it fails after this routine succeeds).
+	 */
 	if ((ret = __db_appname(dbp->dbenv,
 	    DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0)
 		return (ret);
@@ -867,7 +956,7 @@ __ram_source(dbp, rp, fname)
 	if ((ret =
 	    __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) {
 		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
-		goto err;
+		return (ret);
 	}
 
 	/*
@@ -878,10 +967,10 @@ __ram_source(dbp, rp, fname)
 	 * compiler will perpetrate, doing the comparison in a portable way is
 	 * flatly impossible.  Hope that mmap fails if the file is too large.
 	 */
-	if ((ret = __db_ioinfo(rp->re_source,
+	if ((ret = __os_ioinfo(rp->re_source,
 	    rp->re_fd, &mbytes, &bytes, NULL)) != 0) {
 		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
-		goto err;
+		return (ret);
 	}
 	if (mbytes == 0 && bytes == 0) {
 		F_SET(rp, RECNO_EOF);
@@ -891,14 +980,11 @@ __ram_source(dbp, rp, fname)
 	size = mbytes * MEGABYTE + bytes;
 	if ((ret = __db_mapfile(rp->re_source,
 	    rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0)
-		goto err;
+		return (ret);
 	rp->re_cmap = rp->re_smap;
 	rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size);
 	rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ?  __ram_fmap : __ram_vmap;
 	return (0);
-
-err:	FREES(rp->re_source)
-	return (ret);
 }
 
 /*
@@ -906,17 +992,19 @@ err:	FREES(rp->re_source)
  *	Rewrite the backing file.
  */
 static int
-__ram_writeback(dbp)
-	DB *dbp;
+__ram_writeback(dbc)
+	DBC *dbc;
 {
-	RECNO *rp;
+	DB *dbp;
 	DBT key, data;
+	RECNO *rp;
 	db_recno_t keyno;
 	ssize_t nw;
 	int fd, ret, t_ret;
 	u_int8_t delim, *pad;
 
-	rp = ((BTREE *)dbp->internal)->bt_recno;
+	dbp = dbc->dbp;
+	rp = ((BTREE *)dbp->internal)->recno;
 
 	/* If the file wasn't modified, we're done. */
 	if (!F_ISSET(rp, RECNO_MODIFIED))
@@ -931,7 +1019,7 @@ __ram_writeback(dbp)
 	/*
 	 * Read any remaining records into the tree.
 	 *
-	 * XXX
+	 * !!!
 	 * This is why we can't support transactions when applications specify
 	 * backing (re_source) files.  At this point we have to read in the
 	 * rest of the records from the file so that we can write all of the
@@ -946,7 +1034,8 @@ __ram_writeback(dbp)
 	 * protecting the backing source file, i.e. mpool would have to know
 	 * about it, and we don't want to go there.
 	 */
-	if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND)
+	if ((ret =
+	    __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
 		return (ret);
 
 	/*
@@ -962,7 +1051,7 @@ __ram_writeback(dbp)
 
 	/* Get rid of any backing file descriptor, just on GP's. */
 	if (rp->re_fd != -1) {
-		(void)__db_close(rp->re_fd);
+		(void)__os_close(rp->re_fd);
 		rp->re_fd = -1;
 	}
 
@@ -990,10 +1079,8 @@ __ram_writeback(dbp)
 	 */
 	delim = rp->re_delim;
 	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
-		if ((pad = (u_int8_t *)__db_malloc(rp->re_len)) == NULL) {
-			ret = ENOMEM;
+		if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0)
 			goto err;
-		}
 		memset(pad, rp->re_pad, rp->re_len);
 	} else
 		COMPQUIET(pad, NULL);
@@ -1001,7 +1088,7 @@ __ram_writeback(dbp)
 		switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
 		case 0:
 			if ((ret =
-			    __db_write(fd, data.data, data.size, &nw)) != 0)
+			    __os_write(fd, data.data, data.size, &nw)) != 0)
 				goto err;
 			if (nw != (ssize_t)data.size) {
 				ret = EIO;
@@ -1011,7 +1098,7 @@ __ram_writeback(dbp)
 		case DB_KEYEMPTY:
 			if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
 				if ((ret =
-				    __db_write(fd, pad, rp->re_len, &nw)) != 0)
+				    __os_write(fd, pad, rp->re_len, &nw)) != 0)
 					goto err;
 				if (nw != (ssize_t)rp->re_len) {
 					ret = EIO;
@@ -1024,7 +1111,7 @@ __ram_writeback(dbp)
 			goto done;
 		}
 		if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) {
-			if ((ret = __db_write(fd, &delim, 1, &nw)) != 0)
+			if ((ret = __os_write(fd, &delim, 1, &nw)) != 0)
 				goto err;
 			if (nw != 1) {
 				ret = EIO;
@@ -1035,7 +1122,7 @@ __ram_writeback(dbp)
 
 err:
 done:	/* Close the file descriptor. */
-	if ((t_ret = __db_close(fd)) != 0 || ret == 0)
+	if ((t_ret = __os_close(fd)) != 0 || ret == 0)
 		ret = t_ret;
 
 	if (ret == 0)
@@ -1048,11 +1135,11 @@ done:	/* Close the file descriptor. */
  *	Get fixed length records from a file.
  */
 static int
-__ram_fmap(dbp, top)
-	DB *dbp;
+__ram_fmap(dbc, top)
+	DBC *dbc;
 	db_recno_t top;
 {
-	BTREE *t;
+	DB *dbp;
 	DBT data;
 	RECNO *rp;
 	db_recno_t recno;
@@ -1060,24 +1147,23 @@ __ram_fmap(dbp, top)
 	u_int8_t *sp, *ep, *p;
 	int ret;
 
-	if ((ret = __bam_nrecs(dbp, &recno)) != 0)
+	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
 		return (ret);
 
-	t = dbp->internal;
-	rp = t->bt_recno;
-	if (t->bt_rdata.ulen < rp->re_len) {
-		t->bt_rdata.data = t->bt_rdata.data == NULL ?
-		    (void *)__db_malloc(rp->re_len) :
-		    (void *)__db_realloc(t->bt_rdata.data, rp->re_len);
-		if (t->bt_rdata.data == NULL) {
-			t->bt_rdata.ulen = 0;
-			return (ENOMEM);
+	dbp = dbc->dbp;
+	rp = ((BTREE *)(dbp->internal))->recno;
+
+	if (dbc->rdata.ulen < rp->re_len) {
+		if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
+			dbc->rdata.ulen = 0;
+			dbc->rdata.data = NULL;
+			return (ret);
 		}
-		t->bt_rdata.ulen = rp->re_len;
+		dbc->rdata.ulen = rp->re_len;
 	}
 
 	memset(&data, 0, sizeof(data));
-	data.data = t->bt_rdata.data;
+	data.data = dbc->rdata.data;
 	data.size = rp->re_len;
 
 	sp = (u_int8_t *)rp->re_cmap;
@@ -1088,7 +1174,7 @@ __ram_fmap(dbp, top)
 			return (DB_NOTFOUND);
 		}
 		len = rp->re_len;
-		for (p = t->bt_rdata.data;
+		for (p = dbc->rdata.data;
 		    sp < ep && len > 0; *p++ = *sp++, --len)
 			;
 
@@ -1108,7 +1194,7 @@ __ram_fmap(dbp, top)
 				memset(p, rp->re_pad, len);
 
 			++recno;
-			if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0)
+			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
 				return (ret);
 		}
 		++rp->re_last;
@@ -1122,21 +1208,19 @@ __ram_fmap(dbp, top)
  *	Get variable length records from a file.
  */
 static int
-__ram_vmap(dbp, top)
-	DB *dbp;
+__ram_vmap(dbc, top)
+	DBC *dbc;
 	db_recno_t top;
 {
-	BTREE *t;
 	DBT data;
 	RECNO *rp;
 	db_recno_t recno;
 	u_int8_t *sp, *ep;
 	int delim, ret;
 
-	t = dbp->internal;
-	rp = t->bt_recno;
+	rp = ((BTREE *)(dbc->dbp->internal))->recno;
 
-	if ((ret = __bam_nrecs(dbp, &recno)) != 0)
+	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
 		return (ret);
 
 	memset(&data, 0, sizeof(data));
@@ -1163,7 +1247,7 @@ __ram_vmap(dbp, top)
 		if (rp->re_last >= recno) {
 			data.size = sp - (u_int8_t *)data.data;
 			++recno;
-			if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0)
+			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
 				return (ret);
 		}
 		++rp->re_last;
@@ -1178,40 +1262,47 @@ __ram_vmap(dbp, top)
  *	Add records into the tree.
  */
 static int
-__ram_add(dbp, recnop, data, flags, bi_flags)
-	DB *dbp;
+__ram_add(dbc, recnop, data, flags, bi_flags)
+	DBC *dbc;
 	db_recno_t *recnop;
 	DBT *data;
 	u_int32_t flags, bi_flags;
 {
 	BKEYDATA *bk;
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	PAGE *h;
 	db_indx_t indx;
 	int exact, isdeleted, ret, stack;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 
 retry:	/* Find the slot for insertion. */
-	if ((ret = __bam_rsearch(dbp, recnop,
-	    S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0)
+	if ((ret = __bam_rsearch(dbc, recnop,
+	    S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
 		return (ret);
-	h = t->bt_csp->page;
-	indx = t->bt_csp->indx;
+	h = cp->csp->page;
+	indx = cp->csp->indx;
 	stack = 1;
 
 	/*
+	 * If re-numbering records, the on-page deleted flag means this record
+	 * was implicitly created.  If not re-numbering records, the on-page
+	 * deleted flag means this record was implicitly created, or, it was
+	 * deleted at some time.
+	 *
 	 * If DB_NOOVERWRITE is set and the item already exists in the tree,
-	 * return an error unless the item has been marked for deletion.
+	 * return an error unless the item was either marked for deletion or
+	 * only implicitly created.
 	 */
 	isdeleted = 0;
 	if (exact) {
 		bk = GET_BKEYDATA(h, indx);
-		if (B_DISSET(bk->type)) {
+		if (B_DISSET(bk->type))
 			isdeleted = 1;
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
-		} else
-			if (LF_ISSET(DB_NOOVERWRITE)) {
+		else
+			if (flags == DB_NOOVERWRITE) {
 				ret = DB_KEYEXIST;
 				goto err;
 			}
@@ -1224,40 +1315,42 @@ retry:	/* Find the slot for insertion. */
 	 * match, we're inserting a new key/data pair, before the search
 	 * location.
 	 */
-	switch (ret = __bam_iitem(dbp,
+	switch (ret = __bam_iitem(dbc,
 	    &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
 	case 0:
 		/*
-		 * Done.  Clean up the cursor and adjust the internal page
-		 * counts.
+		 * Don't adjust anything.
+		 *
+		 * If we inserted a record, no cursors need adjusting because
+		 * the only new record it's possible to insert is at the very
+		 * end of the tree.  The necessary adjustments to the internal
+		 * page counts were made by __bam_iitem().
+		 *
+		 * If we overwrote a record, no cursors need adjusting because
+		 * future DBcursor->get calls will simply return the underlying
+		 * record (there's no adjustment made for the DB_CURRENT flag
+		 * when a cursor get operation immediately follows a cursor
+		 * delete operation, and the normal adjustment for the DB_NEXT
+		 * flag is still correct).
 		 */
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS);
 		break;
 	case DB_NEEDSPLIT:
-		/*
-		 * We have to split the page.  Back out the cursor setup,
-		 * discard the stack of pages, and do the split.
-		 */
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
-
-		(void)__bam_stkrel(dbp);
+		/* Discard the stack of pages and split the page. */
+		(void)__bam_stkrel(dbc, 0);
 		stack = 0;
 
-		if ((ret = __bam_split(dbp, recnop)) != 0)
-			break;
+		if ((ret = __bam_split(dbc, recnop)) != 0)
+			goto err;
 
 		goto retry;
 		/* NOTREACHED */
 	default:
-		if (isdeleted)
-			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
-		break;
+		goto err;
 	}
 
+
 err:	if (stack)
-		__bam_stkrel(dbp);
+		__bam_stkrel(dbc, 0);
 
 	return (ret);
 }
diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c
index caa6b3515e..8efe4059a8 100644
--- a/db2/btree/bt_rsearch.c
+++ b/db2/btree/bt_rsearch.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_rsearch.c	10.15 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_rsearch.c	10.21 (Sleepycat) 12/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -59,39 +59,37 @@ static const char sccsid[] = "@(#)bt_rsearch.c	10.15 (Sleepycat) 5/6/98";
  * __bam_rsearch --
  *	Search a btree for a record number.
  *
- * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *));
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
  */
 int
-__bam_rsearch(dbp, recnop, flags, stop, exactp)
-	DB *dbp;
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+	DBC *dbc;
 	db_recno_t *recnop;
 	u_int32_t flags;
 	int stop, *exactp;
 {
 	BINTERNAL *bi;
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	DB_LOCK lock;
 	PAGE *h;
 	RINTERNAL *ri;
 	db_indx_t indx, top;
 	db_pgno_t pg;
 	db_recno_t i, recno, total;
-	int isappend, ret, stack;
+	int ret, stack;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 
-	/*
-	 * We test for groups of flags, S_APPEND is the only one that can be
-	 * OR'd into the set.  Clear it now so that the tests for equality
-	 * will work.
-	 */
-	if ((isappend = LF_ISSET(S_APPEND)) != 0)
-		LF_CLR(S_APPEND);
+	BT_STK_CLR(cp);
 
 	/*
 	 * There are several ways we search a btree tree.  The flags argument
 	 * specifies if we're acquiring read or write locks and if we are
-	 * locking pairs of pages.  See btree.h for more details.
+	 * locking pairs of pages.  In addition, if we're adding or deleting
+	 * an item, we have to lock the entire tree, regardless.  See btree.h
+	 * for more details.
 	 *
 	 * If write-locking pages, we need to know whether or not to acquire a
 	 * write lock on a page before getting it.  This depends on how deep it
@@ -102,15 +100,36 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 	 * Retrieve the root page.
 	 */
 	pg = PGNO_ROOT;
-	if ((ret = __bam_lget(dbp, 0, PGNO_ROOT,
-	    flags == S_INSERT || flags == S_DELETE ?
-	    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
+	stack = LF_ISSET(S_STACK);
+	if ((ret = __bam_lget(dbc,
+	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 		return (ret);
-	if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-		(void)__BT_LPUT(dbp, lock);
+	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+		(void)__BT_LPUT(dbc, lock);
 		return (ret);
 	}
-	total = RE_NREC(h);
+
+	/*
+	 * Decide if we need to save this page; if we do, write lock it.
+	 * We deliberately don't lock-couple on this call.  If the tree
+	 * is tiny, i.e., one page, and two threads are busily updating
+	 * the root page, we're almost guaranteed deadlocks galore, as
+	 * each one gets a read lock and then blocks the other's attempt
+	 * for a write lock.
+	 */
+	if (!stack &&
+	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+		(void)memp_fput(dbp->mpf, h, 0);
+		(void)__BT_LPUT(dbc, lock);
+		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
+			return (ret);
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+			(void)__BT_LPUT(dbc, lock);
+			return (ret);
+		}
+		stack = 1;
+	}
 
 	/*
 	 * If appending to the tree, set the record number now -- we have the
@@ -124,7 +143,8 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 	 * for the record immediately after the last record in the tree, so do
 	 * a fast check now.
 	 */
-	if (isappend) {
+	total = RE_NREC(h);
+	if (LF_ISSET(S_APPEND)) {
 		*exactp = 0;
 		*recnop = recno = total + 1;
 	} else {
@@ -133,33 +153,14 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 			*exactp = 1;
 		else {
 			*exactp = 0;
-			if (!PAST_END_OK(flags) || recno > total + 1) {
+			if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
 				(void)memp_fput(dbp->mpf, h, 0);
-				(void)__BT_LPUT(dbp, lock);
+				(void)__BT_LPUT(dbc, lock);
 				return (DB_NOTFOUND);
 			}
 		}
 	}
 
-	/* Decide if we're building a stack based on the operation. */
-	BT_STK_CLR(t);
-	stack = flags == S_DELETE || flags == S_INSERT;
-
-	/*
-	 * Decide if we need to save this page; if we do, write lock it, and
-	 * start to build a stack.
-	 */
-	if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) {
-		(void)memp_fput(dbp->mpf, h, 0);
-		if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
-			return (ret);
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-			(void)__BT_LPUT(dbp, lock);
-			return (ret);
-		}
-		stack = 1;
-	}
-
 	/*
 	 * !!!
 	 * Record numbers in the tree are 0-based, but the recno is
@@ -177,7 +178,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 			 * not exist if there are enough deleted records in the
 			 * page.
 			 */
-			if (recno <= NUM_ENT(h))
+			if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX)
 				for (i = recno - 1;; --i) {
 					if (B_DISSET(GET_BKEYDATA(h,
 					    i * P_INDX + O_INDX)->type))
@@ -185,10 +186,10 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 					if (i == 0)
 						break;
 				}
-			if (recno > NUM_ENT(h)) {
+			if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) {
 				*exactp = 0;
-				if (!PAST_END_OK(flags) ||
-				    recno > (db_recno_t)(NUM_ENT(h) + 1)) {
+				if (!LF_ISSET(S_PAST_EOF) || recno >
+				    (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) {
 					ret = DB_NOTFOUND;
 					goto err;
 				}
@@ -197,7 +198,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 
 			/* Correct from 1-based to 0-based for a page offset. */
 			--recno;
-			BT_STK_ENTER(t, h, recno * P_INDX, lock, ret);
+			BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret);
 			return (ret);
 		case P_IBTREE:
 			for (indx = 0, top = NUM_ENT(h);;) {
@@ -213,7 +214,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 
 			/* Correct from 1-based to 0-based for a page offset. */
 			--recno;
-			BT_STK_ENTER(t, h, recno, lock, ret);
+			BT_STK_ENTER(cp, h, recno, lock, ret);
 			return (ret);
 		case P_IRECNO:
 			for (indx = 0, top = NUM_ENT(h);;) {
@@ -232,42 +233,42 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 		if (stack) {
 			/* Return if this is the lowest page wanted. */
 			if (LF_ISSET(S_PARENT) && stop == h->level) {
-				BT_STK_ENTER(t, h, indx, lock, ret);
+				BT_STK_ENTER(cp, h, indx, lock, ret);
 				return (ret);
 			}
-			BT_STK_PUSH(t, h, indx, lock, ret);
-			if (ret)
+			BT_STK_PUSH(cp, h, indx, lock, ret);
+			if (ret != 0)
 				goto err;
 
-			if ((ret = __bam_lget(dbp, 0, pg,
-			    LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
-			    &lock)) != 0)
+			if ((ret =
+			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
 				goto err;
 		} else {
-			(void)memp_fput(dbp->mpf, h, 0);
-
 			/*
 			 * Decide if we want to return a pointer to the next
 			 * page in the stack.  If we do, write lock it and
 			 * never unlock it.
 			 */
-			if (LF_ISSET(S_PARENT) &&
-			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1))
+			if ((LF_ISSET(S_PARENT) &&
+			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+			    (h->level - 1) == LEAFLEVEL)
 				stack = 1;
 
-			if ((ret = __bam_lget(dbp, 1, pg,
-			    LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
-			    &lock)) != 0)
+			(void)memp_fput(dbp->mpf, h, 0);
+
+			if ((ret =
+			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
+			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 				goto err;
 		}
 
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
 			goto err;
 	}
 	/* NOTREACHED */
 
-err:	BT_STK_POP(t);
-	__bam_stkrel(dbp);
+err:	BT_STK_POP(cp);
+	__bam_stkrel(dbc, 0);
 	return (ret);
 }
 
@@ -275,25 +276,29 @@ err:	BT_STK_POP(t);
  * __bam_adjust --
  *	Adjust the tree after adding or deleting a record.
  *
- * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int32_t));
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
  */
 int
-__bam_adjust(dbp, t, adjust)
-	DB *dbp;
-	BTREE *t;
+__bam_adjust(dbc, adjust)
+	DBC *dbc;
 	int32_t adjust;
 {
+	CURSOR *cp;
+	DB *dbp;
 	EPG *epg;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+
 	/* Update the record counts for the tree. */
-	for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
+	for (epg = cp->sp; epg <= cp->csp; ++epg) {
 		h = epg->page;
 		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
-			if (DB_LOGGING(dbp) &&
+			if (DB_LOGGING(dbc) &&
 			    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
-			    dbp->txn, &LSN(h), 0, dbp->log_fileid,
+			    dbc->txn, &LSN(h), 0, dbp->log_fileid,
 			    PGNO(h), &LSN(h), (u_int32_t)epg->indx,
 			    adjust, 1)) != 0)
 				return (ret);
@@ -317,28 +322,31 @@ __bam_adjust(dbp, t, adjust)
  * __bam_nrecs --
  *	Return the number of records in the tree.
  *
- * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *));
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
  */
 int
-__bam_nrecs(dbp, rep)
-	DB *dbp;
+__bam_nrecs(dbc, rep)
+	DBC *dbc;
 	db_recno_t *rep;
 {
+	DB *dbp;
 	DB_LOCK lock;
 	PAGE *h;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	pgno = PGNO_ROOT;
-	if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
 		return (ret);
-	if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 		return (ret);
 
 	*rep = RE_NREC(h);
 
 	(void)memp_fput(dbp->mpf, h, 0);
-	(void)__BT_TLPUT(dbp, lock);
+	(void)__BT_TLPUT(dbc, lock);
 
 	return (0);
 }
diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c
index 09ce46d90a..1f439a4261 100644
--- a/db2/btree/bt_search.c
+++ b/db2/btree/bt_search.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_search.c	10.15 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_search.c	10.25 (Sleepycat) 12/16/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -65,38 +65,41 @@ static const char sccsid[] = "@(#)bt_search.c	10.15 (Sleepycat) 5/6/98";
  * __bam_search --
  *	Search a btree for a key.
  *
- * PUBLIC: int __bam_search __P((DB *,
+ * PUBLIC: int __bam_search __P((DBC *,
  * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
  */
 int
-__bam_search(dbp, key, flags, stop, recnop, exactp)
-	DB *dbp;
+__bam_search(dbc, key, flags, stop, recnop, exactp)
+	DBC *dbc;
 	const DBT *key;
 	u_int32_t flags;
 	int stop, *exactp;
 	db_recno_t *recnop;
 {
 	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	DB_LOCK lock;
-	EPG cur;
 	PAGE *h;
 	db_indx_t base, i, indx, lim;
 	db_pgno_t pg;
 	db_recno_t recno;
 	int cmp, jump, ret, stack;
 
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 	t = dbp->internal;
 	recno = 0;
 
-	BT_STK_CLR(t);
+	BT_STK_CLR(cp);
 
 	/*
 	 * There are several ways we search a btree tree.  The flags argument
 	 * specifies if we're acquiring read or write locks, if we position
 	 * to the first or last item in a set of duplicates, if we return
-	 * deleted items, and if we are locking pairs of pages.  See btree.h
-	 * for more details.  In addition, if we're doing record numbers, we
-	 * have to lock the entire tree regardless.
+	 * deleted items, and if we are locking pairs of pages.  In addition,
+	 * if we're modifying record numbers, we have to lock the entire tree
+	 * regardless.  See btree.h for more details.
 	 *
 	 * If write-locking pages, we need to know whether or not to acquire a
 	 * write lock on a page before getting it.  This depends on how deep it
@@ -108,11 +111,11 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
 	 */
 	pg = PGNO_ROOT;
 	stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK);
-	if ((ret = __bam_lget(dbp,
+	if ((ret = __bam_lget(dbc,
 	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 		return (ret);
-	if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-		(void)__BT_LPUT(dbp, lock);
+	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+		(void)__BT_LPUT(dbc, lock);
 		return (ret);
 	}
 
@@ -128,14 +131,13 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
 	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
 	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_LPUT(dbp, lock);
-		if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
+		(void)__BT_LPUT(dbc, lock);
+		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
 			return (ret);
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-			(void)__BT_LPUT(dbp, lock);
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+			(void)__BT_LPUT(dbc, lock);
 			return (ret);
 		}
-
 		stack = 1;
 	}
 
@@ -147,12 +149,12 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
 		 * per page item.  If we find an exact match on a leaf page,
 		 * we're done.
 		 */
-		cur.page = h;
 		jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
 		for (base = 0,
 		    lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) {
-			cur.indx = indx = base + ((lim >> 1) * jump);
-			if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) {
+			indx = base + ((lim >> 1) * jump);
+			if ((cmp =
+			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) {
 				if (TYPE(h) == P_LBTREE)
 					goto match;
 				goto next;
@@ -184,7 +186,7 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
 			 * to find an undeleted record.  This is handled in the
 			 * __bam_c_search() routine.
 			 */
-			BT_STK_ENTER(t, h, base, lock, ret);
+			BT_STK_ENTER(cp, h, base, lock, ret);
 			return (ret);
 		}
 
@@ -208,39 +210,39 @@ next:		pg = GET_BINTERNAL(h, indx)->pgno;
 		if (stack) {
 			/* Return if this is the lowest page wanted. */
 			if (LF_ISSET(S_PARENT) && stop == h->level) {
-				BT_STK_ENTER(t, h, indx, lock, ret);
+				BT_STK_ENTER(cp, h, indx, lock, ret);
 				return (ret);
 			}
-			BT_STK_PUSH(t, h, indx, lock, ret);
+			BT_STK_PUSH(cp, h, indx, lock, ret);
 			if (ret != 0)
 				goto err;
 
 			if ((ret =
-			    __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
+			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
 				goto err;
 		} else {
-			(void)memp_fput(dbp->mpf, h, 0);
-
 			/*
-			 * Decide if we want to return a pointer to the next
-			 * page in the stack.  If we do, write lock it and
-			 * never unlock it.
+			 * Decide if we want to return a reference to the next
+			 * page in the return stack.  If so, lock it and never
+			 * unlock it.
 			 */
 			if ((LF_ISSET(S_PARENT) &&
 			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
 			    (h->level - 1) == LEAFLEVEL)
 				stack = 1;
 
+			(void)memp_fput(dbp->mpf, h, 0);
+
 			if ((ret =
-			    __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ?
+			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
 			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 				goto err;
 		}
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
 			goto err;
 	}
-
 	/* NOTREACHED */
+
 match:	*exactp = 1;
 
 	/*
@@ -288,17 +290,17 @@ match:	*exactp = 1;
 			goto notfound;
 	}
 
-	BT_STK_ENTER(t, h, indx, lock, ret);
+	BT_STK_ENTER(cp, h, indx, lock, ret);
 	return (ret);
 
 notfound:
 	(void)memp_fput(dbp->mpf, h, 0);
-	(void)__BT_LPUT(dbp, lock);
+	(void)__BT_LPUT(dbc, lock);
 	ret = DB_NOTFOUND;
 
-err:	if (t->bt_csp > t->bt_sp) {
-		BT_STK_POP(t);
-		__bam_stkrel(dbp);
+err:	if (cp->csp > cp->sp) {
+		BT_STK_POP(cp);
+		__bam_stkrel(dbc, 0);
 	}
 	return (ret);
 }
@@ -307,20 +309,35 @@ err:	if (t->bt_csp > t->bt_sp) {
  * __bam_stkrel --
  *	Release all pages currently held in the stack.
  *
- * PUBLIC: int __bam_stkrel __P((DB *));
+ * PUBLIC: int __bam_stkrel __P((DBC *, int));
  */
 int
-__bam_stkrel(dbp)
-	DB *dbp;
+__bam_stkrel(dbc, nolocks)
+	DBC *dbc;
+	int nolocks;
 {
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	EPG *epg;
 
-	t = dbp->internal;
-	for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
-		(void)memp_fput(dbp->mpf, epg->page, 0);
-		(void)__BT_TLPUT(dbp, epg->lock);
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+
+	/* Release inner pages first. */
+	for (epg = cp->sp; epg <= cp->csp; ++epg) {
+		if (epg->page != NULL)
+			(void)memp_fput(dbp->mpf, epg->page, 0);
+		if (epg->lock != LOCK_INVALID) {
+			if (nolocks)
+				(void)__BT_LPUT(dbc, epg->lock);
+			else
+				(void)__BT_TLPUT(dbc, epg->lock);
+		}
 	}
+
+	/* Clear the stack, all pages have been released. */
+	BT_STK_CLR(cp);
+
 	return (0);
 }
 
@@ -328,24 +345,25 @@ __bam_stkrel(dbp)
  * __bam_stkgrow --
  *	Grow the stack.
  *
- * PUBLIC: int __bam_stkgrow __P((BTREE *));
+ * PUBLIC: int __bam_stkgrow __P((CURSOR *));
  */
 int
-__bam_stkgrow(t)
-	BTREE *t;
+__bam_stkgrow(cp)
+	CURSOR *cp;
 {
 	EPG *p;
 	size_t entries;
+	int ret;
 
-	entries = t->bt_esp - t->bt_sp;
+	entries = cp->esp - cp->sp;
 
-	if ((p = (EPG *)__db_calloc(entries * 2, sizeof(EPG))) == NULL)
-		return (ENOMEM);
-	memcpy(p, t->bt_sp, entries * sizeof(EPG));
-	if (t->bt_sp != t->bt_stack)
-		FREE(t->bt_sp, entries * sizeof(EPG));
-	t->bt_sp = p;
-	t->bt_csp = p + entries;
-	t->bt_esp = p + entries * 2;
+	if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0)
+		return (ret);
+	memcpy(p, cp->sp, entries * sizeof(EPG));
+	if (cp->sp != cp->stack)
+		__os_free(cp->sp, entries * sizeof(EPG));
+	cp->sp = p;
+	cp->csp = p + entries;
+	cp->esp = p + entries * 2;
 	return (0);
 }
diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c
index da9417c781..1d8e926d85 100644
--- a/db2/btree/bt_split.c
+++ b/db2/btree/bt_split.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_split.c	10.23 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)bt_split.c	10.33 (Sleepycat) 10/13/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -59,27 +59,31 @@ static const char sccsid[] = "@(#)bt_split.c	10.23 (Sleepycat) 5/23/98";
 #include "db_page.h"
 #include "btree.h"
 
-static int __bam_page __P((DB *, EPG *, EPG *));
-static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *));
-static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int));
-static int __bam_root __P((DB *, EPG *));
+static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
+static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
 
 /*
  * __bam_split --
  *	Split a page.
  *
- * PUBLIC: int __bam_split __P((DB *, void *));
+ * PUBLIC: int __bam_split __P((DBC *, void *));
  */
 int
-__bam_split(dbp, arg)
-	DB *dbp;
+__bam_split(dbc, arg)
+	DBC *dbc;
 	void *arg;
 {
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	enum { UP, DOWN } dir;
 	int exact, level, ret;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 
 	/*
 	 * The locking protocol we use to avoid deadlock to acquire locks by
@@ -113,15 +117,16 @@ __bam_split(dbp, arg)
 		 * Acquire a page and its parent, locked.
 		 */
 		if ((ret = (dbp->type == DB_BTREE ?
-		    __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) :
-		    __bam_rsearch(dbp,
+		    __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) :
+		    __bam_rsearch(dbc,
 		        (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
 			return (ret);
 
 		/* Split the page. */
-		ret = t->bt_csp[0].page->pgno == PGNO_ROOT ?
-		    __bam_root(dbp, &t->bt_csp[0]) :
-		    __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]);
+		ret = cp->csp[0].page->pgno == PGNO_ROOT ?
+		    __bam_root(dbc, &cp->csp[0]) :
+		    __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+		BT_STK_CLR(cp);
 
 		switch (ret) {
 		case 0:
@@ -155,15 +160,16 @@ __bam_split(dbp, arg)
  *	Split the root page of a btree.
  */
 static int
-__bam_root(dbp, cp)
-	DB *dbp;
+__bam_root(dbc, cp)
+	DBC *dbc;
 	EPG *cp;
 {
-	BTREE *t;
+	DB *dbp;
 	PAGE *lp, *rp;
+	db_indx_t split;
 	int ret;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
 
 	/* Yeah, right. */
 	if (cp->page->level >= MAXBTREELEVEL) {
@@ -173,8 +179,8 @@ __bam_root(dbp, cp)
 
 	/* Create new left and right pages for the split. */
 	lp = rp = NULL;
-	if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 ||
-	    (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0)
+	if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 ||
+	    (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
 		goto err;
 	P_INIT(lp, dbp->pgsize, lp->pgno,
 	    PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
@@ -184,18 +190,18 @@ __bam_root(dbp, cp)
 	    cp->page->level, TYPE(cp->page));
 
 	/* Split the page. */
-	if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0)
+	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
 		goto err;
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		DBT __a;
 		DB_LSN __lsn;
 		memset(&__a, 0, sizeof(__a));
 		__a.data = cp->page;
 		__a.size = dbp->pgsize;
 		ZERO_LSN(__lsn);
-		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn,
+		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
 		    PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn,
 		    &__a)) != 0)
@@ -205,26 +211,27 @@ __bam_root(dbp, cp)
 
 	/* Clean up the new root page. */
 	if ((ret = (dbp->type == DB_RECNO ?
-	    __ram_root(dbp, cp->page, lp, rp) :
-	    __bam_broot(dbp, cp->page, lp, rp))) != 0)
+	    __ram_root(dbc, cp->page, lp, rp) :
+	    __bam_broot(dbc, cp->page, lp, rp))) != 0)
 		goto err;
 
+	/* Adjust any cursors.  Do it last so we don't have to undo it. */
+	__bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
+
 	/* Success -- write the real pages back to the store. */
 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
-	(void)__BT_TLPUT(dbp, cp->lock);
+	(void)__BT_TLPUT(dbc, cp->lock);
 	(void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
 
-	++t->lstat.bt_split;
-	++t->lstat.bt_rootsplit;
 	return (0);
 
 err:	if (lp != NULL)
-		(void)__bam_free(dbp, lp);
+		(void)__bam_free(dbc, lp);
 	if (rp != NULL)
-		(void)__bam_free(dbp, rp);
+		(void)__bam_free(dbc, rp);
 	(void)memp_fput(dbp->mpf, cp->page, 0);
-	(void)__BT_TLPUT(dbp, cp->lock);
+	(void)__BT_TLPUT(dbc, cp->lock);
 	return (ret);
 }
 
@@ -233,19 +240,22 @@ err:	if (lp != NULL)
  *	Split the non-root page of a btree.
  */
 static int
-__bam_page(dbp, pp, cp)
-	DB *dbp;
+__bam_page(dbc, pp, cp)
+	DBC *dbc;
 	EPG *pp, *cp;
 {
+	DB *dbp;
 	DB_LOCK tplock;
 	PAGE *lp, *rp, *tp;
+	db_indx_t split;
 	int ret;
 
+	dbp = dbc->dbp;
 	lp = rp = tp = NULL;
 	ret = -1;
 
 	/* Create new right page for the split. */
-	if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0)
+	if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
 		goto err;
 	P_INIT(rp, dbp->pgsize, rp->pgno,
 	    ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno,
@@ -253,13 +263,8 @@ __bam_page(dbp, pp, cp)
 	    cp->page->level, TYPE(cp->page));
 
 	/* Create new left page for the split. */
-	if ((lp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) {
-		ret = ENOMEM;
+	if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0)
 		goto err;
-	}
-#ifdef DIAGNOSTIC
-	memset(lp, 0xff, dbp->pgsize);
-#endif
 	P_INIT(lp, dbp->pgsize, cp->page->pgno,
 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : cp->page->prev_pgno,
 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : rp->pgno,
@@ -276,7 +281,7 @@ __bam_page(dbp, pp, cp)
 	 * change, we swap the original and the allocated left page after the
 	 * split.
 	 */
-	if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0)
+	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
 		goto err;
 
 	/*
@@ -293,19 +298,19 @@ __bam_page(dbp, pp, cp)
 	 * the page we're splitting.
 	 */
 	if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) {
-		if ((ret = __bam_lget(dbp,
+		if ((ret = __bam_lget(dbc,
 		    0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0)
 			goto err;
-		if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0)
 			goto err;
 	}
 
 	/* Insert the new pages into the parent page. */
-	if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0)
+	if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0)
 		goto err;
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		DBT __a;
 		DB_LSN __lsn;
 		memset(&__a, 0, sizeof(__a));
@@ -313,7 +318,7 @@ __bam_page(dbp, pp, cp)
 		__a.size = dbp->pgsize;
 		if (tp == NULL)
 			ZERO_LSN(__lsn);
-		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn,
+		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
 		    &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page),
 		    &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
 		    tp == NULL ? 0 : PGNO(tp),
@@ -329,56 +334,69 @@ __bam_page(dbp, pp, cp)
 	memcpy(cp->page, lp, LOFFSET(lp));
 	memcpy((u_int8_t *)cp->page + HOFFSET(lp),
 	    (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
-	FREE(lp, dbp->pgsize);
+	__os_free(lp, dbp->pgsize);
 	lp = NULL;
 
 	/* Finish the next-page link. */
 	if (tp != NULL)
 		tp->prev_pgno = rp->pgno;
 
+	/* Adjust any cursors.  Do so last so we don't have to undo it. */
+	__bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0);
+
 	/* Success -- write the real pages back to the store. */
 	(void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY);
-	(void)__BT_TLPUT(dbp, pp->lock);
+	(void)__BT_TLPUT(dbc, pp->lock);
 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
-	(void)__BT_TLPUT(dbp, cp->lock);
+	(void)__BT_TLPUT(dbc, cp->lock);
 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
 	if (tp != NULL) {
 		(void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY);
-		(void)__BT_TLPUT(dbp, tplock);
+		(void)__BT_TLPUT(dbc, tplock);
 	}
 	return (0);
 
 err:	if (lp != NULL)
-		FREE(lp, dbp->pgsize);
+		__os_free(lp, dbp->pgsize);
 	if (rp != NULL)
-		(void)__bam_free(dbp, rp);
+		(void)__bam_free(dbc, rp);
 	if (tp != NULL) {
 		(void)memp_fput(dbp->mpf, tp, 0);
-		(void)__BT_TLPUT(dbp, tplock);
+		if (ret == DB_NEEDSPLIT)
+			(void)__BT_LPUT(dbc, tplock);
+		else
+			(void)__BT_TLPUT(dbc, tplock);
 	}
 	(void)memp_fput(dbp->mpf, pp->page, 0);
-	(void)__BT_TLPUT(dbp, pp->lock);
+	if (ret == DB_NEEDSPLIT)
+		(void)__BT_LPUT(dbc, pp->lock);
+	else
+		(void)__BT_TLPUT(dbc, pp->lock);
 	(void)memp_fput(dbp->mpf, cp->page, 0);
-	(void)__BT_TLPUT(dbp, cp->lock);
+	if (ret == DB_NEEDSPLIT)
+		(void)__BT_LPUT(dbc, cp->lock);
+	else
+		(void)__BT_TLPUT(dbc, cp->lock);
 	return (ret);
 }
 
 /*
  * __bam_broot --
  *	Fix up the btree root page after it has been split.
- *
- * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *));
  */
-int
-__bam_broot(dbp, rootp, lp, rp)
-	DB *dbp;
+static int
+__bam_broot(dbc, rootp, lp, rp)
+	DBC *dbc;
 	PAGE *rootp, *lp, *rp;
 {
 	BINTERNAL bi, *child_bi;
 	BKEYDATA *child_bk;
+	DB *dbp;
 	DBT hdr, data;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * If the root page was a leaf page, change it into an internal page.
 	 * We copy the key we split on (but not the key's data, in the case of
@@ -405,7 +423,7 @@ __bam_broot(dbp, rootp, lp, rp)
 	hdr.data = &bi;
 	hdr.size = SSZA(BINTERNAL, data);
 	if ((ret =
-	    __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
+	    __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
 		return (ret);
 
 	switch (TYPE(rp)) {
@@ -424,13 +442,13 @@ __bam_broot(dbp, rootp, lp, rp)
 		hdr.size = SSZA(BINTERNAL, data);
 		data.data = child_bi->data;
 		data.size = child_bi->len;
-		if ((ret = __db_pitem(dbp, rootp, 1,
+		if ((ret = __db_pitem(dbc, rootp, 1,
 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
 			return (ret);
 
 		/* Increment the overflow ref count. */
 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
-			if ((ret = __db_ovref(dbp,
+			if ((ret = __db_ovref(dbc,
 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
 				return (ret);
 		break;
@@ -450,7 +468,7 @@ __bam_broot(dbp, rootp, lp, rp)
 			hdr.size = SSZA(BINTERNAL, data);
 			data.data = child_bk->data;
 			data.size = child_bk->len;
-			if ((ret = __db_pitem(dbp, rootp, 1,
+			if ((ret = __db_pitem(dbc, rootp, 1,
 			    BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
 				return (ret);
 			break;
@@ -467,13 +485,13 @@ __bam_broot(dbp, rootp, lp, rp)
 			hdr.size = SSZA(BINTERNAL, data);
 			data.data = child_bk;
 			data.size = BOVERFLOW_SIZE;
-			if ((ret = __db_pitem(dbp, rootp, 1,
+			if ((ret = __db_pitem(dbc, rootp, 1,
 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
 				return (ret);
 
 			/* Increment the overflow ref count. */
 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
-				if ((ret = __db_ovref(dbp,
+				if ((ret = __db_ovref(dbc,
 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
 					return (ret);
 			break;
@@ -490,18 +508,19 @@ __bam_broot(dbp, rootp, lp, rp)
 /*
  * __ram_root --
  *	Fix up the recno root page after it has been split.
- *
- * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *));
  */
-int
-__ram_root(dbp, rootp, lp, rp)
-	DB *dbp;
+static int
+__ram_root(dbc, rootp, lp, rp)
+	DBC *dbc;
 	PAGE *rootp, *lp, *rp;
 {
+	DB *dbp;
 	DBT hdr;
 	RINTERNAL ri;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/* Initialize the page. */
 	P_INIT(rootp, dbp->pgsize,
 	    PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
@@ -514,12 +533,12 @@ __ram_root(dbp, rootp, lp, rp)
 	/* Insert the left and right keys, set the header information. */
 	ri.pgno = lp->pgno;
 	ri.nrecs = __bam_total(lp);
-	if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+	if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
 		return (ret);
 	RE_NREC_SET(rootp, ri.nrecs);
 	ri.pgno = rp->pgno;
 	ri.nrecs = __bam_total(rp);
-	if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+	if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
 		return (ret);
 	RE_NREC_ADJ(rootp, ri.nrecs);
 	return (0);
@@ -530,14 +549,15 @@ __ram_root(dbp, rootp, lp, rp)
  *	Insert a new key into a parent page, completing the split.
  */
 static int
-__bam_pinsert(dbp, parent, lchild, rchild)
-	DB *dbp;
+__bam_pinsert(dbc, parent, lchild, rchild)
+	DBC *dbc;
 	EPG *parent;
 	PAGE *lchild, *rchild;
 {
 	BINTERNAL bi, *child_bi;
 	BKEYDATA *child_bk, *tmp_bk;
 	BTREE *t;
+	DB *dbp;
 	DBT a, b, hdr, data;
 	PAGE *ppage;
 	RINTERNAL ri;
@@ -546,6 +566,7 @@ __bam_pinsert(dbp, parent, lchild, rchild)
 	u_int32_t n, nbytes, nksize;
 	int ret;
 
+	dbp = dbc->dbp;
 	t = dbp->internal;
 	ppage = parent->page;
 
@@ -600,13 +621,13 @@ __bam_pinsert(dbp, parent, lchild, rchild)
 		memset(&data, 0, sizeof(data));
 		data.data = child_bi->data;
 		data.size = child_bi->len;
-		if ((ret = __db_pitem(dbp, ppage, off,
+		if ((ret = __db_pitem(dbc, ppage, off,
 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
 			return (ret);
 
 		/* Increment the overflow ref count. */
 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
-			if ((ret = __db_ovref(dbp,
+			if ((ret = __db_ovref(dbc,
 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
 				return (ret);
 		break;
@@ -630,10 +651,9 @@ __bam_pinsert(dbp, parent, lchild, rchild)
 			b.size = child_bk->len;
 			b.data = child_bk->data;
 			nksize = t->bt_prefix(&a, &b);
-			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) {
-				t->lstat.bt_pfxsaved += nbytes - n;
+			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
 				nbytes = n;
-			} else
+			else
 noprefix:			nksize = child_bk->len;
 
 			if (P_FREESPACE(ppage) < nbytes)
@@ -650,7 +670,7 @@ noprefix:			nksize = child_bk->len;
 			memset(&data, 0, sizeof(data));
 			data.data = child_bk->data;
 			data.size = nksize;
-			if ((ret = __db_pitem(dbp, ppage, off,
+			if ((ret = __db_pitem(dbc, ppage, off,
 			    BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
 				return (ret);
 			break;
@@ -672,13 +692,13 @@ noprefix:			nksize = child_bk->len;
 			memset(&data, 0, sizeof(data));
 			data.data = child_bk;
 			data.size = BOVERFLOW_SIZE;
-			if ((ret = __db_pitem(dbp, ppage, off,
+			if ((ret = __db_pitem(dbc, ppage, off,
 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
 				return (ret);
 
 			/* Increment the overflow ref count. */
 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
-				if ((ret = __db_ovref(dbp,
+				if ((ret = __db_ovref(dbc,
 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
 					return (ret);
 			break;
@@ -699,7 +719,7 @@ noprefix:			nksize = child_bk->len;
 		hdr.size = RINTERNAL_SIZE;
 		ri.pgno = rchild->pgno;
 		ri.nrecs = nrecs;
-		if ((ret = __db_pitem(dbp,
+		if ((ret = __db_pitem(dbc,
 		    ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
 			return (ret);
 		break;
@@ -710,9 +730,9 @@ noprefix:			nksize = child_bk->len;
 	/* Adjust the parent page's left page record count. */
 	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
 		/* Log the change. */
-		if (DB_LOGGING(dbp) &&
+		if (DB_LOGGING(dbc) &&
 		    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(ppage), 0, dbp->log_fileid,
+		    dbc->txn, &LSN(ppage), 0, dbp->log_fileid,
 		    PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx,
 		    -(int32_t)nrecs, (int32_t)0)) != 0)
 			return (ret);
@@ -732,18 +752,18 @@ noprefix:			nksize = child_bk->len;
  *	Do the real work of splitting the page.
  */
 static int
-__bam_psplit(dbp, cp, lp, rp, cleft)
-	DB *dbp;
+__bam_psplit(dbc, cp, lp, rp, splitret)
+	DBC *dbc;
 	EPG *cp;
 	PAGE *lp, *rp;
-	int cleft;
+	db_indx_t *splitret;
 {
-	BTREE *t;
+	DB *dbp;
 	PAGE *pp;
 	db_indx_t half, nbytes, off, splitp, top;
 	int adjust, cnt, isbigkey, ret;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
 	pp = cp->page;
 	adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
 
@@ -762,11 +782,8 @@ __bam_psplit(dbp, cp, lp, rp, cleft)
 	else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
 		off = adjust;
 
-	++t->lstat.bt_split;
-	if (off != 0) {
-		++t->lstat.bt_fastsplit;
+	if (off != 0)
 		goto sort;
-	}
 
 	/*
 	 * Split the data to the left and right pages.  Try not to split on
@@ -887,8 +904,7 @@ sort:	splitp = off;
 	if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
 		return (ret);
 
-	/* Adjust the cursors. */
-	__bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft);
+	*splitret = splitp;
 	return (0);
 }
 
diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c
index 2236434b38..855ef40bbd 100644
--- a/db2/btree/bt_stat.c
+++ b/db2/btree/bt_stat.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_stat.c	10.17 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)bt_stat.c	10.27 (Sleepycat) 11/25/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -22,8 +22,6 @@ static const char sccsid[] = "@(#)bt_stat.c	10.17 (Sleepycat) 4/26/98";
 #include "db_page.h"
 #include "btree.h"
 
-static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *));
-
 /*
  * __bam_stat --
  *	Gather/print the btree statistics
@@ -31,62 +29,62 @@ static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *));
  * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
  */
 int
-__bam_stat(argdbp, spp, db_malloc, flags)
-	DB *argdbp;
+__bam_stat(dbp, spp, db_malloc, flags)
+	DB *dbp;
 	void *spp;
 	void *(*db_malloc) __P((size_t));
 	u_int32_t flags;
 {
 	BTMETA *meta;
 	BTREE *t;
-	DB *dbp;
+	DBC *dbc;
 	DB_BTREE_STAT *sp;
 	DB_LOCK lock;
 	PAGE *h;
 	db_pgno_t lastpgno, pgno;
-	int ret;
+	int ret, t_ret;
 
-	DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags);
+	DB_PANIC_CHECK(dbp);
 
 	/* Check for invalid flags. */
-	if ((ret = __db_statchk(argdbp, flags)) != 0)
+	if ((ret = __db_statchk(dbp, flags)) != 0)
 		return (ret);
 
-	if (spp == NULL)
-		return (0);
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags);
 
-	GETHANDLE(argdbp, NULL, &dbp, ret);
 	t = dbp->internal;
 
+	if (spp == NULL)
+		return (0);
+
 	/* Allocate and clear the structure. */
-	if ((sp = db_malloc == NULL ?
-	    (DB_BTREE_STAT *)__db_malloc(sizeof(*sp)) :
-	    (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) {
-		ret = ENOMEM;
+	if ((ret = __os_malloc(sizeof(*sp), db_malloc, &sp)) != 0)
 		goto err;
-	}
 	memset(sp, 0, sizeof(*sp));
 
 	/* If the app just wants the record count, make it fast. */
-	if (LF_ISSET(DB_RECORDCOUNT)) {
+	if (flags == DB_RECORDCOUNT) {
 		pgno = PGNO_ROOT;
-		if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
 			goto err;
-		if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&h)) != 0)
 			goto err;
 
 		sp->bt_nrecs = RE_NREC(h);
 
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_LPUT(dbp, lock);
+		(void)__BT_LPUT(dbc, lock);
 		goto done;
 	}
 
 	/* Get the meta-data page. */
 	pgno = PGNO_METADATA;
-	if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
 		goto err;
-	if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0)
 		goto err;
 
 	/* Translate the metadata flags. */
@@ -110,24 +108,13 @@ __bam_stat(argdbp, spp, db_malloc, flags)
 	/* Get the page size from the DB. */
 	sp->bt_pagesize = dbp->pgsize;
 
-	/* Initialize counters with the meta-data page information. */
-	__bam_add_rstat(&meta->stat, sp);
-
-	/*
-	 * Add in the local information from this handle.
-	 *
-	 * !!!
-	 * This is a bit odd, but it gets us closer to the truth.
-	 */
-	__bam_add_rstat(&t->lstat, sp);
-
 	/* Walk the free list, counting pages. */
 	for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) {
 		++sp->bt_free;
 
-		if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) {
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
 			(void)memp_fput(dbp->mpf, meta, 0);
-			(void)__BT_TLPUT(dbp, lock);
+			(void)__BT_TLPUT(dbc, lock);
 			goto err;
 		}
 		pgno = h->next_pgno;
@@ -136,7 +123,7 @@ __bam_stat(argdbp, spp, db_malloc, flags)
 
 	/* Discard the meta-data page. */
 	(void)memp_fput(dbp->mpf, meta, 0);
-	(void)__BT_TLPUT(dbp, lock);
+	(void)__BT_TLPUT(dbc, lock);
 
 	/* Determine the last page of the database. */
 	if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0)
@@ -145,10 +132,10 @@ __bam_stat(argdbp, spp, db_malloc, flags)
 
 	/* Get the root page. */
 	pgno = PGNO_ROOT;
-	if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0)
 		goto err;
-	if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) {
-		(void)__BT_LPUT(dbp, lock);
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
+		(void)__BT_LPUT(dbc, lock);
 		goto err;
 	}
 
@@ -185,19 +172,19 @@ __bam_stat(argdbp, spp, db_malloc, flags)
 			break;
 		default:
 			(void)memp_fput(dbp->mpf, h, 0);
-			(void)__BT_LPUT(dbp, lock);
+			(void)__BT_LPUT(dbc, lock);
 			return (__db_pgfmt(dbp, pgno));
 		}
 
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)__BT_LPUT(dbp, lock);
+		(void)__BT_LPUT(dbc, lock);
 
 		if (++pgno > lastpgno)
 			break;
-		if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock))
+		if (__bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock))
 			break;
 		if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) {
-			(void)__BT_LPUT(dbp, lock);
+			(void)__BT_LPUT(dbc, lock);
 			break;
 		}
 	}
@@ -205,50 +192,7 @@ __bam_stat(argdbp, spp, db_malloc, flags)
 done:	*(DB_BTREE_STAT **)spp = sp;
 	ret = 0;
 
-err:	PUTHANDLE(dbp);
+err:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
 	return (ret);
 }
-
-/*
- * __bam_add_mstat --
- *	Add the local statistics to the meta-data page statistics.
- *
- * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *));
- */
-void
-__bam_add_mstat(from, to)
-	DB_BTREE_LSTAT *from;
-	DB_BTREE_LSTAT *to;
-{
-	to->bt_freed += from->bt_freed;
-	to->bt_pfxsaved += from->bt_pfxsaved;
-	to->bt_split += from->bt_split;
-	to->bt_rootsplit += from->bt_rootsplit;
-	to->bt_fastsplit += from->bt_fastsplit;
-	to->bt_added += from->bt_added;
-	to->bt_deleted += from->bt_deleted;
-	to->bt_get += from->bt_get;
-	to->bt_cache_hit += from->bt_cache_hit;
-	to->bt_cache_miss += from->bt_cache_miss;
-}
-
-/*
- * __bam_add_rstat --
- *	Add the local statistics to the returned statistics.
- */
-static void
-__bam_add_rstat(from, to)
-	DB_BTREE_LSTAT *from;
-	DB_BTREE_STAT *to;
-{
-	to->bt_freed += from->bt_freed;
-	to->bt_pfxsaved += from->bt_pfxsaved;
-	to->bt_split += from->bt_split;
-	to->bt_rootsplit += from->bt_rootsplit;
-	to->bt_fastsplit += from->bt_fastsplit;
-	to->bt_added += from->bt_added;
-	to->bt_deleted += from->bt_deleted;
-	to->bt_get += from->bt_get;
-	to->bt_cache_hit += from->bt_cache_hit;
-	to->bt_cache_miss += from->bt_cache_miss;
-}
diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c
index 75eadb1d62..95ea76e2cd 100644
--- a/db2/btree/btree_auto.c
+++ b/db2/btree/btree_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "btree.h"
@@ -43,8 +42,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_pg_alloc;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -55,8 +53,8 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(ptype)
 	    + sizeof(next);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -90,7 +88,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -135,7 +133,7 @@ __bam_pg_alloc_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tptype: %lu\n", (u_long)argp->ptype);
 	printf("\tnext: %lu\n", (u_long)argp->next);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -149,11 +147,12 @@ __bam_pg_alloc_read(recbuf, argpp)
 {
 	__bam_pg_alloc_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_pg_alloc_args *)__db_malloc(sizeof(__bam_pg_alloc_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_pg_alloc_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -206,8 +205,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_pg_free;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -217,8 +215,8 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*meta_lsn)
 	    + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
 	    + sizeof(next);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -255,7 +253,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -306,7 +304,7 @@ __bam_pg_free_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\n");
 	printf("\tnext: %lu\n", (u_long)argp->next);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -320,11 +318,12 @@ __bam_pg_free_read(recbuf, argpp)
 {
 	__bam_pg_free_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_pg_free_args *)__db_malloc(sizeof(__bam_pg_free_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_pg_free_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -383,8 +382,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_split;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -398,8 +396,8 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(npgno)
 	    + sizeof(*nlsn)
 	    + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -450,7 +448,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -507,7 +505,7 @@ __bam_split_print(notused1, dbtp, lsnp, notused2, notused3)
 	}
 	printf("\n");
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -521,11 +519,12 @@ __bam_split_read(recbuf, argpp)
 {
 	__bam_split_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_split_args *)__db_malloc(sizeof(__bam_split_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_split_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -587,8 +586,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_rsplit;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -599,8 +597,8 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(nrec)
 	    + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size)
 	    + sizeof(*rootlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -647,7 +645,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -707,7 +705,7 @@ __bam_rsplit_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\trootlsn: [%lu][%lu]\n",
 	    (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -721,11 +719,12 @@ __bam_rsplit_read(recbuf, argpp)
 {
 	__bam_rsplit_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_rsplit_args *)__db_malloc(sizeof(__bam_rsplit_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_rsplit_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -782,8 +781,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_adj;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -794,8 +792,8 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(indx)
 	    + sizeof(indx_copy)
 	    + sizeof(is_insert);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -826,7 +824,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -870,7 +868,7 @@ __bam_adj_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy);
 	printf("\tis_insert: %lu\n", (u_long)argp->is_insert);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -884,11 +882,12 @@ __bam_adj_read(recbuf, argpp)
 {
 	__bam_adj_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_adj_args *)__db_malloc(sizeof(__bam_adj_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_adj_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -941,8 +940,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_cadjust;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -953,8 +951,8 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(indx)
 	    + sizeof(adjust)
 	    + sizeof(total);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -985,7 +983,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1029,7 +1027,7 @@ __bam_cadjust_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tadjust: %ld\n", (long)argp->adjust);
 	printf("\ttotal: %ld\n", (long)argp->total);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1043,11 +1041,12 @@ __bam_cadjust_read(recbuf, argpp)
 {
 	__bam_cadjust_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_cadjust_args *)__db_malloc(sizeof(__bam_cadjust_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_cadjust_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1097,8 +1096,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_cdel;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1107,8 +1105,8 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(*lsn)
 	    + sizeof(indx);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1135,7 +1133,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1177,7 +1175,7 @@ __bam_cdel_print(notused1, dbtp, lsnp, notused2, notused3)
 	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
 	printf("\tindx: %lu\n", (u_long)argp->indx);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1191,11 +1189,12 @@ __bam_cdel_read(recbuf, argpp)
 {
 	__bam_cdel_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_cdel_args *)__db_malloc(sizeof(__bam_cdel_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_cdel_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1250,8 +1249,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_bam_repl;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1265,8 +1263,8 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size)
 	    + sizeof(prefix)
 	    + sizeof(suffix);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1319,7 +1317,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1382,7 +1380,7 @@ __bam_repl_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tprefix: %lu\n", (u_long)argp->prefix);
 	printf("\tsuffix: %lu\n", (u_long)argp->suffix);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1396,11 +1394,12 @@ __bam_repl_read(recbuf, argpp)
 {
 	__bam_repl_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__bam_repl_args *)__db_malloc(sizeof(__bam_repl_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__bam_repl_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
diff --git a/db2/common/db_appinit.c b/db2/common/db_appinit.c
index 6ec007be0a..e02b1a872d 100644
--- a/db2/common/db_appinit.c
+++ b/db2/common/db_appinit.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_appinit.c	10.52 (Sleepycat) 6/2/98";
+static const char sccsid[] = "@(#)db_appinit.c	10.66 (Sleepycat) 12/7/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,7 +16,6 @@ static const char sccsid[] = "@(#)db_appinit.c	10.52 (Sleepycat) 6/2/98";
 
 #include <ctype.h>
 #include <errno.h>
-#include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -34,10 +33,22 @@ static const char sccsid[] = "@(#)db_appinit.c	10.52 (Sleepycat) 6/2/98";
 
 static int __db_home __P((DB_ENV *, const char *, u_int32_t));
 static int __db_parse __P((DB_ENV *, char *));
-static int __db_tmp_dir __P((DB_ENV *, u_int32_t));
 static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, int *));
 
 /*
+ * This conflict array is used for concurrent db access (cdb).  It
+ * uses the same locks as the db_rw_conflict array, but adds an IW
+ * mode to be used for write cursors.
+ */
+static u_int8_t const db_cdb_conflicts[] = {
+	/*		N   R   W  IW */
+	/*    N */	0,  0,  0,  0,
+	/*    R */	0,  0,  1,  0,
+	/*    W */	0,  1,  1,  1,
+	/*   IW */	0,  0,  1,  1
+};
+
+/*
  * db_version --
  *	Return version information.
  */
@@ -70,21 +81,24 @@ db_appinit(db_home, db_config, dbenv, flags)
 	char * const *p;
 	char *lp, buf[MAXPATHLEN * 2];
 
+	fp = NULL;
+
 	/* Validate arguments. */
 	if (dbenv == NULL)
 		return (EINVAL);
 
-
 #ifdef HAVE_SPINLOCKS
 #define	OKFLAGS								\
-   (DB_CREATE | DB_NOMMAP | DB_THREAD | DB_INIT_LOCK | DB_INIT_LOG |	\
-    DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER |	\
-    DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+    (DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG |		\
+    DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_NOMMAP |	\
+    DB_RECOVER | DB_RECOVER_FATAL | DB_THREAD | DB_TXN_NOSYNC |		\
+    DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
 #else
 #define	OKFLAGS								\
-   (DB_CREATE | DB_NOMMAP | DB_INIT_LOCK | DB_INIT_LOG |		\
-    DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER |	\
-    DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+    (DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG |		\
+    DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_NOMMAP |	\
+    DB_RECOVER | DB_RECOVER_FATAL | DB_TXN_NOSYNC |			\
+    DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
 #endif
 	if ((ret = __db_fchk(dbenv, "db_appinit", flags, OKFLAGS)) != 0)
 		return (ret);
@@ -97,8 +111,6 @@ db_appinit(db_home, db_config, dbenv, flags)
 	if (LF_ISSET(DB_THREAD))
 		F_SET(dbenv, DB_ENV_THREAD);
 
-	fp = NULL;
-
 	/* Set the database home. */
 	if ((ret = __db_home(dbenv, db_home, flags)) != 0)
 		goto err;
@@ -127,8 +139,17 @@ db_appinit(db_home, db_config, dbenv, flags)
 		(void)strcat(buf, CONFIG_NAME);
 		if ((fp = fopen(buf, "r")) != NULL) {
 			while (fgets(buf, sizeof(buf), fp) != NULL) {
-				if ((lp = strchr(buf, '\n')) != NULL)
-					*lp = '\0';
+				if ((lp = strchr(buf, '\n')) == NULL) {
+					__db_err(dbenv,
+					    "%s: line too long", CONFIG_NAME);
+					ret = EINVAL;
+					goto err;
+				}
+				*lp = '\0';
+				if (buf[0] == '\0' ||
+				    buf[0] == '#' || isspace(buf[0]))
+					continue;
+
 				if ((ret = __db_parse(dbenv, buf)) != 0)
 					goto err;
 			}
@@ -138,11 +159,14 @@ db_appinit(db_home, db_config, dbenv, flags)
 	}
 
 	/* Set up the tmp directory path. */
-	if (dbenv->db_tmp_dir == NULL &&
-	    (ret = __db_tmp_dir(dbenv, flags)) != 0)
+	if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(dbenv, flags)) != 0)
 		goto err;
 
-	/* Indicate that the path names have been set. */
+	/*
+	 * Flag that the structure has been initialized by the application.
+	 * Note, this must be set before calling into the subsystems as it
+	 * is used when we're doing file naming.
+	 */
 	F_SET(dbenv, DB_ENV_APPINIT);
 
 	/*
@@ -166,6 +190,18 @@ db_appinit(db_home, db_config, dbenv, flags)
 	 * Default permissions are read-write for both owner and group.
 	 */
 	mode = __db_omode("rwrw--");
+	if (LF_ISSET(DB_INIT_CDB)) {
+		if (LF_ISSET(DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN)) {
+			ret = EINVAL;
+			goto err;
+		}
+		F_SET(dbenv, DB_ENV_CDB);
+		dbenv->lk_conflicts = db_cdb_conflicts;
+		dbenv->lk_modes = DB_LOCK_RW_N + 1;
+		if ((ret = lock_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD),
+		    mode, dbenv, &dbenv->lk_info)) != 0)
+			goto err;
+	}
 	if (LF_ISSET(DB_INIT_LOCK) && (ret = lock_open(NULL,
 	    LF_ISSET(DB_CREATE | DB_THREAD),
 	    mode, dbenv, &dbenv->lk_info)) != 0)
@@ -232,28 +268,32 @@ db_appexit(dbenv)
 	if (dbenv->tx_info && (t_ret = txn_close(dbenv->tx_info)) != 0)
 		if (ret == 0)
 			ret = t_ret;
-	if (dbenv->mp_info && (t_ret = memp_close(dbenv->mp_info)) != 0)
+	if (dbenv->lg_info && (t_ret = log_close(dbenv->lg_info)) != 0)
 		if (ret == 0)
 			ret = t_ret;
-	if (dbenv->lg_info && (t_ret = log_close(dbenv->lg_info)) != 0)
+	if (dbenv->mp_info && (t_ret = memp_close(dbenv->mp_info)) != 0)
 		if (ret == 0)
 			ret = t_ret;
 	if (dbenv->lk_info && (t_ret = lock_close(dbenv->lk_info)) != 0)
 		if (ret == 0)
 			ret = t_ret;
 
+	/* Clear initialized flag (after subsystems, it affects naming). */
+	F_CLR(dbenv, DB_ENV_APPINIT);
+
 	/* Free allocated memory. */
 	if (dbenv->db_home != NULL)
-		FREES(dbenv->db_home);
+		__os_freestr(dbenv->db_home);
 	if ((p = dbenv->db_data_dir) != NULL) {
 		for (; *p != NULL; ++p)
-			FREES(*p);
-		FREE(dbenv->db_data_dir, dbenv->data_cnt * sizeof(char **));
+			__os_freestr(*p);
+		__os_free(dbenv->db_data_dir,
+		    dbenv->data_cnt * sizeof(char **));
 	}
 	if (dbenv->db_log_dir != NULL)
-		FREES(dbenv->db_log_dir);
+		__os_freestr(dbenv->db_log_dir);
 	if (dbenv->db_tmp_dir != NULL)
-		FREES(dbenv->db_tmp_dir);
+		__os_freestr(dbenv->db_tmp_dir);
 
 	return (ret);
 }
@@ -261,7 +301,7 @@ db_appexit(dbenv)
 #define	DB_ADDSTR(str) {						\
 	if ((str) != NULL) {						\
 		/* If leading slash, start over. */			\
-		if (__db_abspath(str)) {				\
+		if (__os_abspath(str)) {				\
 			p = start;					\
 			slash = 0;					\
 		}							\
@@ -317,10 +357,9 @@ __db_appname(dbenv, appname, dir, file, tmp_oflags, fdp, namep)
 	 * path, we're done.  If the directory is, simply append the file and
 	 * return.
 	 */
-	if (file != NULL && __db_abspath(file))
-		return ((*namep =
-		    (char *)__db_strdup(file)) == NULL ? ENOMEM : 0);
-	if (dir != NULL && __db_abspath(dir)) {
+	if (file != NULL && __os_abspath(file))
+		return (__os_strdup(file, namep));
+	if (dir != NULL && __os_abspath(dir)) {
 		a = dir;
 		goto done;
 	}
@@ -417,7 +456,7 @@ retry:	switch (appname) {
 	if (0) {
 tmp:		if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_APPINIT)) {
 			memset(&etmp, 0, sizeof(etmp));
-			if ((ret = __db_tmp_dir(&etmp, DB_USE_ENVIRON)) != 0)
+			if ((ret = __os_tmpdir(&etmp, DB_USE_ENVIRON)) != 0)
 				return (ret);
 			tmp_free = 1;
 			a = etmp.db_tmp_dir;
@@ -437,12 +476,11 @@ done:	len =
 	 * name.
 	 */
 #define	DB_TRAIL	"XXXXXX"
-	if ((start =
-	    (char *)__db_malloc(len + sizeof(DB_TRAIL) + 10)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
+	if ((ret =
+	    __os_malloc(len + sizeof(DB_TRAIL) + 10, NULL, &start)) != 0) {
 		if (tmp_free)
-			FREES(etmp.db_tmp_dir);
-		return (ENOMEM);
+			__os_freestr(etmp.db_tmp_dir);
+		return (ret);
 	}
 
 	slash = 0;
@@ -452,28 +490,32 @@ done:	len =
 	DB_ADDSTR(file);
 	*p = '\0';
 
+	/* Discard any space allocated to find the temp directory. */
+	if (tmp_free) {
+		__os_freestr(etmp.db_tmp_dir);
+		tmp_free = 0;
+	}
+
 	/*
 	 * If we're opening a data file, see if it exists.  If it does,
 	 * return it, otherwise, try and find another one to open.
 	 */
-	if (data_entry != -1 && __db_exists(start, NULL) != 0) {
-		FREES(start);
+	if (data_entry != -1 && __os_exists(start, NULL) != 0) {
+		__os_freestr(start);
 		a = b = c = NULL;
 		goto retry;
 	}
 
-	/* Discard any space allocated to find the temp directory. */
-	if (tmp_free)
-		FREES(etmp.db_tmp_dir);
-
 	/* Create the file if so requested. */
 	if (tmp_create &&
 	    (ret = __db_tmp_open(dbenv, tmp_oflags, start, fdp)) != 0) {
-		FREES(start);
+		__os_freestr(start);
 		return (ret);
 	}
 
-	if (namep != NULL)
+	if (namep == NULL)
+		__os_freestr(start);
+	else
 		*namep = start;
 	return (0);
 }
@@ -511,11 +553,7 @@ __db_home(dbenv, db_home, flags)
 	if (p == NULL)
 		return (0);
 
-	if ((dbenv->db_home = (char *)__db_strdup(p)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
-	return (0);
+	return (__os_strdup(p, &dbenv->db_home));
 }
 
 /*
@@ -530,152 +568,73 @@ __db_parse(dbenv, s)
 	int ret;
 	char *local_s, *name, *value, **p, *tp;
 
-	ret = 0;
-
 	/*
 	 * We need to strdup the argument in case the caller passed us
 	 * static data.
 	 */
-	if ((local_s = (char *)__db_strdup(s)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_strdup(s, &local_s)) != 0)
+		return (ret);
 
-	tp = local_s;
-	while ((name = strsep(&tp, " \t")) != NULL && *name == '\0')
+	/*
+	 * Name/value pairs are parsed as two white-space separated strings.
+	 * Leading and trailing white-space is trimmed from the value, but
+	 * it may contain embedded white-space.  Note: we use the isspace(3)
+	 * macro because it's more portable, but that means that you can use
+	 * characters like form-feed to separate the strings.
+	 */
+	name = local_s;
+	for (tp = name; *tp != '\0' && !isspace(*tp); ++tp)
+		;
+	if (*tp == '\0' || tp == name)
+		goto illegal;
+	*tp = '\0';
+	for (++tp; isspace(*tp); ++tp)
 		;
-	if (name == NULL)
+	if (*tp == '\0')
 		goto illegal;
-	while ((value = strsep(&tp, " \t")) != NULL && *value == '\0')
+	value = tp;
+	for (++tp; *tp != '\0'; ++tp)
+		;
+	for (--tp; isspace(*tp); --tp)
 		;
-	if (value == NULL) {
+	if (tp == value) {
 illegal:	ret = EINVAL;
 		__db_err(dbenv, "illegal name-value pair: %s", s);
 		goto err;
 	}
+	*++tp = '\0';
 
 #define	DATA_INIT_CNT	20			/* Start with 20 data slots. */
 	if (!strcmp(name, "DB_DATA_DIR")) {
 		if (dbenv->db_data_dir == NULL) {
-			if ((dbenv->db_data_dir =
-			    (char **)__db_calloc(DATA_INIT_CNT,
-			    sizeof(char **))) == NULL)
-				goto nomem;
+			if ((ret = __os_calloc(DATA_INIT_CNT,
+			    sizeof(char **), &dbenv->db_data_dir)) != 0)
+				goto err;
 			dbenv->data_cnt = DATA_INIT_CNT;
 		} else if (dbenv->data_next == dbenv->data_cnt - 1) {
 			dbenv->data_cnt *= 2;
-			if ((dbenv->db_data_dir =
-			    (char **)__db_realloc(dbenv->db_data_dir,
-			    dbenv->data_cnt * sizeof(char **))) == NULL)
-				goto nomem;
+			if ((ret = __os_realloc(&dbenv->db_data_dir,
+			    dbenv->data_cnt * sizeof(char **))) != 0)
+				goto err;
 		}
 		p = &dbenv->db_data_dir[dbenv->data_next++];
 	} else if (!strcmp(name, "DB_LOG_DIR")) {
 		if (dbenv->db_log_dir != NULL)
-			FREES(dbenv->db_log_dir);
+			__os_freestr(dbenv->db_log_dir);
 		p = &dbenv->db_log_dir;
 	} else if (!strcmp(name, "DB_TMP_DIR")) {
 		if (dbenv->db_tmp_dir != NULL)
-			FREES(dbenv->db_tmp_dir);
+			__os_freestr(dbenv->db_tmp_dir);
 		p = &dbenv->db_tmp_dir;
 	} else
 		goto err;
 
-	if ((*p = (char *)__db_strdup(value)) == NULL) {
-nomem:		ret = ENOMEM;
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-	}
+	ret = __os_strdup(value, p);
 
-err:	FREES(local_s);
+err:	__os_freestr(local_s);
 	return (ret);
 }
 
-#ifdef macintosh
-#include <TFileSpec.h>
-
-static char *sTempFolder;
-#endif
-
-/*
- * tmp --
- *	Set the temporary directory path.
- */
-static int
-__db_tmp_dir(dbenv, flags)
-	DB_ENV *dbenv;
-	u_int32_t flags;
-{
-	static const char * list[] = {	/* Ordered: see db_appinit(3). */
-		"/var/tmp",
-		"/usr/tmp",
-		"/temp",		/* WIN32. */
-		"/tmp",
-		"C:/temp",		/* WIN32. */
-		"C:/tmp",		/* WIN32. */
-		NULL
-	};
-	const char **lp, *p;
-
-	/* Use the environment if it's permitted and initialized. */
-	p = NULL;
-#ifdef HAVE_GETEUID
-	if (LF_ISSET(DB_USE_ENVIRON) ||
-	    (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0)) {
-#else
-	if (LF_ISSET(DB_USE_ENVIRON)) {
-#endif
-		if ((p = getenv("TMPDIR")) != NULL && p[0] == '\0') {
-			__db_err(dbenv, "illegal TMPDIR environment variable");
-			return (EINVAL);
-		}
-		/* WIN32 */
-		if (p == NULL && (p = getenv("TEMP")) != NULL && p[0] == '\0') {
-			__db_err(dbenv, "illegal TEMP environment variable");
-			return (EINVAL);
-		}
-		/* WIN32 */
-		if (p == NULL && (p = getenv("TMP")) != NULL && p[0] == '\0') {
-			__db_err(dbenv, "illegal TMP environment variable");
-			return (EINVAL);
-		}
-		/* Macintosh */
-		if (p == NULL &&
-		    (p = getenv("TempFolder")) != NULL && p[0] == '\0') {
-			__db_err(dbenv,
-			    "illegal TempFolder environment variable");
-			return (EINVAL);
-		}
-	}
-
-#ifdef macintosh
-	/* Get the path to the temporary folder. */
-	if (p == NULL) {
-		FSSpec spec;
-
-		if (!Special2FSSpec(kTemporaryFolderType,
-		    kOnSystemDisk, 0, &spec)) {
-			p = FSp2FullPath(&spec);
-			sTempFolder = __db_malloc(strlen(p) + 1);
-			strcpy(sTempFolder, p);
-			p = sTempFolder;
-		}
-	}
-#endif
-
-	/* Step through the list looking for a possibility. */
-	if (p == NULL)
-		for (lp = list; *lp != NULL; ++lp)
-			if (__db_exists(p = *lp, NULL) == 0)
-				break;
-
-	if (p == NULL)
-		return (0);
-
-	if ((dbenv->db_tmp_dir = (char *)__db_strdup(p)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
-	return (0);
-}
-
 /*
  * __db_tmp_open --
  *	Create a temporary file.
@@ -687,9 +646,6 @@ __db_tmp_open(dbenv, flags, path, fdp)
 	char *path;
 	int *fdp;
 {
-#ifdef HAVE_SIGFILLSET
-	sigset_t set, oset;
-#endif
 	u_long pid;
 	int mode, isdir, ret;
 	const char *p;
@@ -699,7 +655,7 @@ __db_tmp_open(dbenv, flags, path, fdp)
 	 * Check the target directory; if you have six X's and it doesn't
 	 * exist, this runs for a *very* long time.
 	 */
-	if ((ret = __db_exists(path, &isdir)) != 0) {
+	if ((ret = __os_exists(path, &isdir)) != 0) {
 		__db_err(dbenv, "%s: %s", path, strerror(ret));
 		return (ret);
 	}
@@ -738,27 +694,9 @@ __db_tmp_open(dbenv, flags, path, fdp)
 	LF_SET(DB_CREATE | DB_EXCL);
 	mode = __db_omode("rw----");
 
-	/*
-	 * Try to open a file.  We block every signal we can get our hands
-	 * on so that, if we're interrupted at the wrong time, the temporary
-	 * file isn't left around -- of course, if we drop core in-between
-	 * the calls we'll hang forever, but that's probably okay.  ;-}
-	 */
-#ifdef HAVE_SIGFILLSET
-	if (LF_ISSET(DB_TEMPORARY))
-		(void)sigfillset(&set);
-#endif
+	/* Loop, trying to open a file. */
 	for (;;) {
-#ifdef HAVE_SIGFILLSET
-		if (LF_ISSET(DB_TEMPORARY))
-			(void)sigprocmask(SIG_BLOCK, &set, &oset);
-#endif
-		ret = __db_open(path, flags, flags, mode, fdp);
-#ifdef HAVE_SIGFILLSET
-		if (LF_ISSET(DB_TEMPORARY))
-			(void)sigprocmask(SIG_SETMASK, &oset, NULL);
-#endif
-		if (ret == 0)
+		if ((ret = __db_open(path, flags, flags, mode, fdp)) == 0)
 			return (0);
 
 		/*
diff --git a/db2/common/db_apprec.c b/db2/common/db_apprec.c
index 7f0cb3a212..5e8fec4659 100644
--- a/db2/common/db_apprec.c
+++ b/db2/common/db_apprec.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_apprec.c	10.30 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)db_apprec.c	10.33 (Sleepycat) 10/5/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -44,7 +44,8 @@ __db_apprec(dbenv, flags)
 {
 	DBT data;
 	DB_LOG *lp;
-	DB_LSN ckp_lsn, first_lsn, lsn;
+	DB_LSN ckp_lsn, first_lsn, lsn, open_lsn;
+	__txn_ckp_args *ckp_args;
 	time_t now;
 	u_int32_t is_thread;
 	int ret;
@@ -65,10 +66,16 @@ __db_apprec(dbenv, flags)
 
 	/*
 	 * Recovery is done in three passes:
+	 * Pass #0:
+	 *	We need to find the position from which we will open files
+	 *	We need to open files beginning with the last to next
+	 *	checkpoint because we might have crashed after writing the
+	 * 	last checkpoint record, but before having written out all
+	 *	the open file information.
 	 * Pass #1:
-	 *	Read forward through the log from the last checkpoint to the
-	 *	end of the log, opening and closing files so that at the end
-	 *	of the log we have the "current" set of files open.
+	 *	Read forward through the log from the second to last checkpoint
+	 *	opening and closing files so that at the end of the log we have
+	 *	the "current" set of files open.
 	 * Pass #2:
 	 *	Read backward through the log undoing any uncompleted TXNs.
 	 *	If doing catastrophic recovery, we read to the beginning of
@@ -84,33 +91,50 @@ __db_apprec(dbenv, flags)
 	 */
 
 	/*
-	 * Find the last checkpoint in the log.  This is the point from which
-	 * we want to begin pass #1 (the TXN_OPENFILES pass).
+	 * Find the second to last checkpoint in the log.  This is the point
+	 * from which we want to begin pass #1 (the TXN_OPENFILES pass).
 	 */
 	memset(&data, 0, sizeof(data));
+	ckp_args = NULL;
+
 	if ((ret = log_get(lp, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) {
 		/*
 		 * If we don't find a checkpoint, start from the beginning.
 		 * If that fails, we're done.  Note, we do not require that
 		 * there be log records if we're performing recovery.
 		 */
-		if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) {
+first:		if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) {
 			if (ret == DB_NOTFOUND)
 				ret = 0;
 			else
 				__db_err(dbenv, "First log record not found");
 			goto out;
 		}
-	}
+		open_lsn = ckp_lsn;
+	} else if ((ret = __txn_ckp_read(data.data, &ckp_args)) != 0) {
+		__db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n",
+		    (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+		goto out;
+	} else if (IS_ZERO_LSN(ckp_args->last_ckp) ||
+		(ret = log_get(lp, &ckp_args->last_ckp, &data, DB_SET)) != 0)
+		goto first;
+	else
+		open_lsn = ckp_args->last_ckp;
 
 	/*
 	 * Now, ckp_lsn is either the lsn of the last checkpoint or the lsn
-	 * of the first record in the log.  Begin the TXN_OPENFILES pass from
-	 * that lsn, and proceed to the end of the log.
+	 * of the first record in the log.  Open_lsn is the second to last
+	 * checkpoint or the beinning of the log; begin the TXN_OPENFILES
+	 * pass from that lsn, and proceed to the end of the log.
 	 */
-	lsn = ckp_lsn;
+	lsn = open_lsn;
 	for (;;) {
-		ret = __db_dispatch(lp, &data, &lsn, TXN_OPENFILES, txninfo);
+		if (dbenv->tx_recover != NULL)
+			ret = dbenv->tx_recover(lp,
+			    &data, &lsn, TXN_OPENFILES, txninfo);
+		else
+			ret = __db_dispatch(lp,
+			    &data, &lsn, TXN_OPENFILES, txninfo);
 		if (ret != 0 && ret != DB_TXN_CKP)
 			goto msgerr;
 		if ((ret = log_get(lp, &lsn, &data, DB_NEXT)) != 0) {
@@ -148,8 +172,12 @@ __db_apprec(dbenv, flags)
 	for (ret = log_get(lp, &lsn, &data, DB_LAST);
 	    ret == 0 && log_compare(&lsn, &first_lsn) > 0;
 	    ret = log_get(lp, &lsn, &data, DB_PREV)) {
-		ret = __db_dispatch(lp,
-		    &data, &lsn, TXN_BACKWARD_ROLL, txninfo);
+		if (dbenv->tx_recover != NULL)
+			ret = dbenv->tx_recover(lp,
+			    &data, &lsn, TXN_BACKWARD_ROLL, txninfo);
+		else
+			ret = __db_dispatch(lp,
+			    &data, &lsn, TXN_BACKWARD_ROLL, txninfo);
 		if (ret != 0) {
 			if (ret != DB_TXN_CKP)
 				goto msgerr;
@@ -165,7 +193,12 @@ __db_apprec(dbenv, flags)
 	 */
 	for (ret = log_get(lp, &lsn, &data, DB_NEXT);
 	    ret == 0; ret = log_get(lp, &lsn, &data, DB_NEXT)) {
-		ret = __db_dispatch(lp, &data, &lsn, TXN_FORWARD_ROLL, txninfo);
+		if (dbenv->tx_recover != NULL)
+			ret = dbenv->tx_recover(lp,
+			    &data, &lsn, TXN_FORWARD_ROLL, txninfo);
+		else
+			ret = __db_dispatch(lp,
+			    &data, &lsn, TXN_FORWARD_ROLL, txninfo);
 		if (ret != 0) {
 			if (ret != DB_TXN_CKP)
 				goto msgerr;
@@ -207,6 +240,8 @@ msgerr:		__db_err(dbenv, "Recovery function for LSN %lu %lu failed",
 
 out:	F_SET(lp, is_thread);
 	__db_txnlist_end(txninfo);
+	if (ckp_args != NULL)
+		__os_free(ckp_args, sizeof(*ckp_args));
 
 	return (ret);
 }
diff --git a/db2/common/db_err.c b/db2/common/db_err.c
index 98a414279e..e935ddfcc5 100644
--- a/db2/common/db_err.c
+++ b/db2/common/db_err.c
@@ -8,13 +8,15 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_err.c	10.25 (Sleepycat) 5/2/98";
+static const char sccsid[] = "@(#)db_err.c	10.42 (Sleepycat) 11/24/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <stdio.h>
+#include <string.h>
 
 #ifdef __STDC__
 #include <stdarg.h>
@@ -24,10 +26,67 @@ static const char sccsid[] = "@(#)db_err.c	10.25 (Sleepycat) 5/2/98";
 #endif
 
 #include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
+#include "log.h"
+#include "log_ext.h"
+#include "mp.h"
+#include "mp_ext.h"
+#include "txn.h"
+#include "txn_ext.h"
 #include "common_ext.h"
+#include "clib_ext.h"
 
-static int __db_keyempty __P((const DB_ENV *));
-static int __db_rdonly __P((const DB_ENV *, const char *));
+/*
+ * __db_fchk --
+ *	General flags checking routine.
+ *
+ * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__db_fchk(dbenv, name, flags, ok_flags)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags, ok_flags;
+{
+	return (flags & ~ok_flags ?  __db_ferr(dbenv, name, 0) : 0);
+}
+
+/*
+ * __db_fcchk --
+ *	General combination flags checking routine.
+ *
+ * PUBLIC: int __db_fcchk
+ * PUBLIC:    __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__db_fcchk(dbenv, name, flags, flag1, flag2)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags, flag1, flag2;
+{
+	return ((flags & flag1) &&
+	    (flags & flag2) ?  __db_ferr(dbenv, name, 1) : 0);
+}
+
+/*
+ * __db_ferr --
+ *	Common flag errors.
+ *
+ * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int));
+ */
+int
+__db_ferr(dbenv, name, iscombo)
+	const DB_ENV *dbenv;
+	const char *name;
+	int iscombo;
+{
+	__db_err(dbenv, "illegal flag %sspecified to %s",
+	    iscombo ? "combination " : "", name);
+	return (EINVAL);
+}
 
 /*
  * __db_err --
@@ -55,561 +114,98 @@ __db_err(dbenv, fmt, va_alist)
 	if (dbenv == NULL)
 		return;
 
+	if (dbenv->db_errcall != NULL) {
 #ifdef __STDC__
-	va_start(ap, fmt);
+         	va_start(ap, fmt);
 #else
-	va_start(ap);
+	        va_start(ap);
 #endif
-	if (dbenv->db_errcall != NULL) {
 		(void)vsnprintf(errbuf, sizeof(errbuf), fmt, ap);
 		dbenv->db_errcall(dbenv->db_errpfx, errbuf);
+		va_end(ap);
 	}
 	if (dbenv->db_errfile != NULL) {
 		if (dbenv->db_errpfx != NULL)
 			(void)fprintf(dbenv->db_errfile, "%s: ",
 			    dbenv->db_errpfx);
+#ifdef __STDC__
+         	va_start(ap, fmt);
+#else
+	        va_start(ap);
+#endif
 		(void)vfprintf(dbenv->db_errfile, fmt, ap);
 		(void)fprintf(dbenv->db_errfile, "\n");
 		(void)fflush(dbenv->db_errfile);
+		va_end(ap);
 	}
-	va_end(ap);
-}
-
-/*
- * XXX
- * Provide ANSI C prototypes for the panic functions.  Some compilers, (e.g.,
- * MS VC 4.2) get upset if they aren't here, even though the K&R declaration
- * appears before the assignment in the __db__panic() call.
- */
-static int __db_ecursor __P((DB *, DB_TXN *, DBC **));
-static int __db_edel __P((DB *, DB_TXN *, DBT *, u_int32_t));
-static int __db_efd __P((DB *, int *));
-static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-static int __db_estat __P((DB *, void *, void *(*)(size_t), u_int32_t));
-static int __db_esync __P((DB *, u_int32_t));
-
-/*
- * __db_ecursor --
- *	After-panic cursor routine.
- */
-static int
-__db_ecursor(a, b, c)
-	DB *a;
-	DB_TXN *b;
-	DBC **c;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, NULL);
-	COMPQUIET(c, NULL);
-
-	return (EPERM);
-}
-
-/*
- * __db_edel --
- *	After-panic delete routine.
- */
-static int
-__db_edel(a, b, c, d)
-	DB *a;
-	DB_TXN *b;
-	DBT *c;
-	u_int32_t d;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, NULL);
-	COMPQUIET(c, NULL);
-	COMPQUIET(d, 0);
-
-	return (EPERM);
 }
 
 /*
- * __db_efd --
- *	After-panic fd routine.
- */
-static int
-__db_efd(a, b)
-	DB *a;
-	int *b;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, NULL);
-
-	return (EPERM);
-}
-
-/*
- * __db_egp --
- *	After-panic get/put routine.
- */
-static int
-__db_egp(a, b, c, d, e)
-	DB *a;
-	DB_TXN *b;
-	DBT *c, *d;
-	u_int32_t e;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, NULL);
-	COMPQUIET(c, NULL);
-	COMPQUIET(d, NULL);
-	COMPQUIET(e, 0);
-
-	return (EPERM);
-}
-
-/*
- * __db_estat --
- *	After-panic stat routine.
- */
-static int
-__db_estat(a, b, c, d)
-	DB *a;
-	void *b;
-	void *(*c) __P((size_t));
-	u_int32_t d;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, NULL);
-	COMPQUIET(c, NULL);
-	COMPQUIET(d, 0);
-
-	return (EPERM);
-}
-
-/*
- * __db_esync --
- *	After-panic sync routine.
- */
-static int
-__db_esync(a, b)
-	DB *a;
-	u_int32_t b;
-{
-	COMPQUIET(a, NULL);
-	COMPQUIET(b, 0);
-
-	return (EPERM);
-}
-
-/*
- * __db_panic --
- *	Lock out the tree due to unrecoverable error.
+ * __db_pgerr --
+ *	Error when unable to retrieve a specified page.
  *
- * PUBLIC: int __db_panic __P((DB *));
+ * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t));
  */
 int
-__db_panic(dbp)
+__db_pgerr(dbp, pgno)
 	DB *dbp;
+	db_pgno_t pgno;
 {
 	/*
-	 * XXX
-	 * We should shut down all of the process's cursors, too.
-	 *
-	 * We should call mpool and have it shut down the file, so we get
-	 * other processes sharing this file as well.
-	 *
-	 *	Chaos reigns within.
-	 *	Reflect, repent, and reboot.
-	 *	Order shall return.
+	 * Three things are certain:
+	 * Death, taxes, and lost data.
+	 * Guess which has occurred.
 	 */
-	dbp->cursor = __db_ecursor;
-	dbp->del = __db_edel;
-	dbp->fd = __db_efd;
-	dbp->get = __db_egp;
-	dbp->put = __db_egp;
-	dbp->stat = __db_estat;
-	dbp->sync = __db_esync;
-
-	return (EPERM);
+	__db_err(dbp->dbenv,
+	    "unable to create/retrieve page %lu", (u_long)pgno);
+	return (__db_panic(dbp->dbenv, EIO));
 }
 
-/* Check for invalid flags. */
-#undef	DB_CHECK_FLAGS
-#define	DB_CHECK_FLAGS(dbenv, name, flags, ok_flags)			\
-	if ((flags) & ~(ok_flags))					\
-		return (__db_ferr(dbenv, name, 0));
-/* Check for invalid flag combinations. */
-#undef	DB_CHECK_FCOMBO
-#define	DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2)		\
-	if ((flags) & (flag1) && (flags) & (flag2))			\
-		return (__db_ferr(dbenv, name, 1));
-
 /*
- * __db_fchk --
- *	General flags checking routine.
+ * __db_pgfmt --
+ *	Error when a page has the wrong format.
  *
- * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t));
+ * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t));
  */
 int
-__db_fchk(dbenv, name, flags, ok_flags)
-	DB_ENV *dbenv;
-	const char *name;
-	u_int32_t flags, ok_flags;
+__db_pgfmt(dbp, pgno)
+	DB *dbp;
+	db_pgno_t pgno;
 {
-	DB_CHECK_FLAGS(dbenv, name, flags, ok_flags);
-	return (0);
+	__db_err(dbp->dbenv,
+	    "page %lu: illegal page type or format", (u_long)pgno);
+	return (__db_panic(dbp->dbenv, EINVAL));
 }
 
 /*
- * __db_fcchk --
- *	General combination flags checking routine.
+ * __db_panic --
+ *	Lock out the tree due to unrecoverable error.
  *
- * PUBLIC: int __db_fcchk
- * PUBLIC:    __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+ * PUBLIC: int __db_panic __P((DB_ENV *, int));
  */
 int
-__db_fcchk(dbenv, name, flags, flag1, flag2)
+__db_panic(dbenv, errval)
 	DB_ENV *dbenv;
-	const char *name;
-	u_int32_t flags, flag1, flag2;
+	int errval;
 {
-	DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2);
-	return (0);
-}
+	if (dbenv != NULL) {
+		dbenv->db_panic = errval;
 
-/*
- * __db_cdelchk --
- *	Common cursor delete argument checking routine.
- *
- * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int));
- */
-int
-__db_cdelchk(dbp, flags, isrdonly, isvalid)
-	const DB *dbp;
-	u_int32_t flags;
-	int isrdonly, isvalid;
-{
-	/* Check for changes to a read-only tree. */
-	if (isrdonly)
-		return (__db_rdonly(dbp->dbenv, "c_del"));
+		(void)__log_panic(dbenv);
+		(void)__memp_panic(dbenv);
+		(void)__lock_panic(dbenv);
+		(void)__txn_panic(dbenv);
 
-	/* Check for invalid dbc->c_del() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "c_del", flags, 0);
-
-	/*
-	 * The cursor must be initialized, return -1 for an invalid cursor,
-	 * otherwise 0.
-	 */
-	return (isvalid ? 0 : EINVAL);
-}
+		__db_err(dbenv, "PANIC: %s", strerror(errval));
 
-/*
- * __db_cgetchk --
- *	Common cursor get argument checking routine.
- *
- * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
- */
-int
-__db_cgetchk(dbp, key, data, flags, isvalid)
-	const DB *dbp;
-	DBT *key, *data;
-	u_int32_t flags;
-	int isvalid;
-{
-	int key_einval, key_flags;
-
-	key_flags = key_einval = 0;
-
-	/* Check for invalid dbc->c_get() function flags. */
-	switch (flags) {
-	case DB_CURRENT:
-	case DB_FIRST:
-	case DB_LAST:
-	case DB_NEXT:
-	case DB_PREV:
-		key_flags = 1;
-		break;
-	case DB_SET_RANGE:
-		key_einval = key_flags = 1;
-		break;
-	case DB_SET:
-		key_einval = 1;
-		break;
-	case DB_GET_RECNO:
-		if (!F_ISSET(dbp, DB_BT_RECNUM))
-			goto err;
-		break;
-	case DB_SET_RECNO:
-		if (!F_ISSET(dbp, DB_BT_RECNUM))
-			goto err;
-		key_einval = key_flags = 1;
-		break;
-	default:
-err:		return (__db_ferr(dbp->dbenv, "c_get", 0));
+		if (dbenv->db_paniccall != NULL)
+			dbenv->db_paniccall(dbenv, errval);
 	}
 
-	/* Check for invalid key/data flags. */
-	if (key_flags)
-		DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags,
-		    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
-	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-
-	/* Check dbt's for valid flags when multi-threaded. */
-	if (F_ISSET(dbp, DB_AM_THREAD)) {
-		if (!F_ISSET(data, DB_DBT_USERMEM | DB_DBT_MALLOC))
-			return (__db_ferr(dbp->dbenv, "threaded data", 1));
-		if (key_flags &&
-		    !F_ISSET(key, DB_DBT_USERMEM | DB_DBT_MALLOC))
-			return (__db_ferr(dbp->dbenv, "threaded key", 1));
-	}
-
-	/* Check for missing keys. */
-	if (key_einval && (key->data == NULL || key->size == 0))
-		return (__db_keyempty(dbp->dbenv));
-
 	/*
-	 * The cursor must be initialized for DB_CURRENT, return -1 for an
-	 * invalid cursor, otherwise 0.
+	 * Chaos reigns within.
+	 * Reflect, repent, and reboot.
+	 * Order shall return.
 	 */
-	return (isvalid || flags != DB_CURRENT ? 0 : EINVAL);
-}
-
-/*
- * __db_cputchk --
- *	Common cursor put argument checking routine.
- *
- * PUBLIC: int __db_cputchk __P((const DB *,
- * PUBLIC:    const DBT *, DBT *, u_int32_t, int, int));
- */
-int
-__db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
-	const DB *dbp;
-	const DBT *key;
-	DBT *data;
-	u_int32_t flags;
-	int isrdonly, isvalid;
-{
-	int key_einval, key_flags;
-
-	/* Check for changes to a read-only tree. */
-	if (isrdonly)
-		return (__db_rdonly(dbp->dbenv, "c_put"));
-
-	/* Check for invalid dbc->c_put() function flags. */
-	key_einval = key_flags = 0;
-	switch (flags) {
-	case DB_AFTER:
-	case DB_BEFORE:
-		if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER))
-			goto err;
-		if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP))
-			goto err;
-		break;
-	case DB_CURRENT:
-		break;
-	case DB_KEYFIRST:
-	case DB_KEYLAST:
-		if (dbp->type == DB_RECNO)
-			goto err;
-		key_einval = key_flags = 1;
-		break;
-	default:
-err:		return (__db_ferr(dbp->dbenv, "c_put", 0));
-	}
-
-	/* Check for invalid key/data flags. */
-	if (key_flags)
-		DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags,
-		    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
-	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-
-	/* Check for missing keys. */
-	if (key_einval && (key->data == NULL || key->size == 0))
-		return (__db_keyempty(dbp->dbenv));
-
-	/*
-	 * The cursor must be initialized for anything other than DB_KEYFIRST
-	 * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0.
-	 */
-	return (isvalid ||
-	    (flags != DB_KEYFIRST && flags != DB_KEYLAST) ? 0 : EINVAL);
-}
-
-/*
- * __db_delchk --
- *	Common delete argument checking routine.
- *
- * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
- */
-int
-__db_delchk(dbp, key, flags, isrdonly)
-	const DB *dbp;
-	DBT *key;
-	u_int32_t flags;
-	int isrdonly;
-{
-	/* Check for changes to a read-only tree. */
-	if (isrdonly)
-		return (__db_rdonly(dbp->dbenv, "delete"));
-
-	/* Check for invalid db->del() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "delete", flags, 0);
-
-	/* Check for missing keys. */
-	if (key->data == NULL || key->size == 0)
-		return (__db_keyempty(dbp->dbenv));
-
-	return (0);
-}
-
-/*
- * __db_getchk --
- *	Common get argument checking routine.
- *
- * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
- */
-int
-__db_getchk(dbp, key, data, flags)
-	const DB *dbp;
-	const DBT *key;
-	DBT *data;
-	u_int32_t flags;
-{
-	/* Check for invalid db->get() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv,
-	    "get", flags, F_ISSET(dbp, DB_BT_RECNUM) ? DB_SET_RECNO : 0);
-
-	/* Check for invalid key/data flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0);
-	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
-	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-	DB_CHECK_FCOMBO(dbp->dbenv,
-	    "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM);
-	if (F_ISSET(dbp, DB_AM_THREAD) &&
-	    !F_ISSET(data, DB_DBT_MALLOC | DB_DBT_USERMEM))
-		return (__db_ferr(dbp->dbenv, "threaded data", 1));
-
-	/* Check for missing keys. */
-	if (key->data == NULL || key->size == 0)
-		return (__db_keyempty(dbp->dbenv));
-
-	return (0);
-}
-
-/*
- * __db_putchk --
- *	Common put argument checking routine.
- *
- * PUBLIC: int __db_putchk
- * PUBLIC:    __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
- */
-int
-__db_putchk(dbp, key, data, flags, isrdonly, isdup)
-	const DB *dbp;
-	DBT *key;
-	const DBT *data;
-	u_int32_t flags;
-	int isrdonly, isdup;
-{
-	/* Check for changes to a read-only tree. */
-	if (isrdonly)
-		return (__db_rdonly(dbp->dbenv, "put"));
-
-	/* Check for invalid db->put() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "put", flags,
-	    DB_NOOVERWRITE | (dbp->type == DB_RECNO ? DB_APPEND : 0));
-
-	/* Check for invalid key/data flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0);
-	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
-	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
-	DB_CHECK_FCOMBO(dbp->dbenv,
-	    "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM);
-
-	/* Check for missing keys. */
-	if (key->data == NULL || key->size == 0)
-		return (__db_keyempty(dbp->dbenv));
-
-	/* Check for partial puts in the presence of duplicates. */
-	if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) {
-		__db_err(dbp->dbenv,
-"a partial put in the presence of duplicates requires a cursor operation");
-		return (EINVAL);
-	}
-
-	return (0);
-}
-
-/*
- * __db_statchk --
- *	Common stat argument checking routine.
- *
- * PUBLIC: int __db_statchk __P((const DB *, u_int32_t));
- */
-int
-__db_statchk(dbp, flags)
-	const DB *dbp;
-	u_int32_t flags;
-{
-	/* Check for invalid db->stat() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "stat", flags, DB_RECORDCOUNT);
-
-	if (LF_ISSET(DB_RECORDCOUNT) &&
-	    dbp->type == DB_BTREE && !F_ISSET(dbp, DB_BT_RECNUM))
-		return (__db_ferr(dbp->dbenv, "stat", 0));
-
-	return (0);
-}
-
-/*
- * __db_syncchk --
- *	Common sync argument checking routine.
- *
- * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t));
- */
-int
-__db_syncchk(dbp, flags)
-	const DB *dbp;
-	u_int32_t flags;
-{
-	/* Check for invalid db->sync() function flags. */
-	DB_CHECK_FLAGS(dbp->dbenv, "sync", flags, 0);
-
-	return (0);
-}
-
-/*
- * __db_ferr --
- *	Common flag errors.
- *
- * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int));
- */
-int
-__db_ferr(dbenv, name, iscombo)
-	const DB_ENV *dbenv;
-	const char *name;
-	int iscombo;
-{
-	__db_err(dbenv, "illegal flag %sspecified to %s",
-	    iscombo ? "combination " : "", name);
-	return (EINVAL);
-}
-
-/*
- * __db_rdonly --
- *	Common readonly message.
- */
-static int
-__db_rdonly(dbenv, name)
-	const DB_ENV *dbenv;
-	const char *name;
-{
-	__db_err(dbenv, "%s: attempt to modify a read-only tree", name);
-	return (EACCES);
-}
-
-/*
- * __db_keyempty --
- *	Common missing or empty key value message.
- */
-static int
-__db_keyempty(dbenv)
-	const DB_ENV *dbenv;
-{
-	__db_err(dbenv, "missing or empty key value specified");
-	return (EINVAL);
+	return (DB_RUNRECOVERY);
 }
diff --git a/db2/common/db_region.c b/db2/common/db_region.c
index 284af6176a..12abfa524d 100644
--- a/db2/common/db_region.c
+++ b/db2/common/db_region.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_region.c	10.46 (Sleepycat) 5/26/98";
+static const char sccsid[] = "@(#)db_region.c	10.53 (Sleepycat) 11/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -46,7 +46,7 @@ __db_rattach(infop)
 	ret = retry_cnt = 0;
 
 	/* Round off the requested size to the next page boundary. */
-	DB_ROUNDOFF(infop->size);
+	DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
 
 	/* Some architectures have hard limits on the maximum region size. */
 #ifdef DB_REGIONSIZE_MAX
@@ -61,7 +61,7 @@ loop:	infop->addr = NULL;
 	infop->fd = -1;
 	infop->segid = INVALID_SEGID;
 	if (infop->name != NULL) {
-		FREES(infop->name);
+		__os_freestr(infop->name);
 		infop->name = NULL;
 	}
 	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
@@ -74,6 +74,11 @@ loop:	infop->addr = NULL;
 	 * (Theoretically, we could probably get a file descriptor to lock
 	 * other types of shared regions, but I don't see any reason to
 	 * bother.)
+	 *
+	 * Since we may be using shared memory regions, e.g., shmget(2),
+	 * and not mmap of regular files, the backing file may be only a
+	 * few tens of bytes in length.  So, this depends on the ability
+	 * to fcntl lock file offsets much larger than the physical file.
 	 */
 	malloc_possible = 0;
 #endif
@@ -91,15 +96,16 @@ loop:	infop->addr = NULL;
 	 * than either anonymous memory or a shared file.
 	 */
 	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
-		if ((infop->addr = __db_malloc(infop->size)) == NULL)
-			return (ENOMEM);
+		if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
+			return (ret);
 
 		/*
-		 * It's sometimes significantly faster to page-fault in all
-		 * of the region's pages before we run the application, as
-		 * we can see fairly nasty side-effects when we page-fault
-		 * while holding various locks, i.e., the lock takes a long
-		 * time, and other threads convoy behind the lock holder.
+		 * It's sometimes significantly faster to page-fault in all of
+		 * the region's pages before we run the application, as we see
+		 * nasty side-effects when we page-fault while holding various
+		 * locks, i.e., the lock takes a long time to acquire because
+		 * of the underlying page fault, and the other threads convoy
+		 * behind the lock holder.
 		 */
 		if (DB_GLOBAL(db_region_init))
 			for (p = infop->addr;
@@ -159,7 +165,7 @@ loop:	infop->addr = NULL;
 	 *    3. Memory backed by a regular file (mmap(2)).
 	 *
 	 * We instantiate a backing file in all cases, which contains at least
-	 * the RLAYOUT structure, and in case #4, contains the actual region.
+	 * the RLAYOUT structure, and in case #3, contains the actual region.
 	 * This is necessary for a couple of reasons:
 	 *
 	 * First, the mpool region uses temporary files to name regions, and
@@ -218,7 +224,7 @@ loop:	infop->addr = NULL;
 		 * And yes, this makes me want to take somebody and kill them,
 		 * but I can't think of any other solution.
 		 */
-		if ((ret = __db_ioinfo(infop->name,
+		if ((ret = __os_ioinfo(infop->name,
 		    infop->fd, &mbytes, &bytes, NULL)) != 0)
 			goto errmsg;
 		size = mbytes * MEGABYTE + bytes;
@@ -233,7 +239,7 @@ loop:	infop->addr = NULL;
 			if (size < sizeof(RLAYOUT))
 				goto retry;
 			if ((ret =
-			    __db_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
+			    __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
 				goto retry;
 			if (rl.valid != DB_REGIONMAGIC)
 				goto retry;
@@ -284,6 +290,7 @@ loop:	infop->addr = NULL;
 		} else
 			goto err;
 	}
+
 region_init:
 	/*
 	 * Initialize the common region information.
@@ -321,6 +328,7 @@ region_init:
 		rlp->refcnt = 1;
 		rlp->size = infop->size;
 		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
+		rlp->panic = 0;
 		rlp->segid = infop->segid;
 		rlp->flags = 0;
 		if (F_ISSET(infop, REGION_ANONYMOUS))
@@ -347,13 +355,19 @@ region_init:
 		 * the file.
 		 */
 		if (F_ISSET(infop, REGION_ANONYMOUS)) {
-			if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
+			if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
 				goto err;
 			if ((ret =
-			    __db_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
+			    __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
 				goto err;
 		}
 	} else {
+		/* Check to see if the region has had catastrophic failure. */
+		if (rlp->panic) {
+			ret = DB_RUNRECOVERY;
+			goto err;
+		}
+
 		/*
 		 * Check the valid flag to ensure the region is initialized.
 		 * If the valid flag has not been set, the mutex may not have
@@ -380,18 +394,6 @@ region_init:
 		}
 
 		/*
-		 * Problem #2: We want a bigger region than has previously been
-		 * created.  Detected by checking if the region is smaller than
-		 * our caller requested.  If it is, we grow the region, (which
-		 * does the detach and re-attach for us).
-		 */
-		if (grow_region != 0 &&
-		    (ret = __db_rgrow(infop, grow_region)) != 0) {
-			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
-			goto err;
-		}
-
-		/*
 		 * Problem #3: when we checked the size of the file, it was
 		 * still growing as part of creation.  Detected by the fact
 		 * that infop->size isn't the same size as the region.
@@ -419,16 +421,16 @@ retry:		/* Discard the region. */
 
 		/* Discard the backing file. */
 		if (infop->fd != -1) {
-			(void)__db_close(infop->fd);
+			(void)__os_close(infop->fd);
 			infop->fd = -1;
 
 			if (F_ISSET(infop, REGION_CREATED))
-				(void)__db_unlink(infop->name);
+				(void)__os_unlink(infop->name);
 		}
 
 		/* Discard the name. */
 		if (infop->name != NULL) {
-			FREES(infop->name);
+			__os_freestr(infop->name);
 			infop->name = NULL;
 		}
 
@@ -438,7 +440,7 @@ retry:		/* Discard the region. */
 		 */
 		if (ret == 0) {
 			if (++retry_cnt <= 3) {
-				__db_sleep(retry_cnt * 2, 0);
+				__os_sleep(retry_cnt * 2, 0);
 				goto loop;
 			}
 			ret = EAGAIN;
@@ -481,10 +483,11 @@ retry:		/* Discard the region. */
 			F_SET(infop, REGION_REMOVED);
 			F_CLR(infop, REGION_CANGROW);
 
-			(void)__db_close(infop->fd);
-			(void)__db_unlink(infop->name);
+			(void)__os_close(infop->fd);
+			(void)__os_unlink(infop->name);
 		}
 	}
+
 	return (ret);
 }
 
@@ -514,7 +517,7 @@ __db_rdetach(infop)
 	 * action required is freeing the memory.
 	 */
 	if (F_ISSET(infop, REGION_MALLOC)) {
-		__db_free(infop->addr);
+		__os_free(infop->addr, 0);
 		goto done;
 	}
 
@@ -549,7 +552,7 @@ __db_rdetach(infop)
 	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
 
 	/* Close the backing file descriptor. */
-	(void)__db_close(infop->fd);
+	(void)__os_close(infop->fd);
 	infop->fd = -1;
 
 	/* Discard our mapping of the region. */
@@ -561,13 +564,13 @@ __db_rdetach(infop)
 		if ((t_ret =
 		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
 			ret = t_ret;
-		if ((t_ret = __db_unlink(infop->name) != 0) && ret == 0)
+		if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
 			ret = t_ret;
 	}
 
 done:	/* Discard the name. */
 	if (infop->name != NULL) {
-		FREES(infop->name);
+		__os_freestr(infop->name);
 		infop->name = NULL;
 	}
 
@@ -629,8 +632,8 @@ __db_runlink(infop, force)
 	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
 	 * back the region.  If that file doesn't exist, we're done.
 	 */
-	if (__db_exists(name, NULL) != 0) {
-		FREES(name);
+	if (__os_exists(name, NULL) != 0) {
+		__os_freestr(name);
 		return (0);
 	}
 
@@ -641,12 +644,12 @@ __db_runlink(infop, force)
 	 */
 	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
 		goto errmsg;
-	if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
+	if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
 		goto errmsg;
 	size = mbytes * MEGABYTE + bytes;
 
 	if (size <= sizeof(RLAYOUT)) {
-		if ((ret = __db_read(fd, &rl, sizeof(rl), &nr)) != 0)
+		if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
 			goto errmsg;
 		if (rl.valid != DB_REGIONMAGIC) {
 			__db_err(infop->dbenv,
@@ -673,16 +676,16 @@ __db_runlink(infop, force)
 	 * because some architectures (e.g., Win32) won't unlink a file if
 	 * open file descriptors remain.
 	 */
-	(void)__db_close(fd);
-	if ((t_ret = __db_unlink(name)) != 0 && ret == 0)
+	(void)__os_close(fd);
+	if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
 		ret = t_ret;
 
 	if (0) {
 errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
-err:		(void)__db_close(fd);
+err:		(void)__os_close(fd);
 	}
 
-	FREES(name);
+	__os_freestr(name);
 	return (ret);
 }
 
@@ -715,7 +718,7 @@ __db_rgrow(infop, new_size)
 	 * determine the additional space required.
 	 */
 	rlp = (RLAYOUT *)infop->addr;
-	DB_ROUNDOFF(new_size);
+	DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
 	increment = new_size - rlp->size;
 
 	if ((ret = __db_growregion(infop, increment)) != 0)
@@ -745,7 +748,7 @@ __db_growregion(infop, increment)
 	char buf[DB_VMPAGESIZE];
 
 	/* Seek to the end of the region. */
-	if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
+	if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
 		goto err;
 
 	/* Write nuls to the new bytes. */
@@ -760,7 +763,7 @@ __db_growregion(infop, increment)
 		/* Extend the region by writing each new page. */
 		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
 			if ((ret =
-			    __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
+			    __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
 				goto err;
 			if (nw != sizeof(buf))
 				goto eio;
@@ -776,36 +779,44 @@ __db_growregion(infop, increment)
 		 */
 		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
 		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
-		if ((ret = __db_seek(infop->fd,
+		if ((ret = __os_seek(infop->fd,
 		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
 			goto err;
-		if ((ret = __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
+		if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
 			goto err;
 		if (nw != sizeof(buf))
 			goto eio;
 
 		/*
-		 * It's sometimes significantly faster to page-fault in all
-		 * of the region's pages before we run the application, as
-		 * we can see fairly nasty side-effects when we page-fault
-		 * while holding various locks, i.e., the lock takes a long
-		 * time, and other threads convoy behind the lock holder.
+		 * It's sometimes significantly faster to page-fault in all of
+		 * the region's pages before we run the application, as we see
+		 * nasty side-effects when we page-fault while holding various
+		 * locks, i.e., the lock takes a long time to acquire because
+		 * of the underlying page fault, and the other threads convoy
+		 * behind the lock holder.
+		 *
+		 * We also use REGION_INIT to guarantee that there is enough
+		 * disk space for the region, so we also write a byte to each
+		 * page.  Reading the byte is insufficient as some systems
+		 * (e.g., Solaris) do not instantiate disk pages to satisfy
+		 * a read, and so we don't know if there is enough disk space
+		 * or not.
 		 */
 		if (DB_GLOBAL(db_region_init)) {
 			pages = increment / MEGABYTE;
 			relative = increment % MEGABYTE;
-			if ((ret = __db_seek(infop->fd,
+			if ((ret = __os_seek(infop->fd,
 			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
 				goto err;
 
-			/* Read a byte from each page. */
+			/* Write a byte to each page. */
 			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
 				if ((ret =
-				    __db_read(infop->fd, buf, 1, &nr)) != 0)
+				    __os_write(infop->fd, buf, 1, &nr)) != 0)
 					goto err;
 				if (nr != 1)
 					goto eio;
-				if ((ret = __db_seek(infop->fd,
+				if ((ret = __os_seek(infop->fd,
 				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
 					goto err;
 			}
diff --git a/db2/common/db_salloc.c b/db2/common/db_salloc.c
index c02d7e18e9..d58b79f3c4 100644
--- a/db2/common/db_salloc.c
+++ b/db2/common/db_salloc.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_salloc.c	10.13 (Sleepycat) 5/10/98";
+static const char sccsid[] = "@(#)db_salloc.c	10.14 (Sleepycat) 11/16/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -170,7 +170,7 @@ __db_shalloc_free(regionp, ptr)
 
 	/* Trash the returned memory. */
 #ifdef DIAGNOSTIC
-	memset(ptr, 0xff, free_size);
+	memset(ptr, 0xdb, free_size);
 #endif
 
 	/*
diff --git a/db2/db.h b/db2/db.h
index e1f5c72044..b04c8971c9 100644
--- a/db2/db.h
+++ b/db2/db.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db.h.src	10.131 (Sleepycat) 6/2/98
+ *	@(#)db.h	10.174 (Sleepycat) 1/3/99
  */
 
 #ifndef _DB_H_
@@ -56,34 +56,20 @@
  * We also provide the standard u_int, u_long etc., if they're not provided
  * by the system.
  */
-#ifndef	__BIT_TYPES_DEFINED__
-#define	__BIT_TYPES_DEFINED__
-
-
-
-
-
-#endif
-
-
-
-
-
 
 #define	DB_VERSION_MAJOR	2
-#define	DB_VERSION_MINOR	4
-#define	DB_VERSION_PATCH	14
-#define	DB_VERSION_STRING	"Sleepycat Software: DB 2.4.14: (6/2/98)"
+#define	DB_VERSION_MINOR	7
+#define	DB_VERSION_PATCH	5
+#define	DB_VERSION_STRING	"Sleepycat Software: Berkeley DB 2.7.5: (04/18/99)"
 
 typedef	u_int32_t	db_pgno_t;	/* Page number type. */
 typedef	u_int16_t	db_indx_t;	/* Page offset type. */
 #define	DB_MAX_PAGES	0xffffffff	/* >= # of pages in a file */
 
 typedef	u_int32_t	db_recno_t;	/* Record number type. */
-typedef size_t		DB_LOCK;	/* Object returned by lock manager. */
 #define	DB_MAX_RECORDS	0xffffffff	/* >= # of records in a tree */
 
-#define	DB_FILE_ID_LEN		20	/* DB file ID length. */
+typedef size_t		DB_LOCK;	/* Object returned by lock manager. */
 
 /* Forward structure declarations, so applications get type checking. */
 struct __db;		typedef struct __db DB;
@@ -93,6 +79,7 @@ struct __db;		typedef struct __db DB;
 struct __db_bt_stat;	typedef struct __db_bt_stat DB_BTREE_STAT;
 struct __db_dbt;	typedef struct __db_dbt DBT;
 struct __db_env;	typedef struct __db_env DB_ENV;
+struct __db_ilock;	typedef struct __db_ilock DB_LOCK_ILOCK;
 struct __db_info;	typedef struct __db_info DB_INFO;
 struct __db_lock_stat;	typedef struct __db_lock_stat DB_LOCK_STAT;
 struct __db_lockregion;	typedef struct __db_lockregion DB_LOCKREGION;
@@ -121,8 +108,7 @@ struct __db_dbt {
 	u_int32_t dlen;			/* RO: get/put record length. */
 	u_int32_t doff;			/* RO: get/put record offset. */
 
-#define	DB_DBT_INTERNAL	0x01		/* Perform any mallocs using regular
-					   malloc, not the user's malloc. */
+#define	DB_DBT_INTERNAL	0x01		/* Ignore user's malloc (internal). */
 #define	DB_DBT_MALLOC	0x02		/* Return in allocated memory. */
 #define	DB_DBT_PARTIAL	0x04		/* Partial put/get. */
 #define	DB_DBT_USERMEM	0x08		/* Return in user's memory. */
@@ -130,38 +116,36 @@ struct __db_dbt {
 };
 
 /*
- * DB internal configuration.
+ * DB run-time interface configuration.
  *
  * There are a set of functions that the application can replace with its
  * own versions, and some other knobs which can be turned at run-time.
  */
-#define	DB_FUNC_CALLOC	 1	/* DELETED: ANSI C calloc. */
-#define	DB_FUNC_CLOSE	 2		/* POSIX 1003.1 close. */
-#define	DB_FUNC_DIRFREE	 3		/* DB: free directory list. */
-#define	DB_FUNC_DIRLIST	 4		/* DB: create directory list. */
-#define	DB_FUNC_EXISTS	 5		/* DB: return if file exists. */
-#define	DB_FUNC_FREE	 6		/* ANSI C free. */
-#define	DB_FUNC_FSYNC	 7		/* POSIX 1003.1 fsync. */
-#define	DB_FUNC_IOINFO	 8		/* DB: return file I/O information. */
-#define	DB_FUNC_MALLOC	 9		/* ANSI C malloc. */
-#define	DB_FUNC_MAP	10		/* DB: map file into shared memory. */
-#define	DB_FUNC_OPEN	11		/* POSIX 1003.1 open. */
-#define	DB_FUNC_READ	12		/* POSIX 1003.1 read. */
-#define	DB_FUNC_REALLOC	13		/* ANSI C realloc. */
+#define	DB_FUNC_CLOSE	 1		/* POSIX 1003.1 close. */
+#define	DB_FUNC_DIRFREE	 2		/* DB: free directory list. */
+#define	DB_FUNC_DIRLIST	 3		/* DB: create directory list. */
+#define	DB_FUNC_EXISTS	 4		/* DB: return if file exists. */
+#define	DB_FUNC_FREE	 5		/* ANSI C free. */
+#define	DB_FUNC_FSYNC	 6		/* POSIX 1003.1 fsync. */
+#define	DB_FUNC_IOINFO	 7		/* DB: return file I/O information. */
+#define	DB_FUNC_MALLOC	 8		/* ANSI C malloc. */
+#define	DB_FUNC_MAP	 9		/* DB: map file into shared memory. */
+#define	DB_FUNC_OPEN	10		/* POSIX 1003.1 open. */
+#define	DB_FUNC_READ	11		/* POSIX 1003.1 read. */
+#define	DB_FUNC_REALLOC	12		/* ANSI C realloc. */
+#define	DB_FUNC_RUNLINK	13		/* DB: remove a shared region. */
 #define	DB_FUNC_SEEK	14		/* POSIX 1003.1 lseek. */
 #define	DB_FUNC_SLEEP	15		/* DB: sleep secs/usecs. */
-#define	DB_FUNC_STRDUP	16	/* DELETED: DB: strdup(3). */
-#define	DB_FUNC_UNLINK	17		/* POSIX 1003.1 unlink. */
-#define	DB_FUNC_UNMAP	18		/* DB: unmap shared memory file. */
-#define	DB_FUNC_WRITE	19		/* POSIX 1003.1 write. */
-#define	DB_FUNC_YIELD	20		/* DB: yield thread to scheduler. */
-#define	DB_TSL_SPINS	21		/* DB: initialize spin count. */
-#define	DB_FUNC_RUNLINK	22		/* DB: remove a shared region. */
-#define	DB_REGION_ANON	23		/* DB: anonymous, unnamed regions. */
-#define	DB_REGION_INIT	24		/* DB: page-fault regions in create. */
-#define	DB_REGION_NAME	25		/* DB: anonymous, named regions. */
-#define	DB_MUTEXLOCKS	26		/* DB: turn off all mutex locks. */
-#define	DB_PAGEYIELD	27		/* DB: yield the CPU on pool get. */
+#define	DB_FUNC_UNLINK	16		/* POSIX 1003.1 unlink. */
+#define	DB_FUNC_UNMAP	17		/* DB: unmap shared memory file. */
+#define	DB_FUNC_WRITE	18		/* POSIX 1003.1 write. */
+#define	DB_FUNC_YIELD	19		/* DB: yield thread to scheduler. */
+#define	DB_MUTEXLOCKS	20		/* DB: turn off all mutex locks. */
+#define	DB_PAGEYIELD	21		/* DB: yield the CPU on pool get. */
+#define	DB_REGION_ANON	22		/* DB: anonymous, unnamed regions. */
+#define	DB_REGION_INIT	23		/* DB: page-fault regions in create. */
+#define	DB_REGION_NAME	24		/* DB: anonymous, named regions. */
+#define	DB_TSL_SPINS	25		/* DB: initialize spin count. */
 
 /*
  * Database configuration and initialization.
@@ -177,29 +161,18 @@ struct __db_dbt {
  * Flags understood by db_appinit(3).
  */
 /*			      0x000007	   COMMON MASK. */
-#define	DB_INIT_LOCK	      0x000008	/* Initialize locking. */
-#define	DB_INIT_LOG	      0x000010	/* Initialize logging. */
-#define	DB_INIT_MPOOL	      0x000020	/* Initialize mpool. */
-#define	DB_INIT_TXN	      0x000040	/* Initialize transactions. */
-#define	DB_MPOOL_PRIVATE      0x000080	/* Mpool: private memory pool. */
-#define	__UNUSED_100	      0x000100
+#define	DB_INIT_CDB	      0x000008	/* Concurrent Access Methods. */
+#define	DB_INIT_LOCK	      0x000010	/* Initialize locking. */
+#define	DB_INIT_LOG	      0x000020	/* Initialize logging. */
+#define	DB_INIT_MPOOL	      0x000040	/* Initialize mpool. */
+#define	DB_INIT_TXN	      0x000080	/* Initialize transactions. */
+#define	DB_MPOOL_PRIVATE      0x000100	/* Mpool: private memory pool. */
 #define	DB_RECOVER	      0x000200	/* Run normal recovery. */
 #define	DB_RECOVER_FATAL      0x000400	/* Run catastrophic recovery. */
 #define	DB_TXN_NOSYNC	      0x000800	/* Do not sync log on commit. */
 #define	DB_USE_ENVIRON	      0x001000	/* Use the environment. */
 #define	DB_USE_ENVIRON_ROOT   0x002000	/* Use the environment if root. */
 
-/* CURRENTLY UNUSED LOCK FLAGS. */
-#define	DB_TXN_LOCK_2PL	      0x000000	/* Two-phase locking. */
-#define	DB_TXN_LOCK_OPTIMIST  0x000000	/* Optimistic locking. */
-#define	DB_TXN_LOCK_MASK      0x000000	/* Lock flags mask. */
-
-/* CURRENTLY UNUSED LOG FLAGS. */
-#define	DB_TXN_LOG_REDO	      0x000000	/* Redo-only logging. */
-#define	DB_TXN_LOG_UNDO	      0x000000	/* Undo-only logging. */
-#define	DB_TXN_LOG_UNDOREDO   0x000000	/* Undo/redo write-ahead logging. */
-#define	DB_TXN_LOG_MASK	      0x000000	/* Log flags mask. */
-
 /*
  * Flags understood by db_open(3).
  *
@@ -207,23 +180,22 @@ struct __db_dbt {
  * DB_SEQUENTIAL is currently internal, but may be exported some day.
  */
 /*			      0x000007	   COMMON MASK. */
-/*			      0x003fff	   ALREADY USED. */
-#define	__UNUSED_4000	      0x004000
-#define	DB_EXCL		      0x008000	/* O_EXCL: exclusive open. */
-#define	DB_RDONLY	      0x010000	/* O_RDONLY: read-only. */
-#define	DB_SEQUENTIAL	      0x020000	/* Indicate sequential access. */
-#define	DB_TEMPORARY	      0x040000	/* Remove on last close. */
-#define	DB_TRUNCATE	      0x080000	/* O_TRUNCATE: replace existing DB. */
+/*			      0x001fff	   ALREADY USED. */
+#define	DB_EXCL		      0x002000	/* O_EXCL: exclusive open (internal). */
+#define	DB_RDONLY	      0x004000	/* O_RDONLY: read-only. */
+#define	DB_SEQUENTIAL	      0x008000	/* Sequential access (internal). */
+#define	DB_TEMPORARY	      0x010000	/* Remove on last close (internal). */
+#define	DB_TRUNCATE	      0x020000	/* O_TRUNCATE: replace existing DB. */
 
 /*
  * Deadlock detector modes; used in the DBENV structure to configure the
  * locking subsystem.
  */
-#define	DB_LOCK_NORUN		0x0
-#define	DB_LOCK_DEFAULT		0x1	/* Default policy. */
-#define	DB_LOCK_OLDEST		0x2	/* Abort oldest transaction. */
-#define	DB_LOCK_RANDOM		0x3	/* Abort random transaction. */
-#define	DB_LOCK_YOUNGEST	0x4	/* Abort youngest transaction. */
+#define	DB_LOCK_NORUN		0
+#define	DB_LOCK_DEFAULT		1	/* Default policy. */
+#define	DB_LOCK_OLDEST		2	/* Abort oldest transaction. */
+#define	DB_LOCK_RANDOM		3	/* Abort random transaction. */
+#define	DB_LOCK_YOUNGEST	4	/* Abort youngest transaction. */
 
 struct __db_env {
 	int		 db_lorder;	/* Byte order. */
@@ -233,6 +205,8 @@ struct __db_env {
 	FILE		*db_errfile;	/* Error message file stream. */
 	const char	*db_errpfx;	/* Error message prefix. */
 	int		 db_verbose;	/* Generate debugging messages. */
+	int		 db_panic;	/* Panic flag, callback function. */
+	void (*db_paniccall) __P((DB_ENV *, int));
 
 	/* User paths. */
 	char		*db_home;	/* Database home. */
@@ -245,7 +219,7 @@ struct __db_env {
 
 	/* Locking. */
 	DB_LOCKTAB	*lk_info;	/* Return from lock_open(). */
-	u_int8_t	*lk_conflicts;	/* Two dimensional conflict matrix. */
+	const u_int8_t	*lk_conflicts;	/* Two dimensional conflict matrix. */
 	u_int32_t	 lk_modes;	/* Number of lock modes in table. */
 	u_int32_t	 lk_max;	/* Maximum number of locks. */
 	u_int32_t	 lk_detect;	/* Deadlock detect on all conflicts. */
@@ -265,9 +239,25 @@ struct __db_env {
 	int (*tx_recover)		/* Dispatch function for recovery. */
 	    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 
+	/*
+	 * XA support.
+	 *
+	 * !!!
+	 * Explicit representations of structures in queue.h.
+	 *
+	 * TAILQ_ENTRY(__db_env);
+	 */
+	struct {
+		struct __db_env *tqe_next;
+		struct __db_env **tqe_prev;
+	} links;
+	int		 xa_rmid;	/* XA Resource Manager ID. */
+	DB_TXN		*xa_txn;	/* XA Current transaction. */
+
 #define	DB_ENV_APPINIT		0x01	/* Paths initialized by db_appinit(). */
-#define	DB_ENV_STANDALONE	0x02	/* Test: freestanding environment. */
-#define	DB_ENV_THREAD		0x04	/* DB_ENV is multi-threaded. */
+#define	DB_ENV_CDB		0x02	/* Concurrent DB product. */
+#define	DB_ENV_STANDALONE	0x04	/* Test: freestanding environment. */
+#define	DB_ENV_THREAD		0x08	/* DB_ENV is multi-threaded. */
 	u_int32_t	 flags;		/* Flags. */
 };
 
@@ -275,7 +265,7 @@ struct __db_env {
  * Access methods.
  *******************************************************/
 /*
- * XXX
+ * !!!
  * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
  */
 typedef enum {
@@ -304,6 +294,8 @@ struct __db_info {
 
 					/* Local heap allocation. */
 	void *(*db_malloc) __P((size_t));
+	int (*dup_compare)		/* Duplicate compare function. */
+	    __P((const DBT *, const DBT *));
 
 	/* Btree access method. */
 	u_int32_t	 bt_maxkey;	/* Maximum keys per page. */
@@ -327,44 +319,51 @@ struct __db_info {
 
 #define	DB_DELIMITER		0x0001	/* Recno: re_delim set. */
 #define	DB_DUP			0x0002	/* Btree, Hash: duplicate keys. */
-#define	DB_FIXEDLEN		0x0004	/* Recno: fixed-length records. */
-#define	DB_PAD			0x0008	/* Recno: re_pad set. */
-#define	DB_RECNUM		0x0010	/* Btree: record numbers. */
-#define	DB_RENUMBER		0x0020	/* Recno: renumber on insert/delete. */
-#define	DB_SNAPSHOT		0x0040	/* Recno: snapshot the input. */
+#define	DB_DUPSORT		0x0004	/* Btree, Hash: duplicate keys. */
+#define	DB_FIXEDLEN		0x0008	/* Recno: fixed-length records. */
+#define	DB_PAD			0x0010	/* Recno: re_pad set. */
+#define	DB_RECNUM		0x0020	/* Btree: record numbers. */
+#define	DB_RENUMBER		0x0040	/* Recno: renumber on insert/delete. */
+#define	DB_SNAPSHOT		0x0080	/* Recno: snapshot the input. */
 	u_int32_t	 flags;
 };
 
 /*
- * DB access method and cursor operation codes.  These are implemented as
- * bit fields for future flexibility, but currently only a single one may
- * be specified to any function.
+ * DB access method and cursor operation values.  Each value is an operation
+ * code to which additional bit flags are added.
  */
-#define	DB_AFTER	0x000001	/* c_put() */
-#define	DB_APPEND	0x000002	/* put() */
-#define	DB_BEFORE	0x000004	/* c_put() */
-#define	DB_CHECKPOINT	0x000008	/* log_put(), log_get() */
-#define	DB_CURRENT	0x000010	/* c_get(), c_put(), log_get() */
-#define	DB_FIRST	0x000020	/* c_get(), log_get() */
-#define	DB_FLUSH	0x000040	/* log_put() */
-#define	DB_GET_RECNO	0x000080	/* get(), c_get() */
-#define	DB_KEYFIRST	0x000100	/* c_put() */
-#define	DB_KEYLAST	0x000200	/* c_put() */
-#define	DB_LAST		0x000400	/* c_get(), log_get() */
-#define	DB_NEXT		0x000800	/* c_get(), log_get() */
-#define	DB_NOOVERWRITE	0x001000	/* put() */
-#define	DB_NOSYNC	0x002000	/* close() */
-#define	DB_PREV		0x004000	/* c_get(), log_get() */
-#define	DB_RECORDCOUNT	0x008000	/* stat() */
-#define	DB_SET		0x010000	/* c_get(), log_get() */
-#define	DB_SET_RANGE	0x020000	/* c_get() */
-#define	DB_SET_RECNO	0x040000	/* c_get() */
-#define	DB_CURLSN	0x080000	/* log_put() */
+#define	DB_AFTER	 1		/* c_put() */
+#define	DB_APPEND	 2		/* put() */
+#define	DB_BEFORE	 3		/* c_put() */
+#define	DB_CHECKPOINT	 4		/* log_put(), log_get() */
+#define	DB_CURLSN	 5		/* log_put() */
+#define	DB_CURRENT	 6		/* c_get(), c_put(), log_get() */
+#define	DB_FIRST	 7		/* c_get(), log_get() */
+#define	DB_FLUSH	 8		/* log_put() */
+#define	DB_GET_BOTH	 9		/* get(), c_get() */
+#define	DB_GET_RECNO	10		/* c_get() */
+#define	DB_JOIN_ITEM	11		/* c_get(); do not do primary lookup */
+#define	DB_KEYFIRST	12		/* c_put() */
+#define	DB_KEYLAST	13		/* c_put() */
+#define	DB_LAST		14		/* c_get(), log_get() */
+#define	DB_NEXT		15		/* c_get(), log_get() */
+#define	DB_NEXT_DUP	16		/* c_get() */
+#define	DB_NOOVERWRITE	17		/* put() */
+#define	DB_NOSYNC	18		/* close() */
+#define	DB_PREV		19		/* c_get(), log_get() */
+#define	DB_RECORDCOUNT	20		/* stat() */
+#define	DB_SET		21		/* c_get(), log_get() */
+#define	DB_SET_RANGE	22		/* c_get() */
+#define	DB_SET_RECNO	23		/* get(), c_get() */
+#define	DB_WRITELOCK	24		/* cursor() (internal) */
+
+#define	DB_OPFLAGS_MASK	0x1f		/* Mask for operations flags. */
+#define	DB_RMW		0x80000000	/* Acquire write flag immediately. */
 
 /*
  * DB (user visible) error return codes.
  *
- * XXX
+ * !!!
  * Changes to any of the user visible error return codes must be reflected
  * in java/src/com/sleepycat/db/Db.java.
  */
@@ -376,93 +375,84 @@ struct __db_info {
 #define	DB_LOCK_NOTGRANTED	( -5)	/* Lock unavailable, no-wait set. */
 #define	DB_LOCK_NOTHELD		( -6)	/* Lock not held by locker. */
 #define	DB_NOTFOUND		( -7)	/* Key/data pair not found (EOF). */
+#define	DB_RUNRECOVERY		( -8)	/* Panic return. */
 
 /* DB (private) error return codes. */
-#define	DB_DELETED		( -8)	/* Recovery file marked deleted. */
-#define	DB_NEEDSPLIT		( -9)	/* Page needs to be split. */
-#define	DB_REGISTERED		(-10)	/* Entry was previously registered. */
+#define	DB_DELETED		( -9)	/* Recovery file marked deleted. */
+#define	DB_NEEDSPLIT		(-10)	/* Page needs to be split. */
 #define	DB_SWAPBYTES		(-11)	/* Database needs byte swapping. */
-#define DB_TXN_CKP		(-12)	/* Encountered ckp record in log. */
+#define	DB_TXN_CKP		(-12)	/* Encountered ckp record in log. */
 
-struct __db_ilock {			/* Internal DB access method lock. */
-	db_pgno_t	pgno;		/* Page being locked. */
-					/* File id. */
-	u_int8_t	fileid[DB_FILE_ID_LEN];
-};
+#define	DB_FILE_ID_LEN		20	/* DB file ID length. */
 
 /* DB access method description structure. */
 struct __db {
 	void	*mutexp;		/* Synchronization for free threading */
+
+					/* Documented, returned information. */
 	DBTYPE	 type;			/* DB access method. */
+	int	 byteswapped;		/* Database byte order is swapped. */
+
 	DB_ENV	*dbenv;			/* DB_ENV structure. */
 	DB_ENV	*mp_dbenv;		/* DB_ENV for local mpool creation. */
 
-	DB	*master;		/* Original DB created by db_open. */
 	void	*internal;		/* Access method private. */
 
 	DB_MPOOL	*mp;		/* The access method's mpool. */
 	DB_MPOOLFILE	*mpf;		/* The access method's mpool file. */
 
 	/*
-	 * XXX
+	 * !!!
 	 * Explicit representations of structures in queue.h.
 	 *
-	 * TAILQ_HEAD(curs_queue, __dbc);
+	 * TAILQ_HEAD(free_queue, __dbc);
+	 * TAILQ_HEAD(active_queue, __dbc);
 	 */
 	struct {
 		struct __dbc *tqh_first;
 		struct __dbc **tqh_last;
-	} curs_queue;
-
-	/*
-	 * XXX
-	 * Explicit representations of structures in queue.h.
-	 *
-	 * LIST_HEAD(handleq, __db);
-	 * LIST_ENTRY(__db);
-	 */
-	struct {
-		struct __db *lh_first;
-	} handleq;			/* List of handles for this DB. */
+	} free_queue;
 	struct {
-		struct __db *le_next;
-		struct __db **le_prev;
-	} links;			/* Links for the handle list. */
+		struct __dbc *tqh_first;
+		struct __dbc **tqh_last;
+	} active_queue;
 
+	u_int8_t  fileid[DB_FILE_ID_LEN]; /* Uniquely identify this file for
+					     locking. */
 	u_int32_t log_fileid;		/* Logging file id. */
-
-	DB_TXN	 *txn;			/* Current transaction. */
-	u_int32_t locker;		/* Default process' locker id. */
-	DBT	  lock_dbt;		/* DBT referencing lock. */
-	struct __db_ilock lock;		/* Lock. */
-
 	size_t	  pgsize;		/* Logical page size of file. */
 
 					/* Local heap allocation. */
 	void *(*db_malloc) __P((size_t));
+	int (*dup_compare)		/* Duplicate compare function. */
+	    __P((const DBT *, const DBT *));
+	u_int32_t (*h_hash)		/* Hash function. */
+	    __P((const void *, u_int32_t));
 
 					/* Functions. */
+	int (*am_close)	__P((DB *));
 	int (*close)	__P((DB *, u_int32_t));
-	int (*cursor)	__P((DB *, DB_TXN *, DBC **));
+	int (*cursor)	__P((DB *, DB_TXN *, DBC **, u_int32_t));
 	int (*del)	__P((DB *, DB_TXN *, DBT *, u_int32_t));
 	int (*fd)	__P((DB *, int *));
 	int (*get)	__P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+	int (*join)	__P((DB *, DBC **, u_int32_t, DBC **));
 	int (*put)	__P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
 	int (*stat)	__P((DB *, void *, void *(*)(size_t), u_int32_t));
 	int (*sync)	__P((DB *, u_int32_t));
 
-#define	DB_AM_DUP	0x000001	/* DB_DUP (internal). */
-#define	DB_AM_INMEM	0x000002	/* In-memory; no sync on close. */
-#define	DB_AM_LOCKING	0x000004	/* Perform locking. */
-#define	DB_AM_LOGGING	0x000008	/* Perform logging. */
-#define	DB_AM_MLOCAL	0x000010	/* Database memory pool is local. */
-#define	DB_AM_PGDEF	0x000020	/* Page size was defaulted. */
-#define	DB_AM_RDONLY	0x000040	/* Database is readonly. */
-#define	DB_AM_RECOVER	0x000080	/* In recovery (do not log or lock). */
+#define	DB_AM_CDB	0x000001	/* Concurrent Access Methods. */
+#define	DB_AM_DUP	0x000002	/* DB_DUP (internal). */
+#define	DB_AM_INMEM	0x000004	/* In-memory; no sync on close. */
+#define	DB_AM_LOCKING	0x000008	/* Perform locking. */
+#define	DB_AM_LOGGING	0x000010	/* Perform logging. */
+#define	DB_AM_MLOCAL	0x000020	/* Database memory pool is local. */
+#define	DB_AM_PGDEF	0x000040	/* Page size was defaulted. */
+#define	DB_AM_RDONLY	0x000080	/* Database is readonly. */
 #define	DB_AM_SWAP	0x000100	/* Pages need to be byte-swapped. */
 #define	DB_AM_THREAD	0x000200	/* DB is multi-threaded. */
-#define	DB_BT_RECNUM	0x000400	/* DB_RECNUM (internal) */
-#define	DB_HS_DIRTYMETA 0x000800	/* Hash: Metadata page modified. */
+#define	DB_BT_RECNUM	0x000400	/* DB_RECNUM (internal). */
+#define	DB_DBM_ERROR	0x000800	/* Error in DBM/NDBM database. */
 #define	DB_RE_DELIMITER	0x001000	/* DB_DELIMITER (internal). */
 #define	DB_RE_FIXEDLEN	0x002000	/* DB_FIXEDLEN (internal). */
 #define	DB_RE_PAD	0x004000	/* DB_PAD (internal). */
@@ -471,13 +461,18 @@ struct __db {
 	u_int32_t flags;
 };
 
+struct __db_ilock {			/* Internal DB access method lock. */
+	db_pgno_t pgno;			/* Page being locked. */
+	u_int8_t fileid[DB_FILE_ID_LEN];/* File id. */
+};
+
 /* Cursor description structure. */
 struct __dbc {
 	DB *dbp;			/* Related DB access method. */
 	DB_TXN	 *txn;			/* Associated transaction. */
 
 	/*
-	 * XXX
+	 * !!!
 	 * Explicit representations of structures in queue.h.
 	 *
 	 * TAILQ_ENTRY(__dbc);
@@ -487,12 +482,30 @@ struct __dbc {
 		struct __dbc **tqe_prev;
 	} links;
 
+	u_int32_t lid;			/* Default process' locker id. */
+	u_int32_t locker;		/* Locker for this operation. */
+	DBT	  lock_dbt;		/* DBT referencing lock. */
+	DB_LOCK_ILOCK lock;		/* Object to be locked. */
+	DB_LOCK	mylock;			/* Lock held on this cursor. */
+
+	DBT rkey;			/* Returned key. */
+	DBT rdata;			/* Returned data. */
+
+	int (*c_am_close) __P((DBC *));
+	int (*c_am_destroy) __P((DBC *));
+	int (*c_close) __P((DBC *));
+	int (*c_del) __P((DBC *, u_int32_t));
+	int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t));
+	int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t));
+
 	void	 *internal;		/* Access method private. */
 
-	int (*c_close)	__P((DBC *));
-	int (*c_del)	__P((DBC *, u_int32_t));
-	int (*c_get)	__P((DBC *, DBT *, DBT *, u_int32_t));
-	int (*c_put)	__P((DBC *, DBT *, DBT *, u_int32_t));
+#define	DBC_CONTINUE	0x001		/* Continue dup search: next item. */
+#define	DBC_KEYSET	0x002		/* Continue dup search: current item. */
+#define	DBC_RECOVER	0x004		/* In recovery (do not log or lock). */
+#define	DBC_RMW		0x008		/* Acquire write flag in read op. */
+#define	DBC_WRITER	0x010		/* Cursor immediately writing (CDB). */
+	u_int32_t flags;
 };
 
 /* Btree/recno statistics structure. */
@@ -510,24 +523,36 @@ struct __db_bt_stat {
 	u_int32_t bt_dup_pg;		/* Duplicate pages. */
 	u_int32_t bt_over_pg;		/* Overflow pages. */
 	u_int32_t bt_free;		/* Pages on the free list. */
-	u_int32_t bt_freed;		/* Pages freed for reuse. */
 	u_int32_t bt_int_pgfree;	/* Bytes free in internal pages. */
 	u_int32_t bt_leaf_pgfree;	/* Bytes free in leaf pages. */
 	u_int32_t bt_dup_pgfree;	/* Bytes free in duplicate pages. */
 	u_int32_t bt_over_pgfree;	/* Bytes free in overflow pages. */
-	u_int32_t bt_pfxsaved;		/* Bytes saved by prefix compression. */
-	u_int32_t bt_split;		/* Total number of splits. */
-	u_int32_t bt_rootsplit;		/* Root page splits. */
-	u_int32_t bt_fastsplit;		/* Fast splits. */
-	u_int32_t bt_added;		/* Items added. */
-	u_int32_t bt_deleted;		/* Items deleted. */
-	u_int32_t bt_get;		/* Items retrieved. */
-	u_int32_t bt_cache_hit;		/* Hits in fast-insert code. */
-	u_int32_t bt_cache_miss;	/* Misses in fast-insert code. */
 	u_int32_t bt_magic;		/* Magic number. */
 	u_int32_t bt_version;		/* Version number. */
 };
 
+/* Hash statistics structure. */
+struct __db_h_stat {
+	u_int32_t hash_accesses;	/* Number of accesses to this table. */
+	u_int32_t hash_collisions;	/* Number of collisions on search. */
+	u_int32_t hash_expansions;	/* Number of times we added a bucket. */
+	u_int32_t hash_overflows;	/* Number of overflow pages. */
+	u_int32_t hash_bigpages;	/* Number of big key/data pages. */
+	u_int32_t hash_dup;		/* Number of dup pages. */
+	u_int32_t hash_free;		/* Pages on the free list. */
+	u_int32_t hash_bfree;		/* Bytes free on bucket pages. */
+	u_int32_t hash_dup_free;	/* Bytes free on duplicate pages. */
+	u_int32_t hash_big_bfree;	/* Bytes free on big item pages. */
+	u_int32_t hash_buckets;		/* Number of hash buckets. */
+	u_int32_t hash_put;		/* Number of puts. */
+	u_int32_t hash_deleted;		/* Number of deletes. */
+	u_int32_t hash_get;		/* Number of gets. */
+	u_int32_t hash_magic;		/* Magic number. */
+	u_int32_t hash_version;		/* Version number. */
+	u_int32_t hash_pagesize;	/* Page size. */
+	u_int32_t hash_nrecs;		/* Number of records. */
+};
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -538,6 +563,8 @@ int   db_open __P((const char *,
 	  DBTYPE, u_int32_t, int, DB_ENV *, DB_INFO *, DB **));
 int   db_value_set __P((int, int));
 char *db_version __P((int *, int *, int *));
+int   db_xa_open __P((const char *,
+	  DBTYPE, u_int32_t, int, DB_INFO *, DB **));
 #if defined(__cplusplus)
 }
 #endif
@@ -548,8 +575,10 @@ char *db_version __P((int *, int *, int *));
 #define	DB_LOCKVERSION	1
 #define	DB_LOCKMAGIC	0x090193
 
-/* Flag values for lock_vec(). */
+/* Flag values for lock_vec(), lock_get(). */
 #define	DB_LOCK_NOWAIT		0x01	/* Don't wait on unavailable lock. */
+#define	DB_LOCK_UPGRADE		0x02	/* Upgrade an existing lock instead
+					   of granting a new one (internal). */
 
 /* Flag values for lock_detect(). */
 #define	DB_LOCK_CONFLICT	0x01	/* Run on any conflict. */
@@ -557,12 +586,13 @@ char *db_version __P((int *, int *, int *));
 /*
  * Request types.
  *
- * XXX
+ * !!!
  * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
  */
 typedef enum {
 	DB_LOCK_DUMP=0,			/* Display held locks. */
 	DB_LOCK_GET,			/* Get the lock. */
+	DB_LOCK_INHERIT,		/* Pass locks to parent. */
 	DB_LOCK_PUT,			/* Release the lock. */
 	DB_LOCK_PUT_ALL,		/* Release locker's locks. */
 	DB_LOCK_PUT_OBJ			/* Release locker's locks on obj. */
@@ -571,15 +601,20 @@ typedef enum {
 /*
  * Simple R/W lock modes and for multi-granularity intention locking.
  *
- * XXX
+ * !!!
+ * These values are NOT random, as they are used as an index into the lock
+ * conflicts arrays, i.e., DB_LOCK_IWRITE must be == 3, and DB_LOCK_IREAD
+ * must be == 4.
+ *
+ * !!!
  * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
  */
 typedef enum {
 	DB_LOCK_NG=0,			/* Not granted. */
 	DB_LOCK_READ,			/* Shared/read. */
 	DB_LOCK_WRITE,			/* Exclusive/write. */
-	DB_LOCK_IREAD,			/* Intent to share/read. */
 	DB_LOCK_IWRITE,			/* Intent exclusive/write. */
+	DB_LOCK_IREAD,			/* Intent to share/read. */
 	DB_LOCK_IWR			/* Intent to read and write. */
 } db_lockmode_t;
 
@@ -647,10 +682,14 @@ int	  lock_id __P((DB_LOCKTAB *, u_int32_t *));
 int	  lock_open __P((const char *,
 	    u_int32_t, int, DB_ENV *, DB_LOCKTAB **));
 int	  lock_put __P((DB_LOCKTAB *, DB_LOCK));
+int	  lock_tget __P((DB_LOCKTAB *,
+	    DB_TXN *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
 int	  lock_stat __P((DB_LOCKTAB *, DB_LOCK_STAT **, void *(*)(size_t)));
 int	  lock_unlink __P((const char *, int, DB_ENV *));
 int	  lock_vec __P((DB_LOCKTAB *,
 	    u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+int	  lock_tvec __P((DB_LOCKTAB *,
+	    DB_TXN *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
 #if defined(__cplusplus)
 }
 #endif
@@ -890,6 +929,7 @@ typedef struct {
  * 4BSD replaced the dbm interface with ndbm, and are not support here.
  */
 #define	dbminit(a)	__db_dbm_init(a)
+#define	dbmclose	__db_dbm_close
 #if !defined(__cplusplus)
 #define	delete(a)	__db_dbm_delete(a)
 #endif
@@ -902,12 +942,13 @@ typedef struct {
 #if defined(__cplusplus)
 extern "C" {
 #endif
-int	 __db_dbm_init __P((char *));
-int	 __db_dbm_delete __P((datum));
+int	 __db_dbm_close __P((void));
 int	 __db_dbm_dbrdonly __P((void));
+int	 __db_dbm_delete __P((datum));
 int	 __db_dbm_dirf __P((void));
 datum	 __db_dbm_fetch __P((datum));
 datum	 __db_dbm_firstkey __P((void));
+int	 __db_dbm_init __P((char *));
 datum	 __db_dbm_nextkey __P((datum));
 int	 __db_dbm_pagf __P((void));
 int	 __db_dbm_store __P((datum, datum));
diff --git a/db2/db/db.c b/db2/db/db.c
index 70c6c5443b..2b4c270324 100644
--- a/db2/db/db.c
+++ b/db2/db/db.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db.c	10.57 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db.c	10.75 (Sleepycat) 12/3/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -67,9 +67,6 @@ static const char sccsid[] = "@(#)db.c	10.57 (Sleepycat) 5/7/98";
 #include "db_am.h"
 #include "common_ext.h"
 
-static int db_close __P((DB *, u_int32_t));
-static int db_fd __P((DB *, int *));
-
 /*
  * If the metadata page has the flag set, set the local flag.  If the page
  * does NOT have the flag set, return EINVAL if the user's dbinfo argument
@@ -87,11 +84,6 @@ static int db_fd __P((DB *, int *));
 		}							\
 }
 
-#ifdef _LIBC
-#define db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) \
-  __nss_db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
-#endif
-
 /*
  * db_open --
  *	Main library interface to the DB access methods.
@@ -141,9 +133,10 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 
 		/*
 		 * Specifying a cachesize to db_open(3), after creating an
-		 * environment, is a common mistake.
+		 * environment with DB_INIT_MPOOL, is a common mistake.
 		 */
-		if (dbinfo != NULL && dbinfo->db_cachesize != 0) {
+		if (dbenv->mp_info != NULL &&
+		    dbinfo != NULL && dbinfo->db_cachesize != 0) {
 			__db_err(dbenv,
 			    "cachesize will be ignored if environment exists");
 			return (EINVAL);
@@ -156,12 +149,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	real_name = NULL;
 
 	/* Allocate the DB structure, reference the DB_ENV structure. */
-	if ((dbp = (DB *)__db_calloc(1, sizeof(DB))) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(DB), &dbp)) != 0)
+		return (ret);
 	dbp->dbenv = dbenv;
 
+	/* Random initialization. */
+	TAILQ_INIT(&dbp->free_queue);
+	TAILQ_INIT(&dbp->active_queue);
+	if ((ret = __db_init_wrapper(dbp)) != 0)
+		goto err;
+
 	/* Convert the db_open(3) flags. */
 	if (LF_ISSET(DB_RDONLY))
 		F_SET(dbp, DB_AM_RDONLY);
@@ -192,21 +189,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	}
 
 	/*
-	 * Always set the master and initialize the queues, so we can
-	 * use these fields without checking the thread bit.
-	 */
-	dbp->master = dbp;
-	LIST_INIT(&dbp->handleq);
-	LIST_INSERT_HEAD(&dbp->handleq, dbp, links);
-	TAILQ_INIT(&dbp->curs_queue);
-
-	/*
 	 * Set based on the dbenv fields, although no logging or transactions
 	 * are possible for temporary files.
 	 */
 	if (dbenv != NULL) {
-		if (dbenv->lk_info != NULL)
-			F_SET(dbp, DB_AM_LOCKING);
+		if (dbenv->lk_info != NULL) {
+			if (F_ISSET(dbenv, DB_ENV_CDB))
+				F_SET(dbp, DB_AM_CDB);
+			else
+				F_SET(dbp, DB_AM_LOCKING);
+		}
 		if (fname != NULL && dbenv->lg_info != NULL)
 			F_SET(dbp, DB_AM_LOGGING);
 	}
@@ -215,9 +207,29 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	if (dbinfo == NULL) {
 		dbp->pgsize = 0;
 		dbp->db_malloc = NULL;
+		dbp->dup_compare = NULL;
 	} else {
+		/*
+		 * We don't want anything that's not a power-of-2, as we rely
+		 * on that for alignment of various types on the pages.
+		 */
+		if ((dbp->pgsize = dbinfo->db_pagesize) != 0 &&
+		    (u_int32_t)1 << __db_log2(dbp->pgsize) != dbp->pgsize) {
+			__db_err(dbenv, "page sizes must be a power-of-2");
+			goto einval;
+		}
 		dbp->pgsize = dbinfo->db_pagesize;
 		dbp->db_malloc = dbinfo->db_malloc;
+		if (F_ISSET(dbinfo, DB_DUPSORT)) {
+			if (F_ISSET(dbinfo, DB_DUP))
+				dbp->dup_compare = dbinfo->dup_compare == NULL ?
+				    __bam_defcmp : dbinfo->dup_compare;
+			else {
+				__db_err(dbenv, "DB_DUPSORT requires DB_DUP");
+				goto einval;
+			}
+			F_CLR(dbinfo, DB_DUPSORT);
+		}
 	}
 
 	/* Fill in the default file mode. */
@@ -235,6 +247,7 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 		default:
 			goto err;
 		}
+	dbp->byteswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
 
 	/*
 	 * If we have a file name, try and read the first page, figure out
@@ -289,7 +302,7 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 		 * sizes, we limit the default pagesize to 16K.
 		 */
 		if (dbp->pgsize == 0) {
-			if ((ret = __db_ioinfo(real_name,
+			if ((ret = __os_ioinfo(real_name,
 			    fd, NULL, NULL, &iopsize)) != 0) {
 				__db_err(dbenv,
 				    "%s: %s", real_name, strerror(ret));
@@ -299,6 +312,14 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 				iopsize = 512;
 			if (iopsize > 16 * 1024)
 				iopsize = 16 * 1024;
+
+			/*
+			 * Sheer paranoia, but we don't want anything that's
+			 * not a power-of-2, as we rely on that for alignment
+			 * of various types on the pages.
+			 */
+			DB_ROUNDOFF(iopsize, 512);
+
 			dbp->pgsize = iopsize;
 			F_SET(dbp, DB_AM_PGDEF);
 		}
@@ -308,11 +329,11 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 		 * that the meta-data for all access methods fits in 512
 		 * bytes, and that no database will be smaller than that.
 		 */
-		if ((ret = __db_read(fd, mbuf, sizeof(mbuf), &nr)) != 0)
+		if ((ret = __os_read(fd, mbuf, sizeof(mbuf), &nr)) != 0)
 			goto err;
 
 		/* The fd is no longer needed. */
-		(void)__db_close(fd);
+		(void)__os_close(fd);
 		fd = -1;
 
 		if (nr != sizeof(mbuf)) {
@@ -337,7 +358,7 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 			 */
 			if (retry_cnt++ < 3 &&
 			    !LF_ISSET(DB_CREATE | DB_TRUNCATE)) {
-				__db_sleep(1, 0);
+				__os_sleep(1, 0);
 				goto open_retry;
 			}
 			if (type == DB_UNKNOWN) {
@@ -396,7 +417,7 @@ retry:		switch (((BTMETA *)mbuf)->magic) {
 
 			/* Copy the file's unique id. */
 			need_fileid = 0;
-			memcpy(dbp->lock.fileid, btm->uid, DB_FILE_ID_LEN);
+			memcpy(dbp->fileid, btm->uid, DB_FILE_ID_LEN);
 			break;
 		case DB_HASHMAGIC:
 			if (type != DB_HASH && type != DB_UNKNOWN)
@@ -425,7 +446,7 @@ retry:		switch (((BTMETA *)mbuf)->magic) {
 
 			/* Copy the file's unique id. */
 			need_fileid = 0;
-			memcpy(dbp->lock.fileid, hashm->uid, DB_FILE_ID_LEN);
+			memcpy(dbp->fileid, hashm->uid, DB_FILE_ID_LEN);
 			break;
 		default:
 			if (swapped) {
@@ -489,11 +510,9 @@ empty:	/*
 		F_SET(dbp, DB_AM_MLOCAL);
 
 		if (dbenv == NULL) {
-			if ((dbp->mp_dbenv =
-			    (DB_ENV *)__db_calloc(sizeof(DB_ENV), 1)) == NULL) {
-				ret = ENOMEM;
+			if ((ret = __os_calloc(1,
+			    sizeof(DB_ENV), &dbp->mp_dbenv)) != 0)
 				goto err;
-			}
 
 			envp = dbp->mp_dbenv;
 			restore = 0;
@@ -554,20 +573,20 @@ empty:	/*
 	 */
 	if (need_fileid) {
 		if (fname == NULL) {
-			memset(dbp->lock.fileid, 0, DB_FILE_ID_LEN);
+			memset(dbp->fileid, 0, DB_FILE_ID_LEN);
 			if (F_ISSET(dbp, DB_AM_LOCKING) &&
 			    (ret = lock_id(dbenv->lk_info,
-			    (u_int32_t *)dbp->lock.fileid)) != 0)
+			    (u_int32_t *)dbp->fileid)) != 0)
 				goto err;
 		} else
-			if ((ret = __db_fileid(dbenv,
-			    real_name, 1, dbp->lock.fileid)) != 0)
+			if ((ret = __os_fileid(dbenv,
+			    real_name, 1, dbp->fileid)) != 0)
 				goto err;
 	}
 
 	/* No further use for the real name. */
 	if (real_name != NULL)
-		FREES(real_name);
+		__os_freestr(real_name);
 	real_name = NULL;
 
 	/*
@@ -595,7 +614,7 @@ empty:	/*
 	memset(&finfo, 0, sizeof(finfo));
 	finfo.ftype = ftype;
 	finfo.pgcookie = &pgcookie;
-	finfo.fileid = dbp->lock.fileid;
+	finfo.fileid = dbp->fileid;
 	finfo.lsn_offset = 0;
 	finfo.clear_len = DB_PAGE_CLEAR_LEN;
 	if ((ret = memp_fopen(dbp->mp, fname,
@@ -605,12 +624,21 @@ empty:	/*
 
 	/*
 	 * XXX
-	 * Truly spectacular layering violation.  We need a per-thread mutex
-	 * that lives in shared memory (thanks, HP-UX!) and so we acquire a
-	 * pointer to the mpool one.
+	 * We need a per-thread mutex that lives in shared memory -- HP-UX
+	 * can't allocate mutexes in malloc'd memory.  Allocate it from the
+	 * shared memory region, since it's the only one that is guaranteed
+	 * to exist.
 	 */
-	if (F_ISSET(dbp, DB_AM_THREAD))
-		dbp->mutexp = dbp->mpf->mutexp;
+	if (F_ISSET(dbp, DB_AM_THREAD)) {
+		if ((ret = __memp_reg_alloc(dbp->mp,
+		    sizeof(db_mutex_t), NULL, &dbp->mutexp)) != 0)
+			goto err;
+		/*
+		 * Since we only get here if DB_THREAD was specified, we know
+		 * we have spinlocks and no file offset argument is needed.
+		 */
+		(void)__db_mutex_init(dbp->mutexp, 0);
+	}
 
 	/* Get a log file id. */
 	if (F_ISSET(dbp, DB_AM_LOGGING) &&
@@ -618,18 +646,6 @@ empty:	/*
 	    dbp, fname, type, &dbp->log_fileid)) != 0)
 		goto err;
 
-	/*
-	 * Get a locker id for this DB, and build the lock cookie: the first
-	 * db_pgno_t bytes are the page number, the next N bytes are the file
-	 * id.
-	 */
-	if (F_ISSET(dbp, DB_AM_LOCKING)) {
-		if ((ret = lock_id(dbenv->lk_info, &dbp->locker)) != 0)
-			goto err;
-		dbp->lock_dbt.size = sizeof(dbp->lock);
-		dbp->lock_dbt.data = &dbp->lock;
-	}
-
 	/* Call the real open function. */
 	switch (type) {
 	case DB_BTREE:
@@ -639,7 +655,7 @@ empty:	/*
 		if (dbinfo != NULL && (ret = __db_fcchk(dbenv,
 		    "db_open", dbinfo->flags, DB_DUP, DB_RECNUM)) != 0)
 			goto err;
-		if ((ret = __bam_open(dbp, type, dbinfo)) != 0)
+		if ((ret = __bam_open(dbp, dbinfo)) != 0)
 			goto err;
 		break;
 	case DB_HASH:
@@ -655,24 +671,20 @@ empty:	/*
 		if (dbinfo != NULL && (ret = __db_fchk(dbenv,
 		    "db_open", dbinfo->flags, DB_INFO_FLAGS)) != 0)
 			goto err;
-		if ((ret = __ram_open(dbp, type, dbinfo)) != 0)
+		if ((ret = __ram_open(dbp, dbinfo)) != 0)
 			goto err;
 		break;
 	default:
 		abort();
 	}
 
-	/* Call a local close routine. */
-	dbp->close = db_close;
-	dbp->fd = db_fd;
-
 	*dbpp = dbp;
 	return (0);
 
 einval:	ret = EINVAL;
 err:	/* Close the file descriptor. */
 	if (fd != -1)
-		(void)__db_close(fd);
+		(void)__os_close(fd);
 
 	/* Discard the log file id. */
 	if (dbp->log_fileid != 0)
@@ -688,90 +700,60 @@ err:	/* Close the file descriptor. */
 
 	/* If we allocated a DB_ENV, discard it. */
 	if (dbp->mp_dbenv != NULL)
-		FREE(dbp->mp_dbenv, sizeof(DB_ENV));
+		__os_free(dbp->mp_dbenv, sizeof(DB_ENV));
 
 	if (real_name != NULL)
-		FREES(real_name);
+		__os_freestr(real_name);
 	if (dbp != NULL)
-		FREE(dbp, sizeof(DB));
+		__os_free(dbp, sizeof(DB));
 
 	return (ret);
 }
 
-#ifdef _LIBC
-# undef db_open
-weak_alias (__nss_db_open, db_open)
-#endif
-
 /*
- * db_close --
+ * __db_close --
  *	Close a DB tree.
+ *
+ * PUBLIC: int __db_close __P((DB *, u_int32_t));
  */
-static int
-db_close(dbp, flags)
+int
+__db_close(dbp, flags)
 	DB *dbp;
 	u_int32_t flags;
 {
 	DBC *dbc;
-	DB *tdbp;
 	int ret, t_ret;
 
+	DB_PANIC_CHECK(dbp);
+
 	/* Validate arguments. */
-	if ((ret = __db_fchk(dbp->dbenv, "db_close", flags, DB_NOSYNC)) != 0)
+	if ((ret = __db_closechk(dbp, flags)) != 0)
 		return (ret);
 
 	/* Sync the underlying file. */
-	if (!LF_ISSET(DB_NOSYNC) &&
+	if (flags != DB_NOSYNC &&
 	    (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0)
 		ret = t_ret;
 
 	/*
-	 * Call the underlying access method close routine for all the
-	 * cursors and handles.
+	 * Go through the active cursors and call the cursor recycle routine,
+	 * which resolves pending operations and moves the cursors onto the
+	 * free list.  Then, walk the free list and call the cursor destroy
+	 * routine.
 	 */
-	for (tdbp = LIST_FIRST(&dbp->handleq);
-	    tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) {
-		while ((dbc = TAILQ_FIRST(&tdbp->curs_queue)) != NULL)
-			switch (tdbp->type) {
-			case DB_BTREE:
-				if ((t_ret =
-				    __bam_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			case DB_HASH:
-				if ((t_ret =
-				    __ham_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			case DB_RECNO:
-				if ((t_ret =
-				    __ram_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			default:
-				abort();
-			}
-
-		switch (tdbp->type) {
-		case DB_BTREE:
-			if ((t_ret = __bam_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		case DB_HASH:
-			if ((t_ret = __ham_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		case DB_RECNO:
-			if ((t_ret = __ram_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		default:
-			abort();
-		}
-	}
+	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Call the access specific close function. */
+	if ((t_ret = dbp->am_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
 
 	/* Sync the memory pool. */
-	if (!LF_ISSET(DB_NOSYNC) && (t_ret = memp_fsync(dbp->mpf)) != 0 &&
+	if (flags != DB_NOSYNC && (t_ret = memp_fsync(dbp->mpf)) != 0 &&
 	    t_ret != DB_INCOMPLETE && ret == 0)
 		ret = t_ret;
 
@@ -788,91 +770,12 @@ db_close(dbp, flags)
 	if (F_ISSET(dbp, DB_AM_LOGGING))
 		(void)log_unregister(dbp->dbenv->lg_info, dbp->log_fileid);
 
-	/* Discard the lock cookie for all handles. */
-	for (tdbp = LIST_FIRST(&dbp->handleq);
-	    tdbp != NULL; tdbp = LIST_NEXT(tdbp, links))
-		if (F_ISSET(tdbp, DB_AM_LOCKING)) {
-#ifdef DEBUG
-			DB_LOCKREQ request;
-
-			/*
-			 * If we're running tests, display any locks currently
-			 * held.  It's possible that some applications may hold
-			 * locks for long periods, e.g., conference room locks,
-			 * but the DB tests should never close holding locks.
-			 */
-			request.op = DB_LOCK_DUMP;
-			if ((t_ret = lock_vec(tdbp->dbenv->lk_info,
-			    tdbp->locker, 0, &request, 1, NULL)) != 0 &&
-			    ret == 0)
-				ret = EAGAIN;
-#endif
-		}
-
 	/* If we allocated a DB_ENV, discard it. */
 	if (dbp->mp_dbenv != NULL)
-		FREE(dbp->mp_dbenv, sizeof(DB_ENV));
+		__os_free(dbp->mp_dbenv, sizeof(DB_ENV));
 
-	/* Free all of the DB's. */
-	LIST_REMOVE(dbp, links);
-	while ((tdbp = LIST_FIRST(&dbp->handleq)) != NULL) {
-		LIST_REMOVE(tdbp, links);
-		FREE(tdbp, sizeof(*tdbp));
-	}
-	FREE(dbp, sizeof(*dbp));
+	/* Free the DB. */
+	__os_free(dbp, sizeof(*dbp));
 
 	return (ret);
 }
-
-/*
- * db_fd --
- *	Return a file descriptor for flock'ing.
- */
-static int
-db_fd(dbp, fdp)
-        DB *dbp;
-	int *fdp;
-{
-	/*
-	 * XXX
-	 * Truly spectacular layering violation.
-	 */
-	return (__mp_xxx_fd(dbp->mpf, fdp));
-}
-
-/*
- * __db_pgerr --
- *	Error when unable to retrieve a specified page.
- *
- * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t));
- */
-int
-__db_pgerr(dbp, pgno)
-	DB *dbp;
-	db_pgno_t pgno;
-{
-	/*
-	 * Three things are certain:
-	 * Death, taxes, and lost data.
-	 * Guess which has occurred.
-	 */
-	__db_err(dbp->dbenv,
-	    "unable to create/retrieve page %lu", (u_long)pgno);
-	return (__db_panic(dbp));
-}
-
-/*
- * __db_pgfmt --
- *	Error when a page has the wrong format.
- *
- * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t));
- */
-int
-__db_pgfmt(dbp, pgno)
-	DB *dbp;
-	db_pgno_t pgno;
-{
-	__db_err(dbp->dbenv,
-	    "page %lu: illegal page type or format", (u_long)pgno);
-	return (__db_panic(dbp));
-}
diff --git a/db2/db/db.src b/db2/db/db.src
index 91d8b390a1..26557e10ac 100644
--- a/db2/db/db.src
+++ b/db2/db/db.src
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db.src	10.6 (Sleepycat) 4/28/98
+ *	@(#)db.src	10.8 (Sleepycat) 9/20/98
  */
 
 PREFIX	db
@@ -98,6 +98,7 @@ END
 /*
  * relink -- Handles relinking around a page.
  *
+ * opcode:	indicates if this is an addpage or delete page
  * pgno:	the page being changed.
  * lsn		the page's original lsn.
  * prev:	the previous page.
@@ -106,6 +107,7 @@ END
  * lsn_next:	the previous page's original lsn.
  */
 BEGIN relink
+ARG	opcode		u_int32_t	lu
 ARG	fileid		u_int32_t	lu
 ARG	pgno		db_pgno_t	lu
 POINTER	lsn		DB_LSN *	lu
@@ -148,12 +150,3 @@ DBT	key		DBT		s
 DBT	data		DBT		s
 ARG	arg_flags	u_int32_t	lu
 END
-
-/*
- * noop -- do nothing, but get an LSN.
- */
-BEGIN noop
-ARG	fileid		u_int32_t	lu
-ARG	pgno		db_pgno_t	lu
-POINTER	prevlsn		DB_LSN *	lu
-END
diff --git a/db2/db/db_am.c b/db2/db/db_am.c
new file mode 100644
index 0000000000..e02ad57f53
--- /dev/null
+++ b/db2/db/db_am.c
@@ -0,0 +1,430 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_am.c	10.15 (Sleepycat) 12/30/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "btree.h"
+#include "hash.h"
+#include "db_am.h"
+#include "db_ext.h"
+
+static int __db_c_close __P((DBC *));
+static int __db_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t));
+static int __db_fd __P((DB *, int *));
+static int __db_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __db_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+
+/*
+ * __db_init_wrapper --
+ *	Wrapper layer to implement generic DB functions.
+ *
+ * PUBLIC: int __db_init_wrapper __P((DB *));
+ */
+int
+__db_init_wrapper(dbp)
+	DB *dbp;
+{
+	dbp->close = __db_close;
+	dbp->cursor = __db_cursor;
+	dbp->del = NULL;		/* !!! Must be set by access method. */
+	dbp->fd = __db_fd;
+	dbp->get = __db_get;
+	dbp->join = __db_join;
+	dbp->put = __db_put;
+	dbp->stat = NULL;		/* !!! Must be set by access method. */
+	dbp->sync = __db_sync;
+
+	return (0);
+}
+
+/*
+ * __db_cursor --
+ *	Allocate and return a cursor.
+ */
+static int
+__db_cursor(dbp, txn, dbcp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc, *adbc;
+	int ret;
+	db_lockmode_t mode;
+	u_int32_t op;
+
+	DB_PANIC_CHECK(dbp);
+
+	/* Take one from the free list if it's available. */
+	DB_THREAD_LOCK(dbp);
+	if ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	else {
+		DB_THREAD_UNLOCK(dbp);
+
+		if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0)
+			return (ret);
+
+		dbc->dbp = dbp;
+		dbc->c_close = __db_c_close;
+
+		/* Set up locking information. */
+		if (F_ISSET(dbp, DB_AM_LOCKING | DB_AM_CDB)) {
+ 			/*
+ 			 * If we are not threaded, then there is no need to
+ 			 * create new locker ids.  We know that no one else
+ 			 * is running concurrently using this DB, so we can
+ 			 * take a peek at any cursors on the active queue.
+ 			 */
+ 			if (!F_ISSET(dbp, DB_AM_THREAD) &&
+ 			    (adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+ 				dbc->lid = adbc->lid;
+ 			else
+ 				if ((ret = lock_id(dbp->dbenv->lk_info,
+ 				    &dbc->lid)) != 0)
+ 					goto err;
+ 
+			memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN);
+			if (F_ISSET(dbp, DB_AM_CDB)) {
+				dbc->lock_dbt.size = DB_FILE_ID_LEN;
+				dbc->lock_dbt.data = dbc->lock.fileid;
+			} else {
+				dbc->lock_dbt.size = sizeof(dbc->lock);
+				dbc->lock_dbt.data = &dbc->lock;
+			}
+		}
+
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bam_c_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __ham_c_init(dbc)) != 0)
+				goto err;
+			break;
+		default:
+			ret = EINVAL;
+			goto err;
+		}
+
+		DB_THREAD_LOCK(dbp);
+	}
+
+	if ((dbc->txn = txn) == NULL)
+		dbc->locker = dbc->lid;
+	else
+		dbc->locker = txn->txnid;
+
+	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/*
+	 * If this is the concurrent DB product, then we do all locking
+	 * in the interface, which is right here.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		op = LF_ISSET(DB_OPFLAGS_MASK);
+		mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE :
+		    (LF_ISSET(DB_RMW) ? DB_LOCK_IWRITE : DB_LOCK_READ);
+		if ((ret = lock_get(dbp->dbenv->lk_info, dbc->locker, 0,
+		    &dbc->lock_dbt, mode, &dbc->mylock)) != 0) {
+			(void)__db_c_close(dbc);
+			return (EAGAIN);
+		}
+		if (LF_ISSET(DB_RMW))
+			F_SET(dbc, DBC_RMW);
+		if (op == DB_WRITELOCK)
+			F_SET(dbc, DBC_WRITER);
+	}
+
+	*dbcp = dbc;
+	return (0);
+
+err:	__os_free(dbc, sizeof(*dbc));
+	return (ret);
+}
+
+/*
+ * __db_c_close --
+ *	Close the cursor (recycle for later use).
+ */
+static int
+__db_c_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	DB_PANIC_CHECK(dbp);
+
+	ret = 0;
+
+	/*
+	 * We cannot release the lock until after we've called the
+	 * access method specific routine, since btrees may have pending
+	 * deletes.
+	 */
+
+	/* Remove the cursor from the active queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/* Call the access specific cursor close routine. */
+	if ((t_ret = dbc->c_am_close(dbc)) != 0 && ret == 0)
+		t_ret = ret;
+
+	/* Release the lock. */
+	if (F_ISSET(dbc->dbp, DB_AM_CDB) && dbc->mylock != LOCK_INVALID) {
+		ret = lock_put(dbc->dbp->dbenv->lk_info, dbc->mylock);
+		dbc->mylock = LOCK_INVALID;
+	}
+
+	/* Clean up the cursor. */
+	dbc->flags = 0;
+
+#ifdef DEBUG
+	/*
+	 * Check for leftover locks, unless we're running with transactions.
+	 *
+	 * If we're running tests, display any locks currently held.  It's
+	 * possible that some applications may hold locks for long periods,
+	 * e.g., conference room locks, but the DB tests should never close
+	 * holding locks.
+	 */
+	if (F_ISSET(dbp, DB_AM_LOCKING) && dbc->lid == dbc->locker) {
+		DB_LOCKREQ request;
+
+		request.op = DB_LOCK_DUMP;
+		if ((t_ret = lock_vec(dbp->dbenv->lk_info,
+		    dbc->locker, 0, &request, 1, NULL)) != 0 && ret == 0)
+			ret = EAGAIN;
+	}
+#endif
+	/* Move the cursor to the free queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_cprint --
+ *	Display the current cursor list.
+ *
+ * PUBLIC: int __db_cprint __P((DB *));
+ */
+int
+__db_cprint(dbp)
+	DB *dbp;
+{
+	static const FN fn[] = {
+		{ DBC_RECOVER, 	"recover" },
+		{ DBC_RMW, 	"read-modify-write" },
+		{ 0 },
+	};
+	DBC *dbc;
+
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		fprintf(stderr,
+		    "%#0x: dbp: %#0x txn: %#0x lid: %lu locker: %lu",
+		    (u_int)dbc, (u_int)dbc->dbp, (u_int)dbc->txn,
+		    (u_long)dbc->lid, (u_long)dbc->locker);
+		__db_prflags(dbc->flags, fn, stderr);
+		fprintf(stderr, "\n");
+	}
+	DB_THREAD_UNLOCK(dbp);
+
+	return (0);
+}
+#endif /* DEBUG */
+
+/*
+ * __db_c_destroy --
+ *	Destroy the cursor.
+ *
+ * PUBLIC: int __db_c_destroy __P((DBC *));
+ */
+int
+__db_c_destroy(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	/* Remove the cursor from the free queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/* Call the access specific cursor destroy routine. */
+	ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc);
+
+	/* Free up allocated memory. */
+	if (dbc->rkey.data != NULL)
+		__os_free(dbc->rkey.data, dbc->rkey.ulen);
+	if (dbc->rdata.data != NULL)
+		__os_free(dbc->rdata.data, dbc->rdata.ulen);
+	__os_free(dbc, sizeof(*dbc));
+
+	return (0);
+}
+
+/*
+ * db_fd --
+ *	Return a file descriptor for flock'ing.
+ */
+static int
+__db_fd(dbp, fdp)
+        DB *dbp;
+	int *fdp;
+{
+	DB_PANIC_CHECK(dbp);
+
+	/*
+	 * XXX
+	 * Truly spectacular layering violation.
+	 */
+	return (__mp_xxx_fd(dbp->mpf, fdp));
+}
+
+/*
+ * __db_get --
+ *	Return a key/data pair.
+ */
+static int
+__db_get(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
+		return (ret);
+
+	if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LREAD(dbc, txn, "__db_get", key, NULL, flags);
+
+	ret = dbc->c_get(dbc, key, data,
+	    flags == 0 || flags == DB_RMW ? flags | DB_SET : flags);
+
+	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_put --
+ *	Store a key/data pair.
+ */
+static int
+__db_put(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT tdata;
+	int ret, t_ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_putchk(dbp, key, data,
+	    flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0)
+		return (ret);
+
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags);
+
+	if (flags == DB_NOOVERWRITE) {
+		/*
+		 * Set DB_DBT_USERMEM, this might be a threaded application and
+		 * the flags checking will catch us.  We don't want the actual
+		 * data, so request a partial of length 0.
+		 */
+		memset(&tdata, 0, sizeof(tdata));
+		F_SET(&tdata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+		if ((ret = dbc->c_get(dbc, key, &tdata, DB_SET | DB_RMW)) == 0)
+			ret = DB_KEYEXIST;
+		else
+			ret = 0;
+	}
+	if (ret == 0)
+		ret = dbc->c_put(dbc, key, data, DB_KEYLAST);
+
+	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_sync --
+ *	Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *, u_int32_t));
+ */
+int
+__db_sync(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	int ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_syncchk(dbp, flags)) != 0)
+		return (ret);
+
+	/* If it wasn't possible to modify the file, we're done. */
+	if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY))
+		return (0);
+
+	/* Flush any dirty pages from the cache to the backing file. */
+	if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
+		ret = 0;
+
+	return (ret);
+}
diff --git a/db2/db/db_auto.c b/db2/db/db_auto.c
index 5203e0a94c..e3dba23c8b 100644
--- a/db2/db/db_auto.c
+++ b/db2/db/db_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "db_am.h"
@@ -46,8 +45,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_addrem;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -60,8 +58,8 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size)
 	    + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
 	    + sizeof(*pagelsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -112,7 +110,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -174,7 +172,7 @@ __db_addrem_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tpagelsn: [%lu][%lu]\n",
 	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -188,11 +186,12 @@ __db_addrem_read(recbuf, argpp)
 {
 	__db_addrem_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_addrem_args *)__db_malloc(sizeof(__db_addrem_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_addrem_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -253,8 +252,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_split;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -264,8 +262,8 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size)
 	    + sizeof(*pagelsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -302,7 +300,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -353,7 +351,7 @@ __db_split_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tpagelsn: [%lu][%lu]\n",
 	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -367,11 +365,12 @@ __db_split_read(recbuf, argpp)
 {
 	__db_split_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_split_args *)__db_malloc(sizeof(__db_split_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_split_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -430,8 +429,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_big;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -445,8 +443,8 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*pagelsn)
 	    + sizeof(*prevlsn)
 	    + sizeof(*nextlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -497,7 +495,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -554,7 +552,7 @@ __db_big_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tnextlsn: [%lu][%lu]\n",
 	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -568,11 +566,12 @@ __db_big_read(recbuf, argpp)
 {
 	__db_big_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_big_args *)__db_malloc(sizeof(__db_big_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_big_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -630,8 +629,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_ovref;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -640,8 +638,8 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(adjust)
 	    + sizeof(*lsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -668,7 +666,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -710,7 +708,7 @@ __db_ovref_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tlsn: [%lu][%lu]\n",
 	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -724,11 +722,12 @@ __db_ovref_read(recbuf, argpp)
 {
 	__db_ovref_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_ovref_args *)__db_malloc(sizeof(__db_ovref_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_ovref_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -752,16 +751,17 @@ __db_ovref_read(recbuf, argpp)
 /*
  * PUBLIC: int __db_relink_log
  * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
- * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t,
- * PUBLIC:     DB_LSN *, db_pgno_t, DB_LSN *));
+ * PUBLIC:     u_int32_t, u_int32_t, db_pgno_t, DB_LSN *,
+ * PUBLIC:     db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *));
  */
 int __db_relink_log(logp, txnid, ret_lsnp, flags,
-	fileid, pgno, lsn, prev, lsn_prev, next,
-	lsn_next)
+	opcode, fileid, pgno, lsn, prev, lsn_prev,
+	next, lsn_next)
 	DB_LOG *logp;
 	DB_TXN *txnid;
 	DB_LSN *ret_lsnp;
 	u_int32_t flags;
+	u_int32_t opcode;
 	u_int32_t fileid;
 	db_pgno_t pgno;
 	DB_LSN * lsn;
@@ -779,12 +779,12 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_relink;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
 	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(opcode)
 	    + sizeof(fileid)
 	    + sizeof(pgno)
 	    + sizeof(*lsn)
@@ -792,8 +792,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*lsn_prev)
 	    + sizeof(next)
 	    + sizeof(*lsn_next);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -802,6 +802,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	bp += sizeof(txn_num);
 	memcpy(bp, lsnp, sizeof(DB_LSN));
 	bp += sizeof(DB_LSN);
+	memcpy(bp, &opcode, sizeof(opcode));
+	bp += sizeof(opcode);
 	memcpy(bp, &fileid, sizeof(fileid));
 	bp += sizeof(fileid);
 	memcpy(bp, &pgno, sizeof(pgno));
@@ -832,7 +834,7 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -868,6 +870,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3)
 	    (u_long)argp->txnid->txnid,
 	    (u_long)argp->prev_lsn.file,
 	    (u_long)argp->prev_lsn.offset);
+	printf("\topcode: %lu\n", (u_long)argp->opcode);
 	printf("\tfileid: %lu\n", (u_long)argp->fileid);
 	printf("\tpgno: %lu\n", (u_long)argp->pgno);
 	printf("\tlsn: [%lu][%lu]\n",
@@ -879,7 +882,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tlsn_next: [%lu][%lu]\n",
 	    (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -893,11 +896,12 @@ __db_relink_read(recbuf, argpp)
 {
 	__db_relink_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_relink_args *)__db_malloc(sizeof(__db_relink_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_relink_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -906,6 +910,8 @@ __db_relink_read(recbuf, argpp)
 	bp += sizeof(argp->txnid->txnid);
 	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
 	bp += sizeof(DB_LSN);
+	memcpy(&argp->opcode, bp, sizeof(argp->opcode));
+	bp += sizeof(argp->opcode);
 	memcpy(&argp->fileid, bp, sizeof(argp->fileid));
 	bp += sizeof(argp->fileid);
 	memcpy(&argp->pgno, bp, sizeof(argp->pgno));
@@ -951,8 +957,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_addpage;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -962,8 +967,8 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*lsn)
 	    + sizeof(nextpgno)
 	    + sizeof(*nextlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -995,7 +1000,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1039,7 +1044,7 @@ __db_addpage_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tnextlsn: [%lu][%lu]\n",
 	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1053,11 +1058,12 @@ __db_addpage_read(recbuf, argpp)
 {
 	__db_addpage_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_addpage_args *)__db_malloc(sizeof(__db_addpage_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_addpage_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1108,8 +1114,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_debug;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1119,8 +1124,8 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (key == NULL ? 0 : key->size)
 	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size)
 	    + sizeof(arg_flags);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1170,7 +1175,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1236,7 +1241,7 @@ __db_debug_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\n");
 	printf("\targ_flags: %lu\n", (u_long)argp->arg_flags);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1250,11 +1255,12 @@ __db_debug_read(recbuf, argpp)
 {
 	__db_debug_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_debug_args *)__db_malloc(sizeof(__db_debug_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_debug_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1284,143 +1290,6 @@ __db_debug_read(recbuf, argpp)
 }
 
 /*
- * PUBLIC: int __db_noop_log
- * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
- * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *));
- */
-int __db_noop_log(logp, txnid, ret_lsnp, flags,
-	fileid, pgno, prevlsn)
-	DB_LOG *logp;
-	DB_TXN *txnid;
-	DB_LSN *ret_lsnp;
-	u_int32_t flags;
-	u_int32_t fileid;
-	db_pgno_t pgno;
-	DB_LSN * prevlsn;
-{
-	DBT logrec;
-	DB_LSN *lsnp, null_lsn;
-	u_int32_t rectype, txn_num;
-	int ret;
-	u_int8_t *bp;
-
-	rectype = DB_db_noop;
-	txn_num = txnid == NULL ? 0 : txnid->txnid;
-	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
-		lsnp = &null_lsn;
-	} else
-		lsnp = &txnid->last_lsn;
-	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
-	    + sizeof(fileid)
-	    + sizeof(pgno)
-	    + sizeof(*prevlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
-
-	bp = logrec.data;
-	memcpy(bp, &rectype, sizeof(rectype));
-	bp += sizeof(rectype);
-	memcpy(bp, &txn_num, sizeof(txn_num));
-	bp += sizeof(txn_num);
-	memcpy(bp, lsnp, sizeof(DB_LSN));
-	bp += sizeof(DB_LSN);
-	memcpy(bp, &fileid, sizeof(fileid));
-	bp += sizeof(fileid);
-	memcpy(bp, &pgno, sizeof(pgno));
-	bp += sizeof(pgno);
-	if (prevlsn != NULL)
-		memcpy(bp, prevlsn, sizeof(*prevlsn));
-	else
-		memset(bp, 0, sizeof(*prevlsn));
-	bp += sizeof(*prevlsn);
-#ifdef DIAGNOSTIC
-	if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
-		fprintf(stderr, "Error in log record length");
-#endif
-	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
-	if (txnid != NULL)
-		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
-	return (ret);
-}
-
-/*
- * PUBLIC: int __db_noop_print
- * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
- */
-int
-__db_noop_print(notused1, dbtp, lsnp, notused2, notused3)
-	DB_LOG *notused1;
-	DBT *dbtp;
-	DB_LSN *lsnp;
-	int notused2;
-	void *notused3;
-{
-	__db_noop_args *argp;
-	u_int32_t i;
-	u_int ch;
-	int ret;
-
-	i = 0;
-	ch = 0;
-	notused1 = NULL;
-	notused2 = 0;
-	notused3 = NULL;
-
-	if ((ret = __db_noop_read(dbtp->data, &argp)) != 0)
-		return (ret);
-	printf("[%lu][%lu]db_noop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
-	    (u_long)lsnp->file,
-	    (u_long)lsnp->offset,
-	    (u_long)argp->type,
-	    (u_long)argp->txnid->txnid,
-	    (u_long)argp->prev_lsn.file,
-	    (u_long)argp->prev_lsn.offset);
-	printf("\tfileid: %lu\n", (u_long)argp->fileid);
-	printf("\tpgno: %lu\n", (u_long)argp->pgno);
-	printf("\tprevlsn: [%lu][%lu]\n",
-	    (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
-	printf("\n");
-	__db_free(argp);
-	return (0);
-}
-
-/*
- * PUBLIC: int __db_noop_read __P((void *, __db_noop_args **));
- */
-int
-__db_noop_read(recbuf, argpp)
-	void *recbuf;
-	__db_noop_args **argpp;
-{
-	__db_noop_args *argp;
-	u_int8_t *bp;
-
-	argp = (__db_noop_args *)__db_malloc(sizeof(__db_noop_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
-	argp->txnid = (DB_TXN *)&argp[1];
-	bp = recbuf;
-	memcpy(&argp->type, bp, sizeof(argp->type));
-	bp += sizeof(argp->type);
-	memcpy(&argp->txnid->txnid,  bp, sizeof(argp->txnid->txnid));
-	bp += sizeof(argp->txnid->txnid);
-	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
-	bp += sizeof(DB_LSN);
-	memcpy(&argp->fileid, bp, sizeof(argp->fileid));
-	bp += sizeof(argp->fileid);
-	memcpy(&argp->pgno, bp, sizeof(argp->pgno));
-	bp += sizeof(argp->pgno);
-	memcpy(&argp->prevlsn, bp,  sizeof(argp->prevlsn));
-	bp += sizeof(argp->prevlsn);
-	*argpp = argp;
-	return (0);
-}
-
-/*
  * PUBLIC: int __db_init_print __P((DB_ENV *));
  */
 int
@@ -1450,9 +1319,6 @@ __db_init_print(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __db_debug_print, DB_db_debug)) != 0)
 		return (ret);
-	if ((ret = __db_add_recovery(dbenv,
-	    __db_noop_print, DB_db_noop)) != 0)
-		return (ret);
 	return (0);
 }
 
@@ -1486,9 +1352,6 @@ __db_init_recover(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __db_debug_recover, DB_db_debug)) != 0)
 		return (ret);
-	if ((ret = __db_add_recovery(dbenv,
-	    __db_noop_recover, DB_db_noop)) != 0)
-		return (ret);
 	return (0);
 }
 
diff --git a/db2/db/db_dispatch.c b/db2/db/db_dispatch.c
index 8645948614..616d08c3ff 100644
--- a/db2/db/db_dispatch.c
+++ b/db2/db/db_dispatch.c
@@ -43,13 +43,14 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_dispatch.c	10.14 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)db_dispatch.c	10.20 (Sleepycat) 10/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <shqueue.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
@@ -61,6 +62,7 @@ static const char sccsid[] = "@(#)db_dispatch.c	10.14 (Sleepycat) 5/3/98";
 #include "db_am.h"
 #include "common_ext.h"
 #include "log_auto.h"
+#include "txn.h"
 #include "txn_auto.h"
 
 /*
@@ -148,27 +150,16 @@ __db_add_recovery(dbenv, func, ndx)
 	u_int32_t ndx;
 {
 	u_int32_t i;
+	int ret;
 
-	/* Check if function is already registered. */
-	if (dispatch_table && ndx < dispatch_size &&
-	    dispatch_table[ndx] != 0 && dispatch_table[ndx] != func)
-		return (DB_REGISTERED);
+	COMPQUIET(dbenv, NULL);		/* !!!: not currently used. */
 
 	/* Check if we have to grow the table. */
 	if (ndx >= dispatch_size) {
-		if (dispatch_table == NULL)
-			dispatch_table = (int (**)
-			 __P((DB_LOG *, DBT *, DB_LSN *, int, void *)))
-			 __db_malloc(DB_user_BEGIN * sizeof(dispatch_table[0]));
-		else
-			dispatch_table = (int (**)
-			    __P((DB_LOG *, DBT *, DB_LSN *, int, void *)))
-			    __db_realloc(dispatch_table, (DB_user_BEGIN +
-			    dispatch_size) * sizeof(dispatch_table[0]));
-		if (dispatch_table == NULL) {
-			__db_err(dbenv, "%s", strerror(ENOMEM));
-			return (ENOMEM);
-		}
+		if ((ret = __os_realloc(&dispatch_table,
+		    (DB_user_BEGIN + dispatch_size) *
+		    sizeof(dispatch_table[0]))) != 0)
+			return (ret);
 		for (i = dispatch_size,
 		    dispatch_size += DB_user_BEGIN; i < dispatch_size; ++i)
 			dispatch_table[i] = NULL;
@@ -189,9 +180,10 @@ __db_txnlist_init(retp)
 	void *retp;
 {
 	DB_TXNHEAD *headp;
+	int ret;
 
-	if ((headp = (DB_TXNHEAD *)__db_malloc(sizeof(DB_TXNHEAD))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(sizeof(DB_TXNHEAD), NULL, &headp)) != 0)
+		return (ret);
 
 	LIST_INIT(&headp->head);
 	headp->maxid = 0;
@@ -214,9 +206,10 @@ __db_txnlist_add(listp, txnid)
 {
 	DB_TXNHEAD *hp;
 	DB_TXNLIST *elp;
+	int ret;
 
-	if ((elp = (DB_TXNLIST *)__db_malloc(sizeof(DB_TXNLIST))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+		return (ret);
 
 	elp->txnid = txnid;
 	hp = (DB_TXNHEAD *)listp;
@@ -269,9 +262,9 @@ __db_txnlist_end(listp)
 	hp = (DB_TXNHEAD *)listp;
 	while ((p = LIST_FIRST(&hp->head)) != LIST_END(&hp->head)) {
 		LIST_REMOVE(p, links);
-		__db_free(p);
+		__os_free(p, 0);
 	}
-	__db_free(listp);
+	__os_free(listp, sizeof(DB_TXNHEAD));
 }
 
 /*
diff --git a/db2/db/db_dup.c b/db2/db/db_dup.c
index 6379fc1729..2673bbcd61 100644
--- a/db2/db/db_dup.c
+++ b/db2/db/db_dup.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_dup.c	10.18 (Sleepycat) 5/31/98";
+static const char sccsid[] = "@(#)db_dup.c	10.35 (Sleepycat) 12/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -23,25 +23,25 @@ static const char sccsid[] = "@(#)db_dup.c	10.18 (Sleepycat) 5/31/98";
 #include "btree.h"
 #include "db_am.h"
 
-static int __db_addpage __P((DB *,
-    PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
-static int __db_dsplit __P((DB *,
-    PAGE **, db_indx_t *, u_int32_t, int (*)(DB *, u_int32_t, PAGE **)));
+static int __db_addpage __P((DBC *,
+    PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **)));
+static int __db_dsplit __P((DBC *,
+    PAGE **, db_indx_t *, u_int32_t, int (*)(DBC *, u_int32_t, PAGE **)));
 
 /*
  * __db_dput --
  *	Put a duplicate item onto a duplicate page at the given index.
  *
- * PUBLIC: int __db_dput __P((DB *,
- * PUBLIC:    DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
+ * PUBLIC: int __db_dput __P((DBC *, DBT *,
+ * PUBLIC:    PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **)));
  */
 int
-__db_dput(dbp, dbt, pp, indxp, newfunc)
-	DB *dbp;
+__db_dput(dbc, dbt, pp, indxp, newfunc)
+	DBC *dbc;
 	DBT *dbt;
 	PAGE **pp;
 	db_indx_t *indxp;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
 	BOVERFLOW bo;
 	DBT *data_dbtp, hdr_dbt, *hdr_dbtp;
@@ -54,10 +54,12 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	 * We need some access method independent threshold for when we put
 	 * a duplicate item onto an overflow page.
 	 */
-	if (dbt->size > 0.25 * dbp->pgsize) {
-		if ((ret = __db_poff(dbp, dbt, &pgno, newfunc)) != 0)
+	if (dbt->size > 0.25 * dbc->dbp->pgsize) {
+		if ((ret = __db_poff(dbc, dbt, &pgno, newfunc)) != 0)
 			return (ret);
+		UMRW(bo.unused1);
 		B_TSET(bo.type, B_OVERFLOW, 0);
+		UMRW(bo.unused2);
 		bo.tlen = dbt->size;
 		bo.pgno = pgno;
 		hdr_dbt.data = &bo;
@@ -75,11 +77,14 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	pagep = *pp;
 	if (size > P_FREESPACE(pagep)) {
 		if (*indxp == NUM_ENT(*pp) && NEXT_PGNO(*pp) == PGNO_INVALID)
-			ret = __db_addpage(dbp, pp, indxp, newfunc);
+			ret = __db_addpage(dbc, pp, indxp, newfunc);
 		else
-			ret = __db_dsplit(dbp, pp, indxp, isize, newfunc);
+			ret = __db_dsplit(dbc, pp, indxp, isize, newfunc);
 		if (ret != 0)
-			/* XXX: Pages not returned to free list. */
+			/*
+			 * XXX
+			 * Pages not returned to free list.
+			 */
 			return (ret);
 		pagep = *pp;
 	}
@@ -88,11 +93,11 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	 * Now, pagep references the page on which to insert and indx is the
 	 * the location to insert.
 	 */
-	if ((ret = __db_pitem(dbp,
+	if ((ret = __db_pitem(dbc,
 	    pagep, (u_int32_t)*indxp, isize, hdr_dbtp, data_dbtp)) != 0)
 		return (ret);
 
-	(void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY);
+	(void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY);
 	return (0);
 }
 
@@ -100,15 +105,15 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
  * __db_drem --
  *	Remove a duplicate at the given index on the given page.
  *
- * PUBLIC: int __db_drem __P((DB *,
- * PUBLIC:    PAGE **, u_int32_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_drem __P((DBC *,
+ * PUBLIC:    PAGE **, u_int32_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_drem(dbp, pp, indx, freefunc)
-	DB *dbp;
+__db_drem(dbc, pp, indx, freefunc)
+	DBC *dbc;
 	PAGE **pp;
 	u_int32_t indx;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
 	PAGE *pagep;
 	int ret;
@@ -117,12 +122,12 @@ __db_drem(dbp, pp, indx, freefunc)
 
 	/* Check if we are freeing a big item. */
 	if (B_TYPE(GET_BKEYDATA(pagep, indx)->type) == B_OVERFLOW) {
-		if ((ret = __db_doff(dbp,
+		if ((ret = __db_doff(dbc,
 		    GET_BOVERFLOW(pagep, indx)->pgno, freefunc)) != 0)
 			return (ret);
-		ret = __db_ditem(dbp, pagep, indx, BOVERFLOW_SIZE);
+		ret = __db_ditem(dbc, pagep, indx, BOVERFLOW_SIZE);
 	} else
-		ret = __db_ditem(dbp, pagep, indx,
+		ret = __db_ditem(dbc, pagep, indx,
 		    BKEYDATA_SIZE(GET_BKEYDATA(pagep, indx)->len));
 	if (ret != 0)
 		return (ret);
@@ -137,12 +142,12 @@ __db_drem(dbp, pp, indx, freefunc)
 		 * !!!
 		 * __db_relink will set the dirty bit for us.
 		 */
-		if ((ret = __db_relink(dbp, pagep, pp, 0)) != 0)
+		if ((ret = __db_relink(dbc, DB_REM_PAGE, pagep, pp, 0)) != 0)
 			return (ret);
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} else
-		(void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY);
+		(void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY);
 
 	return (0);
 }
@@ -151,32 +156,41 @@ __db_drem(dbp, pp, indx, freefunc)
  * __db_dend --
  *	Find the last page in a set of offpage duplicates.
  *
- * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **));
+ * PUBLIC: int __db_dend __P((DBC *, db_pgno_t, PAGE **));
  */
 int
-__db_dend(dbp, pgno, pagep)
-	DB *dbp;
+__db_dend(dbc, pgno, pp)
+	DBC *dbc;
 	db_pgno_t pgno;
-	PAGE **pagep;
+	PAGE **pp;
 {
+	DB *dbp;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * This implements DB_KEYLAST.  The last page is returned in pp; pgno
 	 * should be the page number of the first page of the duplicate chain.
+	 *
+	 * *pp may be non-NULL -- if given a valid page use it.
 	 */
+	if (*pp != NULL)
+		goto started;
 	for (;;) {
-		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
 			return (ret);
 		}
+started:	h = *pp;
+
 		if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
 			break;
-		(void)memp_fput(dbp->mpf, h, 0);
-	}
 
-	*pagep = h;
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+	}
 	return (0);
 }
 
@@ -191,41 +205,44 @@ __db_dend(dbp, pgno, pagep)
  *	the page on which the insert should happen, not yet put.
  */
 static int
-__db_dsplit(dbp, hp, indxp, size, newfunc)
-	DB *dbp;
+__db_dsplit(dbc, hp, indxp, size, newfunc)
+	DBC *dbc;
 	PAGE **hp;
 	db_indx_t *indxp;
 	u_int32_t size;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
 	PAGE *h, *np, *tp;
 	BKEYDATA *bk;
 	DBT page_dbt;
+	DB *dbp;
+	size_t pgsize;
 	db_indx_t halfbytes, i, indx, lastsum, nindex, oindex, s, sum;
-	int did_indx, ret;
+	int did_indx, ret, t_ret;
 
 	h = *hp;
 	indx = *indxp;
+	ret = 0;
+	dbp = dbc->dbp;
+	pgsize = dbp->pgsize;
 
 	/* Create a temporary page to do compaction onto. */
-	if ((tp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL)
-		return (ENOMEM);
-#ifdef DIAGNOSTIC
-	memset(tp, 0xff, dbp->pgsize);
-#endif
+	if ((ret = __os_malloc(pgsize, NULL, &tp)) != 0)
+		return (ret);
+
 	/* Create new page for the split. */
-	if ((ret = newfunc(dbp, P_DUPLICATE, &np)) != 0) {
-		FREE(tp, dbp->pgsize);
+	if ((ret = newfunc(dbc, P_DUPLICATE, &np)) != 0) {
+		__os_free(tp, pgsize);
 		return (ret);
 	}
 
-	P_INIT(np, dbp->pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0,
+	P_INIT(np, pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0,
 	    P_DUPLICATE);
-	P_INIT(tp, dbp->pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0,
+	P_INIT(tp, pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0,
 	    P_DUPLICATE);
 
 	/* Figure out the split point */
-	halfbytes = (dbp->pgsize - HOFFSET(h)) / 2;
+	halfbytes = (pgsize - HOFFSET(h)) / 2;
 	did_indx = 0;
 	for (sum = 0, lastsum = 0, i = 0; i < NUM_ENT(h); i++) {
 		if (i == indx) {
@@ -237,7 +254,6 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 				    (db_indx_t)(sum - halfbytes)) {
 					*hp = np;
 					*indxp = 0;
-					i--;
 				} else
 					*indxp = i;
 				break;
@@ -252,29 +268,28 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 
 		if (lastsum < halfbytes && sum >= halfbytes) {
 			/* We've crossed the halfway point. */
-			if ((db_indx_t)(halfbytes - lastsum) <
-			    (db_indx_t)(sum - halfbytes))
-				i--;
+			if ((db_indx_t)(sum - halfbytes) <
+			    (db_indx_t)(halfbytes - lastsum))
+				i++;
 			break;
 		}
 	}
-
 	/*
 	 * Check if we have set the return values of the index pointer and
 	 * page pointer.
 	 */
 	if (!did_indx) {
 		*hp = np;
-		*indxp = indx - i - 1;
+		*indxp = indx - i;
 	}
 
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		page_dbt.size = dbp->pgsize;
 		page_dbt.data = h;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid,
+		    dbc->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid,
 		    PGNO(h), &page_dbt, &LSN(h))) != 0) {
-			FREE(tp, dbp->pgsize);
+			__os_free(tp, pgsize);
 			return (ret);
 		}
 		LSN(tp) = LSN(h);
@@ -283,12 +298,12 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	/*
 	 * If it's a btree, adjust the cursors.
 	 *
-	 * i is the index of the last element to stay on the page.
+	 * i is the index of the first element to move onto the new page.
 	 */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
-		__bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i + 1, 0);
+	if (dbp->type == DB_BTREE)
+		__bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i, 0);
 
-	for (nindex = 0, oindex = i + 1; oindex < NUM_ENT(h); oindex++) {
+	for (nindex = 0, oindex = i; oindex < NUM_ENT(h); oindex++) {
 		bk = GET_BKEYDATA(h, oindex);
 		if (B_TYPE(bk->type) == B_KEYDATA)
 			s = BKEYDATA_SIZE(bk->len);
@@ -304,7 +319,7 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	 * Now do data compaction by copying the remaining stuff onto the
 	 * temporary page and then copying it back to the real page.
 	 */
-	for (nindex = 0, oindex = 0; oindex <= i; oindex++) {
+	for (nindex = 0, oindex = 0; oindex < i; oindex++) {
 		bk = GET_BKEYDATA(h, oindex);
 		if (B_TYPE(bk->type) == B_KEYDATA)
 			s = BKEYDATA_SIZE(bk->len);
@@ -324,59 +339,73 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	 */
 	memcpy(h, tp, LOFFSET(tp));
 	memcpy((u_int8_t *)h + HOFFSET(tp),
-	    (u_int8_t *)tp + HOFFSET(tp), dbp->pgsize - HOFFSET(tp));
-	FREE(tp, dbp->pgsize);
+	    (u_int8_t *)tp + HOFFSET(tp), pgsize - HOFFSET(tp));
+	__os_free(tp, pgsize);
 
-	if (DB_LOGGING(dbp)) {
-		page_dbt.size = dbp->pgsize;
+	if (DB_LOGGING(dbc)) {
+		/*
+		 * XXX
+		 * If either of these fails, are we leaving pages pinned?
+		 * Yes, but it seems like this happens in error case.
+		 */
+		page_dbt.size = pgsize;
 		page_dbt.data = h;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid,
+		    dbc->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid,
 		    PGNO(h), &page_dbt, &LSN(h))) != 0)
 			return (ret);
 
-		page_dbt.size = dbp->pgsize;
+		page_dbt.size = pgsize;
 		page_dbt.data = np;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid,
+		    dbc->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid,
 		    PGNO(np),  &page_dbt, &LSN(np))) != 0)
 			return (ret);
 	}
 
 	/*
+	 * Finally, if there was a next page after the page being
+	 * split, fix its prev pointer.
+	 */
+	if (np->next_pgno != PGNO_INVALID)
+	    ret = __db_relink(dbc, DB_ADD_PAGE, np, NULL, 1);
+
+	/*
 	 * Figure out if the location we're interested in is on the new
 	 * page, and if so, reset the callers' pointer.  Push the other
 	 * page back to the store.
 	 */
 	if (*hp == h)
-		ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
+		t_ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
 	else
-		ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
+		t_ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
 
-	return (ret);
+	return (ret != 0 ? ret : t_ret);
 }
 
 /*
  * __db_ditem --
  *	Remove an item from a page.
  *
- * PUBLIC:  int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t));
+ * PUBLIC:  int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
  */
 int
-__db_ditem(dbp, pagep, indx, nbytes)
-	DB *dbp;
+__db_ditem(dbc, pagep, indx, nbytes)
+	DBC *dbc;
 	PAGE *pagep;
 	u_int32_t indx, nbytes;
 {
+	DB *dbp;
 	DBT ldbt;
 	db_indx_t cnt, offset;
 	int ret;
 	u_int8_t *from;
 
-	if (DB_LOGGING(dbp)) {
+	dbp = dbc->dbp;
+	if (DB_LOGGING(dbc)) {
 		ldbt.data = P_ENTRY(pagep, indx);
 		ldbt.size = nbytes;
-		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn,
+		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep),
 		    (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
 			return (ret);
@@ -413,7 +442,7 @@ __db_ditem(dbp, pagep, indx, nbytes)
 		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
 
 	/* If it's a btree, adjust the cursors. */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
+	if (dbp->type == DB_BTREE)
 		__bam_ca_di(dbp, PGNO(pagep), indx, -1);
 
 	return (0);
@@ -424,16 +453,17 @@ __db_ditem(dbp, pagep, indx, nbytes)
  *	Put an item on a page.
  *
  * PUBLIC: int __db_pitem
- * PUBLIC:     __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
  */
 int
-__db_pitem(dbp, pagep, indx, nbytes, hdr, data)
-	DB *dbp;
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
 	PAGE *pagep;
 	u_int32_t indx;
 	u_int32_t nbytes;
 	DBT *hdr, *data;
 {
+	DB *dbp;
 	BKEYDATA bk;
 	DBT thdr;
 	int ret;
@@ -456,8 +486,9 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
 	 * the passed in header sizes must be adjusted for the structure's
 	 * placeholder for the trailing variable-length data field.
 	 */
-	if (DB_LOGGING(dbp))
-		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn,
+	dbp = dbc->dbp;
+	if (DB_LOGGING(dbc))
+		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep),
 		    (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0)
 			return (ret);
@@ -485,7 +516,7 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
 		memcpy(p + hdr->size, data->data, data->size);
 
 	/* If it's a btree, adjust the cursors. */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
+	if (dbp->type == DB_BTREE)
 		__bam_ca_di(dbp, PGNO(pagep), indx, 1);
 
 	return (0);
@@ -495,14 +526,16 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
  * __db_relink --
  *	Relink around a deleted page.
  *
- * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int));
+ * PUBLIC: int __db_relink __P((DBC *, u_int32_t, PAGE *, PAGE **, int));
  */
 int
-__db_relink(dbp, pagep, new_next, needlock)
-	DB *dbp;
+__db_relink(dbc, add_rem, pagep, new_next, needlock)
+	DBC *dbc;
+	u_int32_t add_rem;
 	PAGE *pagep, **new_next;
 	int needlock;
 {
+	DB *dbp;
 	PAGE *np, *pp;
 	DB_LOCK npl, ppl;
 	DB_LSN *nlsnp, *plsnp;
@@ -512,10 +545,15 @@ __db_relink(dbp, pagep, new_next, needlock)
 	np = pp = NULL;
 	npl = ppl = LOCK_INVALID;
 	nlsnp = plsnp = NULL;
+	dbp = dbc->dbp;
 
-	/* Retrieve and lock the two pages. */
+	/*
+	 * Retrieve and lock the one/two pages.  For a remove, we may need
+	 * two pages (the before and after).  For an add, we only need one
+	 * because, the split took care of the prev.
+	 */
 	if (pagep->next_pgno != PGNO_INVALID) {
-		if (needlock && (ret = __bam_lget(dbp,
+		if (needlock && (ret = __bam_lget(dbc,
 		    0, pagep->next_pgno, DB_LOCK_WRITE, &npl)) != 0)
 			goto err;
 		if ((ret = memp_fget(dbp->mpf,
@@ -525,8 +563,8 @@ __db_relink(dbp, pagep, new_next, needlock)
 		}
 		nlsnp = &np->lsn;
 	}
-	if (pagep->prev_pgno != PGNO_INVALID) {
-		if (needlock && (ret = __bam_lget(dbp,
+	if (add_rem == DB_REM_PAGE && pagep->prev_pgno != PGNO_INVALID) {
+		if (needlock && (ret = __bam_lget(dbc,
 		    0, pagep->prev_pgno, DB_LOCK_WRITE, &ppl)) != 0)
 			goto err;
 		if ((ret = memp_fget(dbp->mpf,
@@ -538,9 +576,10 @@ __db_relink(dbp, pagep, new_next, needlock)
 	}
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
-		if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbp->txn,
-		    &pagep->lsn, 0, dbp->log_fileid, pagep->pgno, &pagep->lsn,
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbc->txn,
+		    &pagep->lsn, 0, add_rem, dbp->log_fileid,
+		    pagep->pgno, &pagep->lsn,
 		    pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0)
 			goto err;
 		if (np != NULL)
@@ -558,7 +597,10 @@ __db_relink(dbp, pagep, new_next, needlock)
 	 * set to NULL.
 	 */
 	if (np != NULL) {
-		np->prev_pgno = pagep->prev_pgno;
+		if (add_rem == DB_ADD_PAGE)
+			np->prev_pgno = pagep->pgno;
+		else
+			np->prev_pgno = pagep->prev_pgno;
 		if (new_next == NULL)
 			ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
 		else {
@@ -568,7 +610,7 @@ __db_relink(dbp, pagep, new_next, needlock)
 		if (ret != 0)
 			goto err;
 		if (needlock)
-			(void)__bam_lput(dbp, npl);
+			(void)__bam_lput(dbc, npl);
 	} else if (new_next != NULL)
 		*new_next = NULL;
 
@@ -577,18 +619,18 @@ __db_relink(dbp, pagep, new_next, needlock)
 		if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0)
 			goto err;
 		if (needlock)
-			(void)__bam_lput(dbp, ppl);
+			(void)__bam_lput(dbc, ppl);
 	}
 	return (0);
 
 err:	if (np != NULL)
 		(void)memp_fput(dbp->mpf, np, 0);
 	if (needlock && npl != LOCK_INVALID)
-		(void)__bam_lput(dbp, npl);
+		(void)__bam_lput(dbc, npl);
 	if (pp != NULL)
 		(void)memp_fput(dbp->mpf, pp, 0);
 	if (needlock && ppl != LOCK_INVALID)
-		(void)__bam_lput(dbp, ppl);
+		(void)__bam_lput(dbc, ppl);
 	return (ret);
 }
 
@@ -596,34 +638,37 @@ err:	if (np != NULL)
  * __db_ddup --
  *	Delete an offpage chain of duplicates.
  *
- * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_ddup __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_ddup(dbp, pgno, freefunc)
-	DB *dbp;
+__db_ddup(dbc, pgno, freefunc)
+	DBC *dbc;
 	db_pgno_t pgno;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
+	DB *dbp;
 	PAGE *pagep;
 	DBT tmp_dbt;
 	int ret;
 
+	dbp = dbc->dbp;
 	do {
 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
 			return (ret);
 		}
 
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = pagep;
 			tmp_dbt.size = dbp->pgsize;
-			if ((ret = __db_split_log(dbp->dbenv->lg_info, dbp->txn,
-			    &LSN(pagep), 0, DB_SPLITOLD, dbp->log_fileid,
-			    PGNO(pagep), &tmp_dbt, &LSN(pagep))) != 0)
+			if ((ret = __db_split_log(dbp->dbenv->lg_info,
+			    dbc->txn, &LSN(pagep), 0, DB_SPLITOLD,
+			    dbp->log_fileid, PGNO(pagep), &tmp_dbt,
+			    &LSN(pagep))) != 0)
 				return (ret);
 		}
 		pgno = pagep->next_pgno;
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} while (pgno != PGNO_INVALID);
 
@@ -636,21 +681,23 @@ __db_ddup(dbp, pgno, freefunc)
  *	current page.
  */
 static int
-__db_addpage(dbp, hp, indxp, newfunc)
-	DB *dbp;
+__db_addpage(dbc, hp, indxp, newfunc)
+	DBC *dbc;
 	PAGE **hp;
 	db_indx_t *indxp;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
+	DB *dbp;
 	PAGE *newpage;
 	int ret;
 
-	if ((ret = newfunc(dbp, P_DUPLICATE, &newpage)) != 0)
+	dbp = dbc->dbp;
+	if ((ret = newfunc(dbc, P_DUPLICATE, &newpage)) != 0)
 		return (ret);
 
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		if ((ret = __db_addpage_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(*hp), 0, dbp->log_fileid,
+		    dbc->txn, &LSN(*hp), 0, dbp->log_fileid,
 		    PGNO(*hp), &LSN(*hp), PGNO(newpage), &LSN(newpage))) != 0) {
 			return (ret);
 		}
@@ -666,3 +713,235 @@ __db_addpage(dbp, hp, indxp, newfunc)
 	*indxp = 0;
 	return (0);
 }
+
+/*
+ * __db_dsearch --
+ *	Search a set of duplicates for the proper position for a new duplicate.
+ *
+ *	+ pgno is the page number of the page on which to begin searching.
+ * 	  Since we can continue duplicate searches, it might not be the first
+ * 	  page.
+ *
+ * 	+ If we are continuing a search, then *pp may be non-NULL in which
+ * 	  case we do not have to retrieve the page.
+ *
+ *	+ If we are continuing a search, then *indxp contains the first
+ * 	  on pgno of where we should begin the search.
+ *
+ * 	NOTE: if there is no comparison function, then continuing is
+ * 	meaningless, and *pp should always be NULL and *indxp will be
+ *	ignored.
+ *
+ *	3 return values::
+ *
+ *	+ pp is the returned page pointer of where this element should go.
+ *	+ indxp is the returned index on that page
+ *	+ cmpp is the returned final comparison result.
+ *
+ * PUBLIC: int __db_dsearch __P((DBC *,
+ * PUBLIC:     int, DBT *, db_pgno_t, db_indx_t *, PAGE **, int *));
+ */
+int
+__db_dsearch(dbc, is_insert, dbt, pgno, indxp, pp, cmpp)
+	DBC *dbc;
+	int is_insert, *cmpp;
+	DBT *dbt;
+	db_pgno_t pgno;
+	db_indx_t *indxp;
+	PAGE **pp;
+{
+	DB *dbp;
+	PAGE *h;
+	db_indx_t base, indx, lim, save_indx;
+	db_pgno_t save_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	if (dbp->dup_compare == NULL) {
+		/*
+		 * We may have been given a valid page, but we may not be
+		 * able to use it.  The problem is that the application is
+		 * doing a join and we're trying to continue the search,
+		 * but since the items aren't sorted, we can't.  Discard
+		 * the page if it's not the one we're going to start with
+		 * anyway.
+		 */
+		if (*pp != NULL && (*pp)->pgno != pgno) {
+			if ((ret = memp_fput(dbp->mpf, *pp, 0)) != 0)
+				return (ret);
+			*pp = NULL;
+		}
+
+		/*
+		 * If no duplicate function is specified, just go to the end
+		 * of the duplicate set.
+		 */
+		if (is_insert) {
+			if ((ret = __db_dend(dbc, pgno, pp)) != 0)
+				return (ret);
+			*indxp = NUM_ENT(*pp);
+			return (0);
+		}
+
+		/*
+		 * We are looking for a specific duplicate, so do a linear
+		 * search.
+		 */
+		if (*pp != NULL)
+			goto nocmp_started;
+		for (;;) {
+			if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+				goto pg_err;
+nocmp_started:		h = *pp;
+
+			for (*indxp = 0; *indxp < NUM_ENT(h); ++*indxp) {
+				if ((*cmpp = __bam_cmp(dbp,
+				    dbt, h, *indxp, __bam_defcmp)) != 0)
+					continue;
+				/*
+				 * The duplicate may have already been deleted,
+				 * if it's a btree page, in which case we skip
+				 * it.
+				 */
+				if (dbp->type == DB_BTREE &&
+				    B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+					continue;
+
+				return (0);
+			}
+
+			if ((pgno = h->next_pgno) == PGNO_INVALID)
+				break;
+
+			if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+				return (ret);
+		}
+		*cmpp = 1;			/* We didn't succeed... */
+		return (0);
+	}
+
+	/*
+	 * We have a comparison routine, i.e., the duplicates are sorted.
+	 * Walk through the chain of duplicates, checking the last entry
+	 * on each page to decide if it's the page we want to search.
+	 *
+	 * *pp may be non-NULL -- if we were given a valid page (e.g., are
+	 * in mid-search), then use the provided page.
+	 */
+	if (*pp != NULL)
+		goto cmp_started;
+	for (;;) {
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+cmp_started:	h = *pp;
+
+		if ((pgno = h->next_pgno) == PGNO_INVALID || __bam_cmp(dbp,
+		    dbt, h, h->entries - 1, dbp->dup_compare) <= 0)
+			break;
+		/*
+		 * Even when continuing a search, make sure we don't skip
+		 * entries on a new page
+		 */
+		*indxp = 0;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+	}
+
+	/* Next, do a binary search on the page. */
+	base = F_ISSET(dbc, DBC_CONTINUE) ? *indxp : 0;
+	for (lim = NUM_ENT(h) - base; lim != 0; lim >>= 1) {
+		indx = base + (lim >> 1);
+		if ((*cmpp = __bam_cmp(dbp,
+		    dbt, h, indx, dbp->dup_compare)) == 0) {
+			*indxp = indx;
+
+			if (dbp->type != DB_BTREE ||
+			    !B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+			goto check_delete;
+		}
+		if (*cmpp > 0) {
+			base = indx + 1;
+			lim--;
+		}
+	}
+
+	/*
+	 * Base references the smallest index larger than the supplied DBT's
+	 * data item, potentially both 0 and NUM_ENT.
+	 */
+	*indxp = base;
+	return (0);
+
+check_delete:
+	/*
+	 * The duplicate may have already been deleted, if it's a btree page,
+	 * in which case we wander around, hoping to find an entry that hasn't
+	 * been deleted.  First, wander in a forwardly direction.
+	 */
+	save_pgno = (*pp)->pgno;
+	save_indx = *indxp;
+	for (++*indxp;;) {
+		for (; *indxp < NUM_ENT(h); ++*indxp) {
+			if ((*cmpp = __bam_cmp(dbp,
+			    dbt, h, *indxp, dbp->dup_compare)) != 0)
+				goto check_delete_rev;
+
+			if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+		}
+		if ((pgno = h->next_pgno) == PGNO_INVALID)
+			break;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+
+		*indxp = 0;
+	}
+
+check_delete_rev:
+	/* Go back to where we started, and wander in a backwardly direction. */
+	if (h->pgno != save_pgno) {
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+		if ((ret = memp_fget(dbp->mpf, &save_pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+	}
+
+	for (;;) {
+		while (*indxp > 0) {
+			--*indxp;
+			if ((*cmpp = __bam_cmp(dbp,
+			    dbt, h, *indxp, dbp->dup_compare)) != 0)
+				goto check_delete_fail;
+
+			if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+		}
+		if ((pgno = h->prev_pgno) == PGNO_INVALID)
+			break;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+
+		*indxp = NUM_ENT(h);
+	}
+
+check_delete_fail:
+	*cmpp = 1;			/* We didn't succeed... */
+	return (0);
+
+pg_err:	__db_pgerr(dbp, pgno);
+	return (ret);
+}
diff --git a/db2/db/db_iface.c b/db2/db/db_iface.c
new file mode 100644
index 0000000000..4ebf3ba019
--- /dev/null
+++ b/db2/db/db_iface.c
@@ -0,0 +1,488 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_iface.c	10.40 (Sleepycat) 12/19/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_auto.h"
+#include "db_ext.h"
+#include "common_ext.h"
+
+static int __db_keyempty __P((const DB_ENV *));
+static int __db_rdonly __P((const DB_ENV *, const char *));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+
+/*
+ * __db_cdelchk --
+ *	Common cursor delete argument checking routine.
+ *
+ * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int));
+ */
+int
+__db_cdelchk(dbp, flags, isrdonly, isvalid)
+	const DB *dbp;
+	u_int32_t flags;
+	int isrdonly, isvalid;
+{
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "c_del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DBcursor->c_del", 0));
+	}
+
+	/*
+	 * The cursor must be initialized, return -1 for an invalid cursor,
+	 * otherwise 0.
+	 */
+	return (isvalid ? 0 : EINVAL);
+}
+
+/*
+ * __db_cgetchk --
+ *	Common cursor get argument checking routine.
+ *
+ * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
+ */
+int
+__db_cgetchk(dbp, key, data, flags, isvalid)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+	int isvalid;
+{
+	int key_einval, key_flags, ret;
+
+	key_einval = key_flags = 0;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_RMW);
+	switch (flags) {
+	case DB_NEXT_DUP:
+		if (dbp->type == DB_RECNO)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_LAST:
+	case DB_NEXT:
+	case DB_PREV:
+		key_flags = 1;
+		break;
+	case DB_GET_BOTH:
+	case DB_SET_RANGE:
+		key_einval = key_flags = 1;
+		break;
+	case DB_SET:
+		key_einval = 1;
+		break;
+	case DB_GET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		key_einval = key_flags = 1;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
+	/*
+	 * The cursor must be initialized for DB_CURRENT, return -1 for an
+	 * invalid cursor, otherwise 0.
+	 */
+	return (isvalid || flags != DB_CURRENT ? 0 : EINVAL);
+}
+
+/*
+ * __db_cputchk --
+ *	Common cursor put argument checking routine.
+ *
+ * PUBLIC: int __db_cputchk __P((const DB *,
+ * PUBLIC:    const DBT *, DBT *, u_int32_t, int, int));
+ */
+int
+__db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	int isrdonly, isvalid;
+{
+	int key_einval, key_flags, ret;
+
+	key_einval = key_flags = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "c_put"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_AFTER:
+	case DB_BEFORE:
+		if (dbp->dup_compare != NULL)
+			goto err;
+		if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER))
+			goto err;
+		if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP))
+			goto err;
+		break;
+	case DB_CURRENT:
+		/*
+		 * If there is a comparison function, doing a DB_CURRENT
+		 * must not change the part of the data item that is used
+		 * for the comparison.
+		 */
+		break;
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+		if (dbp->type == DB_RECNO)
+			goto err;
+		key_einval = key_flags = 1;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DBcursor->c_put", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
+	/*
+	 * The cursor must be initialized for anything other than DB_KEYFIRST
+	 * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0.
+	 */
+	return (isvalid ||
+	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? 0 : EINVAL);
+}
+
+/*
+ * __db_closechk --
+ *	DB->close flag check.
+ *
+ * PUBLIC: int __db_closechk __P((const DB *, u_int32_t));
+ */
+int
+__db_closechk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	if (flags != 0 && flags != DB_NOSYNC)
+		return (__db_ferr(dbp->dbenv, "DB->close", 0));
+
+	return (0);
+}
+
+/*
+ * __db_delchk --
+ *	Common delete argument checking routine.
+ *
+ * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
+ */
+int
+__db_delchk(dbp, key, flags, isrdonly)
+	const DB *dbp;
+	DBT *key;
+	u_int32_t flags;
+	int isrdonly;
+{
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "delete"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DB->del", 0));
+	}
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	return (0);
+}
+
+/*
+ * __db_getchk --
+ *	Common get argument checking routine.
+ *
+ * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
+ */
+int
+__db_getchk(dbp, key, data, flags)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	int ret;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_RMW);
+	switch (flags) {
+	case 0:
+	case DB_GET_BOTH:
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, flags == DB_SET_RECNO)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	return (0);
+}
+
+/*
+ * __db_joinchk --
+ *	Common join argument checking routine.
+ *
+ * PUBLIC: int __db_joinchk __P((const DB *, u_int32_t));
+ */
+int
+__db_joinchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	if (flags != 0)
+		return (__db_ferr(dbp->dbenv, "DB->join", 0));
+
+	return (0);
+}
+
+/*
+ * __db_putchk --
+ *	Common put argument checking routine.
+ *
+ * PUBLIC: int __db_putchk
+ * PUBLIC:    __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
+ */
+int
+__db_putchk(dbp, key, data, flags, isrdonly, isdup)
+	const DB *dbp;
+	DBT *key;
+	const DBT *data;
+	u_int32_t flags;
+	int isrdonly, isdup;
+{
+	int ret;
+
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "put"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+	case DB_NOOVERWRITE:
+		break;
+	case DB_APPEND:
+		if (dbp->type != DB_RECNO)
+			goto err;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->put", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	/* Check for partial puts in the presence of duplicates. */
+	if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) {
+		__db_err(dbp->dbenv,
+"a partial put in the presence of duplicates requires a cursor operation");
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_statchk --
+ *	Common stat argument checking routine.
+ *
+ * PUBLIC: int __db_statchk __P((const DB *, u_int32_t));
+ */
+int
+__db_statchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	case DB_RECORDCOUNT:
+		if (dbp->type == DB_RECNO)
+			break;
+		if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_BT_RECNUM))
+			break;
+		goto err;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->stat", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_syncchk --
+ *	Common sync argument checking routine.
+ *
+ * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t));
+ */
+int
+__db_syncchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DB->sync", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __dbt_ferr --
+ *	Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+	const DB *dbp;
+	const char *name;
+	const DBT *dbt;
+	int check_thread;
+{
+	int ret;
+
+	/*
+	 * Check for invalid DBT flags.  We allow any of the flags to be
+	 * specified to any DB or DBcursor call so that applications can
+	 * set DB_DBT_MALLOC when retrieving a data item from a secondary
+	 * database and then specify that same DBT as a key to a primary
+	 * database, without having to clear flags.
+	 */
+	if ((ret = __db_fchk(dbp->dbenv, name, dbt->flags,
+	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(dbp->dbenv, name,
+	    dbt->flags, DB_DBT_MALLOC, DB_DBT_USERMEM)) != 0)
+		return (ret);
+
+	if (check_thread && F_ISSET(dbp, DB_AM_THREAD) &&
+	    !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_USERMEM)) {
+		__db_err(dbp->dbenv,
+		    "missing flag thread flag for %s DBT", name);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_eopnotsup --
+ *	Common operation not supported message.
+ *
+ * PUBLIC: int __db_eopnotsup __P((const DB_ENV *));
+ */
+int
+__db_eopnotsup(dbenv)
+	const DB_ENV *dbenv;
+{
+	__db_err(dbenv, "operation not supported");
+#ifdef EOPNOTSUPP
+	return (EOPNOTSUPP);
+#else
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __db_keyempty --
+ *	Common missing or empty key value message.
+ */
+static int
+__db_keyempty(dbenv)
+	const DB_ENV *dbenv;
+{
+	__db_err(dbenv, "missing or empty key value specified");
+	return (EINVAL);
+}
+
+/*
+ * __db_rdonly --
+ *	Common readonly message.
+ */
+static int
+__db_rdonly(dbenv, name)
+	const DB_ENV *dbenv;
+	const char *name;
+{
+	__db_err(dbenv, "%s: attempt to modify a read-only tree", name);
+	return (EACCES);
+}
diff --git a/db2/db/db_join.c b/db2/db/db_join.c
new file mode 100644
index 0000000000..a4051c20b0
--- /dev/null
+++ b/db2/db/db_join.c
@@ -0,0 +1,271 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_join.c	10.10 (Sleepycat) 10/9/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_join.h"
+#include "db_am.h"
+#include "common_ext.h"
+
+static int __db_join_close __P((DBC *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * This is the duplicate-assisted join functionality.  Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * XXX
+ * Right now we do not maintain the number of duplicates so we do
+ * not optimize the join.  If the caller does, then best performance
+ * will be achieved by putting the cursor with the smallest cardinality
+ * first.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ *	This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join.  While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, intialized cursors and primary
+ * is the DB of the primary file.  The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned.  When no more items are left in the join
+ * set, the  c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, u_int32_t, DBC **));
+ */
+int
+__db_join(primary, curslist, flags, dbcp)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	JOIN_CURSOR *jc;
+	int i, ret;
+
+	DB_PANIC_CHECK(primary);
+
+	if ((ret = __db_joinchk(primary, flags)) != 0)
+		return (ret);
+
+	if (curslist == NULL || curslist[0] == NULL)
+		return (EINVAL);
+
+	dbc = NULL;
+	jc = NULL;
+
+	if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0)
+		goto err;
+
+	if ((ret = __os_calloc(1, sizeof(JOIN_CURSOR), &jc)) != 0)
+		goto err;
+
+	if ((ret = __os_malloc(256, NULL, &jc->j_key.data)) != 0)
+		goto err;
+	jc->j_key.ulen = 256;
+	F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+	for (jc->j_curslist = curslist;
+	    *jc->j_curslist != NULL; jc->j_curslist++)
+		;
+	if ((ret = __os_calloc((jc->j_curslist - curslist + 1),
+	    sizeof(DBC *), &jc->j_curslist)) != 0)
+		goto err;
+	for (i = 0; curslist[i] != NULL; i++) {
+		if (i != 0)
+			F_SET(curslist[i], DBC_KEYSET);
+		jc->j_curslist[i] = curslist[i];
+	}
+
+	dbc->c_close = __db_join_close;
+	dbc->c_del = __db_join_del;
+	dbc->c_get = __db_join_get;
+	dbc->c_put = __db_join_put;
+	dbc->internal = jc;
+	dbc->dbp = primary;
+	jc->j_init = 1;
+	jc->j_primary = primary;
+
+	*dbcp = dbc;
+
+	return (0);
+
+err:	if (jc != NULL) {
+		if (jc->j_curslist != NULL)
+			__os_free(jc->j_curslist,
+			    (jc->j_curslist - curslist + 1) * sizeof(DBC *));
+		__os_free(jc, sizeof(JOIN_CURSOR));
+	}
+	if (dbc != NULL)
+		__os_free(dbc, sizeof(DBC));
+	return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB_PANIC_CHECK(dbc->dbp);
+
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB_PANIC_CHECK(dbc->dbp);
+
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC **cpp;
+	JOIN_CURSOR *jc;
+	int ret;
+	u_int32_t operation;
+
+	dbp = dbc->dbp;
+
+	DB_PANIC_CHECK(dbp);
+
+	operation = LF_ISSET(DB_OPFLAGS_MASK);
+	if (operation != 0 && operation != DB_JOIN_ITEM)
+		return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
+
+	LF_CLR(DB_OPFLAGS_MASK);
+	if ((ret =
+	    __db_fchk(dbp->dbenv, "DBcursor->c_get", flags, DB_RMW)) != 0)
+		return (ret);
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+retry:
+	ret = jc->j_curslist[0]->c_get(jc->j_curslist[0],
+	    &jc->j_key, key, jc->j_init ? DB_CURRENT : DB_NEXT_DUP);
+
+	if (ret == ENOMEM) {
+		jc->j_key.ulen <<= 1;
+		if ((ret = __os_realloc(&jc->j_key.data, jc->j_key.ulen)) != 0)
+			return (ret);
+		goto retry;
+	}
+	if (ret != 0)
+		return (ret);
+
+	jc->j_init = 0;
+	do {
+		/*
+		 * We have the first element; now look for it in the
+		 * other cursors.
+		 */
+		for (cpp = jc->j_curslist + 1; *cpp != NULL; cpp++) {
+retry2:			if ((ret = ((*cpp)->c_get)(*cpp,
+			    &jc->j_key, key, DB_GET_BOTH)) == DB_NOTFOUND)
+				break;
+			if (ret == ENOMEM) {
+				jc->j_key.ulen <<= 1;
+				if ((ret = __os_realloc(&jc->j_key.data,
+				    jc->j_key.ulen)) != 0)
+					return (ret);
+				goto retry2;
+			}
+			if (F_ISSET(*cpp, DBC_KEYSET)) {
+				F_CLR(*cpp, DBC_KEYSET);
+				F_SET(*cpp, DBC_CONTINUE);
+			}
+		}
+
+		/*
+		 * If we got out of here with ret != 0, then we failed to
+		 * find the duplicate in one of the files, so we go on to
+		 * the next item in the outermost relation. If ret was
+		 * equal to 0, then we've got something to return.
+		 */
+		if (ret == 0)
+			break;
+	} while ((ret = jc->j_curslist[0]->c_get(jc->j_curslist[0],
+	    &jc->j_key, key,  DB_NEXT_DUP)) == 0);
+
+	/*
+	 * If ret != 0 here, we've exhausted the first file.  Otherwise,
+	 * key and data are set and we need to do the lookup on the
+	 * primary.
+	 */
+	if (ret != 0)
+		return (ret);
+
+	if (operation == DB_JOIN_ITEM)
+		return (0);
+	else
+		return ((jc->j_primary->get)(jc->j_primary,
+		    jc->j_curslist[0]->txn, key, data, 0));
+}
+
+static int
+__db_join_close(dbc)
+	DBC *dbc;
+{
+	JOIN_CURSOR *jc;
+	int i;
+
+	DB_PANIC_CHECK(dbc->dbp);
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+
+	/*
+	 * Clear the optimization flag in the cursors.
+	 */
+	for (i = 0; jc->j_curslist[i] != NULL; i++)
+		F_CLR(jc->j_curslist[i], DBC_CONTINUE | DBC_KEYSET);
+
+	__os_free(jc->j_curslist, 0);
+	__os_free(jc->j_key.data, jc->j_key.ulen);
+	__os_free(jc, sizeof(JOIN_CURSOR));
+	__os_free(dbc, sizeof(DBC));
+
+	return (0);
+}
diff --git a/db2/db/db_overflow.c b/db2/db/db_overflow.c
index d28740dcbe..0efcc9de7f 100644
--- a/db2/db/db_overflow.c
+++ b/db2/db/db_overflow.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_overflow.c	10.11 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db_overflow.c	10.21 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -60,6 +60,7 @@ static const char sccsid[] = "@(#)db_overflow.c	10.11 (Sleepycat) 5/7/98";
 #include "db_int.h"
 #include "db_page.h"
 #include "db_am.h"
+#include "common_ext.h"
 
 /*
  * Big key/data code.
@@ -106,29 +107,20 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
 		needed = tlen;
 	}
 
-	/*
-	 * Allocate any necessary memory.
-	 *
-	 * XXX: Never allocate 0 bytes;
-	 */
+	/* Allocate any necessary memory. */
 	if (F_ISSET(dbt, DB_DBT_USERMEM)) {
 		if (needed > dbt->ulen) {
 			dbt->size = needed;
 			return (ENOMEM);
 		}
 	} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
-		dbt->data = dbp->db_malloc == NULL ?
-		    (void *)__db_malloc(needed + 1) :
-		    (void *)dbp->db_malloc(needed + 1);
-		if (dbt->data == NULL)
-			return (ENOMEM);
+		if ((ret =
+		    __os_malloc(needed, dbp->db_malloc, &dbt->data)) != 0)
+			return (ret);
 	} else if (*bpsz == 0 || *bpsz < needed) {
-		*bpp = (*bpp == NULL ?
-		    (void *)__db_malloc(needed + 1) :
-		    (void *)__db_realloc(*bpp, needed + 1));
-		if (*bpp == NULL)
-			return (ENOMEM);
-		*bpsz = needed + 1;
+		if ((ret = __os_realloc(bpp, needed)) != 0)
+			return (ret);
+		*bpsz = needed;
 		dbt->data = *bpp;
 	} else
 		dbt->data = *bpp;
@@ -168,16 +160,17 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
  * __db_poff --
  *	Put an offpage item.
  *
- * PUBLIC: int __db_poff __P((DB *, const DBT *, db_pgno_t *,
- * PUBLIC:     int (*)(DB *, u_int32_t, PAGE **)));
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *,
+ * PUBLIC:     int (*)(DBC *, u_int32_t, PAGE **)));
  */
 int
-__db_poff(dbp, dbt, pgnop, newfunc)
-	DB *dbp;
+__db_poff(dbc, dbt, pgnop, newfunc)
+	DBC *dbc;
 	const DBT *dbt;
 	db_pgno_t *pgnop;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
+	DB *dbp;
 	PAGE *pagep, *lastp;
 	DB_LSN new_lsn, null_lsn;
 	DBT tmp_dbt;
@@ -191,6 +184,7 @@ __db_poff(dbp, dbt, pgnop, newfunc)
 	 * number of bytes we get for pages we fill completely with a single
 	 * item.
 	 */
+	dbp = dbc->dbp;
 	pagespace = P_MAXSPACE(dbp->pgsize);
 
 	lastp = NULL;
@@ -208,13 +202,13 @@ __db_poff(dbp, dbt, pgnop, newfunc)
 		 * the item onto the page.  If sz is less than pagespace, we
 		 * have a partial record.
 		 */
-		if ((ret = newfunc(dbp, P_OVERFLOW, &pagep)) != 0)
+		if ((ret = newfunc(dbc, P_OVERFLOW, &pagep)) != 0)
 			return (ret);
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = p;
 			tmp_dbt.size = pagespace;
 			ZERO_LSN(null_lsn);
-			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn,
+			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn,
 			    &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid,
 			    PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID,
 			    PGNO_INVALID, &tmp_dbt, &LSN(pagep),
@@ -256,24 +250,26 @@ __db_poff(dbp, dbt, pgnop, newfunc)
  * __db_ovref --
  *	Increment/decrement the reference count on an overflow page.
  *
- * PUBLIC: int __db_ovref __P((DB *, db_pgno_t, int32_t));
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t, int32_t));
  */
 int
-__db_ovref(dbp, pgno, adjust)
-	DB *dbp;
+__db_ovref(dbc, pgno, adjust)
+	DBC *dbc;
 	db_pgno_t pgno;
 	int32_t adjust;
 {
+	DB *dbp;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
 		(void)__db_pgerr(dbp, pgno);
 		return (ret);
 	}
 
-	if (DB_LOGGING(dbp))
-		if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbp->txn,
+	if (DB_LOGGING(dbc))
+		if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(h), 0, dbp->log_fileid, h->pgno, adjust,
 		    &LSN(h))) != 0)
 			return (ret);
@@ -287,19 +283,21 @@ __db_ovref(dbp, pgno, adjust)
  * __db_doff --
  *	Delete an offpage chain of overflow pages.
  *
- * PUBLIC: int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_doff(dbp, pgno, freefunc)
-	DB *dbp;
+__db_doff(dbc, pgno, freefunc)
+	DBC *dbc;
 	db_pgno_t pgno;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
+	DB *dbp;
 	PAGE *pagep;
 	DB_LSN null_lsn;
 	DBT tmp_dbt;
 	int ret;
 
+	dbp = dbc->dbp;
 	do {
 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
@@ -312,21 +310,21 @@ __db_doff(dbp, pgno, freefunc)
 		 */
 		if (TYPE(pagep) == P_OVERFLOW && OV_REF(pagep) > 1) {
 			(void)memp_fput(dbp->mpf, pagep, 0);
-			return (__db_ovref(dbp, pgno, -1));
+			return (__db_ovref(dbc, pgno, -1));
 		}
 
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD;
 			tmp_dbt.size = OV_LEN(pagep);
 			ZERO_LSN(null_lsn);
-			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn,
+			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn,
 			    &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid,
 			    PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep),
 			    &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0)
 				return (ret);
 		}
 		pgno = pagep->next_pgno;
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} while (pgno != PGNO_INVALID);
 
@@ -339,44 +337,71 @@ __db_doff(dbp, pgno, freefunc)
  *
  * Given a starting page number and a key, return <0, 0, >0 to indicate if the
  * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function.  In this case, we need to materialize
+ * the entire object and call their comparison routine.
  *
- * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t));
+ * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC:     int (*)(const DBT *, const DBT *), int *));
  */
 int
-__db_moff(dbp, dbt, pgno)
+__db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp)
 	DB *dbp;
 	const DBT *dbt;
 	db_pgno_t pgno;
+	u_int32_t tlen;
+	int (*cmpfunc) __P((const DBT *, const DBT *)), *cmpp;
 {
 	PAGE *pagep;
-	u_int32_t cmp_bytes, key_left;
+	DBT local_dbt;
+	void *buf;
+	u_int32_t bufsize, cmp_bytes, key_left;
 	u_int8_t *p1, *p2;
 	int ret;
 
+	/*
+	 * If there is a user-specified comparison function, build a
+	 * contiguous copy of the key, and call it.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_dbt, 0, sizeof(local_dbt));
+		buf = NULL;
+		bufsize = 0;
+
+		if ((ret = __db_goff(dbp,
+		    &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+			return (ret);
+		*cmpp = cmpfunc(&local_dbt, dbt);
+		__os_free(buf, bufsize);
+		return (0);
+	}
+
 	/* While there are both keys to compare. */
-	for (ret = 0, p1 = dbt->data,
+	for (*cmpp = 0, p1 = dbt->data,
 	    key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
-		if (memp_fget(dbp->mpf, &pgno, 0, &pagep) != 0) {
-			(void)__db_pgerr(dbp, pgno);
-			return (0);	/* No system error return. */
-		}
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0)
+			return (ret);
 
 		cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
 		key_left -= cmp_bytes;
 		for (p2 =
 		    (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2)
 			if (*p1 != *p2) {
-				ret = (long)*p1 - (long)*p2;
+				*cmpp = (long)*p1 - (long)*p2;
 				break;
 			}
 		pgno = NEXT_PGNO(pagep);
-		(void)memp_fput(dbp->mpf, pagep, 0);
-		if (ret != 0)
+		if ((ret = memp_fput(dbp->mpf, pagep, 0)) != 0)
 			return (ret);
+		if (*cmpp != 0)
+			return (0);
 	}
 	if (key_left > 0)		/* DBT is longer than page key. */
-		return (-1);
-	if (pgno != PGNO_INVALID)	/* DBT is shorter than page key. */
-		return (1);
+		*cmpp = -1;
+	else if (pgno != PGNO_INVALID)	/* DBT is shorter than page key. */
+		*cmpp = 1;
+	else
+		*cmpp = 0;
+
 	return (0);
 }
diff --git a/db2/db/db_pr.c b/db2/db/db_pr.c
index a294cdd135..7f4364c6e1 100644
--- a/db2/db/db_pr.c
+++ b/db2/db/db_pr.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_pr.c	10.29 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)db_pr.c	10.40 (Sleepycat) 11/22/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -126,11 +126,10 @@ __db_prdb(dbp)
 		{ DB_AM_MLOCAL,		"local mpool" },
 		{ DB_AM_PGDEF,		"default page size" },
 		{ DB_AM_RDONLY,		"read-only" },
-		{ DB_AM_RECOVER,	"recover" },
 		{ DB_AM_SWAP,		"needswap" },
 		{ DB_AM_THREAD,		"thread" },
-		{ DB_BT_RECNUM,		"btree:records" },
-		{ DB_HS_DIRTYMETA,	"hash:dirty-meta" },
+		{ DB_BT_RECNUM,		"btree:recnum" },
+		{ DB_DBM_ERROR,		"dbm/ndbm error" },
 		{ DB_RE_DELIMITER,	"recno:delimiter" },
 		{ DB_RE_FIXEDLEN,	"recno:fixed-length" },
 		{ DB_RE_PAD,		"recno:pad" },
@@ -178,42 +177,55 @@ __db_prbtree(dbp)
 	static const FN mfn[] = {
 		{ BTM_DUP,	"duplicates" },
 		{ BTM_RECNO,	"recno" },
-		{ BTM_RECNUM,	"btree:records" },
+		{ BTM_RECNUM,	"btree:recnum" },
 		{ BTM_FIXEDLEN,	"recno:fixed-length" },
 		{ BTM_RENUMBER,	"recno:renumber" },
 		{ 0 },
 	};
+	DBC *dbc;
 	BTMETA *mp;
 	BTREE *t;
-	EPG *epg;
 	FILE *fp;
 	PAGE *h;
 	RECNO *rp;
 	db_pgno_t i;
-	int ret;
+	int cnt, ret;
+	const char *sep;
 
 	t = dbp->internal;
 	fp = __db_prinit(NULL);
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
 
 	(void)fprintf(fp, "%s\nOn-page metadata:\n", DB_LINE);
 
 	i = PGNO_METADATA;
-	if ((ret = __bam_pget(dbp, (PAGE **)&mp, &i, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &i, 0, (PAGE **)&mp)) != 0) {
+		(void)dbc->c_close(dbc);
 		return (ret);
+	}
 
+	fprintf(fp, "lsn.file: %lu lsn.offset: %lu\n",
+	    (u_long)LSN(mp).file, (u_long)LSN(mp).offset);
 	(void)fprintf(fp, "magic %#lx\n", (u_long)mp->magic);
 	(void)fprintf(fp, "version %#lx\n", (u_long)mp->version);
 	(void)fprintf(fp, "pagesize %lu\n", (u_long)mp->pagesize);
 	(void)fprintf(fp, "maxkey: %lu minkey: %lu\n",
 	    (u_long)mp->maxkey, (u_long)mp->minkey);
 
-	(void)fprintf(fp, "free %lu", (u_long)mp->free);
-	for (i = mp->free; i != PGNO_INVALID;) {
-		if ((ret = __bam_pget(dbp, &h, &i, 0)) != 0)
+	(void)fprintf(fp, "free list: %lu", (u_long)mp->free);
+	for (i = mp->free, cnt = 0, sep = ", "; i != PGNO_INVALID;) {
+		if ((ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0)
 			return (ret);
 		i = h->next_pgno;
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)fprintf(fp, ", %lu", (u_long)i);
+		(void)fprintf(fp, "%s%lu", sep, (u_long)i);
+		if (++cnt % 10 == 0) {
+			(void)fprintf(fp, "\n");
+			cnt = 0;
+			sep = "";
+		} else
+			sep = ", ";
 	}
 	(void)fprintf(fp, "\n");
 
@@ -227,7 +239,7 @@ __db_prbtree(dbp)
 	    (u_long)t->bt_maxkey, (u_long)t->bt_minkey);
 	(void)fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n",
 	    (u_long)t->bt_compare, (u_long)t->bt_prefix);
-	if ((rp = t->bt_recno) != NULL) {
+	if ((rp = t->recno) != NULL) {
 		(void)fprintf(fp,
 		    "re_delim: %#lx re_pad: %#lx re_len: %lu re_source: %s\n",
 		    (u_long)rp->re_delim, (u_long)rp->re_pad,
@@ -238,13 +250,9 @@ __db_prbtree(dbp)
 		    (u_long)rp->re_cmap, (u_long)rp->re_smap,
 		    (u_long)rp->re_emap, (u_long)rp->re_msize);
 	}
-	(void)fprintf(fp, "stack:");
-	for (epg = t->bt_stack; epg < t->bt_sp; ++epg)
-		(void)fprintf(fp, " %lu", (u_long)epg->page->pgno);
-	(void)fprintf(fp, "\n");
 	(void)fprintf(fp, "ovflsize: %lu\n", (u_long)t->bt_ovflsize);
 	(void)fflush(fp);
-	return (0);
+	return (dbc->c_close(dbc));
 }
 
 /*
@@ -258,51 +266,50 @@ __db_prhash(dbp)
 	DB *dbp;
 {
 	FILE *fp;
-	HTAB *t;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	int i, put_page, ret;
 	db_pgno_t pgno;
 
-	t = dbp->internal;
-
 	fp = __db_prinit(NULL);
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	fprintf(fp, "\thash_accesses    %lu\n", (u_long)t->hash_accesses);
-	fprintf(fp, "\thash_collisions  %lu\n", (u_long)t->hash_collisions);
-	fprintf(fp, "\thash_expansions  %lu\n", (u_long)t->hash_expansions);
-	fprintf(fp, "\thash_overflows 	%lu\n", (u_long)t->hash_overflows);
-	fprintf(fp, "\thash_bigpages    %lu\n", (u_long)t->hash_bigpages);
-	fprintf(fp, "\n");
-
-	if (t->hdr == NULL) {
+	/*
+	 * In this case,  hcp->hdr will never be null, if we decide
+	 * to pass dbc's to this routine instead, then it could be.
+	 */
+	if (hcp->hdr == NULL) {
 		pgno = PGNO_METADATA;
-		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &t->hdr)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &hcp->hdr)) != 0)
 			return (ret);
 		put_page = 1;
 	} else
 		put_page = 0;
 
-	fprintf(fp, "\tmagic      %#lx\n", (u_long)t->hdr->magic);
-	fprintf(fp, "\tversion    %lu\n", (u_long)t->hdr->version);
-	fprintf(fp, "\tpagesize   %lu\n", (u_long)t->hdr->pagesize);
-	fprintf(fp, "\tovfl_point %lu\n", (u_long)t->hdr->ovfl_point);
-	fprintf(fp, "\tlast_freed %lu\n", (u_long)t->hdr->last_freed);
-	fprintf(fp, "\tmax_bucket %lu\n", (u_long)t->hdr->max_bucket);
-	fprintf(fp, "\thigh_mask  %#lx\n", (u_long)t->hdr->high_mask);
-	fprintf(fp, "\tlow_mask   %#lx\n", (u_long)t->hdr->low_mask);
-	fprintf(fp, "\tffactor    %lu\n", (u_long)t->hdr->ffactor);
-	fprintf(fp, "\tnelem      %lu\n", (u_long)t->hdr->nelem);
-	fprintf(fp, "\th_charkey  %#lx\n", (u_long)t->hdr->h_charkey);
+	fprintf(fp, "\tmagic      %#lx\n", (u_long)hcp->hdr->magic);
+	fprintf(fp, "\tversion    %lu\n", (u_long)hcp->hdr->version);
+	fprintf(fp, "\tpagesize   %lu\n", (u_long)hcp->hdr->pagesize);
+	fprintf(fp, "\tovfl_point %lu\n", (u_long)hcp->hdr->ovfl_point);
+	fprintf(fp, "\tlast_freed %lu\n", (u_long)hcp->hdr->last_freed);
+	fprintf(fp, "\tmax_bucket %lu\n", (u_long)hcp->hdr->max_bucket);
+	fprintf(fp, "\thigh_mask  %#lx\n", (u_long)hcp->hdr->high_mask);
+	fprintf(fp, "\tlow_mask   %#lx\n", (u_long)hcp->hdr->low_mask);
+	fprintf(fp, "\tffactor    %lu\n", (u_long)hcp->hdr->ffactor);
+	fprintf(fp, "\tnelem      %lu\n", (u_long)hcp->hdr->nelem);
+	fprintf(fp, "\th_charkey  %#lx\n", (u_long)hcp->hdr->h_charkey);
 
 	for (i = 0; i < NCACHED; i++)
-		fprintf(fp, "%lu ", (u_long)t->hdr->spares[i]);
+		fprintf(fp, "%lu ", (u_long)hcp->hdr->spares[i]);
 	fprintf(fp, "\n");
 
 	(void)fflush(fp);
 	if (put_page) {
-		(void)memp_fput(dbp->mpf, (PAGE *)t->hdr, 0);
-		t->hdr = NULL;
+		(void)memp_fput(dbp->mpf, (PAGE *)hcp->hdr, 0);
+		hcp->hdr = NULL;
 	}
-	return (0);
+	return (dbc->c_close(dbc));
 }
 
 /*
@@ -318,22 +325,18 @@ __db_prtree(mpf, all)
 {
 	PAGE *h;
 	db_pgno_t i;
-	int ret, t_ret;
 
 	if (set_psize == PSIZE_BOUNDARY)
 		__db_psize(mpf);
 
-	ret = 0;
 	for (i = PGNO_ROOT;; ++i) {
-		if ((ret = memp_fget(mpf, &i, 0, &h)) != 0)
+		if (memp_fget(mpf, &i, 0, &h) != 0)
 			break;
-		if (TYPE(h) != P_INVALID)
-			if ((t_ret = __db_prpage(h, all)) != 0 && ret == 0)
-				ret = t_ret;
+		(void)__db_prpage(h, all);
 		(void)memp_fput(mpf, h, 0);
 	}
 	(void)fflush(__db_prinit(NULL));
-	return (ret);
+	return (0);
 }
 
 /*
@@ -425,8 +428,7 @@ __db_prpage(h, all)
 	    (TYPE(h) == P_LRECNO && h->pgno == PGNO_ROOT))
 		fprintf(fp, " total records: %4lu", (u_long)RE_NREC(h));
 	fprintf(fp, "\n");
-	if (TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO ||
-	    TYPE(h) == P_DUPLICATE || TYPE(h) == P_OVERFLOW)
+	if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO)
 		fprintf(fp, "    prev: %4lu next: %4lu",
 		    (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
 	if (TYPE(h) == P_IBTREE || TYPE(h) == P_LBTREE)
diff --git a/db2/db/db_rec.c b/db2/db/db_rec.c
index 1ef6f18e61..7f577b5855 100644
--- a/db2/db/db_rec.c
+++ b/db2/db/db_rec.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_rec.c	10.16 (Sleepycat) 4/28/98";
+static const char sccsid[] = "@(#)db_rec.c	10.19 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -40,7 +40,8 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_addrem_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -57,9 +58,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -73,7 +72,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	    (cmp_n == 0 && !redo && argp->opcode == DB_REM_DUP)) {
 
 		/* Need to redo an add, or undo a delete. */
-		if ((ret = __db_pitem(file_dbp, pagep, argp->indx, argp->nbytes,
+		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
 		    argp->hdr.size == 0 ? NULL : &argp->hdr,
 		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
 			goto out;
@@ -83,7 +82,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	} else if ((cmp_n == 0 && !redo && argp->opcode == DB_ADD_DUP) ||
 	    (cmp_p == 0 && redo && argp->opcode == DB_REM_DUP)) {
 		/* Need to undo an add, or redo a delete. */
-		if ((ret = __db_ditem(file_dbp,
+		if ((ret = __db_ditem(dbc,
 		    pagep, argp->indx, argp->nbytes)) != 0)
 			goto out;
 		change = DB_MPOOL_DIRTY;
@@ -96,8 +95,11 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 			LSN(pagep) = argp->pagelsn;
 	}
 
-	if ((ret = memp_fput(mpf, pagep, change)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -114,7 +116,8 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_split_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int change, cmp_n, cmp_p, ret;
@@ -130,9 +133,7 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -169,8 +170,11 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 		LSN(pagep) = argp->pagelsn;
 		change = DB_MPOOL_DIRTY;
 	}
-	if ((ret = memp_fput(mpf, pagep, change)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -187,7 +191,8 @@ __db_big_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_big_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -209,7 +214,7 @@ __db_big_recover(logp, dbtp, lsnp, redo, info)
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
-			goto out;
+				goto out;
 	}
 
 	/*
@@ -299,9 +304,7 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 				 * so we would not have to undo anything.  In
 				 * this case, don't bother creating a page.
 				 */
-				*lsnp = argp->prev_lsn;
-				ret = 0;
-				goto out;
+				goto done;
 			} else
 				if ((ret = memp_fget(mpf, &argp->next_pgno,
 				    DB_MPOOL_CREATE, &pagep)) != 0)
@@ -323,7 +326,8 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 			goto out;
 	}
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -343,7 +347,8 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_ovref_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int modified, ret;
@@ -370,8 +375,11 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info)
 		pagep->lsn = argp->lsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -392,17 +400,20 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_relink_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
-	int modified, ret;
+	int cmp_n, cmp_p, modified, ret;
 
 	REC_PRINT(__db_relink_print);
 	REC_INTRO(__db_relink_read);
 
 	/*
-	 * There are three pages we need to check -- the page, and the
-	 * previous and next pages, if they existed.
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
 	 */
 	if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
 		if (redo) {
@@ -411,6 +422,9 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 		}
 		goto next;
 	}
+	if (argp->opcode == DB_ADD_PAGE)
+		goto next;
+
 	modified = 0;
 	if (log_compare(&LSN(pagep), &argp->lsn) == 0 && redo) {
 		/* Redo the relink. */
@@ -424,10 +438,8 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 		pagep->lsn = argp->lsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 next:	if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
 		if (redo) {
@@ -437,23 +449,27 @@ next:	if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
 		goto prev;
 	}
 	modified = 0;
-	if (log_compare(&LSN(pagep), &argp->lsn_next) == 0 && redo) {
-		/* Redo the relink. */
+	cmp_n = log_compare(lsnp, &LSN(pagep));
+	cmp_p = log_compare(&LSN(pagep), &argp->lsn_next);
+	if ((argp->opcode == DB_REM_PAGE && cmp_p == 0 && redo) ||
+	    (argp->opcode == DB_ADD_PAGE && cmp_n == 0 && !redo)) {
+		/* Redo the remove or undo the add. */
 		pagep->prev_pgno = argp->prev;
 
 		pagep->lsn = *lsnp;
 		modified = 1;
-	} else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) {
-		/* Undo the relink. */
+	} else if ((argp->opcode == DB_REM_PAGE && cmp_n == 0 && !redo) ||
+	    (argp->opcode == DB_ADD_PAGE && cmp_p == 0 && redo)) {
+		/* Undo the remove or redo the add. */
 		pagep->prev_pgno = argp->pgno;
 
 		pagep->lsn = argp->lsn_next;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
+	if (argp->opcode == DB_ADD_PAGE)
+		goto done;
 
 prev:	if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
 		if (redo) {
@@ -476,10 +492,8 @@ prev:	if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
 		pagep->lsn = argp->lsn_prev;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void) __db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 done:	*lsnp = argp->prev_lsn;
 	ret = 0;
@@ -500,7 +514,8 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_addpage_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -541,8 +556,7 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->nextpgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -563,11 +577,13 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 		LSN(pagep) = argp->nextlsn;
 		change = DB_MPOOL_DIRTY;
 	}
-	ret = memp_fput(mpf, pagep, change);
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
-out:	if (ret == 0)
-		*lsnp = argp->prev_lsn;
-	REC_CLOSE;
+out:	REC_CLOSE;
 }
 
 /*
@@ -598,46 +614,3 @@ __db_debug_recover(logp, dbtp, lsnp, redo, info)
 
 	REC_NOOP_CLOSE;
 }
-
-/*
- * __db_noop_recover --
- *	Recovery function for noop.
- *
- * PUBLIC: int __db_noop_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
- */
-int
-__db_noop_recover(logp, dbtp, lsnp, redo, info)
-	DB_LOG *logp;
-	DBT *dbtp;
-	DB_LSN *lsnp;
-	int redo;
-	void *info;
-{
-	__db_noop_args *argp;
-	DB *file_dbp, *mdbp;
-	DB_MPOOLFILE *mpf;
-	PAGE *pagep;
-	u_int32_t change;
-	int cmp_n, cmp_p, ret;
-
-	REC_PRINT(__db_noop_print);
-	REC_INTRO(__db_noop_read);
-
-	if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0)
-		goto out;
-
-	cmp_n = log_compare(lsnp, &LSN(pagep));
-	cmp_p = log_compare(&LSN(pagep), &argp->prevlsn);
-	change = 0;
-	if (cmp_p == 0 && redo) {
-		LSN(pagep) = *lsnp;
-		change = DB_MPOOL_DIRTY;
-	} else if (cmp_n == 0 && !redo) {
-		LSN(pagep) = argp->prevlsn;
-		change = DB_MPOOL_DIRTY;
-	}
-	*lsnp = argp->prev_lsn;
-	ret = memp_fput(mpf, pagep, change);
-
-out:	REC_CLOSE;
-}
diff --git a/db2/db/db_ret.c b/db2/db/db_ret.c
index 9d9b599ad6..9f0d0ecf8d 100644
--- a/db2/db/db_ret.c
+++ b/db2/db/db_ret.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_ret.c	10.13 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db_ret.c	10.16 (Sleepycat) 10/4/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -93,6 +93,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	u_int32_t *memsize;
 	void *(*db_malloc) __P((size_t));
 {
+	int ret;
+
 	/* If returning a partial record, reset the length. */
 	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
 		data = (u_int8_t *)data + dbt->doff;
@@ -120,9 +122,6 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	 * guarantees consistency, i.e., the application can always free memory
 	 * without concern as to how many bytes of the record were requested.
 	 *
-	 * XXX
-	 * Never allocate 0 bytes, it's known to make malloc/realloc unhappy.
-	 *
 	 * Use the memory specified by the application: DB_DBT_USERMEM.
 	 *
 	 * !!!
@@ -130,11 +129,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	 * memory pointer is allowed to be NULL.
 	 */
 	if (F_ISSET(dbt, DB_DBT_MALLOC)) {
-		dbt->data = db_malloc == NULL ?
-		    (void *)__db_malloc(len) :
-		    (void *)db_malloc(len + 1);
-		if (dbt->data == NULL)
-			return (ENOMEM);
+		if ((ret = __os_malloc(len, db_malloc, &dbt->data)) != 0)
+			return (ret);
 	} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
 		if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
 			return (ENOMEM);
@@ -142,12 +138,9 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 		return (EINVAL);
 	} else {
 		if (len != 0 && (*memsize == 0 || *memsize < len)) {
-			*memp = *memp == NULL ?
-			    (void *)__db_malloc(len) :
-			    (void *)__db_realloc(*memp, len);
-			if (*memp == NULL) {
+			if ((ret = __os_realloc(memp, len)) != 0) {
 				*memsize = 0;
-				return (ENOMEM);
+				return (ret);
 			}
 			*memsize = len;
 		}
diff --git a/db2/db/db_thread.c b/db2/db/db_thread.c
deleted file mode 100644
index 73e2a51286..0000000000
--- a/db2/db/db_thread.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- */
-
-#include "config.h"
-
-#ifndef lint
-static const char sccsid[] = "@(#)db_thread.c	8.15 (Sleepycat) 4/26/98";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <errno.h>
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-
-static int __db_getlockid __P((DB *, DB *));
-
-/*
- * __db_gethandle --
- *	Called by db access method routines when the DB_THREAD flag is set.
- *	This routine returns a handle, either an existing handle from the
- *	chain of handles, or creating one if necessary.
- *
- * PUBLIC: int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **));
- */
-int
-__db_gethandle(dbp, am_func, dbpp)
-	DB *dbp, **dbpp;
-	int (*am_func) __P((DB *, DB *));
-{
-	DB *ret_dbp;
-	int ret, t_ret;
-
-	if ((ret = __db_mutex_lock((db_mutex_t *)dbp->mutexp, -1)) != 0)
-		return (ret);
-
-	if ((ret_dbp = LIST_FIRST(&dbp->handleq)) != NULL)
-		/* Simply take one off the list. */
-		LIST_REMOVE(ret_dbp, links);
-	else {
-		/* Allocate a new handle. */
-		if ((ret_dbp = (DB *)__db_malloc(sizeof(*dbp))) == NULL) {
-			ret = ENOMEM;
-			goto err;
-		}
-		memcpy(ret_dbp, dbp, sizeof(*dbp));
-		ret_dbp->internal = NULL;
-		TAILQ_INIT(&ret_dbp->curs_queue);
-
-		/* Set the locker, the lock structure and the lock DBT. */
-		if ((ret = __db_getlockid(dbp, ret_dbp)) != 0)
-			goto err;
-
-		/* Finally, call the access method specific dup function. */
-		if ((ret = am_func(dbp, ret_dbp)) != 0)
-			goto err;
-	}
-
-	*dbpp = ret_dbp;
-
-	if (0) {
-err:		if (ret_dbp != NULL)
-			FREE(ret_dbp, sizeof(*ret_dbp));
-	}
-	if ((t_ret =
-	    __db_mutex_unlock((db_mutex_t *)dbp->mutexp, -1)) != 0 && ret == 0)
-		ret = t_ret;
-	return (ret);
-}
-
-/*
- * __db_puthandle --
- *	Return a DB handle to the pool for later use.
- *
- * PUBLIC: int __db_puthandle __P((DB *));
- */
-int
-__db_puthandle(dbp)
-	DB *dbp;
-{
-	DB *master;
-	int ret;
-
-	master = dbp->master;
-	if ((ret = __db_mutex_lock((db_mutex_t *)master->mutexp, -1)) != 0)
-		return (ret);
-
-	LIST_INSERT_HEAD(&master->handleq, dbp, links);
-
-	return (__db_mutex_unlock((db_mutex_t *)master->mutexp, -1));
-}
-
-/*
- * __db_getlockid --
- *	Create a new locker ID and copy the file lock information from
- *	the old DB into the new one.
- */
-static int
-__db_getlockid(dbp, new_dbp)
-	DB *dbp, *new_dbp;
-{
-	int ret;
-
-	if (F_ISSET(dbp, DB_AM_LOCKING)) {
-		if ((ret = lock_id(dbp->dbenv->lk_info, &new_dbp->locker)) != 0)
-			return (ret);
-		memcpy(new_dbp->lock.fileid, dbp->lock.fileid, DB_FILE_ID_LEN);
-		new_dbp->lock_dbt.size = sizeof(new_dbp->lock);
-		new_dbp->lock_dbt.data = &new_dbp->lock;
-	}
-	return (0);
-}
diff --git a/db2/db185/db185.c b/db2/db185/db185.c
index 893dfa3c7f..739ada83d0 100644
--- a/db2/db185/db185.c
+++ b/db2/db185/db185.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db185.c	8.17 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db185.c	8.21 (Sleepycat) 11/22/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -28,6 +28,10 @@ static const char sccsid[] = "@(#)db185.c	8.17 (Sleepycat) 5/7/98";
 #include "db185_int.h"
 #include "common_ext.h"
 
+#ifndef STDERR_FILENO
+#define	STDERR_FILENO	2
+#endif
+
 static int db185_close __P((DB185 *));
 static int db185_del __P((const DB185 *, const DBT185 *, u_int));
 static int db185_fd __P((const DB185 *));
@@ -37,7 +41,7 @@ static int db185_seq __P((const DB185 *, DBT185 *, DBT185 *, u_int));
 static int db185_sync __P((const DB185 *, u_int));
 
 DB185 *
-__dbopen(file, oflags, mode, type, openinfo)
+dbopen(file, oflags, mode, type, openinfo)
 	const char *file;
 	int oflags, mode;
 	DBTYPE type;
@@ -49,9 +53,10 @@ __dbopen(file, oflags, mode, type, openinfo)
 	DB *dbp;
 	DB185 *db185p;
 	DB_INFO dbinfo, *dbinfop;
-	int s_errno;
+	ssize_t nw;
+	int fd, s_errno;
 
-	if ((db185p = (DB185 *)__db_calloc(1, sizeof(DB185))) == NULL)
+	if ((errno = __os_calloc(1, sizeof(DB185), &db185p)) != 0)
 		return (NULL);
 	dbinfop = NULL;
 	memset(&dbinfo, 0, sizeof(dbinfo));
@@ -93,7 +98,8 @@ __dbopen(file, oflags, mode, type, openinfo)
 			dbinfop->h_ffactor = hi->ffactor;
 			dbinfop->h_nelem = hi->nelem;
 			dbinfop->db_cachesize = hi->cachesize;
-			dbinfop->h_hash = hi->hash;
+			dbinfop->h_hash = (u_int32_t (*)
+			    __P((const void *, u_int32_t)))hi->hash;
 			dbinfop->db_lorder = hi->lorder;
 		}
 
@@ -127,14 +133,15 @@ __dbopen(file, oflags, mode, type, openinfo)
 		 * that in DB 2.0, so do that cast.
 		 */
 		if (file != NULL) {
-			if (oflags & O_CREAT && __db_exists(file, NULL) != 0)
-				(void)__os_close(__os_open(file, oflags, mode));
+			if (oflags & O_CREAT && __os_exists(file, NULL) != 0)
+				if (__os_open(file, oflags, mode, &fd) == 0)
+					(void)__os_close(fd);
 			dbinfop->re_source = (char *)file;
-			file = NULL;
 
 			if (O_RDONLY)
 				oflags &= ~O_RDONLY;
 			oflags |= O_RDWR;
+			file = NULL;
 		}
 
 		if ((ri = openinfo) != NULL) {
@@ -144,7 +151,8 @@ __dbopen(file, oflags, mode, type, openinfo)
 			 */
 #define	BFMSG	"DB: DB 1.85's recno bfname field is not supported.\n"
 			if (ri->bfname != NULL) {
-				(void)__os_write(2, BFMSG, sizeof(BFMSG) - 1);
+				(void)__os_write(STDERR_FILENO,
+				    BFMSG, sizeof(BFMSG) - 1, &nw);
 				goto einval;
 			}
 
@@ -196,27 +204,26 @@ __dbopen(file, oflags, mode, type, openinfo)
 	 */
 	if ((errno = db_open(file,
 	    type, __db_oflags(oflags), mode, NULL, dbinfop, &dbp)) != 0) {
-		__db_free(db185p);
+		__os_free(db185p, sizeof(DB185));
 		return (NULL);
 	}
 
 	/* Create the cursor used for sequential ops. */
-	if ((errno = dbp->cursor(dbp, NULL, &((DB185 *)db185p)->dbc)) != 0) {
+	if ((errno = dbp->cursor(dbp, NULL, &((DB185 *)db185p)->dbc, 0)) != 0) {
 		s_errno = errno;
 		(void)dbp->close(dbp, 0);
-		__db_free(db185p);
-		__set_errno(s_errno);
+		__os_free(db185p, sizeof(DB185));
+		errno = s_errno;
 		return (NULL);
 	}
 
 	db185p->internal = dbp;
 	return (db185p);
 
-einval:	__db_free(db185p);
-	__set_errno(EINVAL);
+einval:	__os_free(db185p, sizeof(DB185));
+	errno = EINVAL;
 	return (NULL);
 }
-weak_alias (__dbopen, dbopen)
 
 static int
 db185_close(db185p)
@@ -226,9 +233,9 @@ db185_close(db185p)
 
 	dbp = (DB *)db185p->internal;
 
-	__set_errno(dbp->close(dbp, 0));
+	errno = dbp->close(dbp, 0);
 
-	__db_free(db185p);
+	__os_free(db185p, sizeof(DB185));
 
 	return (errno == 0 ? 0 : -1);
 }
@@ -251,9 +258,9 @@ db185_del(db185p, key185, flags)
 	if (flags & ~R_CURSOR)
 		goto einval;
 	if (flags & R_CURSOR)
-		__set_errno(db185p->dbc->c_del(db185p->dbc, 0));
+		errno = db185p->dbc->c_del(db185p->dbc, 0);
 	else
-		__set_errno(dbp->del(dbp, NULL, &key, 0));
+		errno = dbp->del(dbp, NULL, &key, 0);
 
 	switch (errno) {
 	case 0:
@@ -263,7 +270,7 @@ db185_del(db185p, key185, flags)
 	}
 	return (-1);
 
-einval:	__set_errno(EINVAL);
+einval:	errno = EINVAL;
 	return (-1);
 }
 
@@ -276,7 +283,7 @@ db185_fd(db185p)
 
 	dbp = (DB *)db185p->internal;
 
-	return ((__set_errno(dbp->fd(dbp, &fd))) == 0 ? fd : -1);
+	return ((errno = dbp->fd(dbp, &fd)) == 0 ? fd : -1);
 }
 
 static int
@@ -301,7 +308,7 @@ db185_get(db185p, key185, data185, flags)
 	if (flags)
 		goto einval;
 
-	switch (__set_errno(dbp->get(dbp, NULL, &key, &data, 0))) {
+	switch (errno = dbp->get(dbp, NULL, &key, &data, 0)) {
 	case 0:
 		data185->data = data.data;
 		data185->size = data.size;
@@ -311,7 +318,7 @@ db185_get(db185p, key185, data185, flags)
 	}
 	return (-1);
 
-einval:	__set_errno(EINVAL);
+einval:	errno = EINVAL;
 	return (-1);
 }
 
@@ -338,46 +345,46 @@ db185_put(db185p, key185, data185, flags)
 
 	switch (flags) {
 	case 0:
-		__set_errno(dbp->put(dbp, NULL, &key, &data, 0));
+		errno = dbp->put(dbp, NULL, &key, &data, 0);
 		break;
 	case R_CURSOR:
-		__set_errno(
-		    db185p->dbc->c_put(db185p->dbc, &key, &data, DB_CURRENT));
+		errno =
+		    db185p->dbc->c_put(db185p->dbc, &key, &data, DB_CURRENT);
 		break;
 	case R_IAFTER:
 	case R_IBEFORE:
 		if (dbp->type != DB_RECNO)
 			goto einval;
 
-		if ((__set_errno(dbp->cursor(dbp, NULL, &dbcp_put))) != 0)
+		if ((errno = dbp->cursor(dbp, NULL, &dbcp_put, 0)) != 0)
 			return (-1);
-		if ((__set_errno(
-		    dbcp_put->c_get(dbcp_put, &key, &data, DB_SET))) != 0) {
+		if ((errno =
+		    dbcp_put->c_get(dbcp_put, &key, &data, DB_SET)) != 0) {
 			s_errno = errno;
 			(void)dbcp_put->c_close(dbcp_put);
-			__set_errno(s_errno);
+			errno = s_errno;
 			return (-1);
 		}
 		memset(&data, 0, sizeof(data));
 		data.data = data185->data;
 		data.size = data185->size;
-		__set_errno(dbcp_put->c_put(dbcp_put,
-		    &key, &data, flags == R_IAFTER ? DB_AFTER : DB_BEFORE));
+		errno = dbcp_put->c_put(dbcp_put,
+		    &key, &data, flags == R_IAFTER ? DB_AFTER : DB_BEFORE);
 		s_errno = errno;
 		(void)dbcp_put->c_close(dbcp_put);
-		__set_errno(s_errno);
+		errno = s_errno;
 		break;
 	case R_NOOVERWRITE:
-		__set_errno(dbp->put(dbp, NULL, &key, &data, DB_NOOVERWRITE));
+		errno = dbp->put(dbp, NULL, &key, &data, DB_NOOVERWRITE);
 		break;
 	case R_SETCURSOR:
 		if (dbp->type != DB_BTREE && dbp->type != DB_RECNO)
 			goto einval;
 
-		if ((__set_errno(dbp->put(dbp, NULL, &key, &data, 0))) != 0)
+		if ((errno = dbp->put(dbp, NULL, &key, &data, 0)) != 0)
 			break;
-		__set_errno(db185p->dbc->c_get(db185p->dbc,
-					       &key, &data, DB_SET_RANGE));
+		errno =
+		    db185p->dbc->c_get(db185p->dbc, &key, &data, DB_SET_RANGE);
 		break;
 	default:
 		goto einval;
@@ -393,7 +400,7 @@ db185_put(db185p, key185, data185, flags)
 	}
 	return (-1);
 
-einval:	__set_errno(EINVAL);
+einval:	errno = EINVAL;
 	return (-1);
 }
 
@@ -438,8 +445,7 @@ db185_seq(db185p, key185, data185, flags)
 	default:
 		goto einval;
 	}
-	switch (__set_errno(db185p->dbc->c_get(db185p->dbc,
-					       &key, &data, flags))) {
+	switch (errno = db185p->dbc->c_get(db185p->dbc, &key, &data, flags)) {
 	case 0:
 		key185->data = key.data;
 		key185->size = key.size;
@@ -451,7 +457,7 @@ db185_seq(db185p, key185, data185, flags)
 	}
 	return (-1);
 
-einval:	__set_errno(EINVAL);
+einval:	errno = EINVAL;
 	return (-1);
 }
 
@@ -461,6 +467,7 @@ db185_sync(db185p, flags)
 	u_int flags;
 {
 	DB *dbp;
+	ssize_t nw;
 
 	dbp = (DB *)db185p->internal;
 
@@ -473,14 +480,14 @@ db185_sync(db185p, flags)
 		 * We can't support the R_RECNOSYNC flag.
 		 */
 #define	RSMSG	"DB: DB 1.85's R_RECNOSYNC sync flag is not supported.\n"
-		(void)__os_write(2, RSMSG, sizeof(RSMSG) - 1);
+		(void)__os_write(STDERR_FILENO, RSMSG, sizeof(RSMSG) - 1, &nw);
 		goto einval;
 	default:
 		goto einval;
 	}
 
-	return ((__set_errno(dbp->sync(dbp, 0))) == 0 ? 0 : -1);
+	return ((errno = dbp->sync(dbp, 0)) == 0 ? 0 : -1);
 
-einval:	__set_errno(EINVAL);
+einval:	errno = EINVAL;
 	return (-1);
 }
diff --git a/db2/db_185.h b/db2/db_185.h
index 0be51f5074..a928ca8fd5 100644
--- a/db2/db_185.h
+++ b/db2/db_185.h
@@ -65,11 +65,11 @@
 
 #ifndef	__BIT_TYPES_DEFINED__
 #define	__BIT_TYPES_DEFINED__
-
-
-
-
-
+@u_int8_decl@
+@int16_decl@
+@u_int16_decl@
+@int32_decl@
+@u_int32_decl@
 #endif
 
 /*
diff --git a/db2/db_int.h b/db2/db_int.h
index 92a3817764..0016240e70 100644
--- a/db2/db_int.h
+++ b/db2/db_int.h
@@ -4,14 +4,15 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db_int.h.src	10.62 (Sleepycat) 5/23/98
+ *	@(#)db_int.h	10.77 (Sleepycat) 1/3/99
  */
 
 #ifndef _DB_INTERNAL_H_
 #define	_DB_INTERNAL_H_
 
-#include <db.h>				/* Standard DB include file. */
+#include "db.h"				/* Standard DB include file. */
 #include "queue.h"
+#include "shqueue.h"
 
 /*******************************************************
  * General purpose constants and macros.
@@ -75,27 +76,7 @@
 #define	R_ADDR(base, offset)	((void *)((u_int8_t *)((base)->addr) + offset))
 #define	R_OFFSET(base, p)	((u_int8_t *)(p) - (u_int8_t *)(base)->addr)
 
-/* Free and free-string macros that overwrite memory. */
-#ifdef DIAGNOSTIC
-#undef	FREE
-#define	FREE(p, len) {							\
-	memset(p, 0xff, len);						\
-	__db_free(p);							\
-}
-#undef	FREES
-#define	FREES(p) {							\
-	FREE(p, strlen(p));						\
-}
-#else
-#undef	FREE
-#define	FREE(p, len) {							\
-	__db_free(p);							\
-}
-#undef	FREES
-#define	FREES(p) {							\
-	__db_free(p);							\
-}
-#endif
+#define	DB_DEFAULT	0x000000	/* No flag was specified. */
 
 /* Structure used to print flag values. */
 typedef struct __fn {
@@ -111,25 +92,29 @@ typedef struct __fn {
 #define	LF_CLR(f)	(flags &= ~(f))
 #define	LF_ISSET(f)	(flags & (f))
 
+/*
+ * Panic check:
+ * All interfaces check the panic flag, if it's set, the tree is dead.
+ */
+#define	DB_PANIC_CHECK(dbp) {						\
+	if ((dbp)->dbenv != NULL && (dbp)->dbenv->db_panic != 0)	\
+		return (DB_RUNRECOVERY);				\
+}
+
 /* Display separator string. */
 #undef	DB_LINE
 #define	DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
 
-/* Global variables. */
-typedef struct __db_globals {
-	int db_mutexlocks;		/* DB_MUTEXLOCKS */
-	int db_region_anon;		/* DB_REGION_ANON, DB_REGION_NAME */
-	int db_region_init;		/* DB_REGION_INIT */
-	int db_tsl_spins;		/* DB_TSL_SPINS */
-	int db_pageyield;		/* DB_PAGEYIELD */
-} DB_GLOBALS;
-extern	DB_GLOBALS	__db_global_values;
-#define	DB_GLOBAL(v)	__db_global_values.v
-
 /* Unused, or not-used-yet variable.  "Shut that bloody compiler up!" */
 #define	COMPQUIET(n, v)	(n) = (v)
 
 /*
+ * Purify and similar run-time tools complain about unitialized reads/writes
+ * for structure fields whose only purpose is padding.
+ */
+#define	UMRW(v)		(v) = 0
+
+/*
  * Win16 needs specific syntax on callback functions.  Nobody else cares.
  */
 #ifndef	DB_CALLBACK
@@ -155,8 +140,6 @@ extern	DB_GLOBALS	__db_global_values;
  *******************************************************/
 typedef unsigned char tsl_t;
 
-
-
 /*
  * !!!
  * Various systems require different alignments for mutexes (the worst we've
@@ -204,21 +187,6 @@ typedef struct _db_mutex_t {
 	if (F_ISSET(dbp, DB_AM_THREAD))					\
 	    (void)__db_mutex_unlock((db_mutex_t *)(dbp)->mutexp, -1);
 
-/* Btree/recno local statistics structure. */
-struct __db_bt_lstat;	typedef struct __db_bt_lstat DB_BTREE_LSTAT;
-struct __db_bt_lstat {
-	u_int32_t bt_freed;		/* Pages freed for reuse. */
-	u_int32_t bt_pfxsaved;		/* Bytes saved by prefix compression. */
-	u_int32_t bt_split;		/* Total number of splits. */
-	u_int32_t bt_rootsplit;		/* Root page splits. */
-	u_int32_t bt_fastsplit;		/* Fast splits. */
-	u_int32_t bt_added;		/* Items added. */
-	u_int32_t bt_deleted;		/* Items deleted. */
-	u_int32_t bt_get;		/* Items retrieved. */
-	u_int32_t bt_cache_hit;		/* Hits in fast-insert code. */
-	u_int32_t bt_cache_miss;	/* Misses in fast-insert code. */
-};
-
 /*******************************************************
  * Environment.
  *******************************************************/
@@ -250,6 +218,7 @@ typedef struct _rlayout {
 	int	   majver;		/* Major version number. */
 	int	   minver;		/* Minor version number. */
 	int	   patch;		/* Patch version number. */
+	int	   panic;		/* Region is dead. */
 #define	INVALID_SEGID	-1
 	int	   segid;		/* shmget(2) ID, or Win16 segment ID. */
 
@@ -262,9 +231,9 @@ typedef struct _rlayout {
  * we don't make the underlying VM unhappy.
  */
 #define	DB_VMPAGESIZE	(4 * 1024)
-#define	DB_ROUNDOFF(i) {						\
-	(i) += DB_VMPAGESIZE - 1;					\
-	(i) -= (i) % DB_VMPAGESIZE;					\
+#define	DB_ROUNDOFF(n, round) {						\
+	(n) += (round) - 1;						\
+	(n) -= (n) % (round);						\
 }
 
 /*
@@ -292,6 +261,7 @@ struct __db_reginfo {
 					   and mmap(2) is being used to map it
 					   into our address space. */
 	int	    segid;		/* shmget(2) ID, or Win16 segment ID. */
+	void	   *wnt_handle;		/* Win/NT HANDLE. */
 
 					/* Shared flags. */
 /*				0x0001	COMMON MASK with RLAYOUT structure. */
@@ -334,8 +304,8 @@ typedef struct __dbpginfo {
 #define	IS_ZERO_LSN(LSN)	((LSN).file == 0)
 
 /* Test if we need to log a change. */
-#define	DB_LOGGING(dbp)							\
-	(F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER))
+#define	DB_LOGGING(dbc)							\
+	(F_ISSET((dbc)->dbp, DB_AM_LOGGING) && !F_ISSET(dbc, DBC_RECOVER))
 
 #ifdef DIAGNOSTIC
 /*
@@ -350,30 +320,30 @@ typedef struct __dbpginfo {
  * A data
  * F flags
  */
-#define	LOG_OP(D, T, O, K, A, F) {					\
+#define	LOG_OP(C, T, O, K, A, F) {					\
 	DB_LSN _lsn;							\
 	DBT _op;							\
-	if (DB_LOGGING((D))) {						\
+	if (DB_LOGGING((C))) {						\
 		memset(&_op, 0, sizeof(_op));				\
 		_op.data = O;						\
 		_op.size = strlen(O) + 1;				\
-		(void)__db_debug_log((D)->dbenv->lg_info,		\
-		    T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F);	\
+		(void)__db_debug_log((C)->dbp->dbenv->lg_info,		\
+		    T, &_lsn, 0, &_op, (C)->dbp->log_fileid, K, A, F);	\
 	}								\
 }
 #ifdef DEBUG_ROP
-#define	DEBUG_LREAD(D, T, O, K, A, F)	LOG_OP(D, T, O, K, A, F)
+#define	DEBUG_LREAD(C, T, O, K, A, F)	LOG_OP(C, T, O, K, A, F)
 #else
-#define	DEBUG_LREAD(D, T, O, K, A, F)
+#define	DEBUG_LREAD(C, T, O, K, A, F)
 #endif
 #ifdef DEBUG_WOP
-#define	DEBUG_LWRITE(D, T, O, K, A, F)	LOG_OP(D, T, O, K, A, F)
+#define	DEBUG_LWRITE(C, T, O, K, A, F)	LOG_OP(C, T, O, K, A, F)
 #else
-#define	DEBUG_LWRITE(D, T, O, K, A, F)
+#define	DEBUG_LWRITE(C, T, O, K, A, F)
 #endif
 #else
-#define	DEBUG_LREAD(D, T, O, K, A, F)
-#define	DEBUG_LWRITE(D, T, O, K, A, F)
+#define	DEBUG_LREAD(C, T, O, K, A, F)
+#define	DEBUG_LWRITE(C, T, O, K, A, F)
 #endif /* DIAGNOSTIC */
 
 /*******************************************************
@@ -393,10 +363,45 @@ struct __db_txn {
 	DB_LSN		last_lsn;	/* Lsn of last log write. */
 	u_int32_t	txnid;		/* Unique transaction id. */
 	size_t		off;		/* Detail structure within region. */
-	TAILQ_ENTRY(__db_txn) links;
+	TAILQ_ENTRY(__db_txn) links;	/* Links transactions off manager. */
+	TAILQ_HEAD(__kids, __db_txn) kids; /* Child transactions. */
+	TAILQ_ENTRY(__db_txn) klinks;	/* Links child transactions. */
+
+#define	TXN_MALLOC	0x01		/* Structure allocated by TXN system. */
+	u_int32_t	flags;
+};
+
+/*******************************************************
+ * Global variables.
+ *******************************************************/
+/*
+ * !!!
+ * Initialized in os/os_config.c, don't change this unless you change it
+ * as well.
+ */
+
+struct __rmname {
+	char *dbhome;
+	int rmid;
+	TAILQ_ENTRY(__rmname) links;
 };
 
-#include "os_func.h"
+typedef struct __db_globals {
+	int db_mutexlocks;		/* DB_MUTEXLOCKS */
+	int db_pageyield;		/* DB_PAGEYIELD */
+	int db_region_anon;		/* DB_REGION_ANON, DB_REGION_NAME */
+	int db_region_init;		/* DB_REGION_INIT */
+	int db_tsl_spins;		/* DB_TSL_SPINS */
+					/* XA: list of opened environments. */
+	TAILQ_HEAD(__db_envq, __db_env) db_envq;
+					/* XA: list of id to dbhome mappings. */
+	TAILQ_HEAD(__db_nameq, __rmname) db_nameq;
+} DB_GLOBALS;
+
+extern	DB_GLOBALS	__db_global_values;
+#define	DB_GLOBAL(v)	__db_global_values.v
+
+#include "os.h"
 #include "os_ext.h"
 
 #endif /* !_DB_INTERNAL_H_ */
diff --git a/db2/dbm/dbm.c b/db2/dbm/dbm.c
index 261fe81ff2..5bcb53f023 100644
--- a/db2/dbm/dbm.c
+++ b/db2/dbm/dbm.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)dbm.c	10.16 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)dbm.c	10.23 (Sleepycat) 11/22/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -89,6 +89,16 @@ __db_dbm_init(file)
 }
 weak_alias (__db_dbm_init, dbminit)
 
+int
+__db_dbm_close()
+{
+	if (__cur_db != NULL) {
+		dbm_close(__cur_db);
+		__cur_db = NULL;
+	}
+	return (0);
+}
+
 datum
 __db_dbm_fetch(key)
 	datum key;
@@ -140,16 +150,11 @@ int
 __db_dbm_delete(key)
 	datum key;
 {
-	int ret;
-
 	if (__cur_db == NULL) {
 		__db_no_open();
 		return (-1);
 	}
-	ret = dbm_delete(__cur_db, key);
-	if (ret == 0)
-		ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0);
-	return (ret);
+	return (dbm_delete(__cur_db, key));
 }
 weak_alias (__db_dbm_delete, delete)
 
@@ -157,16 +162,11 @@ int
 __db_dbm_store(key, dat)
 	datum key, dat;
 {
-	int ret;
-
 	if (__cur_db == NULL) {
 		__db_no_open();
 		return (-1);
 	}
-	ret = dbm_store(__cur_db, key, dat, DBM_REPLACE);
-	if (ret == 0)
-		ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0);
-	return (ret);
+	return (dbm_store(__cur_db, key, dat, DBM_REPLACE));
 }
 weak_alias (__db_dbm_store, store)
 
@@ -192,7 +192,9 @@ __db_ndbm_open(file, oflags, mode)
 	int oflags, mode;
 {
 	DB *dbp;
+	DBC *dbc;
 	DB_INFO dbinfo;
+	int sv_errno;
 	char path[MAXPATHLEN];
 
 	memset(&dbinfo, 0, sizeof(dbinfo));
@@ -215,7 +217,15 @@ __db_ndbm_open(file, oflags, mode)
 	if ((errno = db_open(path,
 	    DB_HASH, __db_oflags(oflags), mode, NULL, &dbinfo, &dbp)) != 0)
 		return (NULL);
-	return ((DBM *)dbp);
+
+	if ((errno = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) {
+		sv_errno = errno;
+		(void)dbp->close(dbp, 0);
+		errno = sv_errno;
+		return (NULL);
+	}
+
+	return ((DBM *)dbc);
 }
 weak_alias (__db_ndbm_open, dbm_open)
 
@@ -224,10 +234,14 @@ weak_alias (__db_ndbm_open, dbm_open)
  *	Nothing.
  */
 void
-__db_ndbm_close(db)
-	DBM *db;
+__db_ndbm_close(dbm)
+	DBM *dbm;
 {
-	(void)db->close(db, 0);
+	DBC *dbc;
+
+	dbc = (DBC *)dbm;
+
+	(void)dbc->dbp->close(dbc->dbp, 0);
 }
 weak_alias (__db_ndbm_close, dbm_close)
 
@@ -237,25 +251,39 @@ weak_alias (__db_ndbm_close, dbm_close)
  *	NULL on failure
  */
 datum
-__db_ndbm_fetch(db, key)
-	DBM *db;
+__db_ndbm_fetch(dbm, key)
+	DBM *dbm;
 	datum key;
 {
+	DBC *dbc;
 	DBT _key, _data;
 	datum data;
 	int ret;
 
+	dbc = (DBC *)dbm;
+
 	memset(&_key, 0, sizeof(DBT));
 	memset(&_data, 0, sizeof(DBT));
 	_key.size = key.dsize;
 	_key.data = key.dptr;
-	if ((ret = db->get((DB *)db, NULL, &_key, &_data, 0)) == 0) {
+
+	/*
+	 * Note that we can't simply use the dbc we have to do a c_get/SET,
+	 * because that cursor is the one used for sequential iteration and
+	 * it has to remain stable in the face of intervening gets and puts.
+	 */
+	if ((ret = dbc->dbp->get(dbc->dbp, NULL, &_key, &_data, 0)) == 0) {
 		data.dptr = _data.data;
 		data.dsize = _data.size;
 	} else {
 		data.dptr = NULL;
 		data.dsize = 0;
-		__set_errno (ret == DB_NOTFOUND ? ENOENT : ret);
+		if (ret == DB_NOTFOUND)
+			errno = ENOENT;
+		else {
+			errno = ret;
+			F_SET(dbc->dbp, DB_DBM_ERROR);
+		}
 	}
 	return (data);
 }
@@ -267,30 +295,31 @@ weak_alias (__db_ndbm_fetch, dbm_fetch)
  *	NULL on failure
  */
 datum
-__db_ndbm_firstkey(db)
-	DBM *db;
+__db_ndbm_firstkey(dbm)
+	DBM *dbm;
 {
+	DBC *dbc;
 	DBT _key, _data;
 	datum key;
 	int ret;
 
-	DBC *cp;
-
-	if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL)
-		if ((errno = db->cursor(db, NULL, &cp)) != 0) {
-			memset(&key, 0, sizeof(key));
-			return (key);
-		}
+	dbc = (DBC *)dbm;
 
 	memset(&_key, 0, sizeof(DBT));
 	memset(&_data, 0, sizeof(DBT));
-	if ((ret = (cp->c_get)(cp, &_key, &_data, DB_FIRST)) == 0) {
+
+	if ((ret = dbc->c_get(dbc, &_key, &_data, DB_FIRST)) == 0) {
 		key.dptr = _key.data;
 		key.dsize = _key.size;
 	} else {
 		key.dptr = NULL;
 		key.dsize = 0;
-		__set_errno (ret == DB_NOTFOUND ? ENOENT : ret);
+		if (ret == DB_NOTFOUND)
+			errno = ENOENT;
+		else {
+			errno = ret;
+			F_SET(dbc->dbp, DB_DBM_ERROR);
+		}
 	}
 	return (key);
 }
@@ -302,29 +331,31 @@ weak_alias (__db_ndbm_firstkey, dbm_firstkey)
  *	NULL on failure
  */
 datum
-__db_ndbm_nextkey(db)
-	DBM *db;
+__db_ndbm_nextkey(dbm)
+	DBM *dbm;
 {
-	DBC *cp;
+	DBC *dbc;
 	DBT _key, _data;
 	datum key;
 	int ret;
 
-	if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL)
-		if ((errno = db->cursor(db, NULL, &cp)) != 0) {
-			memset(&key, 0, sizeof(key));
-			return (key);
-		}
+	dbc = (DBC *)dbm;
 
 	memset(&_key, 0, sizeof(DBT));
 	memset(&_data, 0, sizeof(DBT));
-	if ((ret = (cp->c_get)(cp, &_key, &_data, DB_NEXT)) == 0) {
+
+	if ((ret = dbc->c_get(dbc, &_key, &_data, DB_NEXT)) == 0) {
 		key.dptr = _key.data;
 		key.dsize = _key.size;
 	} else {
 		key.dptr = NULL;
 		key.dsize = 0;
-		__set_errno (ret == DB_NOTFOUND ? ENOENT : ret);
+		if (ret == DB_NOTFOUND)
+			errno = ENOENT;
+		else {
+			errno = ret;
+			F_SET(dbc->dbp, DB_DBM_ERROR);
+		}
 	}
 	return (key);
 }
@@ -336,19 +367,29 @@ weak_alias (__db_ndbm_nextkey, dbm_nextkey)
  *	<0 failure
  */
 int
-__db_ndbm_delete(db, key)
-	DBM *db;
+__db_ndbm_delete(dbm, key)
+	DBM *dbm;
 	datum key;
 {
+	DBC *dbc;
 	DBT _key;
 	int ret;
 
+	dbc = (DBC *)dbm;
+
 	memset(&_key, 0, sizeof(DBT));
 	_key.data = key.dptr;
 	_key.size = key.dsize;
-	if ((ret = (((DB *)db)->del)((DB *)db, NULL, &_key, 0)) == 0)
+
+	if ((ret = dbc->dbp->del(dbc->dbp, NULL, &_key, 0)) == 0)
 		return (0);
-	errno = ret == DB_NOTFOUND ? ENOENT : ret;
+
+	if (ret == DB_NOTFOUND)
+		errno = ENOENT;
+	else {
+		errno = ret;
+		F_SET(dbc->dbp, DB_DBM_ERROR);
+	}
 	return (-1);
 }
 weak_alias (__db_ndbm_delete, dbm_delete)
@@ -360,49 +401,59 @@ weak_alias (__db_ndbm_delete, dbm_delete)
  *	 1 if DBM_INSERT and entry exists
  */
 int
-__db_ndbm_store(db, key, data, flags)
-	DBM *db;
+__db_ndbm_store(dbm, key, data, flags)
+	DBM *dbm;
 	datum key, data;
 	int flags;
 {
+	DBC *dbc;
 	DBT _key, _data;
 	int ret;
 
+	dbc = (DBC *)dbm;
+
 	memset(&_key, 0, sizeof(DBT));
-	memset(&_data, 0, sizeof(DBT));
 	_key.data = key.dptr;
 	_key.size = key.dsize;
+
+	memset(&_data, 0, sizeof(DBT));
 	_data.data = data.dptr;
 	_data.size = data.dsize;
-	if ((ret = db->put((DB *)db, NULL,
+
+	if ((ret = dbc->dbp->put(dbc->dbp, NULL,
 	    &_key, &_data, flags == DBM_INSERT ? DB_NOOVERWRITE : 0)) == 0)
 		return (0);
+
 	if (ret == DB_KEYEXIST)
 		return (1);
+
 	errno = ret;
+	F_SET(dbc->dbp, DB_DBM_ERROR);
 	return (-1);
 }
 weak_alias (__db_ndbm_store, dbm_store)
 
 int
-__db_ndbm_error(db)
-	DBM *db;
+__db_ndbm_error(dbm)
+	DBM *dbm;
 {
-	HTAB *hp;
+	DBC *dbc;
 
-	hp = (HTAB *)db->internal;
-	return (hp->local_errno);
+	dbc = (DBC *)dbm;
+
+	return (F_ISSET(dbc->dbp, DB_DBM_ERROR));
 }
 weak_alias (__db_ndbm_error, dbm_error)
 
 int
-__db_ndbm_clearerr(db)
-	DBM *db;
+__db_ndbm_clearerr(dbm)
+	DBM *dbm;
 {
-	HTAB *hp;
+	DBC *dbc;
+
+	dbc = (DBC *)dbm;
 
-	hp = (HTAB *)db->internal;
-	hp->local_errno = 0;
+	F_CLR(dbc->dbp, DB_DBM_ERROR);
 	return (0);
 }
 weak_alias (__db_ndbm_clearerr, dbm_clearerr)
@@ -413,10 +464,14 @@ weak_alias (__db_ndbm_clearerr, dbm_clearerr)
  *	0 if not read-only
  */
 int
-__db_ndbm_rdonly(db)
-	DBM *db;
+__db_ndbm_rdonly(dbm)
+	DBM *dbm;
 {
-	return (F_ISSET((DB *)db, DB_AM_RDONLY) ? 1 : 0);
+	DBC *dbc;
+
+	dbc = (DBC *)dbm;
+
+	return (F_ISSET(dbc->dbp, DB_AM_RDONLY) ? 1 : 0);
 }
 
 /*
@@ -426,23 +481,23 @@ __db_ndbm_rdonly(db)
  * and picked one to use at random.
  */
 int
-__db_ndbm_dirfno(db)
-	DBM *db;
+__db_ndbm_dirfno(dbm)
+	DBM *dbm;
 {
-	int fd;
-
-	(void)db->fd(db, &fd);
-	return (fd);
+	return (dbm_pagfno(dbm));
 }
 weak_alias (__db_ndbm_dirfno, dbm_dirfno)
 
 int
-__db_ndbm_pagfno(db)
-	DBM *db;
+__db_ndbm_pagfno(dbm)
+	DBM *dbm;
 {
+	DBC *dbc;
 	int fd;
 
-	(void)db->fd(db, &fd);
+	dbc = (DBC *)dbm;
+
+	(void)dbc->dbp->fd(dbc->dbp, &fd);
 	return (fd);
 }
 weak_alias (__db_ndbm_pagfno, dbm_pagfno)
diff --git a/db2/hash/hash.c b/db2/hash/hash.c
index 0265f19659..0d202fce20 100644
--- a/db2/hash/hash.c
+++ b/db2/hash/hash.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)hash.c	10.45 (Sleepycat) 5/11/98";
+static const char sccsid[] = "@(#)hash.c	10.63 (Sleepycat) 12/11/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -64,23 +64,23 @@ static const char sccsid[] = "@(#)hash.c	10.45 (Sleepycat) 5/11/98";
 #include "db_am.h"
 #include "db_ext.h"
 #include "hash.h"
+#include "btree.h"
 #include "log.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
 
 static int  __ham_c_close __P((DBC *));
 static int  __ham_c_del __P((DBC *, u_int32_t));
+static int  __ham_c_destroy __P((DBC *));
 static int  __ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
 static int  __ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
-static int  __ham_c_init __P((DB *, DB_TXN *, DBC **));
-static int  __ham_cursor __P((DB *, DB_TXN *, DBC **));
 static int  __ham_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
-static int  __ham_dup_return __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
-static int  __ham_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-static void __ham_init_htab __P((HTAB *, u_int32_t, u_int32_t));
-static int  __ham_lookup __P((HTAB *,
-		HASH_CURSOR *, const DBT *, u_int32_t, db_lockmode_t));
-static int  __ham_overwrite __P((HTAB *, HASH_CURSOR *, DBT *));
-static int  __ham_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-static int  __ham_sync __P((DB *, u_int32_t));
+static int  __ham_dup_return __P((DBC *, DBT *, u_int32_t));
+static int  __ham_expand_table __P((DBC *));
+static void __ham_init_htab __P((DBC *, u_int32_t, u_int32_t));
+static int  __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t));
+static int  __ham_overwrite __P((DBC *, DBT *));
 
 /************************** INTERFACE ROUTINES ***************************/
 /* OPEN/CLOSE */
@@ -96,65 +96,53 @@ __ham_open(dbp, dbinfo)
 	DB_INFO *dbinfo;
 {
 	DB_ENV *dbenv;
-	DBC *curs;
-	HTAB *hashp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	int file_existed, ret;
 
+	dbc = NULL;
 	dbenv = dbp->dbenv;
 
-	if ((hashp = (HTAB *)__db_calloc(1, sizeof(HTAB))) == NULL)
-		return (ENOMEM);
-	hashp->dbp = dbp;
-
 	/* Set the hash function if specified by the user. */
 	if (dbinfo != NULL && dbinfo->h_hash != NULL)
-		hashp->hash = dbinfo->h_hash;
+		dbp->h_hash = dbinfo->h_hash;
 
 	/*
-	 * Initialize the remaining fields of the dbp.  The type, close and
-	 * fd functions are all set in db_open.
+	 * Initialize the remaining fields of the dbp.  The only function
+	 * that differs from the default set is __ham_stat().
 	 */
-	dbp->internal = hashp;
-	dbp->cursor = __ham_cursor;
+	dbp->internal = NULL;
+	dbp->am_close = __ham_close;
 	dbp->del = __ham_delete;
-	dbp->get = __ham_get;
-	dbp->put = __ham_put;
-	dbp->sync = __ham_sync;
-
-	/* If locking is turned on, lock the meta data page. */
-	if (F_ISSET(dbp, DB_AM_LOCKING)) {
-		dbp->lock.pgno = BUCKET_INVALID;
-		if ((ret = lock_get(dbenv->lk_info, dbp->locker,
-		    0, &dbp->lock_dbt, DB_LOCK_READ, &hashp->hlock)) != 0) {
-			if (ret < 0)
-				ret = EAGAIN;
-			goto out;
-		}
-	}
+	dbp->stat = __ham_stat;
+
+	/* Get a cursor we can use for the rest of this function. */
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		goto out;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	GET_META(dbp, hcp, ret);
+	if (ret != 0)
+		goto out;
 
 	/*
-	 * Now, we can try to read the meta-data page and figure out
-	 * if we set up locking and get the meta-data page properly.
 	 * If this is a new file, initialize it, and put it back dirty.
 	 */
-	if ((ret = __ham_get_page(hashp->dbp, 0, (PAGE **)&hashp->hdr)) != 0)
-		goto out;
 
-	/* Initialize the hashp structure */
-	if (hashp->hdr->magic == DB_HASHMAGIC) {
+	/* Initialize the hdr structure */
+	if (hcp->hdr->magic == DB_HASHMAGIC) {
 		file_existed = 1;
 		/* File exists, verify the data in the header. */
-		if (hashp->hash == NULL)
-			hashp->hash =
-			    hashp->hdr->version < 5 ? __ham_func4 : __ham_func5;
-		if (hashp->hash(CHARKEY, sizeof(CHARKEY)) !=
-		    hashp->hdr->h_charkey) {
-			__db_err(hashp->dbp->dbenv,
-			    "hash: incompatible hash function");
+		if (dbp->h_hash == NULL)
+			dbp->h_hash =
+			    hcp->hdr->version < 5 ? __ham_func4 : __ham_func5;
+		if (dbp->h_hash(CHARKEY, sizeof(CHARKEY)) !=
+		    hcp->hdr->h_charkey) {
+			__db_err(dbp->dbenv, "hash: incompatible hash function");
 			ret = EINVAL;
 			goto out;
 		}
-		if (F_ISSET(hashp->hdr, DB_HASH_DUP))
+		if (F_ISSET(hcp->hdr, DB_HASH_DUP))
 			F_SET(dbp, DB_AM_DUP);
 	} else {
 		/*
@@ -163,59 +151,27 @@ __ham_open(dbp, dbinfo)
 		 */
 		file_existed = 0;
 		if (F_ISSET(dbp, DB_AM_LOCKING) &&
-		    ((ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0 ||
-		    (ret = lock_get(dbenv->lk_info, dbp->locker, 0,
-		        &dbp->lock_dbt, DB_LOCK_WRITE, &hashp->hlock)) != 0)) {
+		    ((ret = lock_put(dbenv->lk_info, hcp->hlock)) != 0 ||
+		    (ret = lock_get(dbenv->lk_info, dbc->locker, 0,
+		        &dbc->lock_dbt, DB_LOCK_WRITE, &hcp->hlock)) != 0)) {
 			if (ret < 0)
 				ret = EAGAIN;
 			goto out;
 		}
 
-		__ham_init_htab(hashp,
-		    dbinfo != NULL ? dbinfo->h_nelem : 0,
+		__ham_init_htab(dbc, dbinfo != NULL ? dbinfo->h_nelem : 0,
 		    dbinfo != NULL ? dbinfo->h_ffactor : 0);
 		if (F_ISSET(dbp, DB_AM_DUP))
-			F_SET(hashp->hdr, DB_HASH_DUP);
-		if ((ret = __ham_dirty_page(hashp, (PAGE *)hashp->hdr)) != 0)
+			F_SET(hcp->hdr, DB_HASH_DUP);
+		if ((ret = __ham_dirty_page(dbp, (PAGE *)hcp->hdr)) != 0)
 			goto out;
 	}
 
-	/* Initialize the default cursor. */
-	__ham_c_init(dbp, NULL, &curs);
-	TAILQ_INSERT_TAIL(&dbp->curs_queue, curs, links);
-
-	/* Allocate memory for our split buffer. */
-	if ((hashp->split_buf = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) {
-		ret = ENOMEM;
-		goto out;
-	}
-
-#ifdef NO_STATISTICS_FOR_DB_ERR
-	__db_err(dbp->dbenv,
-	    "%s%lx\n%s%ld\n%s%ld\n%s%ld\n%s%ld\n%s0x%lx\n%s0x%lx\n%s%ld\n%s%ld\n%s0x%lx",
-	    "TABLE POINTER   ", (long)hashp,
-	    "BUCKET SIZE     ", (long)hashp->hdr->pagesize,
-	    "FILL FACTOR     ", (long)hashp->hdr->ffactor,
-	    "MAX BUCKET      ", (long)hashp->hdr->max_bucket,
-	    "OVFL POINT      ", (long)hashp->hdr->ovfl_point,
-	    "LAST FREED      ", (long)hashp->hdr->last_freed,
-	    "HIGH MASK       ", (long)hashp->hdr->high_mask,
-	    "LOW  MASK       ", (long)hashp->hdr->low_mask,
-	    "NELEM           ", (long)hashp->hdr->nelem,
-	    "FLAGS           ", (long)hashp->hdr->flags);
-#endif
-
 	/* Release the meta data page */
-	(void)__ham_put_page(hashp->dbp, (PAGE *)hashp->hdr, 0);
-	if (F_ISSET(dbp, DB_AM_LOCKING) &&
-	    (ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0) {
-		if (ret < 0)
-			ret = EAGAIN;
+	RELEASE_META(dbp, hcp);
+	if ((ret  = dbc->c_close(dbc)) != 0)
 		goto out;
-	}
 
-	hashp->hlock = 0;
-	hashp->hdr = NULL;
 	/* Sync the file so that we know that the meta data goes to disk. */
 	if (!file_existed && (ret = dbp->sync(dbp, 0)) != 0)
 		goto out;
@@ -232,27 +188,8 @@ int
 __ham_close(dbp)
 	DB *dbp;
 {
-	HTAB *hashp;
-	int ret, t_ret;
-
-	DEBUG_LWRITE(dbp, NULL, "ham_close", NULL, NULL, 0);
-	hashp = (HTAB *)dbp->internal;
-	ret = 0;
-
-	/* Free the split page. */
-	if (hashp->split_buf)
-		FREE(hashp->split_buf, dbp->pgsize);
-
-	if (hashp->hdr && (t_ret = __ham_put_page(hashp->dbp,
-	    (PAGE *)hashp->hdr, 0)) != 0 && ret == 0)
-		ret = t_ret;
-	if (hashp->hlock && (t_ret = lock_put(hashp->dbp->dbenv->lk_info,
-	    hashp->hlock)) != 0 && ret == 0)
-		ret = t_ret;
-
-	FREE(hashp, sizeof(HTAB));
-	dbp->internal = NULL;
-	return (ret);
+	COMPQUIET(dbp, NULL);
+	return (0);
 }
 
 /************************** LOCAL CREATION ROUTINES **********************/
@@ -260,408 +197,204 @@ __ham_close(dbp)
  * Returns 0 on No Error
  */
 static void
-__ham_init_htab(hashp, nelem, ffactor)
-	HTAB *hashp;
+__ham_init_htab(dbc, nelem, ffactor)
+	DBC *dbc;
 	u_int32_t nelem, ffactor;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	int32_t l2, nbuckets;
 
-	memset(hashp->hdr, 0, sizeof(HASHHDR));
-	hashp->hdr->ffactor = ffactor;
-	hashp->hdr->pagesize = hashp->dbp->pgsize;
-	ZERO_LSN(hashp->hdr->lsn);
-	hashp->hdr->magic = DB_HASHMAGIC;
-	hashp->hdr->version = DB_HASHVERSION;
-	if (hashp->hash == NULL)
-		hashp->hash =
-		    hashp->hdr->version < 5 ? __ham_func4 : __ham_func5;
-	hashp->hdr->h_charkey = hashp->hash(CHARKEY, sizeof(CHARKEY));
-	if (nelem != 0 && hashp->hdr->ffactor != 0) {
-		nelem = (nelem - 1) / hashp->hdr->ffactor + 1;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	memset(hcp->hdr, 0, sizeof(HASHHDR));
+	hcp->hdr->ffactor = ffactor;
+	hcp->hdr->pagesize = dbp->pgsize;
+	ZERO_LSN(hcp->hdr->lsn);
+	hcp->hdr->magic = DB_HASHMAGIC;
+	hcp->hdr->version = DB_HASHVERSION;
+
+	if (dbp->h_hash == NULL)
+		dbp->h_hash = hcp->hdr->version < 5 ? __ham_func4 : __ham_func5;
+	hcp->hdr->h_charkey = dbp->h_hash(CHARKEY, sizeof(CHARKEY));
+	if (nelem != 0 && hcp->hdr->ffactor != 0) {
+		nelem = (nelem - 1) / hcp->hdr->ffactor + 1;
 		l2 = __db_log2(nelem > 2 ? nelem : 2);
 	} else
 		l2 = 2;
 
 	nbuckets = 1 << l2;
 
-	hashp->hdr->ovfl_point = l2;
-	hashp->hdr->last_freed = PGNO_INVALID;
+	hcp->hdr->ovfl_point = l2;
+	hcp->hdr->last_freed = PGNO_INVALID;
 
-	hashp->hdr->max_bucket = hashp->hdr->high_mask = nbuckets - 1;
-	hashp->hdr->low_mask = (nbuckets >> 1) - 1;
-	memcpy(hashp->hdr->uid, hashp->dbp->lock.fileid, DB_FILE_ID_LEN);
+	hcp->hdr->max_bucket = hcp->hdr->high_mask = nbuckets - 1;
+	hcp->hdr->low_mask = (nbuckets >> 1) - 1;
+	memcpy(hcp->hdr->uid, dbp->fileid, DB_FILE_ID_LEN);
 }
 
-/********************** DESTROY/CLOSE ROUTINES ************************/
-
-
-/*
- * Write modified pages to disk
- *
- * Returns:
- *	 0 == OK
- *	-1 ERROR
- */
 static int
-__ham_sync(dbp, flags)
-	DB *dbp;
-	u_int32_t flags;
-{
-	int ret;
-
-	DEBUG_LWRITE(dbp, NULL, "ham_sync", NULL, NULL, flags);
-	if ((ret = __db_syncchk(dbp, flags)) != 0)
-		return (ret);
-	if (F_ISSET(dbp, DB_AM_RDONLY))
-		return (0);
-
-	if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
-		ret = 0;
-
-	return (ret);
-}
-
-/*******************************SEARCH ROUTINES *****************************/
-/*
- * All the access routines return
- *
- * Returns:
- *	 0 on SUCCESS
- *	 1 to indicate an external ERROR (i.e. key not found, etc)
- *	-1 to indicate an internal ERROR (i.e. out of memory, etc)
- */
-
-static int
-__ham_get(dbp, txn, key, data, flags)
+__ham_delete(dbp, txn, key, flags)
 	DB *dbp;
 	DB_TXN *txn;
 	DBT *key;
-	DBT *data;
 	u_int32_t flags;
 {
-	DB *ldbp;
-	HTAB *hashp;
+	DBC *dbc;
 	HASH_CURSOR *hcp;
-	int ret, t_ret;
+	int ret, tret;
 
-	DEBUG_LREAD(dbp, txn, "ham_get", key, NULL, flags);
-	if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
-		return (ret);
+	DB_PANIC_CHECK(dbp);
 
-	ldbp = dbp;
-	if (F_ISSET(dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0)
+	if ((ret =
+	    __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
 		return (ret);
 
-	hashp = (HTAB *)ldbp->internal;
-	SET_LOCKER(ldbp, txn);
-	GET_META(ldbp, hashp);
-
-	hashp->hash_accesses++;
-	hcp = (HASH_CURSOR *)TAILQ_FIRST(&ldbp->curs_queue)->internal;
-	if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ)) == 0) {
-		if (F_ISSET(hcp, H_OK))
-			ret = __ham_dup_return(hashp, hcp, data, DB_FIRST);
-		else /* Key was not found */
-			ret = DB_NOTFOUND;
-	}
-
-	if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0)
-		ret = t_ret;
-	RELEASE_META(ldbp, hashp);
-	if (F_ISSET(dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
-	return (ret);
-}
-
-static int
-__ham_put(dbp, txn, key, data, flags)
-	DB *dbp;
-	DB_TXN *txn;
-	DBT *key;
-	DBT *data;
-	u_int32_t flags;
-{
-	DB *ldbp;
-	DBT tmp_val, *myval;
-	HASH_CURSOR *hcp;
-	HTAB *hashp;
-	u_int32_t nbytes;
-	int ret, t_ret;
-
-	DEBUG_LWRITE(dbp, txn, "ham_put", key, data, flags);
-	if ((ret = __db_putchk(dbp, key, data,
-	    flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0)
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
 		return (ret);
 
-	ldbp = dbp;
-	if (F_ISSET(dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0)
-		return (ret);
+	DEBUG_LWRITE(dbc, txn, "ham_delete", key, NULL, flags);
 
-	hashp = (HTAB *)ldbp->internal;
-	SET_LOCKER(ldbp, txn);
-	GET_META(ldbp, hashp);
-	hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal;
-
-	nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE :
-	    HKEYDATA_PSIZE(key->size)) +
-	    (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE :
-	    HKEYDATA_PSIZE(data->size));
-
-	hashp->hash_accesses++;
-	ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE);
-
-	if (ret == DB_NOTFOUND) {
-		ret = 0;
-		if (hcp->seek_found_page != PGNO_INVALID &&
-		    hcp->seek_found_page != hcp->pgno) {
-			if ((ret = __ham_item_done(hashp, hcp, 0)) != 0)
-				goto out;
-			hcp->pgno = hcp->seek_found_page;
-			hcp->bndx = NDX_INVALID;
-		}
+	hcp = (HASH_CURSOR *)dbc->internal;
+	GET_META(dbp, hcp, ret);
+	if (ret != 0)
+		goto out;
 
-		if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
-			/*
-			 * Doing a partial put, but the key does not exist
-			 * and we are not beginning the write at 0.  We
-			 * must create a data item padded up to doff and
-			 * then write the new bytes represented by val.
-			 */
-			ret = __ham_init_dbt(&tmp_val, data->size + data->doff,
-			    &hcp->big_data, &hcp->big_datalen);
-			if (ret == 0) {
-				memset(tmp_val.data, 0, data->doff);
-				memcpy((u_int8_t *)tmp_val.data + data->doff,
-				    data->data, data->size);
-				myval = &tmp_val;
-			}
-		} else
-			myval = (DBT *)data;
-
-		if (ret == 0)
-			ret = __ham_add_el(hashp, hcp, key, myval, H_KEYDATA);
-	} else if (ret == 0 && F_ISSET(hcp, H_OK)) {
-		if (flags == DB_NOOVERWRITE)
-			ret = DB_KEYEXIST;
-		else if (F_ISSET(ldbp, DB_AM_DUP))
-			ret = __ham_add_dup(hashp, hcp, data, DB_KEYLAST);
+	hcp->stats.hash_deleted++;
+	if ((ret = __ham_lookup(dbc, key, 0, DB_LOCK_WRITE)) == 0) {
+		if (F_ISSET(hcp, H_OK))
+			ret = __ham_del_pair(dbc, 1);
 		else
-			ret = __ham_overwrite(hashp, hcp, data);
-	}
-
-	/* Free up all the cursor pages. */
-	if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0)
-		ret = t_ret;
-	/* Now check if we have to grow. */
-out:	if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
-		ret = __ham_expand_table(hashp);
-		F_CLR(hcp, H_EXPAND);
+			ret = DB_NOTFOUND;
 	}
 
-	if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0)
-		ret = t_ret;
-	RELEASE_META(ldbp, hashp);
-	if (F_ISSET(dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
+	RELEASE_META(dbp, hcp);
+out:	if ((tret = dbc->c_close(dbc)) != 0 && ret == 0)
+		ret = tret;
 	return (ret);
 }
 
-static int
-__ham_cursor(dbp, txnid, dbcp)
-	DB *dbp;
-	DB_TXN *txnid;
-	DBC **dbcp;
-{
+/* ****************** CURSORS ********************************** */
+/*
+ * __ham_c_init --
+ *	Initialize the hash-specific portion of a cursor.
+ *
+ * PUBLIC: int __ham_c_init __P((DBC *));
+ */
+int
+__ham_c_init(dbc)
+	DBC *dbc;
+  {
+	HASH_CURSOR *new_curs;
 	int ret;
 
-	DEBUG_LWRITE(dbp, txnid, "ham_cursor", NULL, NULL, 0);
-	if ((ret = __ham_c_init(dbp, txnid, dbcp)) != 0)
+	if ((ret = __os_calloc(1, sizeof(struct cursor_t), &new_curs)) != 0)
+		return (ret);
+	if ((ret =
+	    __os_malloc(dbc->dbp->pgsize, NULL, &new_curs->split_buf)) != 0) {
+		__os_free(new_curs, sizeof(*new_curs));
 		return (ret);
-
-	DB_THREAD_LOCK(dbp);
-	TAILQ_INSERT_TAIL(&dbp->curs_queue, *dbcp, links);
-	DB_THREAD_UNLOCK(dbp);
-	return (ret);
-}
-
-static int
-__ham_c_init(dbp, txnid, dbcp)
-	DB *dbp;
-	DB_TXN *txnid;
-	DBC **dbcp;
-{
-	DBC *db_curs;
-	HASH_CURSOR *new_curs;
-
-	if ((db_curs = (DBC *)__db_calloc(sizeof(DBC), 1)) == NULL)
-		return (ENOMEM);
-
-	if ((new_curs =
-	    (HASH_CURSOR *)__db_calloc(sizeof(struct cursor_t), 1)) == NULL) {
-		FREE(db_curs, sizeof(DBC));
-		return (ENOMEM);
 	}
 
-	db_curs->internal = new_curs;
-	db_curs->c_close = __ham_c_close;
-	db_curs->c_del = __ham_c_del;
-	db_curs->c_get = __ham_c_get;
-	db_curs->c_put = __ham_c_put;
-	db_curs->txn = txnid;
-	db_curs->dbp = dbp;
+	new_curs->dbc = dbc;
+
+	dbc->internal = new_curs;
+	dbc->c_am_close = __ham_c_close;
+	dbc->c_am_destroy = __ham_c_destroy;
+	dbc->c_del = __ham_c_del;
+	dbc->c_get = __ham_c_get;
+	dbc->c_put = __ham_c_put;
 
-	new_curs->db_cursor = db_curs;
 	__ham_item_init(new_curs);
 
-	if (dbcp != NULL)
-		*dbcp = db_curs;
 	return (0);
 }
 
+/*
+ * __ham_c_close --
+ *	Close down the cursor from a single use.
+ */
 static int
-__ham_delete(dbp, txn, key, flags)
-	DB *dbp;
-	DB_TXN *txn;
-	DBT *key;
-	u_int32_t flags;
-{
-	DB *ldbp;
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
-	int ret, t_ret;
-
-	DEBUG_LWRITE(dbp, txn, "ham_delete", key, NULL, flags);
-	if ((ret =
-	    __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
-		return (ret);
-
-	ldbp = dbp;
-	if (F_ISSET(dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0)
-		return (ret);
-	hashp = (HTAB *)ldbp->internal;
-	SET_LOCKER(ldbp, txn);
-	GET_META(ldbp, hashp);
-	hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal;
-
-	hashp->hash_accesses++;
-	if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_WRITE)) == 0) {
-		if (F_ISSET(hcp, H_OK))
-			ret = __ham_del_pair(hashp, hcp, 1);
-		else
-			ret = DB_NOTFOUND;
-	}
-
-	if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0)
-		ret = t_ret;
-	RELEASE_META(ldbp, hashp);
-	if (F_ISSET(dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
-	return (ret);
-}
-
-/* ****************** CURSORS ********************************** */
-static int
-__ham_c_close(cursor)
-	DBC *cursor;
+__ham_c_close(dbc)
+	DBC *dbc;
 {
-	DB  *ldbp;
 	int ret;
 
-	DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_close", NULL, NULL, 0);
-	/*
-	 * If the pagep, dpagep, and lock fields of the cursor are all NULL,
-	 * then there really isn't a need to get a handle here.  However,
-	 * the normal case is that at least one of those fields is non-NULL,
-	 * and putting those checks in here would couple the ham_item_done
-	 * functionality with cursor close which would be pretty disgusting.
-	 * Instead, we pay the overhead here of always getting the handle.
-	 */
-	ldbp = cursor->dbp;
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0)
+	if ((ret = __ham_item_done(dbc, 0)) != 0)
 		return (ret);
 
-	ret = __ham_c_iclose(ldbp, cursor);
-
-	if (F_ISSET(ldbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
-	return (ret);
+	__ham_item_init((HASH_CURSOR *)dbc->internal);
+	return (0);
 }
+
 /*
- * __ham_c_iclose --
- *
- * Internal cursor close routine; assumes it is being passed the correct
- * handle, rather than getting and putting a handle.
- *
- * PUBLIC: int __ham_c_iclose __P((DB *, DBC *));
+ * __ham_c_destroy --
+ *	Cleanup the access method private part of a cursor.
  */
-int
-__ham_c_iclose(dbp, dbc)
-	DB *dbp;
+static int
+__ham_c_destroy(dbc)
 	DBC *dbc;
 {
 	HASH_CURSOR *hcp;
-	HTAB *hashp;
-	int ret;
 
-	hashp = (HTAB *)dbp->internal;
 	hcp = (HASH_CURSOR *)dbc->internal;
-	ret = __ham_item_done(hashp, hcp, 0);
-
-	if (hcp->big_key)
-		FREE(hcp->big_key, hcp->big_keylen);
-	if (hcp->big_data)
-		FREE(hcp->big_data, hcp->big_datalen);
+	if (hcp->split_buf != NULL)
+		__os_free(hcp->split_buf, dbc->dbp->pgsize);
+	__os_free(hcp, sizeof(HASH_CURSOR));
 
-	/*
-	 * All cursors (except the default ones) are linked off the master.
-	 * Therefore, when we close the cursor, we have to remove it from
-	 * the master, not the local one.
-	 * XXX I am always removing from the master; what about local cursors?
-	 */
-	DB_THREAD_LOCK(dbc->dbp);
-	TAILQ_REMOVE(&dbc->dbp->curs_queue, dbc, links);
-	DB_THREAD_UNLOCK(dbc->dbp);
-
-	FREE(hcp, sizeof(HASH_CURSOR));
-	FREE(dbc, sizeof(DBC));
-
-	return (ret);
+	return (0);
 }
 
 static int
-__ham_c_del(cursor, flags)
-	DBC *cursor;
+__ham_c_del(dbc, flags)
+	DBC *dbc;
 	u_int32_t flags;
 {
-	DB *ldbp;
+	DB *dbp;
+	DBT repldbt;
 	HASH_CURSOR *hcp;
 	HASH_CURSOR save_curs;
-	HTAB *hashp;
 	db_pgno_t ppgno, chg_pgno;
 	int ret, t_ret;
 
-	DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_del", NULL, NULL, flags);
-	ldbp = cursor->dbp;
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0)
-		return (ret);
-	hashp = (HTAB *)ldbp->internal;
-	hcp = (HASH_CURSOR *)cursor->internal;
-	save_curs = *hcp;
-	if ((ret = __db_cdelchk(ldbp, flags,
-	    F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
+	DEBUG_LWRITE(dbc, dbc->txn, "ham_c_del", NULL, NULL, flags);
+	dbp = dbc->dbp;
+	DB_PANIC_CHECK(dbp);
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __db_cdelchk(dbc->dbp, flags,
+	    F_ISSET(dbc->dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
 		return (ret);
+
 	if (F_ISSET(hcp, H_DELETED))
 		return (DB_NOTFOUND);
 
-	SET_LOCKER(hashp->dbp, cursor->txn);
-	GET_META(hashp->dbp, hashp);
-	hashp->hash_accesses++;
-	if ((ret = __ham_get_cpage(hashp, hcp, DB_LOCK_WRITE)) != 0)
+	/*
+	 * If we are in the concurrent DB product and this cursor
+	 * is not a write cursor, then this request is invalid.
+	 * If it is a simple write cursor, then we need to upgrade its
+	 * lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		/* Make sure it's a valid update cursor. */
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+		if (F_ISSET(dbc, DBC_RMW) &&
+		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
+		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+		    &dbc->mylock)) != 0)
+			return (EAGAIN);
+	}
+
+	GET_META(dbp, hcp, ret);
+	if (ret != 0)
+		return (ret);
+
+	SAVE_CURSOR(hcp, &save_curs);
+	hcp->stats.hash_deleted++;
+
+	if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
 		goto out;
 	if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID) {
 		/*
@@ -695,20 +428,20 @@ __ham_c_del(cursor, flags)
 
 		/* Remove item from duplicate page. */
 		chg_pgno = hcp->dpgno;
-		if ((ret = __db_drem(hashp->dbp,
+		if ((ret = __db_drem(dbc,
 		    &hcp->dpagep, hcp->dndx, __ham_del_page)) != 0)
 			goto out;
 
 		if (hcp->dpagep == NULL) {
 			if (ppgno != PGNO_INVALID) {		/* Case 3 */
 				hcp->dpgno = ppgno;
-				if ((ret = __ham_get_cpage(hashp, hcp,
+				if ((ret = __ham_get_cpage(dbc,
 				    DB_LOCK_READ)) != 0)
 					goto out;
 				hcp->dndx = NUM_ENT(hcp->dpagep);
 				F_SET(hcp, H_DELETED);
 			} else {				/* Case 4 */
-				ret = __ham_del_pair(hashp, hcp, 1);
+				ret = __ham_del_pair(dbc, 1);
 				hcp->dpgno = PGNO_INVALID;
 				/*
 				 * Delpair updated the cursor queue, so we
@@ -723,6 +456,15 @@ __ham_c_del(cursor, flags)
 				memcpy(HOFFDUP_PGNO(P_ENTRY(hcp->pagep,
 				    H_DATAINDEX(hcp->bndx))),
 				    &hcp->dpgno, sizeof(db_pgno_t));
+			/*
+			 * We need to put the master page here, because
+			 * although we have a duplicate page, the master
+			 * page is dirty, and ham_item_done assumes that
+			 * if you have a duplicate page, it's the only one
+			 * that can be dirty.
+			 */
+			ret = __ham_put_page(dbp, hcp->pagep, 1);
+			hcp->pagep = NULL;
 			F_SET(hcp, H_DELETED);
 		} else						/* Case 1 */
 			F_SET(hcp, H_DELETED);
@@ -730,17 +472,17 @@ __ham_c_del(cursor, flags)
 			__ham_c_update(hcp, chg_pgno, 0, 0, 1);
 	} else if (F_ISSET(hcp, H_ISDUP)) {			/* on page */
 		if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) ==
-		    LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx))
-			ret = __ham_del_pair(hashp, hcp, 1);
+		    LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx))
+			ret = __ham_del_pair(dbc, 1);
 		else {
-			DBT repldbt;
-
 			repldbt.flags = 0;
 			F_SET(&repldbt, DB_DBT_PARTIAL);
 			repldbt.doff = hcp->dup_off;
 			repldbt.dlen = DUP_SIZE(hcp->dup_len);
 			repldbt.size = 0;
-			ret = __ham_replpair(hashp, hcp, &repldbt, 0);
+			repldbt.data =
+			    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
+			ret = __ham_replpair(dbc, &repldbt, 0);
 			hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
 			F_SET(hcp, H_DELETED);
 			__ham_c_update(hcp, hcp->pgno,
@@ -749,48 +491,53 @@ __ham_c_del(cursor, flags)
 
 	} else
 		/* Not a duplicate */
-normal:		ret = __ham_del_pair(hashp, hcp, 1);
+normal:		ret = __ham_del_pair(dbc, 1);
 
-out:	if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0)
+out:	if ((t_ret = __ham_item_done(dbc, ret == 0)) != 0 && ret == 0)
 		ret = t_ret;
-	if (ret != 0)
-		*hcp = save_curs;
-	RELEASE_META(hashp->dbp, hashp);
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
+	RELEASE_META(dbp, hcp);
+	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
 	return (ret);
 }
 
 static int
-__ham_c_get(cursor, key, data, flags)
-	DBC *cursor;
+__ham_c_get(dbc, key, data, flags)
+	DBC *dbc;
 	DBT *key;
 	DBT *data;
 	u_int32_t flags;
 {
-	DB *ldbp;
-	HTAB *hashp;
+	DB *dbp;
 	HASH_CURSOR *hcp, save_curs;
+	db_lockmode_t lock_type;
 	int get_key, ret, t_ret;
 
-	DEBUG_LREAD(cursor->dbp, cursor->txn, "ham_c_get",
+	DEBUG_LREAD(dbc, dbc->txn, "ham_c_get",
 	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL,
 	    NULL, flags);
-	ldbp = cursor->dbp;
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0)
-		return (ret);
-	hashp = (HTAB *)(ldbp->internal);
-	hcp = (HASH_CURSOR *)cursor->internal;
-	save_curs = *hcp;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	DB_PANIC_CHECK(dbp);
+	SAVE_CURSOR(hcp, &save_curs);
 	if ((ret =
-	    __db_cgetchk(hashp->dbp, key, data, flags, IS_VALID(hcp))) != 0)
+	    __db_cgetchk(dbp, key, data, flags, IS_VALID(hcp))) != 0)
 		return (ret);
 
-	SET_LOCKER(hashp->dbp, cursor->txn);
-	GET_META(hashp->dbp, hashp);
-	hashp->hash_accesses++;
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	if (LF_ISSET(DB_RMW)) {
+		lock_type = DB_LOCK_WRITE;
+		LF_CLR(DB_RMW);
+	} else
+		lock_type = DB_LOCK_READ;
 
+	GET_META(dbp, hcp, ret);
+	if (ret != 0)
+		return (ret);
+	hcp->stats.hash_get++;
 	hcp->seek_size = 0;
 
 	ret = 0;
@@ -798,24 +545,39 @@ __ham_c_get(cursor, key, data, flags)
 	switch (flags) {
 	case DB_PREV:
 		if (hcp->bucket != BUCKET_INVALID) {
-			ret = __ham_item_prev(hashp, hcp, DB_LOCK_READ);
+			ret = __ham_item_prev(dbc, lock_type);
 			break;
 		}
 		/* FALLTHROUGH */
 	case DB_LAST:
-		ret = __ham_item_last(hashp, hcp, DB_LOCK_READ);
+		ret = __ham_item_last(dbc, lock_type);
 		break;
 	case DB_FIRST:
-		ret = __ham_item_first(hashp, hcp, DB_LOCK_READ);
+		ret = __ham_item_first(dbc, lock_type);
+		break;
+	case DB_NEXT_DUP:
+		if (hcp->bucket == BUCKET_INVALID)
+			ret = EINVAL;
+		else {
+			F_SET(hcp, H_DUPONLY);
+			ret = __ham_item_next(dbc, lock_type);
+		}
 		break;
 	case DB_NEXT:
 		if (hcp->bucket == BUCKET_INVALID)
 			hcp->bucket = 0;
-		ret = __ham_item_next(hashp, hcp, DB_LOCK_READ);
+		ret = __ham_item_next(dbc, lock_type);
 		break;
 	case DB_SET:
 	case DB_SET_RANGE:
-		ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ);
+	case DB_GET_BOTH:
+		if (F_ISSET(dbc, DBC_CONTINUE)) {
+			F_SET(hcp, H_DUPONLY);
+			ret = __ham_item_next(dbc, lock_type);
+		} else if (F_ISSET(dbc, DBC_KEYSET))
+			ret = __ham_item(dbc, lock_type);
+		else
+			ret = __ham_lookup(dbc, key, 0, lock_type);
 		get_key = 0;
 		break;
 	case DB_CURRENT:
@@ -824,7 +586,7 @@ __ham_c_get(cursor, key, data, flags)
 			goto out;
 		}
 
-		ret = __ham_item(hashp, hcp, DB_LOCK_READ);
+		ret = __ham_item(dbc, lock_type);
 		break;
 	}
 
@@ -837,12 +599,12 @@ __ham_c_get(cursor, key, data, flags)
 			goto out1;
 		else if (F_ISSET(hcp, H_OK)) {
 			/* Get the key. */
-			if (get_key && (ret = __db_ret(hashp->dbp, hcp->pagep,
-			    H_KEYINDEX(hcp->bndx), key, &hcp->big_key,
-			    &hcp->big_keylen)) != 0)
+			if (get_key && (ret = __db_ret(dbp, hcp->pagep,
+			    H_KEYINDEX(hcp->bndx), key, &dbc->rkey.data,
+			    &dbc->rkey.size)) != 0)
 				goto out1;
 
-			ret = __ham_dup_return(hashp, hcp, data, flags);
+			ret = __ham_dup_return(dbc, data, flags);
 			break;
 		} else if (!F_ISSET(hcp, H_NOMORE)) {
 			abort();
@@ -855,7 +617,7 @@ __ham_c_get(cursor, key, data, flags)
 		switch (flags) {
 			case DB_LAST:
 			case DB_PREV:
-				ret = __ham_item_done(hashp, hcp, 0);
+				ret = __ham_item_done(dbc, 0);
 				if (hcp->bucket == 0) {
 					ret = DB_NOTFOUND;
 					goto out1;
@@ -863,24 +625,24 @@ __ham_c_get(cursor, key, data, flags)
 				hcp->bucket--;
 				hcp->bndx = NDX_INVALID;
 				if (ret == 0)
-					ret = __ham_item_prev(hashp,
-					    hcp, DB_LOCK_READ);
+					ret = __ham_item_prev(dbc, lock_type);
 				break;
 			case DB_FIRST:
 			case DB_NEXT:
-				ret = __ham_item_done(hashp, hcp, 0);
+				ret = __ham_item_done(dbc, 0);
 				hcp->bndx = NDX_INVALID;
 				hcp->bucket++;
 				hcp->pgno = PGNO_INVALID;
 				hcp->pagep = NULL;
-				if (hcp->bucket > hashp->hdr->max_bucket) {
+				if (hcp->bucket > hcp->hdr->max_bucket) {
 					ret = DB_NOTFOUND;
 					goto out1;
 				}
 				if (ret == 0)
-					ret = __ham_item_next(hashp,
-					    hcp, DB_LOCK_READ);
+					ret = __ham_item_next(dbc, lock_type);
 				break;
+			case DB_GET_BOTH:
+			case DB_NEXT_DUP:
 			case DB_SET:
 			case DB_SET_RANGE:
 				/* Key not found. */
@@ -888,85 +650,137 @@ __ham_c_get(cursor, key, data, flags)
 				goto out1;
 		}
 	}
-out1:	if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0)
+out1:	if ((t_ret = __ham_item_done(dbc, 0)) != 0 && ret == 0)
 		ret = t_ret;
-out:	if (ret)
-		*hcp = save_curs;
-	RELEASE_META(hashp->dbp, hashp);
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
+out:	RELEASE_META(dbp, hcp);
+	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);
 	return (ret);
 }
 
 static int
-__ham_c_put(cursor, key, data, flags)
-	DBC *cursor;
+__ham_c_put(dbc, key, data, flags)
+	DBC *dbc;
 	DBT *key;
 	DBT *data;
 	u_int32_t flags;
 {
-	DB *ldbp;
+	DB *dbp;
+	DBT tmp_val, *myval;
 	HASH_CURSOR *hcp, save_curs;
-	HTAB *hashp;
 	u_int32_t nbytes;
 	int ret, t_ret;
 
-	DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_put",
+	dbp = dbc->dbp;
+	DB_PANIC_CHECK(dbp);
+	DEBUG_LWRITE(dbc, dbc->txn, "ham_c_put",
 	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
 	    data, flags);
-	ldbp = cursor->dbp;
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD) &&
-	    (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0)
-		return (ret);
-	hashp = (HTAB *)(ldbp->internal);
-	hcp = (HASH_CURSOR *)cursor->internal;
-	save_curs = *hcp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	if ((ret = __db_cputchk(hashp->dbp, key, data, flags,
-	    F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
+	if ((ret = __db_cputchk(dbp, key, data, flags,
+	    F_ISSET(dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
 		return (ret);
-	if (F_ISSET(hcp, H_DELETED))
+
+	if (F_ISSET(hcp, H_DELETED) &&
+	    flags != DB_KEYFIRST && flags != DB_KEYLAST)
 		return (DB_NOTFOUND);
 
-	SET_LOCKER(hashp->dbp, cursor->txn);
-	GET_META(hashp->dbp, hashp);
-	ret = 0;
+	/*
+	 * If we are in the concurrent DB product and this cursor
+	 * is not a write cursor, then this request is invalid.
+	 * If it is a simple write cursor, then we need to upgrade its
+	 * lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		/* Make sure it's a valid update cursor. */
+		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
+			return (EINVAL);
+
+		if (F_ISSET(dbc, DBC_RMW) &&
+		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
+		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
+		    &dbc->mylock)) != 0)
+			return (EAGAIN);
+	}
+
+	GET_META(dbp, hcp, ret);
+	if (ret != 0)
+		return (ret);
+
+	SAVE_CURSOR(hcp, &save_curs);
+	hcp->stats.hash_put++;
 
 	switch (flags) {
 	case DB_KEYLAST:
 	case DB_KEYFIRST:
-		nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE :
+		nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
 		    HKEYDATA_PSIZE(key->size)) +
-		    (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE :
+		    (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
 		    HKEYDATA_PSIZE(data->size));
-		ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE);
+		if ((ret = __ham_lookup(dbc,
+		    key, nbytes, DB_LOCK_WRITE)) == DB_NOTFOUND) {
+			ret = 0;
+			if (hcp->seek_found_page != PGNO_INVALID &&
+			    hcp->seek_found_page != hcp->pgno) {
+				if ((ret = __ham_item_done(dbc, 0)) != 0)
+					goto out;
+				hcp->pgno = hcp->seek_found_page;
+				hcp->bndx = NDX_INVALID;
+			}
+
+			if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
+				/*
+				 * A partial put, but the key does not exist
+				 * and we are not beginning the write at 0.
+				 * We must create a data item padded up to doff
+				 * and then write the new bytes represented by
+				 * val.
+				 */
+				if ((ret = __ham_init_dbt(&tmp_val,
+				    data->size + data->doff,
+				    &dbc->rdata.data, &dbc->rdata.size)) == 0) {
+					memset(tmp_val.data, 0, data->doff);
+					memcpy((u_int8_t *)tmp_val.data +
+					    data->doff, data->data, data->size);
+					myval = &tmp_val;
+				}
+			} else
+				myval = (DBT *)data;
+
+			if (ret == 0)
+				ret = __ham_add_el(dbc, key, myval, H_KEYDATA);
+			goto done;
+		}
 		break;
 	case DB_BEFORE:
 	case DB_AFTER:
 	case DB_CURRENT:
-		ret = __ham_item(hashp, hcp, DB_LOCK_WRITE);
+		ret = __ham_item(dbc, DB_LOCK_WRITE);
 		break;
 	}
 
 	if (ret == 0) {
-		if (flags == DB_CURRENT && !F_ISSET(ldbp, DB_AM_DUP))
-			ret = __ham_overwrite(hashp, hcp, data);
+		if ((flags == DB_CURRENT && !F_ISSET(hcp, H_ISDUP)) ||
+		    ((flags == DB_KEYFIRST || flags == DB_KEYLAST) &&
+		    !F_ISSET(dbp, DB_AM_DUP)))
+			ret = __ham_overwrite(dbc, data);
 		else
-			ret = __ham_add_dup(hashp, hcp, data, flags);
+			ret = __ham_add_dup(dbc, data, flags);
 	}
 
-	if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
-		ret = __ham_expand_table(hashp);
+done:	if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
+		ret = __ham_expand_table(dbc);
 		F_CLR(hcp, H_EXPAND);
 	}
 
-	if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0)
+	if ((t_ret = __ham_item_done(dbc, ret == 0)) != 0 && ret == 0)
 		ret = t_ret;
-	if (ret != 0)
-		*hcp = save_curs;
-	RELEASE_META(hashp->dbp, hashp);
-	if (F_ISSET(cursor->dbp, DB_AM_THREAD))
-		__db_puthandle(ldbp);
+
+out:	RELEASE_META(dbp, hcp);
+	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);
+	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
+		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
+		    DB_LOCK_IWRITE, 0);
 	return (ret);
 }
 
@@ -974,19 +788,21 @@ __ham_c_put(cursor, key, data, flags)
 
 /*
  * __ham_expand_table --
- *
- * PUBLIC: int __ham_expand_table __P((HTAB *));
  */
-int
-__ham_expand_table(hashp)
-	HTAB *hashp;
+static int
+__ham_expand_table(dbc)
+	DBC *dbc;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DB_LSN new_lsn;
 	u_int32_t old_bucket, new_bucket, spare_ndx;
 	int ret;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	ret = 0;
-	DIRTY_META(hashp, ret);
+	DIRTY_META(dbp, hcp, ret);
 	if (ret)
 		return (ret);
 
@@ -999,78 +815,78 @@ __ham_expand_table(hashp)
 	 * see what the log of one greater than that is; here we have to
 	 * look at the log of max + 2.  VERY NASTY STUFF.
 	 */
-	if (__db_log2(hashp->hdr->max_bucket + 2) > hashp->hdr->ovfl_point) {
+	if (__db_log2(hcp->hdr->max_bucket + 2) > hcp->hdr->ovfl_point) {
 		/*
 		 * We are about to shift the split point.  Make sure that
 		 * if the next doubling is going to be big (more than 8
 		 * pages), we have some extra pages around.
 		 */
-		if (hashp->hdr->max_bucket + 1 >= 8 &&
-		    hashp->hdr->spares[hashp->hdr->ovfl_point] <
-		    hashp->hdr->spares[hashp->hdr->ovfl_point - 1] +
-		    hashp->hdr->ovfl_point + 1)
-			__ham_init_ovflpages(hashp);
+		if (hcp->hdr->max_bucket + 1 >= 8 &&
+		    hcp->hdr->spares[hcp->hdr->ovfl_point] <
+		    hcp->hdr->spares[hcp->hdr->ovfl_point - 1] +
+		    hcp->hdr->ovfl_point + 1)
+			__ham_init_ovflpages(dbc);
 	}
 
 	/* Now we can log the meta-data split. */
-	if (DB_LOGGING(hashp->dbp)) {
-		if ((ret = __ham_splitmeta_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-		    hashp->dbp->log_fileid,
-		    hashp->hdr->max_bucket, hashp->hdr->ovfl_point,
-		    hashp->hdr->spares[hashp->hdr->ovfl_point],
-		    &hashp->hdr->lsn)) != 0)
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __ham_splitmeta_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, dbp->log_fileid,
+		    hcp->hdr->max_bucket, hcp->hdr->ovfl_point,
+		    hcp->hdr->spares[hcp->hdr->ovfl_point],
+		    &hcp->hdr->lsn)) != 0)
 			return (ret);
 
-		hashp->hdr->lsn = new_lsn;
+		hcp->hdr->lsn = new_lsn;
 	}
 
-	hashp->hash_expansions++;
-	new_bucket = ++hashp->hdr->max_bucket;
-	old_bucket = (hashp->hdr->max_bucket & hashp->hdr->low_mask);
+	hcp->stats.hash_expansions++;
+	new_bucket = ++hcp->hdr->max_bucket;
+	old_bucket = (hcp->hdr->max_bucket & hcp->hdr->low_mask);
 
 	/*
 	 * If the split point is increasing, copy the current contents
 	 * of the spare split bucket to the next bucket.
 	 */
-	spare_ndx = __db_log2(hashp->hdr->max_bucket + 1);
-	if (spare_ndx > hashp->hdr->ovfl_point) {
-		hashp->hdr->spares[spare_ndx] =
-		    hashp->hdr->spares[hashp->hdr->ovfl_point];
-		hashp->hdr->ovfl_point = spare_ndx;
+	spare_ndx = __db_log2(hcp->hdr->max_bucket + 1);
+	if (spare_ndx > hcp->hdr->ovfl_point) {
+		hcp->hdr->spares[spare_ndx] =
+		    hcp->hdr->spares[hcp->hdr->ovfl_point];
+		hcp->hdr->ovfl_point = spare_ndx;
 	}
 
-	if (new_bucket > hashp->hdr->high_mask) {
+	if (new_bucket > hcp->hdr->high_mask) {
 		/* Starting a new doubling */
-		hashp->hdr->low_mask = hashp->hdr->high_mask;
-		hashp->hdr->high_mask = new_bucket | hashp->hdr->low_mask;
+		hcp->hdr->low_mask = hcp->hdr->high_mask;
+		hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
 	}
 
-	if (BUCKET_TO_PAGE(hashp, new_bucket) > MAX_PAGES(hashp)) {
-		__db_err(hashp->dbp->dbenv,
+	if (BUCKET_TO_PAGE(hcp, new_bucket) > MAX_PAGES(hcp)) {
+		__db_err(dbp->dbenv,
 		    "hash: Cannot allocate new bucket.  Pages exhausted.");
 		return (ENOSPC);
 	}
 
 	/* Relocate records to the new bucket */
-	return (__ham_split_page(hashp, old_bucket, new_bucket));
+	return (__ham_split_page(dbc, old_bucket, new_bucket));
 }
 
 /*
- * PUBLIC: u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t));
+ * PUBLIC: u_int32_t __ham_call_hash __P((HASH_CURSOR *, u_int8_t *, int32_t));
  */
 u_int32_t
-__ham_call_hash(hashp, k, len)
-	HTAB *hashp;
+__ham_call_hash(hcp, k, len)
+	HASH_CURSOR *hcp;
 	u_int8_t *k;
 	int32_t len;
 {
 	u_int32_t n, bucket;
 
-	n = (u_int32_t)hashp->hash(k, len);
-	bucket = n & hashp->hdr->high_mask;
-	if (bucket > hashp->hdr->max_bucket)
-		bucket = bucket & hashp->hdr->low_mask;
+	n = (u_int32_t)(hcp->dbc->dbp->h_hash(k, len));
+
+	bucket = n & hcp->hdr->high_mask;
+	if (bucket > hcp->hdr->max_bucket)
+		bucket = bucket & hcp->hdr->low_mask;
 	return (bucket);
 }
 
@@ -1079,31 +895,36 @@ __ham_call_hash(hashp, k, len)
  * everything held by the cursor.
  */
 static int
-__ham_dup_return(hashp, hcp, val, flags)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_dup_return(dbc, val, flags)
+	DBC *dbc;
 	DBT *val;
 	u_int32_t flags;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	PAGE *pp;
 	DBT *myval, tmp_val;
 	db_indx_t ndx;
 	db_pgno_t pgno;
+	u_int32_t off, tlen;
 	u_int8_t *hk, type;
-	int ret;
+	int cmp, ret;
 	db_indx_t len;
 
 	/* Check for duplicate and return the first one. */
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	ndx = H_DATAINDEX(hcp->bndx);
 	type = HPAGE_TYPE(hcp->pagep, ndx);
 	pp = hcp->pagep;
 	myval = val;
 
 	/*
-	 * There are 3 cases:
+	 * There are 4 cases:
 	 * 1. We are not in duplicate, simply call db_ret.
 	 * 2. We are looking at keys and stumbled onto a duplicate.
 	 * 3. We are in the middle of a duplicate set. (ISDUP set)
+	 * 4. This is a duplicate and we need to return a specific item.
 	 */
 
 	/*
@@ -1115,7 +936,7 @@ __ham_dup_return(hashp, hcp, val, flags)
 		if (type == H_DUPLICATE) {
 			F_SET(hcp, H_ISDUP);
 			hcp->dup_tlen = LEN_HDATA(hcp->pagep,
-			    hashp->hdr->pagesize, hcp->bndx);
+			    hcp->hdr->pagesize, hcp->bndx);
 			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
 			if (flags == DB_LAST || flags == DB_PREV) {
 				hcp->dndx = 0;
@@ -1141,18 +962,63 @@ __ham_dup_return(hashp, hcp, val, flags)
 			memcpy(&pgno, HOFFDUP_PGNO(P_ENTRY(hcp->pagep, ndx)),
 			    sizeof(db_pgno_t));
 			if (flags == DB_LAST || flags == DB_PREV) {
-				if ((ret = __db_dend(hashp->dbp,
+				if ((ret = __db_dend(dbc,
 				    pgno, &hcp->dpagep)) != 0)
 					return (ret);
 				hcp->dpgno = PGNO(hcp->dpagep);
 				hcp->dndx = NUM_ENT(hcp->dpagep) - 1;
-			} else if ((ret = __ham_next_cpage(hashp,
-			    hcp, pgno, 0, H_ISDUP)) != 0)
+			} else if ((ret = __ham_next_cpage(dbc,
+			    pgno, 0, H_ISDUP)) != 0)
 				return (ret);
 		}
 	}
 
 	/*
+	 * If we are retrieving a specific key/data pair, then we
+	 * may need to adjust the cursor before returning data.
+	 */
+	if (flags == DB_GET_BOTH) {
+		if (F_ISSET(hcp, H_ISDUP)) {
+			if (hcp->dpgno != PGNO_INVALID) {
+				if ((ret = __db_dsearch(dbc, 0, val,
+				    hcp->dpgno, &hcp->dndx, &hcp->dpagep, &cmp))
+				    != 0)
+					return (ret);
+				if (cmp == 0)
+					hcp->dpgno = PGNO(hcp->dpagep);
+			} else {
+				__ham_dsearch(dbc, val, &off, &cmp);
+				hcp->dup_off = off;
+			}
+		} else {
+			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
+			if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
+				memcpy(&tlen,
+				    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+				memcpy(&pgno,
+				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+				if ((ret = __db_moff(dbp, val,
+				    pgno, tlen, dbp->dup_compare, &cmp)) != 0)
+					return (ret);
+			} else {
+				/*
+				 * We do not zero tmp_val since the comparison
+				 * routines may only look at data and size.
+				 */
+				tmp_val.data = HKEYDATA_DATA(hk);
+				tmp_val.size = LEN_HDATA(hcp->pagep,
+				    dbp->pgsize, hcp->bndx);
+				cmp = dbp->dup_compare == NULL ?
+				    __bam_defcmp(&tmp_val, val) :
+				    dbp->dup_compare(&tmp_val, val);
+			}
+		}
+
+		if (cmp != 0)
+			return (DB_NOTFOUND);
+	}
+
+	/*
 	 * Now, everything is initialized, grab a duplicate if
 	 * necessary.
 	 */
@@ -1162,14 +1028,34 @@ __ham_dup_return(hashp, hcp, val, flags)
 			ndx = hcp->dndx;
 		} else {
 			/*
-			 * Copy the DBT in case we are retrieving into
-			 * user memory and we need the parameters for
-			 * it.
+			 * Copy the DBT in case we are retrieving into user
+			 * memory and we need the parameters for it.  If the
+			 * user requested a partial, then we need to adjust
+			 * the user's parameters to get the partial of the
+			 * duplicate which is itself a partial.
 			 */
 			memcpy(&tmp_val, val, sizeof(*val));
-			F_SET(&tmp_val, DB_DBT_PARTIAL);
-			tmp_val.dlen = hcp->dup_len;
-			tmp_val.doff = hcp->dup_off + sizeof(db_indx_t);
+			if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
+				/*
+				 * Take the user's length unless it would go
+				 * beyond the end of the duplicate.
+				 */
+				if (tmp_val.doff + hcp->dup_off > hcp->dup_len)
+					tmp_val.dlen = 0;
+				else if (tmp_val.dlen + tmp_val.doff >
+				    hcp->dup_len)
+					tmp_val.dlen =
+					    hcp->dup_len - tmp_val.doff;
+
+				/*
+				 * Calculate the new offset.
+				 */
+				tmp_val.doff += hcp->dup_off;
+			} else {
+				F_SET(&tmp_val, DB_DBT_PARTIAL);
+				tmp_val.dlen = hcp->dup_len;
+				tmp_val.doff = hcp->dup_off + sizeof(db_indx_t);
+			}
 			myval = &tmp_val;
 		}
 	}
@@ -1178,8 +1064,8 @@ __ham_dup_return(hashp, hcp, val, flags)
 	 * Finally, if we had a duplicate, pp, ndx, and myval should be
 	 * set appropriately.
 	 */
-	if ((ret = __db_ret(hashp->dbp, pp, ndx, myval, &hcp->big_data,
-	    &hcp->big_datalen)) != 0)
+	if ((ret = __db_ret(dbp, pp, ndx, myval, &dbc->rdata.data,
+	    &dbc->rdata.size)) != 0)
 		return (ret);
 
 	/*
@@ -1193,16 +1079,17 @@ __ham_dup_return(hashp, hcp, val, flags)
 }
 
 static int
-__ham_overwrite(hashp, hcp, nval)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_overwrite(dbc, nval)
+	DBC *dbc;
 	DBT *nval;
 {
+	HASH_CURSOR *hcp;
 	DBT *myval, tmp_val;
 	u_int8_t *hk;
 
-	if (F_ISSET(hashp->dbp, DB_AM_DUP))
-		return (__ham_add_dup(hashp, hcp, nval, DB_KEYLAST));
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (F_ISSET(dbc->dbp, DB_AM_DUP))
+		return (__ham_add_dup(dbc, nval, DB_KEYLAST));
 	else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
 		/* Put/overwrite */
 		memcpy(&tmp_val, nval, sizeof(*nval));
@@ -1214,12 +1101,12 @@ __ham_overwrite(hashp, hcp, nval)
 			    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
 		else
 			tmp_val.dlen = LEN_HDATA(hcp->pagep,
-			    hashp->hdr->pagesize,hcp->bndx);
+			    hcp->hdr->pagesize,hcp->bndx);
 		myval = &tmp_val;
 	} else /* Regular partial put */
 		myval = nval;
 
-	return (__ham_replpair(hashp, hcp, myval, 0));
+	return (__ham_replpair(dbc, myval, 0));
 }
 
 /*
@@ -1232,29 +1119,32 @@ __ham_overwrite(hashp, hcp, nval)
  * non of the cursor pointer field are valid.
  */
 static int
-__ham_lookup(hashp, hcp, key, sought, mode)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_lookup(dbc, key, sought, mode)
+	DBC *dbc;
 	const DBT *key;
 	u_int32_t sought;
 	db_lockmode_t mode;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	db_pgno_t pgno;
 	u_int32_t tlen;
 	int match, ret, t_ret;
 	u_int8_t *hk;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	/*
 	 * Set up cursor so that we're looking for space to add an item
 	 * as we cycle through the pages looking for the key.
 	 */
-	if ((ret = __ham_item_reset(hashp, hcp)) != 0)
+	if ((ret = __ham_item_reset(dbc)) != 0)
 		return (ret);
 	hcp->seek_size = sought;
 
-	hcp->bucket = __ham_call_hash(hashp, (u_int8_t *)key->data, key->size);
+	hcp->bucket = __ham_call_hash(hcp, (u_int8_t *)key->data, key->size);
 	while (1) {
-		if ((ret = __ham_item_next(hashp, hcp, mode)) != 0)
+		if ((ret = __ham_item_next(dbc, mode)) != 0)
 			return (ret);
 
 		if (F_ISSET(hcp, H_NOMORE))
@@ -1267,7 +1157,9 @@ __ham_lookup(hashp, hcp, key, sought, mode)
 			if (tlen == key->size) {
 				memcpy(&pgno,
 				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
-				match = __db_moff(hashp->dbp, key, pgno);
+				if ((ret = __db_moff(dbp,
+				    key, pgno, tlen, NULL, &match)) != 0)
+					return (ret);
 				if (match == 0) {
 					F_SET(hcp, H_OK);
 					return (0);
@@ -1276,7 +1168,7 @@ __ham_lookup(hashp, hcp, key, sought, mode)
 			break;
 		case H_KEYDATA:
 			if (key->size == LEN_HKEY(hcp->pagep,
-			    hashp->hdr->pagesize, hcp->bndx) &&
+			    hcp->hdr->pagesize, hcp->bndx) &&
 			    memcmp(key->data,
 			    HKEYDATA_DATA(hk), key->size) == 0) {
 				F_SET(hcp, H_OK);
@@ -1289,9 +1181,9 @@ __ham_lookup(hashp, hcp, key, sought, mode)
 			 * These are errors because keys are never
 			 * duplicated, only data items are.
 			 */
-			return (__db_pgfmt(hashp->dbp, PGNO(hcp->pagep)));
+			return (__db_pgfmt(dbp, PGNO(hcp->pagep)));
 		}
-		hashp->hash_collisions++;
+		hcp->stats.hash_collisions++;
 	}
 
 	/*
@@ -1301,7 +1193,7 @@ __ham_lookup(hashp, hcp, key, sought, mode)
 	if (sought != 0)
 		return (ret);
 
-	if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0)
+	if ((t_ret = __ham_item_done(dbc, 0)) != 0 && ret == 0)
 		ret = t_ret;
 	return (ret);
 }
@@ -1318,12 +1210,13 @@ __ham_init_dbt(dbt, size, bufp, sizep)
 	void **bufp;
 	u_int32_t *sizep;
 {
+	int ret;
+
 	memset(dbt, 0, sizeof(*dbt));
 	if (*sizep < size) {
-		if ((*bufp = (void *)(*bufp == NULL ?
-		    __db_malloc(size) : __db_realloc(*bufp, size))) == NULL) {
+		if ((ret = __os_realloc(bufp, size)) != 0) {
 			*sizep = 0;
-			return (ENOMEM);
+			return (ret);
 		}
 		*sizep = size;
 	}
@@ -1352,8 +1245,8 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup)
 	u_int32_t len;
 	int add, is_dup;
 {
+	DB *dbp;
 	DBC *cp;
-	HTAB *hp;
 	HASH_CURSOR *lcp;
 	int page_deleted;
 
@@ -1379,10 +1272,10 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup)
 		page_deleted =
 		    chg_pgno != PGNO_INVALID && chg_pgno != hcp->dpgno;
 
-	hp = hcp->db_cursor->dbp->master->internal;
-	DB_THREAD_LOCK(hp->dbp);
+	dbp = hcp->dbc->dbp;
+	DB_THREAD_LOCK(dbp);
 
-	for (cp = TAILQ_FIRST(&hp->dbp->curs_queue); cp != NULL;
+	for (cp = TAILQ_FIRST(&dbp->active_queue); cp != NULL;
 	    cp = TAILQ_NEXT(cp, links)) {
 		if (cp->internal == hcp)
 			continue;
@@ -1440,43 +1333,5 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup)
 			}
 		}
 	}
-	DB_THREAD_UNLOCK(hp->dbp);
-}
-
-/*
- * __ham_hdup --
- *	This function gets called when we create a duplicate handle for a
- *	threaded DB.  It should create the private part of the DB structure.
- *
- * PUBLIC: int  __ham_hdup __P((DB *, DB *));
- */
-int
-__ham_hdup(orig, new)
-	DB *orig, *new;
-{
-	DBC *curs;
-	HTAB *hashp;
-	int ret;
-
-	if ((hashp = (HTAB *)__db_malloc(sizeof(HTAB))) == NULL)
-		return (ENOMEM);
-
-	new->internal = hashp;
-
-	hashp->dbp = new;
-	hashp->hlock = 0;
-	hashp->hdr = NULL;
-	hashp->hash = ((HTAB *)orig->internal)->hash;
-	if ((hashp->split_buf = (PAGE *)__db_malloc(orig->pgsize)) == NULL)
-		return (ENOMEM);
-	hashp->local_errno = 0;
-	hashp->hash_accesses = 0;
-	hashp->hash_collisions = 0;
-	hashp->hash_expansions = 0;
-	hashp->hash_overflows = 0;
-	hashp->hash_bigpages = 0;
-	/* Initialize the cursor queue. */
-	ret = __ham_c_init(new, NULL, &curs);
-	TAILQ_INSERT_TAIL(&new->curs_queue, curs, links);
-	return (ret);
+	DB_THREAD_UNLOCK(dbp);
 }
diff --git a/db2/hash/hash_auto.c b/db2/hash/hash_auto.c
index 41b1ebed01..94a1dff6ed 100644
--- a/db2/hash/hash_auto.c
+++ b/db2/hash/hash_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "hash.h"
@@ -46,8 +45,7 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_insdel;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -59,8 +57,8 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*pagelsn)
 	    + sizeof(u_int32_t) + (key == NULL ? 0 : key->size)
 	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -109,7 +107,7 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -170,7 +168,7 @@ __ham_insdel_print(notused1, dbtp, lsnp, notused2, notused3)
 	}
 	printf("\n");
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -184,11 +182,12 @@ __ham_insdel_read(recbuf, argpp)
 {
 	__ham_insdel_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_insdel_args *)__db_malloc(sizeof(__ham_insdel_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_insdel_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -250,8 +249,7 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_newpage;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -264,8 +262,8 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*pagelsn)
 	    + sizeof(next_pgno)
 	    + sizeof(*nextlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -306,7 +304,7 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -354,7 +352,7 @@ __ham_newpage_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tnextlsn: [%lu][%lu]\n",
 	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -368,11 +366,12 @@ __ham_newpage_read(recbuf, argpp)
 {
 	__ham_newpage_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_newpage_args *)__db_malloc(sizeof(__ham_newpage_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_newpage_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -428,8 +427,7 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_splitmeta;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -439,8 +437,8 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(ovflpoint)
 	    + sizeof(spares)
 	    + sizeof(*metalsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -469,7 +467,7 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -512,7 +510,7 @@ __ham_splitmeta_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tmetalsn: [%lu][%lu]\n",
 	    (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -526,11 +524,12 @@ __ham_splitmeta_read(recbuf, argpp)
 {
 	__ham_splitmeta_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_splitmeta_args *)__db_malloc(sizeof(__ham_splitmeta_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_splitmeta_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -581,8 +580,7 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_splitdata;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -592,8 +590,8 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size)
 	    + sizeof(*pagelsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -630,7 +628,7 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -681,7 +679,7 @@ __ham_splitdata_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tpagelsn: [%lu][%lu]\n",
 	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -695,11 +693,12 @@ __ham_splitdata_read(recbuf, argpp)
 {
 	__ham_splitdata_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_splitdata_args *)__db_malloc(sizeof(__ham_splitdata_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_splitdata_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -756,8 +755,7 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_replace;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -770,8 +768,8 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (olditem == NULL ? 0 : olditem->size)
 	    + sizeof(u_int32_t) + (newitem == NULL ? 0 : newitem->size)
 	    + sizeof(makedup);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -822,7 +820,7 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -884,7 +882,7 @@ __ham_replace_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\n");
 	printf("\tmakedup: %lu\n", (u_long)argp->makedup);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -898,11 +896,12 @@ __ham_replace_read(recbuf, argpp)
 {
 	__ham_replace_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_replace_args *)__db_malloc(sizeof(__ham_replace_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_replace_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -968,8 +967,7 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_newpgno;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -983,8 +981,8 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(new_type)
 	    + sizeof(*pagelsn)
 	    + sizeof(*metalsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1024,7 +1022,7 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1072,7 +1070,7 @@ __ham_newpgno_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tmetalsn: [%lu][%lu]\n",
 	    (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1086,11 +1084,12 @@ __ham_newpgno_read(recbuf, argpp)
 {
 	__ham_newpgno_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_newpgno_args *)__db_malloc(sizeof(__ham_newpgno_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_newpgno_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1149,8 +1148,7 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_ovfl;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1161,8 +1159,8 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(free_pgno)
 	    + sizeof(ovflpoint)
 	    + sizeof(*metalsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1193,7 +1191,7 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1237,7 +1235,7 @@ __ham_ovfl_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tmetalsn: [%lu][%lu]\n",
 	    (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1251,11 +1249,12 @@ __ham_ovfl_read(recbuf, argpp)
 {
 	__ham_ovfl_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_ovfl_args *)__db_malloc(sizeof(__ham_ovfl_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_ovfl_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1312,8 +1311,7 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_ham_copypage;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1326,8 +1324,8 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(nnext_pgno)
 	    + sizeof(*nnextlsn)
 	    + sizeof(u_int32_t) + (page == NULL ? 0 : page->size);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1376,7 +1374,7 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1432,7 +1430,7 @@ __ham_copypage_print(notused1, dbtp, lsnp, notused2, notused3)
 	}
 	printf("\n");
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1446,11 +1444,12 @@ __ham_copypage_read(recbuf, argpp)
 {
 	__ham_copypage_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__ham_copypage_args *)__db_malloc(sizeof(__ham_copypage_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__ham_copypage_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
diff --git a/db2/hash/hash_debug.c b/db2/hash/hash_debug.c
deleted file mode 100644
index 232906ae34..0000000000
--- a/db2/hash/hash_debug.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- */
-/*
- * Copyright (c) 1995
- *	The President and Fellows of Harvard University.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Jeremy Rassen.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#ifndef lint
-static const char sccsid[] = "@(#)hash_debug.c	10.6 (Sleepycat) 5/7/98";
-#endif /* not lint */
-
-#ifdef DEBUG
-/*
- * PACKAGE:  hashing
- *
- * DESCRIPTION:
- *	Debug routines.
- *
- * ROUTINES:
- *
- * External
- *	__dump_bucket
- */
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "hash.h"
-
-/*
- * __ham_dump_bucket --
- *
- * PUBLIC: #ifdef DEBUG
- * PUBLIC: void __ham_dump_bucket __P((HTAB *, u_int32_t));
- * PUBLIC: #endif
- */
-void
-__ham_dump_bucket(hashp, bucket)
-	HTAB *hashp;
-	u_int32_t bucket;
-{
-	PAGE *p;
-	db_pgno_t pgno;
-
-	for (pgno = BUCKET_TO_PAGE(hashp, bucket); pgno != PGNO_INVALID;) {
-		if (memp_fget(hashp->dbp->mpf, &pgno, 0, &p) != 0)
-			break;
-		(void)__db_prpage(p, 1);
-		pgno = p->next_pgno;
-		(void)memp_fput(hashp->dbp->mpf, p, 0);
-	}
-}
-#endif /* DEBUG */
diff --git a/db2/hash/hash_dup.c b/db2/hash/hash_dup.c
index ba248ddb17..bb3466428d 100644
--- a/db2/hash/hash_dup.c
+++ b/db2/hash/hash_dup.c
@@ -42,7 +42,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)hash_dup.c	10.14 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)hash_dup.c	10.27 (Sleepycat) 12/6/98";
 #endif /* not lint */
 
 /*
@@ -61,15 +61,17 @@ static const char sccsid[] = "@(#)hash_dup.c	10.14 (Sleepycat) 5/7/98";
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
+#include <errno.h>
 #include <string.h>
 #endif
 
 #include "db_int.h"
 #include "db_page.h"
 #include "hash.h"
+#include "btree.h"
 
-static int __ham_check_move __P((HTAB *, HASH_CURSOR *, int32_t));
-static int __ham_dup_convert __P((HTAB *, HASH_CURSOR *));
+static int __ham_check_move __P((DBC *, int32_t));
+static int __ham_dup_convert __P((DBC *));
 static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *));
 
 /*
@@ -85,26 +87,29 @@ static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *));
  * Case 4: The element is large enough to push the duplicate set onto a
  *	   separate page.
  *
- * PUBLIC: int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
+ * PUBLIC: int __ham_add_dup __P((DBC *, DBT *, u_int32_t));
  */
 int
-__ham_add_dup(hashp, hcp, nval, flags)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_add_dup(dbc, nval, flags)
+	DBC *dbc;
 	DBT *nval;
 	u_int32_t flags;
 {
-	DBT pval, tmp_val;
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	DBT dbt, pval, tmp_val;
 	u_int32_t del_len, new_size;
-	int ret;
+	int cmp, ret;
 	u_int8_t *hk;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	if (flags == DB_CURRENT && hcp->dpgno == PGNO_INVALID)
 		del_len = hcp->dup_len;
 	else
 		del_len = 0;
 
-	if ((ret = __ham_check_move(hashp, hcp,
+	if ((ret = __ham_check_move(dbc,
 	    (int32_t)DUP_SIZE(nval->size) - (int32_t)del_len)) != 0)
 		return (ret);
 
@@ -117,7 +122,7 @@ __ham_add_dup(hashp, hcp, nval, flags)
 	 */
 	hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
 	new_size = DUP_SIZE(nval->size) - del_len + LEN_HKEYDATA(hcp->pagep,
-	    hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx));
+	    hcp->hdr->pagesize, H_DATAINDEX(hcp->bndx));
 
 	/*
 	 * We convert to off-page duplicates if the item is a big item,
@@ -125,10 +130,10 @@ __ham_add_dup(hashp, hcp, nval, flags)
 	 * if there isn't enough room on this page to add the next item.
 	 */
 	if (HPAGE_PTYPE(hk) != H_OFFDUP &&
-	    (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hashp, new_size) ||
+	    (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hcp, new_size) ||
 	    DUP_SIZE(nval->size) - del_len > P_FREESPACE(hcp->pagep))) {
 
-		if ((ret = __ham_dup_convert(hashp, hcp)) != 0)
+		if ((ret = __ham_dup_convert(dbc)) != 0)
 			return (ret);
 		else
 			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
@@ -140,30 +145,44 @@ __ham_add_dup(hashp, hcp, nval, flags)
 			HPAGE_PTYPE(hk) = H_DUPLICATE;
 			pval.flags = 0;
 			pval.data = HKEYDATA_DATA(hk);
-			pval.size = LEN_HDATA(hcp->pagep, hashp->hdr->pagesize,
+			pval.size = LEN_HDATA(hcp->pagep, dbp->pgsize,
 			    hcp->bndx);
 			if ((ret =
-			    __ham_make_dup(&pval, &tmp_val, &hcp->big_data,
-			    &hcp->big_datalen)) != 0 || (ret =
-			    __ham_replpair(hashp, hcp, &tmp_val, 1)) != 0)
+			    __ham_make_dup(&pval, &tmp_val, &dbc->rdata.data,
+			    &dbc->rdata.size)) != 0 || (ret =
+			    __ham_replpair(dbc, &tmp_val, 1)) != 0)
 				return (ret);
 		}
 
 		/* Now make the new entry a duplicate. */
 		if ((ret = __ham_make_dup(nval,
-		    &tmp_val, &hcp->big_data, &hcp->big_datalen)) != 0)
+		    &tmp_val, &dbc->rdata.data, &dbc->rdata.size)) != 0)
 			return (ret);
 
 		tmp_val.dlen = 0;
 		switch (flags) {			/* On page. */
 		case DB_KEYFIRST:
-			tmp_val.doff = 0;
-			break;
 		case DB_KEYLAST:
-			tmp_val.doff = LEN_HDATA(hcp->pagep,
-			    hashp->hdr->pagesize, hcp->bndx);
+			if (dbp->dup_compare != NULL)
+				__ham_dsearch(dbc, nval, &tmp_val.doff, &cmp);
+			else if (flags == DB_KEYFIRST)
+				tmp_val.doff = 0;
+			else
+				tmp_val.doff = LEN_HDATA(hcp->pagep,
+				    hcp->hdr->pagesize, hcp->bndx);
 			break;
 		case DB_CURRENT:
+			/*
+			 * If we have a sort function, we need to verify that
+			 * the new item sorts identically to the old item.
+			 */
+			if (dbp->dup_compare != NULL) {
+				dbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep,
+				    hcp->bndx)) + hcp->dup_off;
+				dbt.size = DUP_SIZE(hcp->dup_len);
+				if (dbp->dup_compare(nval, &dbt) != 0)
+					return (EINVAL);
+			}
 			tmp_val.doff = hcp->dup_off;
 			tmp_val.dlen = DUP_SIZE(hcp->dup_len);
 			break;
@@ -175,9 +194,9 @@ __ham_add_dup(hashp, hcp, nval, flags)
 			break;
 		}
 		/* Add the duplicate. */
-		ret = __ham_replpair(hashp, hcp, &tmp_val, 0);
+		ret = __ham_replpair(dbc, &tmp_val, 0);
 		if (ret == 0)
-			ret = __ham_dirty_page(hashp, hcp->pagep);
+			ret = __ham_dirty_page(dbp, hcp->pagep);
 		__ham_c_update(hcp, hcp->pgno, tmp_val.size, 1, 1);
 		return (ret);
 	}
@@ -190,27 +209,48 @@ __ham_add_dup(hashp, hcp, nval, flags)
 
 	switch (flags) {
 	case DB_KEYFIRST:
+		if (dbp->dup_compare != NULL)
+			goto sorted_dups;
 		/*
 		 * The only way that we are already on a dup page is
 		 * if we just converted the on-page representation.
 		 * In that case, we've only got one page of duplicates.
 		 */
 		if (hcp->dpagep == NULL && (ret =
-		    __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
+		    __db_dend(dbc, hcp->dpgno, &hcp->dpagep)) != 0)
 			return (ret);
 		hcp->dndx = 0;
 		break;
 	case DB_KEYLAST:
-		if (hcp->dpagep == NULL && (ret =
-		    __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
-			return (ret);
-		hcp->dpgno = PGNO(hcp->dpagep);
-		hcp->dndx = NUM_ENT(hcp->dpagep);
+		if (dbp->dup_compare != NULL) {
+sorted_dups:		if ((ret = __db_dsearch(dbc, 1, nval,
+			    hcp->dpgno, &hcp->dndx, &hcp->dpagep, &cmp)) != 0)
+				return (ret);
+			if (cmp == 0)
+				hcp->dpgno = PGNO(hcp->dpagep);
+		} else {
+			if (hcp->dpagep == NULL && (ret =
+			    __db_dend(dbc, hcp->dpgno, &hcp->dpagep)) != 0)
+				return (ret);
+			hcp->dpgno = PGNO(hcp->dpagep);
+			hcp->dndx = NUM_ENT(hcp->dpagep);
+		}
 		break;
 	case DB_CURRENT:
-		if ((ret = __db_ditem(hashp->dbp, hcp->dpagep, hcp->dndx,
-		    BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep, hcp->dndx)->len)))
-		    != 0)
+		if (dbp->dup_compare != NULL && __bam_cmp(dbp,
+		    nval, hcp->dpagep, hcp->dndx, dbp->dup_compare) != 0)
+			return (EINVAL);
+		switch (GET_BKEYDATA(hcp->dpagep, hcp->dndx)->type) {
+		case B_KEYDATA:
+			del_len = BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep,
+			    hcp->dndx)->len);
+			break;
+		case B_OVERFLOW:
+			del_len = BOVERFLOW_SIZE;
+			break;
+		}
+		if ((ret =
+		    __db_ditem(dbc, hcp->dpagep, hcp->dndx, del_len)) != 0)
 			return (ret);
 		break;
 	case DB_BEFORE:	/* The default behavior is correct. */
@@ -220,7 +260,7 @@ __ham_add_dup(hashp, hcp, nval, flags)
 		break;
 	}
 
-	ret = __db_dput(hashp->dbp,
+	ret = __db_dput(dbc,
 	    nval, &hcp->dpagep, &hcp->dndx, __ham_overflow_page);
 	hcp->pgno = PGNO(hcp->pagep);
 	__ham_c_update(hcp, hcp->pgno, nval->size, 1, 1);
@@ -231,22 +271,25 @@ __ham_add_dup(hashp, hcp, nval, flags)
  * Convert an on-page set of duplicates to an offpage set of duplicates.
  */
 static int
-__ham_dup_convert(hashp, hcp)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_dup_convert(dbc)
+	DBC *dbc;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	BOVERFLOW bo;
 	DBT dbt;
 	HOFFPAGE ho;
-	db_indx_t dndx, len;
+	db_indx_t dndx, i, len, off;
 	int ret;
 	u_int8_t *p, *pend;
 
 	/*
 	 * Create a new page for the duplicates.
 	 */
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	if ((ret =
-	    __ham_overflow_page(hashp->dbp, P_DUPLICATE, &hcp->dpagep)) != 0)
+	    __ham_overflow_page(dbc, P_DUPLICATE, &hcp->dpagep)) != 0)
 		return (ret);
 	hcp->dpagep->type = P_DUPLICATE;
 	hcp->dpgno = PGNO(hcp->dpagep);
@@ -254,67 +297,80 @@ __ham_dup_convert(hashp, hcp)
 	/*
 	 * Now put the duplicates onto the new page.
 	 */
+	dndx = 0;
 	dbt.flags = 0;
 	switch (HPAGE_PTYPE(H_PAIRDATA(hcp->pagep, hcp->bndx))) {
 	case H_KEYDATA:
 		/* Simple case, one key on page; move it to dup page. */
-		dndx = 0;
 		dbt.size =
-		    LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx);
+		    LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx);
 		dbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
-		ret = __db_pitem(hashp->dbp, hcp->dpagep,
+		ret = __db_pitem(dbc, hcp->dpagep,
 		    (u_int32_t)dndx, BKEYDATA_SIZE(dbt.size), NULL, &dbt);
 		if (ret == 0)
-			__ham_dirty_page(hashp, hcp->dpagep);
+			__ham_dirty_page(dbp, hcp->dpagep);
 		break;
 	case H_OFFPAGE:
 		/* Simple case, one key on page; move it to dup page. */
-		dndx = 0;
 		memcpy(&ho,
 		    P_ENTRY(hcp->pagep, H_DATAINDEX(hcp->bndx)), HOFFPAGE_SIZE);
+		UMRW(bo.unused1);
 		B_TSET(bo.type, ho.type, 0);
+		UMRW(bo.unused2);
 		bo.pgno = ho.pgno;
 		bo.tlen = ho.tlen;
 		dbt.size = BOVERFLOW_SIZE;
 		dbt.data = &bo;
 
-		ret = __db_pitem(hashp->dbp, hcp->dpagep,
+		ret = __db_pitem(dbc, hcp->dpagep,
 		   (u_int32_t)dndx, dbt.size, &dbt, NULL);
 		if (ret == 0)
-			__ham_dirty_page(hashp, hcp->dpagep);
+			__ham_dirty_page(dbp, hcp->dpagep);
 		break;
 	case H_DUPLICATE:
 		p = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
 		pend = p +
-		    LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx);
+		    LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx);
 
-		for (dndx = 0; p < pend; dndx++) {
+		/*
+		 * We need to maintain the duplicate cursor position.
+		 * Keep track of where we are in the duplicate set via
+		 * the offset, and when it matches the one in the cursor,
+		 * set the off-page duplicate cursor index to the current
+		 * index.
+		 */
+		for (off = 0, i = 0; p < pend; i++) {
+			if (off == hcp->dup_off)
+				dndx = i;
 			memcpy(&len, p, sizeof(db_indx_t));
 			dbt.size = len;
 			p += sizeof(db_indx_t);
 			dbt.data = p;
 			p += len + sizeof(db_indx_t);
-			ret = __db_dput(hashp->dbp, &dbt,
-			    &hcp->dpagep, &dndx, __ham_overflow_page);
+			off += len + 2 * sizeof(db_indx_t);
+			ret = __db_dput(dbc, &dbt,
+			    &hcp->dpagep, &i, __ham_overflow_page);
 			if (ret != 0)
 				break;
 		}
 		break;
 	default:
-		ret = __db_pgfmt(hashp->dbp, (u_long)hcp->pgno);
+		ret = __db_pgfmt(dbp, (u_long)hcp->pgno);
+		break;
 	}
 	if (ret == 0) {
 		/*
 		 * Now attach this to the source page in place of
 		 * the old duplicate item.
 		 */
-		__ham_move_offpage(hashp, hcp->pagep,
+		__ham_move_offpage(dbc, hcp->pagep,
 		    (u_int32_t)H_DATAINDEX(hcp->bndx), hcp->dpgno);
 
 		/* Can probably just do a "put" here. */
-		ret = __ham_dirty_page(hashp, hcp->pagep);
+		ret = __ham_dirty_page(dbp, hcp->pagep);
+		hcp->dndx = dndx;
 	} else {
-		(void)__ham_del_page(hashp->dbp, hcp->dpagep);
+		(void)__ham_del_page(dbc, hcp->dpagep);
 		hcp->dpagep = NULL;
 	}
 	return (ret);
@@ -354,11 +410,12 @@ __ham_make_dup(notdup, duplicate, bufp, sizep)
 }
 
 static int
-__ham_check_move(hashp, hcp, add_len)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_check_move(dbc, add_len)
+	DBC *dbc;
 	int32_t add_len;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DBT k, d;
 	DB_LSN new_lsn;
 	PAGE *next_pagep;
@@ -367,6 +424,8 @@ __ham_check_move(hashp, hcp, add_len)
 	u_int8_t *hk;
 	int ret;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	/*
 	 * Check if we can do whatever we need to on this page.  If not,
 	 * then we'll have to move the current element to a new page.
@@ -381,7 +440,7 @@ __ham_check_move(hashp, hcp, add_len)
 		return (0);
 
 	old_len =
-	    LEN_HITEM(hcp->pagep, hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx));
+	    LEN_HITEM(hcp->pagep, hcp->hdr->pagesize, H_DATAINDEX(hcp->bndx));
 	new_datalen = old_len - HKEYDATA_SIZE(0) + add_len;
 
 	/*
@@ -392,11 +451,11 @@ __ham_check_move(hashp, hcp, add_len)
 	 *    threshold, but the new data won't fit on the page.
 	 * If neither of these is true, then we can return.
 	 */
-	if (ISBIG(hashp, new_datalen) && (old_len > HOFFDUP_SIZE ||
+	if (ISBIG(hcp, new_datalen) && (old_len > HOFFDUP_SIZE ||
 	    HOFFDUP_SIZE - old_len <= P_FREESPACE(hcp->pagep)))
 		return (0);
 
-	if (!ISBIG(hashp, new_datalen) &&
+	if (!ISBIG(hcp, new_datalen) &&
 	    add_len <= (int32_t)P_FREESPACE(hcp->pagep))
 		return (0);
 
@@ -405,18 +464,18 @@ __ham_check_move(hashp, hcp, add_len)
 	 * Check if there are more pages in the chain.
 	 */
 
-	new_datalen = ISBIG(hashp, new_datalen) ?
+	new_datalen = ISBIG(hcp, new_datalen) ?
 	    HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen);
 
 	next_pagep = NULL;
 	for (next_pgno = NEXT_PGNO(hcp->pagep); next_pgno != PGNO_INVALID;
 	    next_pgno = NEXT_PGNO(next_pagep)) {
 		if (next_pagep != NULL &&
-		    (ret = __ham_put_page(hashp->dbp, next_pagep, 0)) != 0)
+		    (ret = __ham_put_page(dbp, next_pagep, 0)) != 0)
 			return (ret);
 
 		if ((ret =
-		    __ham_get_page(hashp->dbp, next_pgno, &next_pagep)) != 0)
+		    __ham_get_page(dbp, next_pgno, &next_pagep)) != 0)
 			return (ret);
 
 		if (P_FREESPACE(next_pagep) >= new_datalen)
@@ -424,17 +483,17 @@ __ham_check_move(hashp, hcp, add_len)
 	}
 
 	/* No more pages, add one. */
-	if (next_pagep == NULL &&
-	    (ret = __ham_add_ovflpage(hashp, hcp->pagep, 0, &next_pagep)) != 0)
+	if (next_pagep == NULL && (ret = __ham_add_ovflpage(dbc,
+	    hcp->pagep, 0, &next_pagep)) != 0)
 		return (ret);
 
 	/* Add new page at the end of the chain. */
-	if (P_FREESPACE(next_pagep) < new_datalen &&
-	    (ret = __ham_add_ovflpage(hashp, next_pagep, 1, &next_pagep)) != 0)
+	if (P_FREESPACE(next_pagep) < new_datalen && (ret =
+	    __ham_add_ovflpage(dbc, next_pagep, 1, &next_pagep)) != 0)
 		return (ret);
 
 	/* Copy the item to the new page. */
-	if (DB_LOGGING(hashp->dbp)) {
+	if (DB_LOGGING(hcp->dbc)) {
 		rectype = PUTPAIR;
 		k.flags = 0;
 		d.flags = 0;
@@ -447,7 +506,7 @@ __ham_check_move(hashp, hcp, add_len)
 			k.data =
 			    HKEYDATA_DATA(H_PAIRKEY(hcp->pagep, hcp->bndx));
 			k.size = LEN_HKEY(hcp->pagep,
-			    hashp->hdr->pagesize, hcp->bndx);
+			    hcp->hdr->pagesize, hcp->bndx);
 		}
 
 		if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
@@ -458,13 +517,13 @@ __ham_check_move(hashp, hcp, add_len)
 			d.data =
 			    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
 			d.size = LEN_HDATA(hcp->pagep,
-			    hashp->hdr->pagesize, hcp->bndx);
+			    hcp->hdr->pagesize, hcp->bndx);
 		}
 
 
-		if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype,
-		    hashp->dbp->log_fileid, PGNO(next_pagep),
+		if ((ret = __ham_insdel_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, rectype,
+		    dbp->log_fileid, PGNO(next_pagep),
 		    (u_int32_t)H_NUMPAIRS(next_pagep), &LSN(next_pagep),
 		    &k, &d)) != 0)
 			return (ret);
@@ -473,13 +532,15 @@ __ham_check_move(hashp, hcp, add_len)
 		LSN(next_pagep) = new_lsn;	/* Structure assignment. */
 	}
 
-	__ham_copy_item(hashp, hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep);
-	__ham_copy_item(hashp, hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep);
+	__ham_copy_item(dbp->pgsize,
+	    hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep);
+	__ham_copy_item(dbp->pgsize,
+	    hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep);
 
 	/* Now delete the pair from the current page. */
-	ret = __ham_del_pair(hashp, hcp, 0);
+	ret = __ham_del_pair(dbc, 0);
 
-	(void)__ham_put_page(hashp->dbp, hcp->pagep, 1);
+	(void)__ham_put_page(dbp, hcp->pagep, 1);
 	hcp->pagep = next_pagep;
 	hcp->pgno = PGNO(hcp->pagep);
 	hcp->bndx = H_NUMPAIRS(hcp->pagep) - 1;
@@ -488,19 +549,25 @@ __ham_check_move(hashp, hcp, add_len)
 }
 
 /*
- * Replace an onpage set of duplicates with the OFFDUP structure that
- * references the duplicate page.
- * XXX This is really just a special case of __onpage_replace; we should
+ * __ham_move_offpage --
+ *	Replace an onpage set of duplicates with the OFFDUP structure
+ *	that references the duplicate page.
+ *
+ * XXX
+ * This is really just a special case of __onpage_replace; we should
  * probably combine them.
- * PUBLIC: void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t));
+ *
+ * PUBLIC: void __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t));
  */
 void
-__ham_move_offpage(hashp, pagep, ndx, pgno)
-	HTAB *hashp;
+__ham_move_offpage(dbc, pagep, ndx, pgno)
+	DBC *dbc;
 	PAGE *pagep;
 	u_int32_t ndx;
 	db_pgno_t pgno;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DBT new_dbt;
 	DBT old_dbt;
 	HOFFDUP od;
@@ -508,22 +575,27 @@ __ham_move_offpage(hashp, pagep, ndx, pgno)
 	int32_t shrink;
 	u_int8_t *src;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	od.type = H_OFFDUP;
+	UMRW(od.unused[0]);
+	UMRW(od.unused[1]);
+	UMRW(od.unused[2]);
 	od.pgno = pgno;
 
-	if (DB_LOGGING(hashp->dbp)) {
+	if (DB_LOGGING(dbc)) {
 		new_dbt.data = &od;
 		new_dbt.size = HOFFDUP_SIZE;
 		old_dbt.data = P_ENTRY(pagep, ndx);
-		old_dbt.size = LEN_HITEM(pagep, hashp->hdr->pagesize, ndx);
-		(void)__ham_replace_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &LSN(pagep), 0,
-		    hashp->dbp->log_fileid, PGNO(pagep), (u_int32_t)ndx,
-		    &LSN(pagep), -1, &old_dbt, &new_dbt, 0);
+		old_dbt.size = LEN_HITEM(pagep, hcp->hdr->pagesize, ndx);
+		(void)__ham_replace_log(dbp->dbenv->lg_info,
+		    dbc->txn, &LSN(pagep), 0, dbp->log_fileid,
+		    PGNO(pagep), (u_int32_t)ndx, &LSN(pagep), -1,
+		    &old_dbt, &new_dbt, 0);
 	}
 
 	shrink =
-	    LEN_HITEM(pagep, hashp->hdr->pagesize, ndx) - HOFFDUP_SIZE;
+	    LEN_HITEM(pagep, hcp->hdr->pagesize, ndx) - HOFFDUP_SIZE;
 
 	if (shrink != 0) {
 		/* Copy data. */
@@ -539,3 +611,46 @@ __ham_move_offpage(hashp, pagep, ndx, pgno)
 	/* Now copy the offdup entry onto the page. */
 	memcpy(P_ENTRY(pagep, ndx), &od, HOFFDUP_SIZE);
 }
+
+/*
+ * __ham_dsearch:
+ *	Locate a particular duplicate in a duplicate set.
+ *
+ * PUBLIC: void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *));
+ */
+void
+__ham_dsearch(dbc, dbt, offp, cmpp)
+	DBC *dbc;
+	DBT *dbt;
+	u_int32_t *offp;
+	int *cmpp;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	DBT cur;
+	db_indx_t i, len;
+	int (*func) __P((const DBT *, const DBT *));
+	u_int8_t *data;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (dbp->dup_compare == NULL)
+		func = __bam_defcmp;
+	else
+		func = dbp->dup_compare;
+
+	i = F_ISSET(dbc, DBC_CONTINUE) ? hcp->dup_off: 0;
+	data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) + i;
+	while (i < LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx)) {
+		memcpy(&len, data, sizeof(db_indx_t));
+		data += sizeof(db_indx_t);
+		cur.data = data;
+		cur.size = (u_int32_t)len;
+		*cmpp = func(dbt, &cur);
+		if (*cmpp == 0 || (*cmpp < 0 && dbp->dup_compare != NULL))
+			break;
+		i += len + 2 * sizeof(db_indx_t);
+		data += len + sizeof(db_indx_t);
+	}
+	*offp = i;
+}
diff --git a/db2/hash/hash_page.c b/db2/hash/hash_page.c
index 5b3463947b..3419c1215c 100644
--- a/db2/hash/hash_page.c
+++ b/db2/hash/hash_page.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)hash_page.c	10.40 (Sleepycat) 6/2/98";
+static const char sccsid[] = "@(#)hash_page.c	10.55 (Sleepycat) 1/3/99";
 #endif /* not lint */
 
 /*
@@ -77,107 +77,118 @@ static const char sccsid[] = "@(#)hash_page.c	10.40 (Sleepycat) 6/2/98";
 #include "db_page.h"
 #include "hash.h"
 
-static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t));
+static int __ham_lock_bucket __P((DBC *, db_lockmode_t));
 
 #ifdef DEBUG_SLOW
-static void  __account_page(HTAB *, db_pgno_t, int);
+static void  __account_page(DB *, db_pgno_t, int);
 #endif
 
 /*
- * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t));
  */
 int
-__ham_item(hashp, cursorp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	db_pgno_t next_pgno;
 	int ret;
 
-	if (F_ISSET(cursorp, H_DELETED))
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (F_ISSET(hcp, H_DELETED))
 		return (EINVAL);
-	F_CLR(cursorp, H_OK | H_NOMORE);
+	F_CLR(hcp, H_OK | H_NOMORE);
 
 	/* Check if we need to get a page for this cursor. */
-	if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
 		return (ret);
 
 	/* Check if we are looking for space in which to insert an item. */
-	if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID
-	    && cursorp->seek_size < P_FREESPACE(cursorp->pagep))
-		cursorp->seek_found_page = cursorp->pgno;
+	if (hcp->seek_size && hcp->seek_found_page == PGNO_INVALID
+	    && hcp->seek_size < P_FREESPACE(hcp->pagep))
+		hcp->seek_found_page = hcp->pgno;
 
 	/* Check if we need to go on to the next page. */
-	if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID)
+	if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno == PGNO_INVALID)
 		/*
 		 * ISDUP is set, and offset is at the beginning of the datum.
 		 * We need to grab the length of the datum, then set the datum
 		 * pointer to be the beginning of the datum.
 		 */
-		memcpy(&cursorp->dup_len,
-		    HKEYDATA_DATA(H_PAIRDATA(cursorp->pagep, cursorp->bndx)) +
-		    cursorp->dup_off, sizeof(db_indx_t));
-	else if (F_ISSET(cursorp, H_ISDUP)) {
+		memcpy(&hcp->dup_len,
+		    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) +
+		    hcp->dup_off, sizeof(db_indx_t));
+	else if (F_ISSET(hcp, H_ISDUP)) {
 		/* Make sure we're not about to run off the page. */
-		if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp,
-		    cursorp->dpgno, &cursorp->dpagep)) != 0)
+		if (hcp->dpagep == NULL && (ret = __ham_get_page(dbp,
+		    hcp->dpgno, &hcp->dpagep)) != 0)
 			return (ret);
 
-		if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) {
-			if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) {
-				if ((ret = __ham_put_page(hashp->dbp,
-				    cursorp->dpagep, 0)) != 0)
+		if (hcp->dndx >= NUM_ENT(hcp->dpagep)) {
+			if (NEXT_PGNO(hcp->dpagep) == PGNO_INVALID) {
+				if (F_ISSET(hcp, H_DUPONLY)) {
+					F_CLR(hcp, H_OK);
+					F_SET(hcp, H_NOMORE);
+					return (0);
+				}
+				if ((ret = __ham_put_page(dbp,
+				    hcp->dpagep, 0)) != 0)
 					return (ret);
-				F_CLR(cursorp, H_ISDUP);
-				cursorp->dpagep = NULL;
-				cursorp->dpgno = PGNO_INVALID;
-				cursorp->dndx = NDX_INVALID;
-				cursorp->bndx++;
-			} else if ((ret = __ham_next_cpage(hashp, cursorp,
-			    NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
+				F_CLR(hcp, H_ISDUP);
+				hcp->dpagep = NULL;
+				hcp->dpgno = PGNO_INVALID;
+				hcp->dndx = NDX_INVALID;
+				hcp->bndx++;
+			} else if ((ret = __ham_next_cpage(dbc,
+			    NEXT_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
 				return (ret);
 		}
 	}
 
-	if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) {
+	if (hcp->bndx >= (db_indx_t)H_NUMPAIRS(hcp->pagep)) {
 		/* Fetch next page. */
-		if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) {
-			F_SET(cursorp, H_NOMORE);
-			if (cursorp->dpagep != NULL &&
-			    (ret = __ham_put_page(hashp->dbp,
-			    cursorp->dpagep, 0)) != 0)
+		if (NEXT_PGNO(hcp->pagep) == PGNO_INVALID) {
+			F_SET(hcp, H_NOMORE);
+			if (hcp->dpagep != NULL &&
+			    (ret = __ham_put_page(dbp, hcp->dpagep, 0)) != 0)
 				return (ret);
-			cursorp->dpgno = PGNO_INVALID;
+			hcp->dpgno = PGNO_INVALID;
 			return (DB_NOTFOUND);
 		}
-		next_pgno = NEXT_PGNO(cursorp->pagep);
-		cursorp->bndx = 0;
-		if ((ret = __ham_next_cpage(hashp,
-		    cursorp, next_pgno, 0, 0)) != 0)
+		next_pgno = NEXT_PGNO(hcp->pagep);
+		hcp->bndx = 0;
+		if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
 			return (ret);
 	}
 
-	F_SET(cursorp, H_OK);
+	F_SET(hcp, H_OK);
 	return (0);
 }
 
 /*
- * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *));
+ * PUBLIC: int __ham_item_reset __P((DBC *));
  */
 int
-__ham_item_reset(hashp, cursorp)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_reset(dbc)
+	DBC *dbc;
 {
+	HASH_CURSOR *hcp;
+	DB *dbp;
 	int ret;
 
-	if (cursorp->pagep)
-		ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0);
-	else
-		ret = 0;
-
-	__ham_item_init(cursorp);
+	ret = 0;
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (hcp->pagep != NULL)
+		ret = __ham_put_page(dbp, hcp->pagep, 0);
+	if (ret == 0 && hcp->dpagep != NULL)
+		ret = __ham_put_page(dbp, hcp->dpagep, 0);
+
+	__ham_item_init(hcp);
 	return (ret);
 }
 
@@ -185,57 +196,67 @@ __ham_item_reset(hashp, cursorp)
  * PUBLIC: void __ham_item_init __P((HASH_CURSOR *));
  */
 void
-__ham_item_init(cursorp)
-	HASH_CURSOR *cursorp;
+__ham_item_init(hcp)
+	HASH_CURSOR *hcp;
 {
-	cursorp->pagep = NULL;
-	cursorp->bucket = BUCKET_INVALID;
-	cursorp->lock = 0;
-	cursorp->bndx = NDX_INVALID;
-	cursorp->pgno = PGNO_INVALID;
-	cursorp->dpgno = PGNO_INVALID;
-	cursorp->dndx = NDX_INVALID;
-	cursorp->dpagep = NULL;
-	cursorp->flags = 0;
-	cursorp->seek_size = 0;
-	cursorp->seek_found_page = PGNO_INVALID;
+	/*
+	 * If this cursor still holds any locks, we must
+	 * release them if we are not running with transactions.
+	 */
+	if (hcp->lock && hcp->dbc->txn == NULL)
+	    (void)lock_put(hcp->dbc->dbp->dbenv->lk_info, hcp->lock);
+
+	/*
+	 * The following fields must *not* be initialized here
+	 * because they may have meaning across inits.
+	 * 	hlock, hdr, split_buf, stats
+	 */
+	hcp->bucket = BUCKET_INVALID;
+	hcp->lbucket = BUCKET_INVALID;
+	hcp->lock = 0;
+	hcp->pagep = NULL;
+	hcp->pgno = PGNO_INVALID;
+	hcp->bndx = NDX_INVALID;
+	hcp->dpagep = NULL;
+	hcp->dpgno = PGNO_INVALID;
+	hcp->dndx = NDX_INVALID;
+	hcp->dup_off = 0;
+	hcp->dup_len = 0;
+	hcp->dup_tlen = 0;
+	hcp->seek_size = 0;
+	hcp->seek_found_page = PGNO_INVALID;
+	hcp->flags = 0;
 }
 
 /*
- * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int));
+ * PUBLIC: int __ham_item_done __P((DBC *, int));
  */
 int
-__ham_item_done(hashp, cursorp, dirty)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_done(dbc, dirty)
+	DBC *dbc;
 	int dirty;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	int ret, t_ret;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	t_ret = ret = 0;
 
-	if (cursorp->pagep)
-		ret = __ham_put_page(hashp->dbp, cursorp->pagep,
-		    dirty && cursorp->dpagep == NULL);
-	cursorp->pagep = NULL;
+	if (hcp->pagep)
+		ret = __ham_put_page(dbp, hcp->pagep,
+		    dirty && hcp->dpagep == NULL);
+	hcp->pagep = NULL;
 
-	if (cursorp->dpagep)
-		t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty);
-	cursorp->dpagep = NULL;
+	if (hcp->dpagep)
+		t_ret = __ham_put_page(dbp, hcp->dpagep, dirty);
+	hcp->dpagep = NULL;
 
 	if (ret == 0 && t_ret != 0)
 		ret = t_ret;
 
 	/*
-	 * If we are running with transactions, then we must
-	 * not relinquish locks explicitly.
-	 */
-	if (cursorp->lock && hashp->dbp->txn == NULL)
-	    t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock);
-	cursorp->lock = 0;
-
-
-	/*
 	 * We don't throw out the page number since we might want to
 	 * continue getting on this page.
 	 */
@@ -245,40 +266,42 @@ __ham_item_done(hashp, cursorp, dirty)
 /*
  * Returns the last item in a bucket.
  *
- * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t));
  */
 int
-__ham_item_last(hashp, cursorp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_last(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	HASH_CURSOR *hcp;
 	int ret;
 
-	if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_item_reset(dbc)) != 0)
 		return (ret);
 
-	cursorp->bucket = hashp->hdr->max_bucket;
-	F_SET(cursorp, H_OK);
-	return (__ham_item_prev(hashp, cursorp, mode));
+	hcp->bucket = hcp->hdr->max_bucket;
+	F_SET(hcp, H_OK);
+	return (__ham_item_prev(dbc, mode));
 }
 
 /*
- * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t));
  */
 int
-__ham_item_first(hashp, cursorp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_first(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	HASH_CURSOR *hcp;
 	int ret;
 
-	if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_item_reset(dbc)) != 0)
 		return (ret);
-	F_SET(cursorp, H_OK);
-	cursorp->bucket = 0;
-	return (__ham_item_next(hashp, cursorp, mode));
+	F_SET(hcp, H_OK);
+	hcp->bucket = 0;
+	return (__ham_item_next(dbc, mode));
 }
 
 /*
@@ -287,17 +310,20 @@ __ham_item_first(hashp, cursorp, mode)
  *	bigkeys, just returns the page number and index of the bigkey
  *	pointer pair.
  *
- * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t));
  */
 int
-__ham_item_prev(hashp, cursorp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_prev(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	db_pgno_t next_pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	/*
 	 * There are N cases for backing up in a hash file.
 	 * Case 1: In the middle of a page, no duplicates, just dec the index.
@@ -307,52 +333,56 @@ __ham_item_prev(hashp, cursorp, mode)
 	 * Case 4: At the beginning of a page; go to previous page.
 	 * Case 5: At the beginning of a bucket; go to prev bucket.
 	 */
-	F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED);
+	F_CLR(hcp, H_OK | H_NOMORE | H_DELETED);
 
 	/*
 	 * First handle the duplicates.  Either you'll get the key here
 	 * or you'll exit the duplicate set and drop into the code below
 	 * to handle backing up through keys.
 	 */
-	if (F_ISSET(cursorp, H_ISDUP)) {
-		if (cursorp->dpgno == PGNO_INVALID) {
+	if (F_ISSET(hcp, H_ISDUP)) {
+		if (hcp->dpgno == PGNO_INVALID) {
 			/* Duplicates are on-page. */
-			if (cursorp->dup_off != 0) {
-				if ((ret = __ham_get_cpage(hashp,
-				    cursorp, mode)) != 0)
+			if (hcp->dup_off != 0) {
+				if ((ret = __ham_get_cpage(dbc, mode)) != 0)
 					return (ret);
 				else {
 					HASH_CURSOR *h;
-					h = cursorp;
+					h = hcp;
 					memcpy(&h->dup_len, HKEYDATA_DATA(
 					    H_PAIRDATA(h->pagep, h->bndx))
 					    + h->dup_off - sizeof(db_indx_t),
 					    sizeof(db_indx_t));
-					cursorp->dup_off -=
-					    DUP_SIZE(cursorp->dup_len);
-					cursorp->dndx--;
-					return (__ham_item(hashp,
-					    cursorp, mode));
+					hcp->dup_off -=
+					    DUP_SIZE(hcp->dup_len);
+					hcp->dndx--;
+					return (__ham_item(dbc, mode));
 				}
 			}
-		} else if (cursorp->dndx > 0) {	/* Duplicates are off-page. */
-			cursorp->dndx--;
-			return (__ham_item(hashp, cursorp, mode));
-		} else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+		} else if (hcp->dndx > 0) {	/* Duplicates are off-page. */
+			hcp->dndx--;
+			return (__ham_item(dbc, mode));
+		} else if ((ret = __ham_get_cpage(dbc, mode)) != 0)
 			return (ret);
-		else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) {
-			F_CLR(cursorp, H_ISDUP); /* End of dups */
-			cursorp->dpgno = PGNO_INVALID;
-			if (cursorp->dpagep != NULL)
-				(void)__ham_put_page(hashp->dbp,
-				    cursorp->dpagep, 0);
-			cursorp->dpagep = NULL;
-		} else if ((ret = __ham_next_cpage(hashp, cursorp,
-		    PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
+		else if (PREV_PGNO(hcp->dpagep) == PGNO_INVALID) {
+			if (F_ISSET(hcp, H_DUPONLY)) {
+				F_CLR(hcp, H_OK);
+				F_SET(hcp, H_NOMORE);
+				return (0);
+			} else {
+				F_CLR(hcp, H_ISDUP); /* End of dups */
+				hcp->dpgno = PGNO_INVALID;
+				if (hcp->dpagep != NULL)
+					(void)__ham_put_page(dbp,
+					    hcp->dpagep, 0);
+				hcp->dpagep = NULL;
+			}
+		} else if ((ret = __ham_next_cpage(dbc,
+		    PREV_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
 			return (ret);
 		else {
-			cursorp->dndx = NUM_ENT(cursorp->pagep) - 1;
-			return (__ham_item(hashp, cursorp, mode));
+			hcp->dndx = NUM_ENT(hcp->pagep) - 1;
+			return (__ham_item(dbc, mode));
 		}
 	}
 
@@ -362,95 +392,123 @@ __ham_item_prev(hashp, cursorp, mode)
 	 * midpage, beginning of page, beginning of bucket.
 	 */
 
-	if (cursorp->bndx == 0) { 		/* Beginning of page. */
-		if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+	if (F_ISSET(hcp, H_DUPONLY)) {
+		F_CLR(hcp, H_OK);
+		F_SET(hcp, H_NOMORE);
+		return (0);
+	}
+
+	if (hcp->bndx == 0) { 		/* Beginning of page. */
+		if ((ret = __ham_get_cpage(dbc, mode)) != 0)
 			return (ret);
-		cursorp->pgno = PREV_PGNO(cursorp->pagep);
-		if (cursorp->pgno == PGNO_INVALID) {
+		hcp->pgno = PREV_PGNO(hcp->pagep);
+		if (hcp->pgno == PGNO_INVALID) {
 			/* Beginning of bucket. */
-			F_SET(cursorp, H_NOMORE);
+			F_SET(hcp, H_NOMORE);
 			return (DB_NOTFOUND);
-		} else if ((ret = __ham_next_cpage(hashp,
-		    cursorp, cursorp->pgno, 0, 0)) != 0)
+		} else if ((ret =
+		    __ham_next_cpage(dbc, hcp->pgno, 0, 0)) != 0)
 			return (ret);
 		else
-			cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
+			hcp->bndx = H_NUMPAIRS(hcp->pagep);
 	}
 
 	/*
 	 * Either we've got the cursor set up to be decremented, or we
 	 * have to find the end of a bucket.
 	 */
-	if (cursorp->bndx == NDX_INVALID) {
-		if (cursorp->pagep == NULL)
-			next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket);
+	if (hcp->bndx == NDX_INVALID) {
+		if (hcp->pagep == NULL)
+			next_pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
 		else
 			goto got_page;
 
 		do {
-			if ((ret = __ham_next_cpage(hashp,
-			    cursorp, next_pgno, 0, 0)) != 0)
+			if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
 				return (ret);
-got_page:		next_pgno = NEXT_PGNO(cursorp->pagep);
-			cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
+got_page:		next_pgno = NEXT_PGNO(hcp->pagep);
+			hcp->bndx = H_NUMPAIRS(hcp->pagep);
 		} while (next_pgno != PGNO_INVALID);
 
-		if (cursorp->bndx == 0) {
+		if (hcp->bndx == 0) {
 			/* Bucket was empty. */
-			F_SET(cursorp, H_NOMORE);
+			F_SET(hcp, H_NOMORE);
 			return (DB_NOTFOUND);
 		}
 	}
 
-	cursorp->bndx--;
+	hcp->bndx--;
 
-	return (__ham_item(hashp, cursorp, mode));
+	return (__ham_item(dbc, mode));
 }
 
 /*
  * Sets the cursor to the next key/data pair on a page.
  *
- * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t));
  */
 int
-__ham_item_next(hashp, cursorp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_item_next(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	HASH_CURSOR *hcp;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
 	/*
 	 * Deleted on-page duplicates are a weird case. If we delete the last
 	 * one, then our cursor is at the very end of a duplicate set and
 	 * we actually need to go on to the next key.
 	 */
-	if (F_ISSET(cursorp, H_DELETED)) {
-		if (cursorp->bndx != NDX_INVALID &&
-		    F_ISSET(cursorp, H_ISDUP) &&
-		    cursorp->dpgno == PGNO_INVALID &&
-		    cursorp->dup_tlen == cursorp->dup_off) {
-			F_CLR(cursorp, H_ISDUP);
-			cursorp->dpgno = PGNO_INVALID;
-			cursorp->bndx++;
+	if (F_ISSET(hcp, H_DELETED)) {
+		if (hcp->bndx != NDX_INVALID &&
+		    F_ISSET(hcp, H_ISDUP) &&
+		    hcp->dpgno == PGNO_INVALID &&
+		    hcp->dup_tlen == hcp->dup_off) {
+			if (F_ISSET(hcp, H_DUPONLY)) {
+				F_CLR(hcp, H_OK);
+				F_SET(hcp, H_NOMORE);
+				return (0);
+			} else {
+				F_CLR(hcp, H_ISDUP);
+				hcp->dpgno = PGNO_INVALID;
+				hcp->bndx++;
+			}
+		} else if (!F_ISSET(hcp, H_ISDUP) &&
+		    F_ISSET(hcp, H_DUPONLY)) {
+			F_CLR(hcp, H_OK);
+			F_SET(hcp, H_NOMORE);
+			return (0);
 		}
-		F_CLR(cursorp, H_DELETED);
-	} else if (cursorp->bndx == NDX_INVALID) {
-		cursorp->bndx = 0;
-		cursorp->dpgno = PGNO_INVALID;
-		F_CLR(cursorp, H_ISDUP);
-	} else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID)
-		cursorp->dndx++;
-	else if (F_ISSET(cursorp, H_ISDUP)) {
-		cursorp->dndx++;
-		cursorp->dup_off += DUP_SIZE(cursorp->dup_len);
-		if (cursorp->dup_off >= cursorp->dup_tlen) {
-			F_CLR(cursorp, H_ISDUP);
-			cursorp->dpgno = PGNO_INVALID;
-			cursorp->bndx++;
+		F_CLR(hcp, H_DELETED);
+	} else if (hcp->bndx == NDX_INVALID) {
+		hcp->bndx = 0;
+		hcp->dpgno = PGNO_INVALID;
+		F_CLR(hcp, H_ISDUP);
+	} else if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID)
+		hcp->dndx++;
+	else if (F_ISSET(hcp, H_ISDUP)) {
+		if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >=
+		    hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) {
+			F_CLR(hcp, H_OK);
+			F_SET(hcp, H_NOMORE);
+			return (0);
+		}
+		hcp->dndx++;
+		hcp->dup_off += DUP_SIZE(hcp->dup_len);
+		if (hcp->dup_off >= hcp->dup_tlen) {
+			F_CLR(hcp, H_ISDUP);
+			hcp->dpgno = PGNO_INVALID;
+			hcp->bndx++;
 		}
+	} else if (F_ISSET(hcp, H_DUPONLY)) {
+		F_CLR(hcp, H_OK);
+		F_SET(hcp, H_NOMORE);
+		return (0);
 	} else
-		cursorp->bndx++;
+		hcp->bndx++;
 
-	return (__ham_item(hashp, cursorp, mode));
+	return (__ham_item(dbc, mode));
 }
 
 /*
@@ -537,18 +595,15 @@ __ham_reputpair(p, psize, ndx, key, data)
 
 
 /*
- * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *, int));
- *
- * XXX
- * TODO: if the item is an offdup, delete the other pages and then remove
- * the pair. If the offpage page is 0, then you can just remove the pair.
+ * PUBLIC: int __ham_del_pair __P((DBC *, int));
  */
 int
-__ham_del_pair(hashp, cursorp, reclaim_page)
-	HTAB *hashp;
-	HASH_CURSOR *cursorp;
+__ham_del_pair(dbc, reclaim_page)
+	DBC *dbc;
 	int reclaim_page;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DBT data_dbt, key_dbt;
 	DB_ENV *dbenv;
 	DB_LSN new_lsn, *n_lsn, tmp_lsn;
@@ -557,13 +612,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 	db_pgno_t chg_pgno, pgno;
 	int ret, tret;
 
-	dbenv = hashp->dbp->dbenv;
-	ndx = cursorp->bndx;
-	if (cursorp->pagep == NULL && (ret =
-	    __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0)
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	dbenv = dbp->dbenv;
+	ndx = hcp->bndx;
+	if (hcp->pagep == NULL &&
+	    (ret = __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
 		return (ret);
 
-	p = cursorp->pagep;
+	p = hcp->pagep;
 
 	/*
 	 * We optimize for the normal case which is when neither the key nor
@@ -576,7 +634,7 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 	if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) {
 		memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))),
 		    sizeof(db_pgno_t));
-		ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
+		ret = __db_doff(dbc, pgno, __ham_del_page);
 	}
 
 	if (ret == 0)
@@ -585,14 +643,14 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 			memcpy(&pgno,
 			    HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
 			    sizeof(db_pgno_t));
-			ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
+			ret = __db_doff(dbc, pgno, __ham_del_page);
 			break;
 		case H_OFFDUP:
 			memcpy(&pgno,
 			    HOFFDUP_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
 			    sizeof(db_pgno_t));
-			ret = __db_ddup(hashp->dbp, pgno, __ham_del_page);
-			F_CLR(cursorp, H_ISDUP);
+			ret = __db_ddup(dbc, pgno, __ham_del_page);
+			F_CLR(hcp, H_ISDUP);
 			break;
 		case H_DUPLICATE:
 			/*
@@ -600,7 +658,7 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 			 * we had better clear the flag so that we update the
 			 * cursor appropriately.
 			 */
-			F_CLR(cursorp, H_ISDUP);
+			F_CLR(hcp, H_ISDUP);
 			break;
 		}
 
@@ -608,17 +666,17 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		return (ret);
 
 	/* Now log the delete off this page. */
-	if (DB_LOGGING(hashp->dbp)) {
+	if (DB_LOGGING(dbc)) {
 		key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx));
 		key_dbt.size =
-		    LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx));
+		    LEN_HITEM(p, hcp->hdr->pagesize, H_KEYINDEX(ndx));
 		data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx));
 		data_dbt.size =
-		    LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx));
+		    LEN_HITEM(p, hcp->hdr->pagesize, H_DATAINDEX(ndx));
 
 		if ((ret = __ham_insdel_log(dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR,
-		    hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
+		    dbc->txn, &new_lsn, 0, DELPAIR,
+		    dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
 		    &LSN(p), &key_dbt, &data_dbt)) != 0)
 			return (ret);
 
@@ -626,15 +684,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		LSN(p) = new_lsn;
 	}
 
-	__ham_dpair(hashp->dbp, p, ndx);
+	__ham_dpair(dbp, p, ndx);
 
 	/*
-	 * If we are locking, we will not maintain this.
-	 * XXXX perhaps we can retain incremental numbers and apply them
+	 * If we are locking, we will not maintain this, because it is
+	 * a hot spot.
+	 * XXX perhaps we can retain incremental numbers and apply them
 	 * later.
 	 */
-	if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
-		--hashp->hdr->nelem;
+	if (!F_ISSET(dbp, DB_AM_LOCKING))
+		--hcp->hdr->nelem;
 
 	/*
 	 * If we need to reclaim the page, then check if the page is empty.
@@ -653,25 +712,25 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		 * are more pages in the chain.
 		 */
 		if ((ret =
-		    __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0)
+		    __ham_get_page(dbp, NEXT_PGNO(p), &n_pagep)) != 0)
 			return (ret);
 
 		if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
 			if ((ret =
-			    __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep),
+			    __ham_get_page(dbp, NEXT_PGNO(n_pagep),
 			    &nn_pagep)) != 0) {
-				(void) __ham_put_page(hashp->dbp, n_pagep, 0);
+				(void) __ham_put_page(dbp, n_pagep, 0);
 				return (ret);
 			}
 		}
 
-		if (DB_LOGGING(hashp->dbp)) {
+		if (DB_LOGGING(dbc)) {
 			key_dbt.data = n_pagep;
-			key_dbt.size = hashp->hdr->pagesize;
+			key_dbt.size = hcp->hdr->pagesize;
 			if ((ret = __ham_copypage_log(dbenv->lg_info,
-			    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-			    hashp->dbp->log_fileid, PGNO(p), &LSN(p),
-			    PGNO(n_pagep), &LSN(n_pagep), NEXT_PGNO(n_pagep),
+			    dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(p),
+			    &LSN(p), PGNO(n_pagep), &LSN(n_pagep),
+			    NEXT_PGNO(n_pagep),
 			    NEXT_PGNO(n_pagep) == PGNO_INVALID ? NULL :
 			    &LSN(nn_pagep), &key_dbt)) != 0)
 				return (ret);
@@ -684,12 +743,12 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		}
 		if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
 			PREV_PGNO(nn_pagep) = PGNO(p);
-			(void)__ham_put_page(hashp->dbp, nn_pagep, 1);
+			(void)__ham_put_page(dbp, nn_pagep, 1);
 		}
 
 		tmp_pgno = PGNO(p);
 		tmp_lsn = LSN(p);
-		memcpy(p, n_pagep, hashp->hdr->pagesize);
+		memcpy(p, n_pagep, hcp->hdr->pagesize);
 		PGNO(p) = tmp_pgno;
 		LSN(p) = tmp_lsn;
 		PREV_PGNO(p) = PGNO_INVALID;
@@ -697,25 +756,25 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		/*
 		 * Cursor is advanced to the beginning of the next page.
 		 */
-		cursorp->bndx = 0;
-		cursorp->pgno = PGNO(p);
-		F_SET(cursorp, H_DELETED);
+		hcp->bndx = 0;
+		hcp->pgno = PGNO(p);
+		F_SET(hcp, H_DELETED);
 		chg_pgno = PGNO(p);
-		if ((ret = __ham_dirty_page(hashp, p)) != 0 ||
-		    (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0)
+		if ((ret = __ham_dirty_page(dbp, p)) != 0 ||
+		    (ret = __ham_del_page(dbc, n_pagep)) != 0)
 			return (ret);
 	} else if (reclaim_page &&
 	    NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) {
 		PAGE *n_pagep, *p_pagep;
 
 		if ((ret =
-		    __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0)
+		    __ham_get_page(dbp, PREV_PGNO(p), &p_pagep)) != 0)
 			return (ret);
 
 		if (NEXT_PGNO(p) != PGNO_INVALID) {
-			if ((ret = __ham_get_page(hashp->dbp,
+			if ((ret = __ham_get_page(dbp,
 			    NEXT_PGNO(p), &n_pagep)) != 0) {
-				(void)__ham_put_page(hashp->dbp, p_pagep, 0);
+				(void)__ham_put_page(dbp, p_pagep, 0);
 				return (ret);
 			}
 			n_lsn = &LSN(n_pagep);
@@ -728,10 +787,10 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		if (n_pagep != NULL)
 			PREV_PGNO(n_pagep) = PGNO(p_pagep);
 
-		if (DB_LOGGING(hashp->dbp)) {
+		if (DB_LOGGING(dbc)) {
 			if ((ret = __ham_newpage_log(dbenv->lg_info,
-			    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL,
-			    hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
+			    dbc->txn, &new_lsn, 0, DELOVFL,
+			    dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
 			    PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
 				return (ret);
 
@@ -741,21 +800,21 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 				LSN(n_pagep) = new_lsn;
 			LSN(p) = new_lsn;
 		}
-		cursorp->pgno = NEXT_PGNO(p);
-		cursorp->bndx = 0;
+		hcp->pgno = NEXT_PGNO(p);
+		hcp->bndx = 0;
 		/*
 		 * Since we are about to delete the cursor page and we have
 		 * just moved the cursor, we need to make sure that the
 		 * old page pointer isn't left hanging around in the cursor.
 		 */
-		cursorp->pagep = NULL;
+		hcp->pagep = NULL;
 		chg_pgno = PGNO(p);
-		ret = __ham_del_page(hashp->dbp, p);
-		if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 &&
+		ret = __ham_del_page(dbc, p);
+		if ((tret = __ham_put_page(dbp, p_pagep, 1)) != 0 &&
 		    ret == 0)
 			ret = tret;
 		if (n_pagep != NULL &&
-		    (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 &&
+		    (tret = __ham_put_page(dbp, n_pagep, 1)) != 0 &&
 		    ret == 0)
 			ret = tret;
 		if (ret != 0)
@@ -766,19 +825,19 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
 		 * so that we update the cursor correctly on the next call
 		 * to next.
 		 */
-		F_SET(cursorp, H_DELETED);
-		chg_pgno = cursorp->pgno;
-		ret = __ham_dirty_page(hashp, p);
+		F_SET(hcp, H_DELETED);
+		chg_pgno = hcp->pgno;
+		ret = __ham_dirty_page(dbp, p);
 	}
-	__ham_c_update(cursorp, chg_pgno, 0, 0, 0);
+	__ham_c_update(hcp, chg_pgno, 0, 0, 0);
 
 	/*
 	 * Since we just deleted a pair from the master page, anything
-	 * in cursorp->dpgno should be cleared.
+	 * in hcp->dpgno should be cleared.
 	 */
-	cursorp->dpgno = PGNO_INVALID;
+	hcp->dpgno = PGNO_INVALID;
 
-	F_CLR(cursorp, H_OK);
+	F_CLR(hcp, H_OK);
 	return (ret);
 }
 
@@ -787,15 +846,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page)
  *	Given the key data indicated by the cursor, replace part/all of it
  *	according to the fields in the dbt.
  *
- * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
+ * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t));
  */
 int
-__ham_replpair(hashp, hcp, dbt, make_dup)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_replpair(dbc, dbt, make_dup)
+	DBC *dbc;
 	DBT *dbt;
 	u_int32_t make_dup;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DBT old_dbt, tdata, tmp;
 	DB_LSN	new_lsn;
 	int32_t change;			/* XXX: Possible overflow. */
@@ -814,6 +874,8 @@ __ham_replpair(hashp, hcp, dbt, make_dup)
 	 * be the common case).  We handle case 3 as a delete and
 	 * add.
 	 */
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 
 	/*
 	 * We need to compute the number of bytes that we are adding or
@@ -833,7 +895,7 @@ __ham_replpair(hashp, hcp, dbt, make_dup)
 		memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
 	else
 		len = LEN_HKEYDATA(hcp->pagep,
-		    hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx));
+		    dbp->pgsize, H_DATAINDEX(hcp->bndx));
 
 	if (dbt->doff + dbt->dlen > len)
 		change += dbt->doff + dbt->dlen - len;
@@ -854,41 +916,39 @@ __ham_replpair(hashp, hcp, dbt, make_dup)
 		tmp.flags = 0;
 		F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL);
 		if ((ret =
-		    __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
-		    &tmp, &hcp->big_key, &hcp->big_keylen)) != 0)
+		    __db_ret(dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
+		    &tmp, &dbc->rkey.data, &dbc->rkey.size)) != 0)
 			return (ret);
 
 		if (dbt->doff == 0 && dbt->dlen == len) {
-			ret = __ham_del_pair(hashp, hcp, 0);
+			ret = __ham_del_pair(dbc, 0);
 			if (ret == 0)
-			    ret = __ham_add_el(hashp,
-			        hcp, &tmp, dbt, H_KEYDATA);
+			    ret = __ham_add_el(dbc, &tmp, dbt, H_KEYDATA);
 		} else {					/* Case B */
 			type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
 			    HPAGE_PTYPE(hk) : H_KEYDATA;
 			tdata.flags = 0;
 			F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL);
 
-			if ((ret = __db_ret(hashp->dbp, hcp->pagep,
-			    H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data,
-			    &hcp->big_datalen)) != 0)
+			if ((ret = __db_ret(dbp, hcp->pagep,
+			    H_DATAINDEX(hcp->bndx), &tdata, &dbc->rdata.data,
+			    &dbc->rdata.size)) != 0)
 				goto err;
 
 			/* Now we can delete the item. */
-			if ((ret = __ham_del_pair(hashp, hcp, 0)) != 0) {
-				__db_free(tdata.data);
+			if ((ret = __ham_del_pair(dbc, 0)) != 0) {
+				__os_free(tdata.data, tdata.size);
 				goto err;
 			}
 
 			/* Now shift old data around to make room for new. */
 			if (change > 0) {
-				tdata.data = (void *)__db_realloc(tdata.data,
-				    tdata.size + change);
+				 if ((ret = __os_realloc(&tdata.data,
+				     tdata.size + change)) != 0)
+					return (ret);
 				memset((u_int8_t *)tdata.data + tdata.size,
 				    0, change);
 			}
-			if (tdata.data == NULL)
-				return (ENOMEM);
 			end = (u_int8_t *)tdata.data + tdata.size;
 
 			src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
@@ -902,10 +962,10 @@ __ham_replpair(hashp, hcp, dbt, make_dup)
 			tdata.size += change;
 
 			/* Now add the pair. */
-			ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type);
-			__db_free(tdata.data);
+			ret = __ham_add_el(dbc, &tmp, &tdata, type);
+			__os_free(tdata.data, tdata.size);
 		}
-err:		__db_free(tmp.data);
+err:		__os_free(tmp.data, tmp.size);
 		return (ret);
 	}
 
@@ -921,12 +981,11 @@ err:		__db_free(tmp.data);
 	 * all the parameters here.  Then log the call before moving
 	 * anything around.
 	 */
-	if (DB_LOGGING(hashp->dbp)) {
+	if (DB_LOGGING(dbc)) {
 		old_dbt.data = beg;
 		old_dbt.size = dbt->dlen;
-		if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-		    hashp->dbp->log_fileid, PGNO(hcp->pagep),
+		if ((ret = __ham_replace_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(hcp->pagep),
 		    (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep),
 		    (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0)
 			return (ret);
@@ -934,7 +993,7 @@ err:		__db_free(tmp.data);
 		LSN(hcp->pagep) = new_lsn;	/* Structure assignment. */
 	}
 
-	__ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize,
+	__ham_onpage_replace(hcp->pagep, dbp->pgsize,
 	    (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt);
 
 	return (0);
@@ -997,13 +1056,15 @@ __ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt)
 }
 
 /*
- * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t));
+ * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
  */
 int
-__ham_split_page(hashp, obucket, nbucket)
-	HTAB *hashp;
+__ham_split_page(dbc, obucket, nbucket)
+	DBC *dbc;
 	u_int32_t obucket, nbucket;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DBT key, page_dbt;
 	DB_ENV *dbenv;
 	DB_LSN new_lsn;
@@ -1014,33 +1075,34 @@ __ham_split_page(hashp, obucket, nbucket)
 	int ret, tret;
 	void *big_buf;
 
-	dbenv = hashp->dbp->dbenv;
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbenv = dbp->dbenv;
 	temp_pagep = old_pagep = new_pagep = NULL;
 
-	bucket_pgno = BUCKET_TO_PAGE(hashp, obucket);
-	if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0)
+	bucket_pgno = BUCKET_TO_PAGE(hcp, obucket);
+	if ((ret = __ham_get_page(dbp, bucket_pgno, &old_pagep)) != 0)
 		return (ret);
-	if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH,
+	if ((ret = __ham_new_page(dbp, BUCKET_TO_PAGE(hcp, nbucket), P_HASH,
 	    &new_pagep)) != 0)
 		goto err;
 
-	temp_pagep = hashp->split_buf;
-	memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize);
+	temp_pagep = hcp->split_buf;
+	memcpy(temp_pagep, old_pagep, hcp->hdr->pagesize);
 
-	if (DB_LOGGING(hashp->dbp)) {
-		page_dbt.size = hashp->hdr->pagesize;
+	if (DB_LOGGING(dbc)) {
+		page_dbt.size = hcp->hdr->pagesize;
 		page_dbt.data = old_pagep;
 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-		    hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep),
-		    &page_dbt, &LSN(old_pagep))) != 0)
+		    dbc->txn, &new_lsn, 0, dbp->log_fileid, SPLITOLD,
+		    PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0)
 			goto err;
 	}
 
-	P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
+	P_INIT(old_pagep, hcp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
 	    PGNO_INVALID, 0, P_HASH);
 
-	if (DB_LOGGING(hashp->dbp))
+	if (DB_LOGGING(dbc))
 		LSN(old_pagep) = new_lsn;	/* Structure assignment. */
 
 	big_len = 0;
@@ -1049,11 +1111,11 @@ __ham_split_page(hashp, obucket, nbucket)
 	while (temp_pagep != NULL) {
 		for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) {
 			if ((ret =
-			    __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n),
+			    __db_ret(dbp, temp_pagep, H_KEYINDEX(n),
 			    &key, &big_buf, &big_len)) != 0)
 				goto err;
 
-			if (__ham_call_hash(hashp, key.data, key.size)
+			if (__ham_call_hash(hcp, key.data, key.size)
 			    == obucket)
 				pp = &old_pagep;
 			else
@@ -1064,59 +1126,59 @@ __ham_split_page(hashp, obucket, nbucket)
 			 * page to store the key/data pair.
 			 */
 
-			len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
+			len = LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
 			    H_DATAINDEX(n)) +
-			    LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
+			    LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
 			    H_KEYINDEX(n)) +
 			    2 * sizeof(db_indx_t);
 
 			if (P_FREESPACE(*pp) < len) {
-				if (DB_LOGGING(hashp->dbp)) {
-					page_dbt.size = hashp->hdr->pagesize;
+				if (DB_LOGGING(dbc)) {
+					page_dbt.size = hcp->hdr->pagesize;
 					page_dbt.data = *pp;
 					if ((ret = __ham_splitdata_log(
-					    dbenv->lg_info,
-					    (DB_TXN *)hashp->dbp->txn,
-					    &new_lsn, 0,
-					    hashp->dbp->log_fileid, SPLITNEW,
-					    PGNO(*pp), &page_dbt,
+					    dbenv->lg_info, dbc->txn,
+					    &new_lsn, 0, dbp->log_fileid,
+					    SPLITNEW, PGNO(*pp), &page_dbt,
 					    &LSN(*pp))) != 0)
 						goto err;
 					LSN(*pp) = new_lsn;
 				}
-				if ((ret = __ham_add_ovflpage(hashp,
-				    *pp, 1, pp)) != 0)
+				if ((ret =
+				    __ham_add_ovflpage(dbc, *pp, 1, pp)) != 0)
 					goto err;
 			}
-			__ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp);
-			__ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp);
+			__ham_copy_item(dbp->pgsize,
+			    temp_pagep, H_KEYINDEX(n), *pp);
+			__ham_copy_item(dbp->pgsize,
+			    temp_pagep, H_DATAINDEX(n), *pp);
 		}
 		next_pgno = NEXT_PGNO(temp_pagep);
 
 		/* Clear temp_page; if it's a link overflow page, free it. */
 		if (PGNO(temp_pagep) != bucket_pgno && (ret =
-		    __ham_del_page(hashp->dbp, temp_pagep)) != 0)
+		    __ham_del_page(dbc, temp_pagep)) != 0)
 			goto err;
 
 		if (next_pgno == PGNO_INVALID)
 			temp_pagep = NULL;
 		else if ((ret =
-		    __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0)
+		    __ham_get_page(dbp, next_pgno, &temp_pagep)) != 0)
 			goto err;
 
-		if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) {
-			page_dbt.size = hashp->hdr->pagesize;
+		if (temp_pagep != NULL && DB_LOGGING(dbc)) {
+			page_dbt.size = hcp->hdr->pagesize;
 			page_dbt.data = temp_pagep;
 			if ((ret = __ham_splitdata_log(dbenv->lg_info,
-			    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-			    hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep),
+			    dbc->txn, &new_lsn, 0, dbp->log_fileid,
+			    SPLITOLD, PGNO(temp_pagep),
 			    &page_dbt, &LSN(temp_pagep))) != 0)
 				goto err;
 			LSN(temp_pagep) = new_lsn;
 		}
 	}
 	if (big_buf != NULL)
-		__db_free(big_buf);
+		__os_free(big_buf, big_len);
 
 	/*
 	 * If the original bucket spanned multiple pages, then we've got
@@ -1124,42 +1186,41 @@ __ham_split_page(hashp, obucket, nbucket)
 	 * should be deleted.
 	 */
 	if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
-	    (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0)
+	    (ret = __ham_del_page(dbc, temp_pagep)) != 0)
 		goto err;
 
 	/*
 	 * Write new buckets out.
 	 */
-	if (DB_LOGGING(hashp->dbp)) {
-		page_dbt.size = hashp->hdr->pagesize;
+	if (DB_LOGGING(dbc)) {
+		page_dbt.size = hcp->hdr->pagesize;
 		page_dbt.data = old_pagep;
 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-		    hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep),
+		   dbc->txn, &new_lsn, 0, dbp->log_fileid,
+		   SPLITNEW, PGNO(old_pagep),
 		    &page_dbt, &LSN(old_pagep))) != 0)
 			goto err;
 		LSN(old_pagep) = new_lsn;
 
 		page_dbt.data = new_pagep;
 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
-		    hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep),
-		    &page_dbt, &LSN(new_pagep))) != 0)
+		    dbc->txn, &new_lsn, 0, dbp->log_fileid,
+		    SPLITNEW, PGNO(new_pagep), &page_dbt, &LSN(new_pagep))) != 0)
 			goto err;
 		LSN(new_pagep) = new_lsn;
 	}
-	ret = __ham_put_page(hashp->dbp, old_pagep, 1);
-	if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 &&
+	ret = __ham_put_page(dbp, old_pagep, 1);
+	if ((tret = __ham_put_page(dbp, new_pagep, 1)) != 0 &&
 	    ret == 0)
 		ret = tret;
 
 	if (0) {
 err:		if (old_pagep != NULL)
-			(void)__ham_put_page(hashp->dbp, old_pagep, 1);
+			(void)__ham_put_page(dbp, old_pagep, 1);
 		if (new_pagep != NULL)
-			(void)__ham_put_page(hashp->dbp, new_pagep, 1);
+			(void)__ham_put_page(dbp, new_pagep, 1);
 		if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
-			(void)__ham_put_page(hashp->dbp, temp_pagep, 1);
+			(void)__ham_put_page(dbp, temp_pagep, 1);
 	}
 	return (ret);
 }
@@ -1171,16 +1232,16 @@ err:		if (old_pagep != NULL)
  * to which we just added something.  This allows us to link overflow
  * pages and return the new page having correctly put the last page.
  *
- * PUBLIC: int __ham_add_el
- * PUBLIC:    __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, int));
+ * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, int));
  */
 int
-__ham_add_el(hashp, hcp, key, val, type)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_add_el(dbc, key, val, type)
+	DBC *dbc;
 	const DBT *key, *val;
 	int type;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	const DBT *pkey, *pdata;
 	DBT key_dbt, data_dbt;
 	DB_LSN new_lsn;
@@ -1190,17 +1251,19 @@ __ham_add_el(hashp, hcp, key, val, type)
 	int do_expand, is_keybig, is_databig, ret;
 	int key_type, data_type;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	do_expand = 0;
 
-	if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp,
+	if (hcp->pagep == NULL && (ret = __ham_get_page(dbp,
 	    hcp->seek_found_page != PGNO_INVALID ?  hcp->seek_found_page :
 	    hcp->pgno, &hcp->pagep)) != 0)
 		return (ret);
 
 	key_size = HKEYDATA_PSIZE(key->size);
 	data_size = HKEYDATA_PSIZE(val->size);
-	is_keybig = ISBIG(hashp, key->size);
-	is_databig = ISBIG(hashp, val->size);
+	is_keybig = ISBIG(hcp, key->size);
+	is_databig = ISBIG(hcp, val->size);
 	if (is_keybig)
 		key_size = HOFFPAGE_PSIZE;
 	if (is_databig)
@@ -1220,7 +1283,7 @@ __ham_add_el(hashp, hcp, key, val, type)
 			break;
 		next_pgno = NEXT_PGNO(hcp->pagep);
 		if ((ret =
-		    __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0)
+		    __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
 			return (ret);
 	}
 
@@ -1229,7 +1292,7 @@ __ham_add_el(hashp, hcp, key, val, type)
 	 */
 	if (P_FREESPACE(hcp->pagep) < pairsize) {
 		do_expand = 1;
-		if ((ret = __ham_add_ovflpage(hashp,
+		if ((ret = __ham_add_ovflpage(dbc,
 		    hcp->pagep, 1, &hcp->pagep)) !=  0)
 			return (ret);
 		hcp->pgno = PGNO(hcp->pagep);
@@ -1241,10 +1304,13 @@ __ham_add_el(hashp, hcp, key, val, type)
 	hcp->bndx = H_NUMPAIRS(hcp->pagep);
 	F_CLR(hcp, H_DELETED);
 	if (is_keybig) {
-		if ((ret = __db_poff(hashp->dbp,
+		koff.type = H_OFFPAGE;
+		UMRW(koff.unused[0]);
+		UMRW(koff.unused[1]);
+		UMRW(koff.unused[2]);
+		if ((ret = __db_poff(dbc,
 		    key, &koff.pgno, __ham_overflow_page)) != 0)
 			return (ret);
-		koff.type = H_OFFPAGE;
 		koff.tlen = key->size;
 		key_dbt.data = &koff;
 		key_dbt.size = sizeof(koff);
@@ -1256,10 +1322,13 @@ __ham_add_el(hashp, hcp, key, val, type)
 	}
 
 	if (is_databig) {
-		if ((ret = __db_poff(hashp->dbp,
+		doff.type = H_OFFPAGE;
+		UMRW(doff.unused[0]);
+		UMRW(doff.unused[1]);
+		UMRW(doff.unused[2]);
+		if ((ret = __db_poff(dbc,
 		    val, &doff.pgno, __ham_overflow_page)) != 0)
 			return (ret);
-		doff.type = H_OFFPAGE;
 		doff.tlen = val->size;
 		data_dbt.data = &doff;
 		data_dbt.size = sizeof(doff);
@@ -1270,16 +1339,16 @@ __ham_add_el(hashp, hcp, key, val, type)
 		data_type = type;
 	}
 
-	if (DB_LOGGING(hashp->dbp)) {
+	if (DB_LOGGING(dbc)) {
 		rectype = PUTPAIR;
 		if (is_databig)
 			rectype |= PAIR_DATAMASK;
 		if (is_keybig)
 			rectype |= PAIR_KEYMASK;
 
-		if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype,
-		    hashp->dbp->log_fileid, PGNO(hcp->pagep),
+		if ((ret = __ham_insdel_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, rectype,
+		    dbp->log_fileid, PGNO(hcp->pagep),
 		    (u_int32_t)H_NUMPAIRS(hcp->pagep),
 		    &LSN(hcp->pagep), pkey, pdata)) != 0)
 			return (ret);
@@ -1303,11 +1372,11 @@ __ham_add_el(hashp, hcp, key, val, type)
 	/*
 	 * XXX Maybe keep incremental numbers here
 	 */
-	if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
-		hashp->hdr->nelem++;
+	if (!F_ISSET(dbp, DB_AM_LOCKING))
+		hcp->hdr->nelem++;
 
-	if (do_expand || (hashp->hdr->ffactor != 0 &&
-	    (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor))
+	if (do_expand || (hcp->hdr->ffactor != 0 &&
+	    (u_int32_t)H_NUMPAIRS(hcp->pagep) > hcp->hdr->ffactor))
 		F_SET(hcp, H_EXPAND);
 	return (0);
 }
@@ -1319,11 +1388,11 @@ __ham_add_el(hashp, hcp, key, val, type)
  * H_DUPLICATE, H_OFFDUP).  Since we log splits at a high level, we
  * do not need to do any logging here.
  *
- * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, u_int32_t, PAGE *));
+ * PUBLIC: void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *));
  */
 void
-__ham_copy_item(hashp, src_page, src_ndx, dest_page)
-	HTAB *hashp;
+__ham_copy_item(pgsize, src_page, src_ndx, dest_page)
+	size_t pgsize;
 	PAGE *src_page;
 	u_int32_t src_ndx;
 	PAGE *dest_page;
@@ -1337,7 +1406,7 @@ __ham_copy_item(hashp, src_page, src_ndx, dest_page)
 	src = P_ENTRY(src_page, src_ndx);
 
 	/* Set up space on dest. */
-	len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx);
+	len = LEN_HITEM(src_page, pgsize, src_ndx);
 	HOFFSET(dest_page) -= len;
 	dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page);
 	dest = P_ENTRY(dest_page, NUM_ENT(dest_page));
@@ -1352,29 +1421,31 @@ __ham_copy_item(hashp, src_page, src_ndx, dest_page)
  *      pointer on success
  *      NULL on error
  *
- * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **));
+ * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **));
  */
 int
-__ham_add_ovflpage(hashp, pagep, release, pp)
-	HTAB *hashp;
+__ham_add_ovflpage(dbc, pagep, release, pp)
+	DBC *dbc;
 	PAGE *pagep;
 	int release;
 	PAGE **pp;
 {
-	DB_ENV *dbenv;
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DB_LSN new_lsn;
 	PAGE *new_pagep;
 	int ret;
 
-	dbenv = hashp->dbp->dbenv;
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0)
+	if ((ret = __ham_overflow_page(dbc, P_HASH, &new_pagep)) != 0)
 		return (ret);
 
-	if (DB_LOGGING(hashp->dbp)) {
-		if ((ret = __ham_newpage_log(dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL,
-		    hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep),
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __ham_newpage_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, PUTOVFL,
+		    dbp->log_fileid, PGNO(pagep), &LSN(pagep),
 		    PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0)
 			return (ret);
 
@@ -1385,78 +1456,76 @@ __ham_add_ovflpage(hashp, pagep, release, pp)
 	PREV_PGNO(new_pagep) = PGNO(pagep);
 
 	if (release)
-		ret = __ham_put_page(hashp->dbp, pagep, 1);
+		ret = __ham_put_page(dbp, pagep, 1);
 
-	hashp->hash_overflows++;
+	hcp->stats.hash_overflows++;
 	*pp = new_pagep;
 	return (ret);
 }
 
 
 /*
- * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **));
+ * PUBLIC: int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **));
  */
 int
-__ham_new_page(hashp, addr, type, pp)
-	HTAB *hashp;
+__ham_new_page(dbp, addr, type, pp)
+	DB *dbp;
 	u_int32_t addr, type;
 	PAGE **pp;
 {
 	PAGE *pagep;
 	int ret;
 
-	if ((ret = memp_fget(hashp->dbp->mpf,
+	if ((ret = memp_fget(dbp->mpf,
 	    &addr, DB_MPOOL_CREATE, &pagep)) != 0)
 		return (ret);
 
-#ifdef DEBUG_SLOW
-	__account_page(hashp, addr, 1);
-#endif
 	/* This should not be necessary because page-in should do it. */
-	P_INIT(pagep,
-	    hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
+	P_INIT(pagep, dbp->pgsize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
 
 	*pp = pagep;
 	return (0);
 }
 
 /*
- * PUBLIC: int __ham_del_page __P((DB *, PAGE *));
+ * PUBLIC: int __ham_del_page __P((DBC *, PAGE *));
  */
 int
-__ham_del_page(dbp, pagep)
-	DB *dbp;
+__ham_del_page(dbc, pagep)
+	DBC *dbc;
 	PAGE *pagep;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DB_LSN new_lsn;
-	HTAB *hashp;
 	int ret;
 
-	hashp = (HTAB *)dbp->internal;
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	ret = 0;
-	DIRTY_META(hashp, ret);
+	DIRTY_META(dbp, hcp, ret);
 	if (ret != 0) {
 		if (ret != EAGAIN)
-			__db_err(hashp->dbp->dbenv,
+			__db_err(dbp->dbenv,
 			    "free_ovflpage: unable to lock meta data page %s\n",
 			    strerror(ret));
 		/*
 		 * If we are going to return an error, then we should free
 		 * the page, so it doesn't stay pinned forever.
 		 */
-		(void)__ham_put_page(hashp->dbp, pagep, 0);
+		(void)__ham_put_page(dbp, pagep, 0);
 		return (ret);
 	}
 
-	if (DB_LOGGING(hashp->dbp)) {
-		if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO,
-		    hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed,
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, DELPGNO,
+		    dbp->log_fileid, PGNO(pagep), hcp->hdr->last_freed,
 		    (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID,
-		    &LSN(pagep), &hashp->hdr->lsn)) != 0)
+		    &LSN(pagep), &hcp->hdr->lsn)) != 0)
 			return (ret);
 
-		hashp->hdr->lsn = new_lsn;
+		hcp->hdr->lsn = new_lsn;
 		LSN(pagep) = new_lsn;
 	}
 
@@ -1466,16 +1535,16 @@ __ham_del_page(dbp, pagep)
 		DB_LSN __lsn;
 		__pgno = pagep->pgno;
 		__lsn = pagep->lsn;
-		memset(pagep, 0xff, dbp->pgsize);
+		memset(pagep, 0xdb, dbp->pgsize);
 		pagep->pgno = __pgno;
 		pagep->lsn = __lsn;
 	}
 #endif
 	TYPE(pagep) = P_INVALID;
-	NEXT_PGNO(pagep) = hashp->hdr->last_freed;
-	hashp->hdr->last_freed = PGNO(pagep);
+	NEXT_PGNO(pagep) = hcp->hdr->last_freed;
+	hcp->hdr->last_freed = PGNO(pagep);
 
-	return (__ham_put_page(hashp->dbp, pagep, 1));
+	return (__ham_put_page(dbp, pagep, 1));
 }
 
 
@@ -1489,8 +1558,7 @@ __ham_put_page(dbp, pagep, is_dirty)
 	int32_t is_dirty;
 {
 #ifdef DEBUG_SLOW
-	__account_page((HTAB *)dbp->cookie,
-	    ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
+	__account_page(dbp, ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
 #endif
 	return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0)));
 }
@@ -1499,14 +1567,14 @@ __ham_put_page(dbp, pagep, is_dirty)
  * __ham_dirty_page --
  *	Mark a page dirty.
  *
- * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *));
+ * PUBLIC: int __ham_dirty_page __P((DB *, PAGE *));
  */
 int
-__ham_dirty_page(hashp, pagep)
-	HTAB *hashp;
+__ham_dirty_page(dbp, pagep)
+	DB *dbp;
 	PAGE *pagep;
 {
-	return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY));
+	return (memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY));
 }
 
 /*
@@ -1523,31 +1591,33 @@ __ham_get_page(dbp, addr, pagep)
 	ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep);
 #ifdef DEBUG_SLOW
 	if (*pagep != NULL)
-		__account_page((HTAB *)dbp->internal, addr, 1);
+		__account_page(dbp, addr, 1);
 #endif
 	return (ret);
 }
 
 /*
- * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **));
+ * PUBLIC: int __ham_overflow_page
+ * PUBLIC:     __P((DBC *, u_int32_t, PAGE **));
  */
 int
-__ham_overflow_page(dbp, type, pp)
-	DB *dbp;
+__ham_overflow_page(dbc, type, pp)
+	DBC *dbc;
 	u_int32_t type;
 	PAGE **pp;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DB_LSN *lsnp, new_lsn;
-	HTAB *hashp;
 	PAGE *p;
 	db_pgno_t new_addr, next_free, newalloc_flag;
 	u_int32_t offset, splitnum;
 	int ret;
 
-	hashp = (HTAB *)dbp->internal;
-
 	ret = 0;
-	DIRTY_META(hashp, ret);
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	DIRTY_META(dbp, hcp, ret);
 	if (ret != 0)
 		return (ret);
 
@@ -1558,22 +1628,22 @@ __ham_overflow_page(dbp, type, pp)
 	 * after the log do we get to complete allocation of the
 	 * new page.
 	 */
-	new_addr = hashp->hdr->last_freed;
+	new_addr = hcp->hdr->last_freed;
 	if (new_addr != PGNO_INVALID) {
-		if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0)
+		if ((ret = __ham_get_page(dbp, new_addr, &p)) != 0)
 			return (ret);
 		next_free = NEXT_PGNO(p);
 		lsnp = &LSN(p);
 		newalloc_flag = 0;
 	} else {
-		splitnum = hashp->hdr->ovfl_point;
-		hashp->hdr->spares[splitnum]++;
-		offset = hashp->hdr->spares[splitnum] -
-		    (splitnum ? hashp->hdr->spares[splitnum - 1] : 0);
-		new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset);
-		if (new_addr > MAX_PAGES(hashp)) {
-			__db_err(hashp->dbp->dbenv, "hash: out of file pages");
-			hashp->hdr->spares[splitnum]--;
+		splitnum = hcp->hdr->ovfl_point;
+		hcp->hdr->spares[splitnum]++;
+		offset = hcp->hdr->spares[splitnum] -
+		    (splitnum ? hcp->hdr->spares[splitnum - 1] : 0);
+		new_addr = PGNO_OF(hcp, hcp->hdr->ovfl_point, offset);
+		if (new_addr > MAX_PAGES(hcp)) {
+			__db_err(dbp->dbenv, "hash: out of file pages");
+			hcp->hdr->spares[splitnum]--;
 			return (ENOMEM);
 		}
 		next_free = PGNO_INVALID;
@@ -1582,29 +1652,29 @@ __ham_overflow_page(dbp, type, pp)
 		newalloc_flag = 1;
 	}
 
-	if (DB_LOGGING(hashp->dbp)) {
-		if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO,
-		    hashp->dbp->log_fileid, new_addr, next_free,
-		    0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0)
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, ALLOCPGNO,
+		    dbp->log_fileid, new_addr, next_free,
+		    0, newalloc_flag, type, lsnp, &hcp->hdr->lsn)) != 0)
 			return (ret);
 
-		hashp->hdr->lsn = new_lsn;
+		hcp->hdr->lsn = new_lsn;
 		if (lsnp != NULL)
 			*lsnp = new_lsn;
 	}
 
 	if (p != NULL) {
 		/* We just took something off the free list, initialize it. */
-		hashp->hdr->last_freed = next_free;
-		P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID,
+		hcp->hdr->last_freed = next_free;
+		P_INIT(p, hcp->hdr->pagesize, PGNO(p), PGNO_INVALID,
 		    PGNO_INVALID, 0, (u_int8_t)type);
 	} else {
 		/* Get the new page. */
-		if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0)
+		if ((ret = __ham_new_page(dbp, new_addr, type, &p)) != 0)
 			return (ret);
 	}
-	if (DB_LOGGING(hashp->dbp))
+	if (DB_LOGGING(dbc))
 		LSN(p) = new_lsn;
 
 	*pp = p;
@@ -1614,94 +1684,123 @@ __ham_overflow_page(dbp, type, pp)
 #ifdef DEBUG
 /*
  * PUBLIC: #ifdef DEBUG
- * PUBLIC: db_pgno_t __bucket_to_page __P((HTAB *, db_pgno_t));
+ * PUBLIC: db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t));
  * PUBLIC: #endif
  */
 db_pgno_t
-__bucket_to_page(hashp, n)
-	HTAB *hashp;
+__bucket_to_page(hcp, n)
+	HASH_CURSOR *hcp;
 	db_pgno_t n;
 {
 	int ret_val;
 
 	ret_val = n + 1;
 	if (n != 0)
-		ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1];
+		ret_val += hcp->hdr->spares[__db_log2(n + 1) - 1];
 	return (ret_val);
 }
 #endif
 
 /*
  * Create a bunch of overflow pages at the current split point.
- * PUBLIC: void __ham_init_ovflpages __P((HTAB *));
+ * PUBLIC: void __ham_init_ovflpages __P((DBC *));
  */
 void
-__ham_init_ovflpages(hp)
-	HTAB *hp;
+__ham_init_ovflpages(dbc)
+	DBC *dbc;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	DB_LSN new_lsn;
 	PAGE *p;
 	db_pgno_t last_pgno, new_pgno;
 	u_int32_t i, curpages, numpages;
 
-	curpages = hp->hdr->spares[hp->hdr->ovfl_point] -
-	    hp->hdr->spares[hp->hdr->ovfl_point - 1];
-	numpages = hp->hdr->ovfl_point + 1 - curpages;
-
-	last_pgno = hp->hdr->last_freed;
-	new_pgno = PGNO_OF(hp, hp->hdr->ovfl_point, curpages + 1);
-	if (DB_LOGGING(hp->dbp)) {
-		(void)__ham_ovfl_log(hp->dbp->dbenv->lg_info,
-		    (DB_TXN *)hp->dbp->txn, &new_lsn, 0,
-		    hp->dbp->log_fileid, new_pgno,
-		    numpages, last_pgno, hp->hdr->ovfl_point, &hp->hdr->lsn);
-		hp->hdr->lsn = new_lsn;
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	curpages = hcp->hdr->spares[hcp->hdr->ovfl_point] -
+	    hcp->hdr->spares[hcp->hdr->ovfl_point - 1];
+	numpages = hcp->hdr->ovfl_point + 1 - curpages;
+
+	last_pgno = hcp->hdr->last_freed;
+	new_pgno = PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + 1);
+	if (DB_LOGGING(dbc)) {
+		(void)__ham_ovfl_log(dbp->dbenv->lg_info,
+		    dbc->txn, &new_lsn, 0, dbp->log_fileid, new_pgno,
+		    numpages, last_pgno, hcp->hdr->ovfl_point, &hcp->hdr->lsn);
+		hcp->hdr->lsn = new_lsn;
 	} else
 		ZERO_LSN(new_lsn);
 
-	hp->hdr->spares[hp->hdr->ovfl_point] += numpages;
+	hcp->hdr->spares[hcp->hdr->ovfl_point] += numpages;
 	for (i = numpages; i > 0; i--) {
-		if (__ham_new_page(hp,
-		    PGNO_OF(hp, hp->hdr->ovfl_point, curpages + i),
+		if (__ham_new_page(dbp,
+		    PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + i),
 		    P_INVALID, &p) != 0)
 			break;
 		LSN(p) = new_lsn;
 		NEXT_PGNO(p) = last_pgno;
 		last_pgno = PGNO(p);
-		(void)__ham_put_page(hp->dbp, p, 1);
+		(void)__ham_put_page(dbp, p, 1);
 	}
-	hp->hdr->last_freed = last_pgno;
+	hcp->hdr->last_freed = last_pgno;
 }
 
 /*
- * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t));
  */
 int
-__ham_get_cpage(hashp, hcp, mode)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_get_cpage(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	int ret;
 
-	if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) &&
-	    (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0)
-		return (ret);
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	/*
+	 * There are three cases with respect to buckets and locks.  If there
+	 * is no lock held, then if we are locking, we should get the lock.
+	 * If there is a lock held and it's for the current bucket, we don't
+	 * need to do anything.  If there is a lock, but it's for a different
+	 * bucket, then we need to release and get.
+	 */
+	if (F_ISSET(dbp, DB_AM_LOCKING)) {
+		if (hcp->lock != 0 && hcp->lbucket != hcp->bucket) {
+			/*
+			 * If this is the original lock, don't release it,
+			 * because we may need to restore it upon exit.
+			 */
+			if (dbc->txn == NULL &&
+			    !F_ISSET(hcp, H_ORIGINAL) && (ret =
+			    lock_put(dbp->dbenv->lk_info, hcp->lock)) != 0)
+				return (ret);
+			F_CLR(hcp, H_ORIGINAL);
+			hcp->lock = 0;
+		}
+		if (hcp->lock == 0 && (ret = __ham_lock_bucket(dbc, mode)) != 0)
+			return (ret);
+		hcp->lbucket = hcp->bucket;
+	}
 
 	if (hcp->pagep == NULL) {
 		if (hcp->pgno == PGNO_INVALID) {
-			hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket);
+			hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
 			hcp->bndx = 0;
 		}
 
 		if ((ret =
-		    __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0)
+		    __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
 			return (ret);
 	}
 
 	if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL)
 		if ((ret =
-		    __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
+		    __ham_get_page(dbp, hcp->dpgno, &hcp->dpagep)) != 0)
 			return (ret);
 	return (0);
 }
@@ -1711,28 +1810,30 @@ __ham_get_cpage(hashp, hcp, mode)
  * If the flag is set to H_ISDUP, then we are talking about the
  * duplicate page, not the main page.
  *
- * PUBLIC: int __ham_next_cpage
- * PUBLIC:    __P((HTAB *, HASH_CURSOR *, db_pgno_t, int, u_int32_t));
+ * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t));
  */
 int
-__ham_next_cpage(hashp, hcp, pgno, dirty, flags)
-	HTAB *hashp;
-	HASH_CURSOR *hcp;
+__ham_next_cpage(dbc, pgno, dirty, flags)
+	DBC *dbc;
 	db_pgno_t pgno;
 	int dirty;
 	u_int32_t flags;
 {
+	DB *dbp;
+	HASH_CURSOR *hcp;
 	PAGE *p;
 	int ret;
 
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
 	if (LF_ISSET(H_ISDUP) && hcp->dpagep != NULL &&
-	    (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0)
+	    (ret = __ham_put_page(dbp, hcp->dpagep, dirty)) != 0)
 		return (ret);
 	else if (!LF_ISSET(H_ISDUP) && hcp->pagep != NULL &&
-	    (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0)
+	    (ret = __ham_put_page(dbp, hcp->pagep, dirty)) != 0)
 		return (ret);
 
-	if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0)
+	if ((ret = __ham_get_page(dbp, pgno, &p)) != 0)
 		return (ret);
 
 	if (LF_ISSET(H_ISDUP)) {
@@ -1753,22 +1854,21 @@ __ham_next_cpage(hashp, hcp, pgno, dirty, flags)
  *	Get the lock on a particular bucket.
  */
 static int
-__ham_lock_bucket(dbp, hcp, mode)
-	DB *dbp;
-	HASH_CURSOR *hcp;
+__ham_lock_bucket(dbc, mode)
+	DBC *dbc;
 	db_lockmode_t mode;
 {
+	HASH_CURSOR *hcp;
 	int ret;
 
-	/*
-	 * What a way to trounce on the memory system.  It might be
-	 * worth copying the lk_info into the hashp.
-	 */
-	ret = 0;
-	dbp->lock.pgno = (db_pgno_t)(hcp->bucket);
-	ret = lock_get(dbp->dbenv->lk_info,
-	    dbp->txn == NULL ?  dbp->locker : dbp->txn->txnid, 0,
-	    &dbp->lock_dbt, mode, &hcp->lock);
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbc->lock.pgno = (db_pgno_t)(hcp->bucket);
+	if (dbc->txn == NULL)
+		ret = lock_get(dbc->dbp->dbenv->lk_info, dbc->locker, 0,
+		    &dbc->lock_dbt, mode, &hcp->lock);
+	else
+		ret = lock_tget(dbc->dbp->dbenv->lk_info, dbc->txn, 0,
+		    &dbc->lock_dbt, mode, &hcp->lock);
 
 	return (ret < 0 ? EAGAIN : ret);
 }
@@ -1827,45 +1927,3 @@ __ham_dpair(dbp, p, pndx)
 	HOFFSET(p) = HOFFSET(p) + delta;
 	NUM_ENT(p) = NUM_ENT(p) - 2;
 }
-
-#ifdef DEBUG_SLOW
-static void
-__account_page(hashp, pgno, inout)
-	HTAB *hashp;
-	db_pgno_t pgno;
-	int inout;
-{
-	static struct {
-		db_pgno_t pgno;
-		int times;
-	} list[100];
-	static int last;
-	int i, j;
-
-	if (inout == -1)			/* XXX: Kluge */
-		inout = 0;
-
-	/* Find page in list. */
-	for (i = 0; i < last; i++)
-		if (list[i].pgno == pgno)
-			break;
-	/* Not found. */
-	if (i == last) {
-		list[last].times = inout;
-		list[last].pgno = pgno;
-		last++;
-	}
-	list[i].times = inout;
-	if (list[i].times == 0) {
-		for (j = i; j < last; j++)
-			list[j] = list[j + 1];
-		last--;
-	}
-	for (i = 0; i < last; i++, list[i].times++)
-		if (list[i].times > 20 &&
-		    !__is_bitmap_pgno(hashp, list[i].pgno))
-			(void)fprintf(stderr,
-			    "Warning: pg %lu has been out for %d times\n",
-			    (u_long)list[i].pgno, list[i].times);
-}
-#endif /* DEBUG_SLOW */
diff --git a/db2/hash/hash_rec.c b/db2/hash/hash_rec.c
index 727f615828..b58f2c6eb7 100644
--- a/db2/hash/hash_rec.c
+++ b/db2/hash/hash_rec.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)hash_rec.c	10.19 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)hash_rec.c	10.22 (Sleepycat) 10/21/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -80,17 +80,19 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_insdel_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	u_int32_t op;
 	int cmp_n, cmp_p, getmeta, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_insdel_print);
 	REC_INTRO(__ham_insdel_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
 	ret = memp_fget(mpf, &argp->pgno, 0, &pagep);
 	if (ret != 0) {
@@ -101,16 +103,15 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else if ((ret = memp_fget(mpf, &argp->pgno,
 		    DB_MPOOL_CREATE, &pagep)) != 0)
 			goto out;
 	}
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, hcp, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	cmp_n = log_compare(lsnp, &LSN(pagep));
@@ -144,7 +145,7 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info)
 			    !redo || PAIR_ISDATABIG(argp->opcode) ?
 			    H_OFFPAGE : H_KEYDATA);
 		} else
-			(void) __ham_reputpair(pagep, hashp->hdr->pagesize,
+			(void) __ham_reputpair(pagep, hcp->hdr->pagesize,
 			    argp->ndx, &argp->key, &argp->data);
 
 		LSN(pagep) = redo ? *lsnp : argp->pagelsn;
@@ -163,10 +164,11 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info)
 			goto out;
 
 	/* Return the previous LSN. */
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -187,16 +189,18 @@ __ham_newpage_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_newpage_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	int cmp_n, cmp_p, change, getmeta, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_newpage_print);
 	REC_INTRO(__ham_newpage_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
 	ret = memp_fget(mpf, &argp->new_pgno, 0, &pagep);
 	if (ret != 0) {
@@ -214,8 +218,9 @@ __ham_newpage_recover(logp, dbtp, lsnp, redo, info)
 			goto out;
 	}
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	/*
@@ -289,11 +294,13 @@ ppage:	if (argp->prev_pgno != PGNO_INVALID) {
 		}
 
 		if (!change) {
-			if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0)
+			if ((ret =
+			    __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0)
 				goto out;
 		} else {
 			LSN(pagep) = redo ? *lsnp : argp->prevlsn;
-			if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0)
+			if ((ret =
+			    __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0)
 				goto out;
 		}
 	}
@@ -310,9 +317,7 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 				 * so we would not have to undo anything.  In
 				 * this case, don't bother creating a page.
 				 */
-				*lsnp = argp->prev_lsn;
-				ret = 0;
-				goto out;
+				goto done;
 			} else if ((ret =
 			    memp_fget(mpf, &argp->next_pgno,
 			    DB_MPOOL_CREATE, &pagep)) != 0)
@@ -346,10 +351,11 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 				goto out;
 		}
 	}
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -372,19 +378,21 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_replace_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
 	DBT dbt;
-	HTAB *hashp;
 	PAGE *pagep;
 	int32_t grow;
 	int change, cmp_n, cmp_p, getmeta, ret;
 	u_int8_t *hk;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_replace_print);
 	REC_INTRO(__ham_replace_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
 	ret = memp_fget(mpf, &argp->pgno, 0, &pagep);
 	if (ret != 0) {
@@ -395,16 +403,15 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else if ((ret = memp_fget(mpf, &argp->pgno,
 		    DB_MPOOL_CREATE, &pagep)) != 0)
 			goto out;
 	}
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	cmp_n = log_compare(lsnp, &LSN(pagep));
@@ -444,10 +451,11 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info)
 	if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0)
 		goto out;
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -468,19 +476,22 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_newpgno_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	int change, cmp_n, cmp_p, getmeta, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_newpgno_print);
 	REC_INTRO(__ham_newpgno_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	/*
@@ -488,34 +499,34 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info)
 	 * to update the meta data; then we need to update the page.
 	 * We'll do the meta-data first.
 	 */
-	cmp_n = log_compare(lsnp, &hashp->hdr->lsn);
-	cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn);
+	cmp_n = log_compare(lsnp, &hcp->hdr->lsn);
+	cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn);
 
 	change = 0;
 	if ((cmp_p == 0 && redo && argp->opcode == ALLOCPGNO) ||
 	    (cmp_n == 0 && !redo && argp->opcode == DELPGNO)) {
 		/* Need to redo an allocation or undo a deletion. */
-		hashp->hdr->last_freed = argp->free_pgno;
+		hcp->hdr->last_freed = argp->free_pgno;
 		if (redo && argp->old_pgno != 0) /* Must be ALLOCPGNO */
-			hashp->hdr->spares[hashp->hdr->ovfl_point]++;
+			hcp->hdr->spares[hcp->hdr->ovfl_point]++;
 		change = 1;
 	} else if (cmp_p == 0 && redo && argp->opcode == DELPGNO) {
 		/* Need to redo a deletion */
-		hashp->hdr->last_freed = argp->pgno;
+		hcp->hdr->last_freed = argp->pgno;
 		change = 1;
 	} else if (cmp_n == 0 && !redo && argp->opcode == ALLOCPGNO) {
 		/* undo an allocation. */
 		if (argp->old_pgno == 0)
-			hashp->hdr->last_freed = argp->pgno;
+			hcp->hdr->last_freed = argp->pgno;
 		else {
-			hashp->hdr->spares[hashp->hdr->ovfl_point]--;
-			hashp->hdr->last_freed = 0;
+			hcp->hdr->spares[hcp->hdr->ovfl_point]--;
+			hcp->hdr->last_freed = 0;
 		}
 		change = 1;
 	}
 	if (change) {
-		hashp->hdr->lsn = redo ? *lsnp : argp->metalsn;
-		F_SET(file_dbp, DB_HS_DIRTYMETA);
+		hcp->hdr->lsn = redo ? *lsnp : argp->metalsn;
+		F_SET(hcp, H_DIRTY);
 	}
 
 
@@ -530,9 +541,7 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else if ((ret = memp_fget(mpf, &argp->pgno,
 		    DB_MPOOL_CREATE, &pagep)) != 0)
 			goto out;
@@ -565,10 +574,11 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info)
 	if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0)
 		goto out;
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 
 }
@@ -590,19 +600,22 @@ __ham_splitmeta_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_splitmeta_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	int change, cmp_n, cmp_p, getmeta, ret;
 	u_int32_t pow;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_splitmeta_print);
 	REC_INTRO(__ham_splitmeta_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	/*
@@ -610,43 +623,45 @@ __ham_splitmeta_recover(logp, dbtp, lsnp, redo, info)
 	 * to update the meta data; then we need to update the page.
 	 * We'll do the meta-data first.
 	 */
-	cmp_n = log_compare(lsnp, &hashp->hdr->lsn);
-	cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn);
+	cmp_n = log_compare(lsnp, &hcp->hdr->lsn);
+	cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn);
 
 	change = 0;
 	if (cmp_p == 0 && redo) {
 		/* Need to redo the split information. */
-		hashp->hdr->max_bucket = argp->bucket + 1;
-		pow = __db_log2(hashp->hdr->max_bucket + 1);
-		if (pow > hashp->hdr->ovfl_point) {
-			hashp->hdr->spares[pow] =
-				hashp->hdr->spares[hashp->hdr->ovfl_point];
-			hashp->hdr->ovfl_point = pow;
+		hcp->hdr->max_bucket = argp->bucket + 1;
+		pow = __db_log2(hcp->hdr->max_bucket + 1);
+		if (pow > hcp->hdr->ovfl_point) {
+			hcp->hdr->spares[pow] =
+				hcp->hdr->spares[hcp->hdr->ovfl_point];
+			hcp->hdr->ovfl_point = pow;
 		}
-		if (hashp->hdr->max_bucket > hashp->hdr->high_mask) {
-			hashp->hdr->low_mask = hashp->hdr->high_mask;
-			hashp->hdr->high_mask =
-			    hashp->hdr->max_bucket | hashp->hdr->low_mask;
+		if (hcp->hdr->max_bucket > hcp->hdr->high_mask) {
+			hcp->hdr->low_mask = hcp->hdr->high_mask;
+			hcp->hdr->high_mask =
+			    hcp->hdr->max_bucket | hcp->hdr->low_mask;
 		}
 		change = 1;
 	} else if (cmp_n == 0 && !redo) {
 		/* Need to undo the split information. */
-		hashp->hdr->max_bucket = argp->bucket;
-		hashp->hdr->ovfl_point = argp->ovflpoint;
-		hashp->hdr->spares[hashp->hdr->ovfl_point] = argp->spares;
-		pow = 1 << __db_log2(hashp->hdr->max_bucket + 1);
-		hashp->hdr->high_mask = pow - 1;
-		hashp->hdr->low_mask = (pow >> 1) - 1;
+		hcp->hdr->max_bucket = argp->bucket;
+		hcp->hdr->ovfl_point = argp->ovflpoint;
+		hcp->hdr->spares[hcp->hdr->ovfl_point] = argp->spares;
+		pow = 1 << __db_log2(hcp->hdr->max_bucket + 1);
+		hcp->hdr->high_mask = pow - 1;
+		hcp->hdr->low_mask = (pow >> 1) - 1;
 		change = 1;
 	}
 	if (change) {
-		hashp->hdr->lsn = redo ? *lsnp : argp->metalsn;
-		F_SET(file_dbp, DB_HS_DIRTYMETA);
+		hcp->hdr->lsn = redo ? *lsnp : argp->metalsn;
+		F_SET(hcp, H_DIRTY);
 	}
-	*lsnp = argp->prev_lsn;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -665,16 +680,18 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_splitdata_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	int change, cmp_n, cmp_p, getmeta, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_splitdata_print);
 	REC_INTRO(__ham_splitdata_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
 	ret = memp_fget(mpf, &argp->pgno, 0, &pagep);
 	if (ret != 0) {
@@ -685,16 +702,15 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else if ((ret = memp_fget(mpf, &argp->pgno,
 		    DB_MPOOL_CREATE, &pagep)) != 0)
 			goto out;
 	}
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
 	cmp_n = log_compare(lsnp, &LSN(pagep));
@@ -732,10 +748,11 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info)
 	if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0)
 		goto out;
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -755,50 +772,52 @@ __ham_ovfl_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_ovfl_args *argp;
-	DB *mdbp, *file_dbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	db_pgno_t max_pgno, pgno;
 	int cmp_n, cmp_p, getmeta, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_ovfl_print);
 	REC_INTRO(__ham_ovfl_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 
-	cmp_n = log_compare(lsnp, &hashp->hdr->lsn);
-	cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn);
+	cmp_n = log_compare(lsnp, &hcp->hdr->lsn);
+	cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn);
 
 	if (cmp_p == 0 && redo) {
 		/* Redo the allocation. */
-		hashp->hdr->last_freed = argp->start_pgno;
-		hashp->hdr->spares[argp->ovflpoint] += argp->npages;
-		hashp->hdr->lsn = *lsnp;
-		F_SET(file_dbp, DB_HS_DIRTYMETA);
+		hcp->hdr->last_freed = argp->start_pgno;
+		hcp->hdr->spares[argp->ovflpoint] += argp->npages;
+		hcp->hdr->lsn = *lsnp;
+		F_SET(hcp, H_DIRTY);
 	} else if (cmp_n == 0 && !redo) {
-		hashp->hdr->last_freed = argp->free_pgno;
-		hashp->hdr->spares[argp->ovflpoint] -= argp->npages;
-		hashp->hdr->lsn = argp->metalsn;
-		F_SET(file_dbp, DB_HS_DIRTYMETA);
+		hcp->hdr->last_freed = argp->free_pgno;
+		hcp->hdr->spares[argp->ovflpoint] -= argp->npages;
+		hcp->hdr->lsn = argp->metalsn;
+		F_SET(hcp, H_DIRTY);
 	}
 
 	max_pgno = argp->start_pgno + argp->npages - 1;
 	ret = 0;
 	for (pgno = argp->start_pgno; pgno <= max_pgno; pgno++) {
-		ret = memp_fget(mpf, &pgno, 0, &pagep);
-		if (ret != 0) {
-			if (redo && (ret = memp_fget(mpf, &pgno,
-			    DB_MPOOL_CREATE, &pagep)) != 0)
-				goto out;
-			else if (!redo) {
-				(void)__ham_put_page(file_dbp, pagep, 0);
+		if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) {
+			if (!redo) {
+				ret = 0;
 				continue;
 			}
+			if ((ret = memp_fget(mpf,
+			    &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
 		}
 		if (redo && log_compare((const DB_LSN *)lsnp,
 		    (const DB_LSN *)&LSN(pagep)) > 0) {
@@ -816,9 +835,11 @@ __ham_ovfl_recover(logp, dbtp, lsnp, redo, info)
 			goto out;
 	}
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
 
@@ -838,19 +859,22 @@ __ham_copypage_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__ham_copypage_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	DB_MPOOLFILE *mpf;
-	HTAB *hashp;
 	PAGE *pagep;
 	int cmp_n, cmp_p, getmeta, modified, ret;
 
 	getmeta = 0;
-	hashp = NULL;				/* XXX: shut the compiler up. */
+	hcp = NULL;
 	REC_PRINT(__ham_copypage_print);
 	REC_INTRO(__ham_copypage_read);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	hashp = (HTAB *)file_dbp->internal;
-	GET_META(file_dbp, hashp);
+	GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret);
+	if (ret != 0)
+		goto out;
 	getmeta = 1;
 	modified = 0;
 
@@ -881,7 +905,7 @@ __ham_copypage_recover(logp, dbtp, lsnp, redo, info)
 		modified = 1;
 	} else if (cmp_n == 0 && !redo) {
 		/* Need to undo update described. */
-		P_INIT(pagep, hashp->hdr->pagesize, argp->pgno, PGNO_INVALID,
+		P_INIT(pagep, hcp->hdr->pagesize, argp->pgno, PGNO_INVALID,
 		    argp->next_pgno, 0, P_HASH);
 		LSN(pagep) = argp->pagelsn;
 		modified = 1;
@@ -918,10 +942,8 @@ donext:	ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep);
 		goto out;
 
 	/* Now fix up the next's next page. */
-do_nn:	if (argp->nnext_pgno == PGNO_INVALID) {
-		*lsnp = argp->prev_lsn;
-		goto out;
-	}
+do_nn:	if (argp->nnext_pgno == PGNO_INVALID)
+		goto done;
 
 	ret = memp_fget(mpf, &argp->nnext_pgno, 0, &pagep);
 	if (ret != 0) {
@@ -932,9 +954,7 @@ do_nn:	if (argp->nnext_pgno == PGNO_INVALID) {
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			ret = 0;
-			*lsnp = argp->prev_lsn;
-			goto out;
+			goto done;
 		} else if ((ret = memp_fget(mpf, &argp->nnext_pgno,
 		    DB_MPOOL_CREATE, &pagep)) != 0)
 			goto out;
@@ -957,9 +977,10 @@ do_nn:	if (argp->nnext_pgno == PGNO_INVALID) {
 	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	if (getmeta)
-		RELEASE_META(file_dbp, hashp);
+		RELEASE_META(file_dbp, hcp);
 	REC_CLOSE;
 }
diff --git a/db2/hash/hash_stat.c b/db2/hash/hash_stat.c
index b57ca0950d..1b493d5f40 100644
--- a/db2/hash/hash_stat.c
+++ b/db2/hash/hash_stat.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)hash_stat.c	10.8 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)hash_stat.c	10.12 (Sleepycat) 12/19/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -23,35 +23,22 @@ static const char sccsid[] = "@(#)hash_stat.c	10.8 (Sleepycat) 4/26/98";
 
 /*
  * __ham_stat --
- *	Gather/print the hash statistics.
+ *	Gather/print the hash statistics
  *
- * PUBLIC: int __ham_stat __P((DB *, FILE *));
+ * PUBLIC: int __ham_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
  */
 int
-__ham_stat(dbp, fp)
+__ham_stat(dbp, spp, db_malloc, flags)
 	DB *dbp;
-	FILE *fp;
+	void *spp;
+	void *(*db_malloc) __P((size_t));
+	u_int32_t flags;
 {
-	HTAB *hashp;
-	int i;
+	COMPQUIET(spp, NULL);
+	COMPQUIET(db_malloc, NULL);
+	COMPQUIET(flags, 0);
 
-	hashp = (HTAB *)dbp->internal;
+	DB_PANIC_CHECK(dbp);
 
-	fprintf(fp, "hash: accesses %lu collisions %lu\n",
-	    hashp->hash_accesses, hashp->hash_collisions);
-	fprintf(fp, "hash: expansions %lu\n", hashp->hash_expansions);
-	fprintf(fp, "hash: overflows %lu\n", hashp->hash_overflows);
-	fprintf(fp, "hash: big key/data pages %lu\n", hashp->hash_bigpages);
-
-	SET_LOCKER(dbp, NULL);
-	GET_META(dbp, hashp);
-	fprintf(fp, "keys %lu maxp %lu\n",
-	    (u_long)hashp->hdr->nelem, (u_long)hashp->hdr->max_bucket);
-
-	for (i = 0; i < NCACHED; i++)
-		fprintf(fp,
-		    "spares[%d] = %lu\n", i, (u_long)hashp->hdr->spares[i]);
-
-	RELEASE_META(dbp, hashp);
-	return (0);
+	return (__db_eopnotsup(dbp->dbenv));
 }
diff --git a/db2/include/btree.h b/db2/include/btree.h
index 1660d331e7..b0c04b1508 100644
--- a/db2/include/btree.h
+++ b/db2/include/btree.h
@@ -43,38 +43,19 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)btree.h	10.21 (Sleepycat) 5/23/98
+ *	@(#)btree.h	10.26 (Sleepycat) 12/16/98
  */
 
 /* Forward structure declarations. */
 struct __btree;		typedef struct __btree BTREE;
 struct __cursor;	typedef struct __cursor CURSOR;
 struct __epg;		typedef struct __epg EPG;
-struct __rcursor;	typedef struct __rcursor RCURSOR;
 struct __recno;		typedef struct __recno RECNO;
 
-#undef	DEFMINKEYPAGE			/* Minimum keys per page */
 #define	DEFMINKEYPAGE	 (2)
 
-#undef	ISINTERNAL			/* If an internal page. */
-#define	ISINTERNAL(p)	(TYPE(p) == P_IBTREE || TYPE(p) ==  P_IRECNO)
-#undef	ISLEAF				/* If a leaf page. */
-#define	ISLEAF(p)	(TYPE(p) == P_LBTREE || TYPE(p) ==  P_LRECNO)
-
-/* Allocate and discard thread structures. */
-#define	GETHANDLE(dbp, set_txn, dbpp, ret) {				\
-	if (F_ISSET(dbp, DB_AM_THREAD)) {				\
-		if ((ret = __db_gethandle(dbp, __bam_bdup, dbpp)) != 0)	\
-			return (ret);					\
-	} else								\
-		*dbpp = dbp;						\
-	*dbpp->txn = set_txn;						\
-}
-#define	PUTHANDLE(dbp) {						\
-	dbp->txn = NULL;						\
-	if (F_ISSET(dbp, DB_AM_THREAD))					\
-		__db_puthandle(dbp);					\
-}
+#define	ISINTERNAL(p)	(TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
+#define	ISLEAF(p)	(TYPE(p) == P_LBTREE || TYPE(p) == P_LRECNO)
 
 /*
  * If doing transactions we have to hold the locks associated with a data item
@@ -82,15 +63,15 @@ struct __recno;		typedef struct __recno RECNO;
  * locks associated with walking the tree.  Distinguish between the two so that
  * we don't tie up the internal pages of the tree longer than necessary.
  */
-#define	__BT_LPUT(dbp, lock)						\
-	(F_ISSET((dbp), DB_AM_LOCKING) ?				\
-	    lock_put((dbp)->dbenv->lk_info, lock) : 0)
-#define	__BT_TLPUT(dbp, lock)						\
-	(F_ISSET((dbp), DB_AM_LOCKING) && (dbp)->txn == NULL ?		\
-	    lock_put((dbp)->dbenv->lk_info, lock) : 0)
+#define	__BT_LPUT(dbc, lock)						\
+	(F_ISSET((dbc)->dbp, DB_AM_LOCKING) ?				\
+	    lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0)
+#define	__BT_TLPUT(dbc, lock)						\
+	(F_ISSET((dbc)->dbp, DB_AM_LOCKING) && (dbc)->txn == NULL ?	\
+	    lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0)
 
 /*
- * Flags to __bt_search() and __rec_search().
+ * Flags to __bam_search() and __bam_rsearch().
  *
  * Note, internal page searches must find the largest record less than key in
  * the tree so that descents work.  Leaf page searches must find the smallest
@@ -113,22 +94,19 @@ struct __recno;		typedef struct __recno RECNO;
 #define	S_EXACT		0x00400		/* Exact items only. */
 #define	S_PARENT	0x00800		/* Lock page pair. */
 #define	S_STACK		0x01000		/* Need a complete stack. */
+#define	S_PAST_EOF	0x02000		/* If doing insert search (or keyfirst
+					 * or keylast operations), or a split
+					 * on behalf of an insert, it's okay to
+					 * return an entry one past end-of-page.
+					 */
 
 #define	S_DELETE	(S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK)
 #define	S_FIND		(S_READ | S_DUPFIRST | S_DELNO)
-#define	S_INSERT	(S_WRITE | S_DUPLAST | S_STACK)
-#define	S_KEYFIRST	(S_WRITE | S_DUPFIRST | S_STACK)
-#define	S_KEYLAST	(S_WRITE | S_DUPLAST | S_STACK)
-#define	S_WRPAIR	(S_WRITE | S_DUPLAST | S_PARENT)
-
-/*
- * If doing insert search (including keyfirst or keylast operations) or a
- * split search on behalf of an insert, it's okay to return the entry one
- * past the end of the page.
- */
-#define	PAST_END_OK(f)							\
-	((f) == S_INSERT ||						\
-	(f) == S_KEYFIRST || (f) == S_KEYLAST || (f) == S_WRPAIR)
+#define	S_FIND_WR	(S_WRITE | S_DUPFIRST | S_DELNO)
+#define	S_INSERT	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
+#define	S_KEYFIRST	(S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK)
+#define	S_KEYLAST	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
+#define	S_WRPAIR	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT)
 
 /*
  * Flags to __bam_iitem().
@@ -149,23 +127,32 @@ struct __epg {
 };
 
 /*
- * All cursors are queued from the master DB structure.  Convert the user's
- * DB reference to the master DB reference.  We lock the master DB mutex
- * so that we can walk the cursor queue.  There's no race in accessing the
- * cursors, because if we're modifying a page, we have a write lock on it,
- * and therefore no other thread than the current one can have a cursor that
- * references the page.
+ * We maintain a stack of the pages that we're locking in the tree.  Btree's
+ * (currently) only save two levels of the tree at a time, so the default
+ * stack is always large enough.  Recno trees have to lock the entire tree to
+ * do inserts/deletes, however.  Grow the stack as necessary.
  */
-#define	CURSOR_SETUP(dbp) {						\
-	(dbp) = (dbp)->master;						\
-	DB_THREAD_LOCK(dbp);						\
-}
-#define	CURSOR_TEARDOWN(dbp)						\
-	DB_THREAD_UNLOCK(dbp);
+#define	BT_STK_CLR(c)							\
+	((c)->csp = (c)->sp)
+
+#define	BT_STK_ENTER(c, pagep, page_indx, lock, ret) do {		\
+	if ((ret =							\
+	    (c)->csp == (c)->esp ? __bam_stkgrow(c) : 0) == 0) {	\
+		(c)->csp->page = pagep;					\
+		(c)->csp->indx = page_indx;				\
+		(c)->csp->lock = lock;					\
+	}								\
+} while (0)
+
+#define	BT_STK_PUSH(c, pagep, page_indx, lock, ret) do {		\
+	BT_STK_ENTER(c, pagep, page_indx, lock, ret);			\
+	++(c)->csp;							\
+} while (0)
+
+#define	BT_STK_POP(c)							\
+	((c)->csp == (c)->stack ? NULL : --(c)->csp)
 
 /*
- * Btree cursor.
- *
  * Arguments passed to __bam_ca_replace().
  */
 typedef enum {
@@ -173,9 +160,27 @@ typedef enum {
 	REPLACE_SUCCESS,
 	REPLACE_FAILED
 } ca_replace_arg;
+
+/* Arguments passed to __ram_ca(). */
+typedef enum {
+	CA_DELETE,
+	CA_IAFTER,
+	CA_IBEFORE
+} ca_recno_arg;
+
+#define	RECNO_OOB	0		/* Illegal record number. */
+
+/* Btree/Recno cursor. */
 struct __cursor {
 	DBC		*dbc;		/* Enclosing DBC. */
 
+	/* Per-thread information: shared by btree/recno. */
+	EPG		*sp;		/* Stack pointer. */
+	EPG	 	*csp;		/* Current stack entry. */
+	EPG		*esp;		/* End stack pointer. */
+	EPG		 stack[5];
+
+	/* Per-thread information: btree private. */
 	PAGE		*page;		/* Cursor page. */
 
 	db_pgno_t	 pgno;		/* Page. */
@@ -187,90 +192,25 @@ struct __cursor {
 	DB_LOCK		 lock;		/* Cursor read lock. */
 	db_lockmode_t	 mode;		/* Lock mode. */
 
-	/*
-	 * If a cursor record is deleted, the key/data pair has to remain on
-	 * the page so that subsequent inserts/deletes don't interrupt the
-	 * cursor progression through the file.  This results in interesting
-	 * cases when "standard" operations, e.g., dbp->put() are done in the
-	 * context of "deleted" cursors.
-	 *
-	 * C_DELETED -- The item referenced by the cursor has been "deleted"
-	 *		but not physically removed from the page.
-	 * C_REPLACE -- The "deleted" item referenced by a cursor has been
-	 *		replaced by a dbp->put(), so the cursor is no longer
-	 *		responsible for physical removal from the page.
-	 * C_REPLACE_SETUP --
-	 *		We are about to overwrite a "deleted" item, flag any
-	 *		cursors referencing it for transition to C_REPLACE
-	 *		state.
-	 */
-#define	C_DELETED	0x0001
-#define	C_REPLACE	0x0002
-#define	C_REPLACE_SETUP	0x0004
-
-	/*
-	 * Internal cursor held for DB->get; don't hold locks unless involved
-	 * in a TXN.
-	 */
-#define	C_INTERNAL	0x0008
-	u_int32_t	 flags;
-};
-
-/*
- * Recno cursor.
- *
- * Arguments passed to __ram_ca().
- */
-typedef enum {
-	CA_DELETE,
-	CA_IAFTER,
-	CA_IBEFORE
-} ca_recno_arg;
-struct __rcursor {
-	DBC		*dbc;		/* Enclosing DBC. */
-
+	/* Per-thread information: recno private. */
 	db_recno_t	 recno;		/* Current record number. */
 
 	/*
-	 * Cursors referencing "deleted" records are positioned between
-	 * two records, and so must be specially adjusted until they are
-	 * moved.
+	 * Btree:
+	 * We set a flag in the cursor structure if the underlying object has
+	 * been deleted.  It's not strictly necessary, we could get the same
+	 * information by looking at the page itself.
+	 *
+	 * Recno:
+	 * When renumbering recno databases during deletes, cursors referencing
+	 * "deleted" records end up positioned between two records, and so must
+	 * be specially adjusted on the next operation.
 	 */
-#define	CR_DELETED	0x0001		/* Record deleted. */
+#define	C_DELETED	0x0001		/* Record was deleted. */
 	u_int32_t	 flags;
 };
 
 /*
- * We maintain a stack of the pages that we're locking in the tree.  Btree's
- * (currently) only save two levels of the tree at a time, so the default
- * stack is always large enough.  Recno trees have to lock the entire tree to
- * do inserts/deletes, however.  Grow the stack as necessary.
- */
-#undef	BT_STK_CLR
-#define	BT_STK_CLR(t)							\
-	((t)->bt_csp = (t)->bt_sp)
-
-#undef	BT_STK_ENTER
-#define	BT_STK_ENTER(t, pagep, page_indx, lock, ret) do {		\
-	if ((ret =							\
-	    (t)->bt_csp == (t)->bt_esp ? __bam_stkgrow(t) : 0) == 0) {	\
-		(t)->bt_csp->page = pagep;				\
-		(t)->bt_csp->indx = page_indx;				\
-		(t)->bt_csp->lock = lock;				\
-	}								\
-} while (0)
-
-#undef	BT_STK_PUSH
-#define	BT_STK_PUSH(t, pagep, page_indx, lock, ret) do {		\
-	BT_STK_ENTER(t, pagep, page_indx, lock, ret);			\
-	++(t)->bt_csp;							\
-} while (0)
-
-#undef	BT_STK_POP
-#define	BT_STK_POP(t)							\
-	((t)->bt_csp == (t)->bt_stack ? NULL : --(t)->bt_csp)
-
-/*
  * The in-memory recno data structure.
  *
  * !!!
@@ -278,9 +218,6 @@ struct __rcursor {
  * are no transaction semantics associated with backing files, nor is there
  * any thread protection.
  */
-#undef	RECNO_OOB
-#define	RECNO_OOB	0		/* Illegal record number. */
-
 struct __recno {
 	int		 re_delim;	/* Variable-length delimiting byte. */
 	int		 re_pad;	/* Fixed-length padding byte. */
@@ -294,7 +231,7 @@ struct __recno {
 	void		*re_emap;	/* End of mapped space. */
 	size_t		 re_msize;	/* Size of mapped region. */
 					/* Recno input function. */
-	int (*re_irec) __P((DB *, db_recno_t));
+	int (*re_irec) __P((DBC *, db_recno_t));
 
 #define	RECNO_EOF	0x0001		/* EOF on backing source file. */
 #define	RECNO_MODIFIED	0x0002		/* Tree was modified. */
@@ -302,31 +239,11 @@ struct __recno {
 };
 
 /*
- * The in-memory btree data structure.
+ * The in-memory, per-tree btree data structure.
  */
 struct __btree {
-/*
- * These fields are per-thread and are initialized when the BTREE structure
- * is created.
- */
 	db_pgno_t	 bt_lpgno;	/* Last insert location. */
 
-	DBT		 bt_rkey;	/* Returned key. */
-	DBT		 bt_rdata;	/* Returned data. */
-
-	EPG		*bt_sp;		/* Stack pointer. */
-	EPG	 	*bt_csp;	/* Current stack entry. */
-	EPG		*bt_esp;	/* End stack pointer. */
-	EPG		 bt_stack[5];
-
-	RECNO		*bt_recno;	/* Private recno structure. */
-
-	DB_BTREE_LSTAT lstat;		/* Btree local statistics. */
-
-/*
- * These fields are copied from the original BTREE structure and never
- * change.
- */
 	db_indx_t 	 bt_maxkey;	/* Maximum keys per page. */
 	db_indx_t 	 bt_minkey;	/* Minimum keys per page. */
 
@@ -336,6 +253,8 @@ struct __btree {
 	    __P((const DBT *, const DBT *));
 
 	db_indx_t	 bt_ovflsize;	/* Maximum key/data on-page size. */
+
+	RECNO		*recno;		/* Private recno structure. */
 };
 
 #include "btree_auto.h"
diff --git a/db2/include/btree_ext.h b/db2/include/btree_ext.h
index b8a137364c..fbc2ed958f 100644
--- a/db2/include/btree_ext.h
+++ b/db2/include/btree_ext.h
@@ -1,45 +1,41 @@
 /* DO NOT EDIT: automatically built by dist/distrib. */
 #ifndef _btree_ext_h_
 #define _btree_ext_h_
-int __bam_close __P((DB *));
-int __bam_sync __P((DB *, u_int32_t));
-int __bam_cmp __P((DB *, const DBT *, EPG *));
+int __bam_cmp __P((DB *, const DBT *,
+   PAGE *, u_int32_t, int (*)(const DBT *, const DBT *)));
 int __bam_defcmp __P((const DBT *, const DBT *));
 size_t __bam_defpfx __P((const DBT *, const DBT *));
 int __bam_pgin __P((db_pgno_t, void *, DBT *));
 int __bam_pgout __P((db_pgno_t, void *, DBT *));
 int __bam_mswap __P((PAGE *));
-int __bam_cursor __P((DB *, DB_TXN *, DBC **));
-int __bam_c_iclose __P((DB *, DBC *));
-int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int));
 int __bam_cprint __P((DB *));
-int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int));
+int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int));
 void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int));
 void __bam_ca_dup __P((DB *,
    db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t));
-void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t));
-void __bam_ca_replace
-   __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg));
+void __bam_ca_rsplit __P((DB *, db_pgno_t, db_pgno_t));
 void __bam_ca_split __P((DB *,
    db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+int __bam_c_init __P((DBC *));
+int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
 int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
-int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
-int __bam_ditem __P((DB *, PAGE *, u_int32_t));
-int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int));
-int __bam_dpage __P((DB *, const DBT *));
-int __bam_open __P((DB *, DBTYPE, DB_INFO *));
-int __bam_bdup __P((DB *, DB *));
-int __bam_new __P((DB *, u_int32_t, PAGE **));
-int __bam_free __P((DB *, PAGE *));
-int __bam_lt __P((DB *));
-int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *));
-int __bam_lput __P((DB *, DB_LOCK));
-int __bam_pget __P((DB *, PAGE **, db_pgno_t *, u_int32_t));
-int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-int __bam_iitem __P((DB *,
+int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+int __bam_dpage __P((DBC *, const DBT *));
+int __bam_dpages __P((DBC *));
+int __bam_open __P((DB *, DB_INFO *));
+int __bam_close __P((DB *));
+void __bam_setovflsize __P((DB *));
+int __bam_read_root __P((DB *));
+int __bam_new __P((DBC *, u_int32_t, PAGE **));
+int __bam_lput __P((DBC *, DB_LOCK));
+int __bam_free __P((DBC *, PAGE *));
+int __bam_lt __P((DBC *));
+int __bam_lget
+   __P((DBC *, int, db_pgno_t, db_lockmode_t, DB_LOCK *));
+int __bam_iitem __P((DBC *,
    PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t));
-int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *));
+int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *));
 int __bam_pg_alloc_recover
   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __bam_pg_free_recover
@@ -56,28 +52,24 @@ int __bam_cdel_recover
   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __bam_repl_recover
   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
-int __ram_open __P((DB *, DBTYPE, DB_INFO *));
-int __ram_cursor __P((DB *, DB_TXN *, DBC **));
+int __ram_open __P((DB *, DB_INFO *));
 int __ram_close __P((DB *));
-int __ram_c_iclose __P((DB *, DBC *));
+int __ram_c_del __P((DBC *, u_int32_t));
+int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
+int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
 void __ram_ca __P((DB *, db_recno_t, ca_recno_arg));
-int __ram_cprint __P((DB *));
-int __ram_getno __P((DB *, const DBT *, db_recno_t *, int));
-int __ram_snapshot __P((DB *));
-int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *));
-int __bam_adjust __P((DB *, BTREE *, int32_t));
-int __bam_nrecs __P((DB *, db_recno_t *));
+int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+int __bam_adjust __P((DBC *, int32_t));
+int __bam_nrecs __P((DBC *, db_recno_t *));
 db_recno_t __bam_total __P((PAGE *));
-int __bam_search __P((DB *,
+int __bam_search __P((DBC *,
     const DBT *, u_int32_t, int, db_recno_t *, int *));
-int __bam_stkrel __P((DB *));
-int __bam_stkgrow __P((BTREE *));
-int __bam_split __P((DB *, void *));
-int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *));
-int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *));
+int __bam_stkrel __P((DBC *, int));
+int __bam_stkgrow __P((CURSOR *));
+int __bam_split __P((DBC *, void *));
 int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
 int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
-void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *));
 int __bam_pg_alloc_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
     u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t,
diff --git a/db2/include/clib_ext.h b/db2/include/clib_ext.h
index f5510a1629..2566b849ce 100644
--- a/db2/include/clib_ext.h
+++ b/db2/include/clib_ext.h
@@ -37,12 +37,6 @@ void *memcpy __P((void *, const void *, size_t));
 #ifndef HAVE_MEMMOVE
 void *memmove __P((void *, const void *, size_t));
 #endif
-#ifndef HAVE_MEMCPY
-void *memcpy __P((void *, const void *, size_t));
-#endif
-#ifndef HAVE_MEMMOVE
-void *memmove __P((void *, const void *, size_t));
-#endif
 #ifndef HAVE_RAISE
 int raise __P((int));
 #endif
diff --git a/db2/include/common_ext.h b/db2/include/common_ext.h
index 4674f9ce01..33fb0cb218 100644
--- a/db2/include/common_ext.h
+++ b/db2/include/common_ext.h
@@ -5,26 +5,18 @@ int __db_appname __P((DB_ENV *,
    APPNAME, const char *, const char *, u_int32_t, int *, char **));
 int __db_apprec __P((DB_ENV *, u_int32_t));
 int __db_byteorder __P((DB_ENV *, int));
+int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t));
+int __db_fcchk
+   __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+int __db_ferr __P((const DB_ENV *, const char *, int));
 #ifdef __STDC__
 void __db_err __P((const DB_ENV *dbenv, const char *fmt, ...));
 #else
 void __db_err();
 #endif
-int __db_panic __P((DB *));
-int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t));
-int __db_fcchk
-   __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
-int __db_cdelchk __P((const DB *, u_int32_t, int, int));
-int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
-int __db_cputchk __P((const DB *,
-   const DBT *, DBT *, u_int32_t, int, int));
-int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
-int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
-int __db_putchk
-   __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
-int __db_statchk __P((const DB *, u_int32_t));
-int __db_syncchk __P((const DB *, u_int32_t));
-int __db_ferr __P((const DB_ENV *, const char *, int));
+int __db_pgerr __P((DB *, db_pgno_t));
+int __db_pgfmt __P((DB *, db_pgno_t));
+int __db_panic __P((DB_ENV *, int));
 u_int32_t __db_log2 __P((u_int32_t));
 int __db_rattach __P((REGINFO *));
 int __db_rdetach __P((REGINFO *));
diff --git a/db2/include/db.h.src b/db2/include/db.h.src
deleted file mode 100644
index 97ad55693f..0000000000
--- a/db2/include/db.h.src
+++ /dev/null
@@ -1,994 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- *
- *	@(#)db.h.src	10.131 (Sleepycat) 6/2/98
- */
-
-#ifndef _DB_H_
-#define	_DB_H_
-
-#ifndef __NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <stdio.h>
-#endif
-
-/*
- * XXX
- * MacOS: ensure that Metrowerks C makes enumeration types int sized.
- */
-#ifdef __MWERKS__
-#pragma enumsalwaysint on
-#endif
-
-/*
- * XXX
- * Handle function prototypes and the keyword "const".  This steps on name
- * space that DB doesn't control, but all of the other solutions are worse.
- *
- * XXX
- * While Microsoft's compiler is ANSI C compliant, it doesn't have _STDC_
- * defined by default, you specify a command line flag or #pragma to turn
- * it on.  Don't do that, however, because some of Microsoft's own header
- * files won't compile.
- */
-#undef	__P
-#if defined(__STDC__) || defined(__cplusplus) || defined(_MSC_VER)
-#define	__P(protos)	protos		/* ANSI C prototypes */
-#else
-#define	const
-#define	__P(protos)	()		/* K&R C preprocessor */
-#endif
-
-/*
- * !!!
- * DB needs basic information about specifically sized types.  If they're
- * not provided by the system, typedef them here.
- *
- * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__,
- * as does BIND and Kerberos, since we don't know for sure what #include
- * files the user is using.
- *
- * !!!
- * We also provide the standard u_int, u_long etc., if they're not provided
- * by the system.
- */
-#ifndef	__BIT_TYPES_DEFINED__
-#define	__BIT_TYPES_DEFINED__
-@u_int8_decl@
-@int16_decl@
-@u_int16_decl@
-@int32_decl@
-@u_int32_decl@
-#endif
-
-@u_char_decl@
-@u_short_decl@
-@u_int_decl@
-@u_long_decl@
-
-#define	DB_VERSION_MAJOR	2
-#define	DB_VERSION_MINOR	4
-#define	DB_VERSION_PATCH	14
-#define	DB_VERSION_STRING	"Sleepycat Software: DB 2.4.14: (6/2/98)"
-
-typedef	u_int32_t	db_pgno_t;	/* Page number type. */
-typedef	u_int16_t	db_indx_t;	/* Page offset type. */
-#define	DB_MAX_PAGES	0xffffffff	/* >= # of pages in a file */
-
-typedef	u_int32_t	db_recno_t;	/* Record number type. */
-typedef size_t		DB_LOCK;	/* Object returned by lock manager. */
-#define	DB_MAX_RECORDS	0xffffffff	/* >= # of records in a tree */
-
-#define	DB_FILE_ID_LEN		20	/* DB file ID length. */
-
-/* Forward structure declarations, so applications get type checking. */
-struct __db;		typedef struct __db DB;
-#ifdef DB_DBM_HSEARCH
-			typedef struct __db DBM;
-#endif
-struct __db_bt_stat;	typedef struct __db_bt_stat DB_BTREE_STAT;
-struct __db_dbt;	typedef struct __db_dbt DBT;
-struct __db_env;	typedef struct __db_env DB_ENV;
-struct __db_info;	typedef struct __db_info DB_INFO;
-struct __db_lock_stat;	typedef struct __db_lock_stat DB_LOCK_STAT;
-struct __db_lockregion;	typedef struct __db_lockregion DB_LOCKREGION;
-struct __db_lockreq;	typedef struct __db_lockreq DB_LOCKREQ;
-struct __db_locktab;	typedef struct __db_locktab DB_LOCKTAB;
-struct __db_log;	typedef struct __db_log DB_LOG;
-struct __db_log_stat;	typedef struct __db_log_stat DB_LOG_STAT;
-struct __db_lsn;	typedef struct __db_lsn DB_LSN;
-struct __db_mpool;	typedef struct __db_mpool DB_MPOOL;
-struct __db_mpool_finfo;typedef struct __db_mpool_finfo DB_MPOOL_FINFO;
-struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT;
-struct __db_mpool_stat;	typedef struct __db_mpool_stat DB_MPOOL_STAT;
-struct __db_mpoolfile;	typedef struct __db_mpoolfile DB_MPOOLFILE;
-struct __db_txn;	typedef struct __db_txn DB_TXN;
-struct __db_txn_active;	typedef struct __db_txn_active DB_TXN_ACTIVE;
-struct __db_txn_stat;	typedef struct __db_txn_stat DB_TXN_STAT;
-struct __db_txnmgr;	typedef struct __db_txnmgr DB_TXNMGR;
-struct __db_txnregion;	typedef struct __db_txnregion DB_TXNREGION;
-struct __dbc;		typedef struct __dbc DBC;
-
-/* Key/data structure -- a Data-Base Thang. */
-struct __db_dbt {
-	void	 *data;			/* key/data */
-	u_int32_t size;			/* key/data length */
-	u_int32_t ulen;			/* RO: length of user buffer. */
-	u_int32_t dlen;			/* RO: get/put record length. */
-	u_int32_t doff;			/* RO: get/put record offset. */
-
-#define	DB_DBT_INTERNAL	0x01		/* Perform any mallocs using regular
-					   malloc, not the user's malloc. */
-#define	DB_DBT_MALLOC	0x02		/* Return in allocated memory. */
-#define	DB_DBT_PARTIAL	0x04		/* Partial put/get. */
-#define	DB_DBT_USERMEM	0x08		/* Return in user's memory. */
-	u_int32_t flags;
-};
-
-/*
- * DB internal configuration.
- *
- * There are a set of functions that the application can replace with its
- * own versions, and some other knobs which can be turned at run-time.
- */
-#define	DB_FUNC_CALLOC	 1	/* DELETED: ANSI C calloc. */
-#define	DB_FUNC_CLOSE	 2		/* POSIX 1003.1 close. */
-#define	DB_FUNC_DIRFREE	 3		/* DB: free directory list. */
-#define	DB_FUNC_DIRLIST	 4		/* DB: create directory list. */
-#define	DB_FUNC_EXISTS	 5		/* DB: return if file exists. */
-#define	DB_FUNC_FREE	 6		/* ANSI C free. */
-#define	DB_FUNC_FSYNC	 7		/* POSIX 1003.1 fsync. */
-#define	DB_FUNC_IOINFO	 8		/* DB: return file I/O information. */
-#define	DB_FUNC_MALLOC	 9		/* ANSI C malloc. */
-#define	DB_FUNC_MAP	10		/* DB: map file into shared memory. */
-#define	DB_FUNC_OPEN	11		/* POSIX 1003.1 open. */
-#define	DB_FUNC_READ	12		/* POSIX 1003.1 read. */
-#define	DB_FUNC_REALLOC	13		/* ANSI C realloc. */
-#define	DB_FUNC_SEEK	14		/* POSIX 1003.1 lseek. */
-#define	DB_FUNC_SLEEP	15		/* DB: sleep secs/usecs. */
-#define	DB_FUNC_STRDUP	16	/* DELETED: DB: strdup(3). */
-#define	DB_FUNC_UNLINK	17		/* POSIX 1003.1 unlink. */
-#define	DB_FUNC_UNMAP	18		/* DB: unmap shared memory file. */
-#define	DB_FUNC_WRITE	19		/* POSIX 1003.1 write. */
-#define	DB_FUNC_YIELD	20		/* DB: yield thread to scheduler. */
-#define	DB_TSL_SPINS	21		/* DB: initialize spin count. */
-#define	DB_FUNC_RUNLINK	22		/* DB: remove a shared region. */
-#define	DB_REGION_ANON	23		/* DB: anonymous, unnamed regions. */
-#define	DB_REGION_INIT	24		/* DB: page-fault regions in create. */
-#define	DB_REGION_NAME	25		/* DB: anonymous, named regions. */
-#define	DB_MUTEXLOCKS	26		/* DB: turn off all mutex locks. */
-#define	DB_PAGEYIELD	27		/* DB: yield the CPU on pool get. */
-
-/*
- * Database configuration and initialization.
- */
- /*
-  * Flags understood by both db_open(3) and db_appinit(3).
-  */
-#define	DB_CREATE	      0x000001	/* O_CREAT: create file as necessary. */
-#define	DB_NOMMAP	      0x000002	/* Don't mmap underlying file. */
-#define	DB_THREAD	      0x000004	/* Free-thread DB package handles. */
-
-/*
- * Flags understood by db_appinit(3).
- */
-/*			      0x000007	   COMMON MASK. */
-#define	DB_INIT_LOCK	      0x000008	/* Initialize locking. */
-#define	DB_INIT_LOG	      0x000010	/* Initialize logging. */
-#define	DB_INIT_MPOOL	      0x000020	/* Initialize mpool. */
-#define	DB_INIT_TXN	      0x000040	/* Initialize transactions. */
-#define	DB_MPOOL_PRIVATE      0x000080	/* Mpool: private memory pool. */
-#define	__UNUSED_100	      0x000100
-#define	DB_RECOVER	      0x000200	/* Run normal recovery. */
-#define	DB_RECOVER_FATAL      0x000400	/* Run catastrophic recovery. */
-#define	DB_TXN_NOSYNC	      0x000800	/* Do not sync log on commit. */
-#define	DB_USE_ENVIRON	      0x001000	/* Use the environment. */
-#define	DB_USE_ENVIRON_ROOT   0x002000	/* Use the environment if root. */
-
-/* CURRENTLY UNUSED LOCK FLAGS. */
-#define	DB_TXN_LOCK_2PL	      0x000000	/* Two-phase locking. */
-#define	DB_TXN_LOCK_OPTIMIST  0x000000	/* Optimistic locking. */
-#define	DB_TXN_LOCK_MASK      0x000000	/* Lock flags mask. */
-
-/* CURRENTLY UNUSED LOG FLAGS. */
-#define	DB_TXN_LOG_REDO	      0x000000	/* Redo-only logging. */
-#define	DB_TXN_LOG_UNDO	      0x000000	/* Undo-only logging. */
-#define	DB_TXN_LOG_UNDOREDO   0x000000	/* Undo/redo write-ahead logging. */
-#define	DB_TXN_LOG_MASK	      0x000000	/* Log flags mask. */
-
-/*
- * Flags understood by db_open(3).
- *
- * DB_EXCL and DB_TEMPORARY are internal only, and are not documented.
- * DB_SEQUENTIAL is currently internal, but may be exported some day.
- */
-/*			      0x000007	   COMMON MASK. */
-/*			      0x003fff	   ALREADY USED. */
-#define	__UNUSED_4000	      0x004000
-#define	DB_EXCL		      0x008000	/* O_EXCL: exclusive open. */
-#define	DB_RDONLY	      0x010000	/* O_RDONLY: read-only. */
-#define	DB_SEQUENTIAL	      0x020000	/* Indicate sequential access. */
-#define	DB_TEMPORARY	      0x040000	/* Remove on last close. */
-#define	DB_TRUNCATE	      0x080000	/* O_TRUNCATE: replace existing DB. */
-
-/*
- * Deadlock detector modes; used in the DBENV structure to configure the
- * locking subsystem.
- */
-#define	DB_LOCK_NORUN		0x0
-#define	DB_LOCK_DEFAULT		0x1	/* Default policy. */
-#define	DB_LOCK_OLDEST		0x2	/* Abort oldest transaction. */
-#define	DB_LOCK_RANDOM		0x3	/* Abort random transaction. */
-#define	DB_LOCK_YOUNGEST	0x4	/* Abort youngest transaction. */
-
-struct __db_env {
-	int		 db_lorder;	/* Byte order. */
-
-					/* Error message callback. */
-	void (*db_errcall) __P((const char *, char *));
-	FILE		*db_errfile;	/* Error message file stream. */
-	const char	*db_errpfx;	/* Error message prefix. */
-	int		 db_verbose;	/* Generate debugging messages. */
-
-	/* User paths. */
-	char		*db_home;	/* Database home. */
-	char		*db_log_dir;	/* Database log file directory. */
-	char		*db_tmp_dir;	/* Database tmp file directory. */
-
-	char	       **db_data_dir;	/* Database data file directories. */
-	int		 data_cnt;	/* Database data file slots. */
-	int		 data_next;	/* Next Database data file slot. */
-
-	/* Locking. */
-	DB_LOCKTAB	*lk_info;	/* Return from lock_open(). */
-	u_int8_t	*lk_conflicts;	/* Two dimensional conflict matrix. */
-	u_int32_t	 lk_modes;	/* Number of lock modes in table. */
-	u_int32_t	 lk_max;	/* Maximum number of locks. */
-	u_int32_t	 lk_detect;	/* Deadlock detect on all conflicts. */
-
-	/* Logging. */
-	DB_LOG		*lg_info;	/* Return from log_open(). */
-	u_int32_t	 lg_max;	/* Maximum file size. */
-
-	/* Memory pool. */
-	DB_MPOOL	*mp_info;	/* Return from memp_open(). */
-	size_t		 mp_mmapsize;	/* Maximum file size for mmap. */
-	size_t		 mp_size;	/* Bytes in the mpool cache. */
-
-	/* Transactions. */
-	DB_TXNMGR	*tx_info;	/* Return from txn_open(). */
-	u_int32_t	 tx_max;	/* Maximum number of transactions. */
-	int (*tx_recover)		/* Dispatch function for recovery. */
-	    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
-
-#define	DB_ENV_APPINIT		0x01	/* Paths initialized by db_appinit(). */
-#define	DB_ENV_STANDALONE	0x02	/* Test: freestanding environment. */
-#define	DB_ENV_THREAD		0x04	/* DB_ENV is multi-threaded. */
-	u_int32_t	 flags;		/* Flags. */
-};
-
-/*******************************************************
- * Access methods.
- *******************************************************/
-/*
- * XXX
- * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
- */
-typedef enum {
-	DB_BTREE=1,			/* B+tree. */
-	DB_HASH,			/* Extended Linear Hashing. */
-	DB_RECNO,			/* Fixed and variable-length records. */
-	DB_UNKNOWN			/* Figure it out on open. */
-} DBTYPE;
-
-#define	DB_BTREEVERSION	6		/* Current btree version. */
-#define	DB_BTREEOLDVER	6		/* Oldest btree version supported. */
-#define	DB_BTREEMAGIC	0x053162
-
-#define	DB_HASHVERSION	5		/* Current hash version. */
-#define	DB_HASHOLDVER	4		/* Oldest hash version supported. */
-#define	DB_HASHMAGIC	0x061561
-
-#define	DB_LOGVERSION	2		/* Current log version. */
-#define	DB_LOGOLDVER	2		/* Oldest log version supported. */
-#define	DB_LOGMAGIC	0x040988
-
-struct __db_info {
-	int		 db_lorder;	/* Byte order. */
-	size_t		 db_cachesize;	/* Underlying cache size. */
-	size_t		 db_pagesize;	/* Underlying page size. */
-
-					/* Local heap allocation. */
-	void *(*db_malloc) __P((size_t));
-
-	/* Btree access method. */
-	u_int32_t	 bt_maxkey;	/* Maximum keys per page. */
-	u_int32_t	 bt_minkey;	/* Minimum keys per page. */
-	int (*bt_compare)		/* Comparison function. */
-	    __P((const DBT *, const DBT *));
-	size_t (*bt_prefix)		/* Prefix function. */
-	    __P((const DBT *, const DBT *));
-
-	/* Hash access method. */
-	u_int32_t 	 h_ffactor;	/* Fill factor. */
-	u_int32_t	 h_nelem;	/* Number of elements. */
-	u_int32_t      (*h_hash)	/* Hash function. */
-	    __P((const void *, u_int32_t));
-
-	/* Recno access method. */
-	int		 re_pad;	/* Fixed-length padding byte. */
-	int		 re_delim;	/* Variable-length delimiting byte. */
-	u_int32_t	 re_len;	/* Length for fixed-length records. */
-	char		*re_source;	/* Source file name. */
-
-#define	DB_DELIMITER		0x0001	/* Recno: re_delim set. */
-#define	DB_DUP			0x0002	/* Btree, Hash: duplicate keys. */
-#define	DB_FIXEDLEN		0x0004	/* Recno: fixed-length records. */
-#define	DB_PAD			0x0008	/* Recno: re_pad set. */
-#define	DB_RECNUM		0x0010	/* Btree: record numbers. */
-#define	DB_RENUMBER		0x0020	/* Recno: renumber on insert/delete. */
-#define	DB_SNAPSHOT		0x0040	/* Recno: snapshot the input. */
-	u_int32_t	 flags;
-};
-
-/*
- * DB access method and cursor operation codes.  These are implemented as
- * bit fields for future flexibility, but currently only a single one may
- * be specified to any function.
- */
-#define	DB_AFTER	0x000001	/* c_put() */
-#define	DB_APPEND	0x000002	/* put() */
-#define	DB_BEFORE	0x000004	/* c_put() */
-#define	DB_CHECKPOINT	0x000008	/* log_put(), log_get() */
-#define	DB_CURRENT	0x000010	/* c_get(), c_put(), log_get() */
-#define	DB_FIRST	0x000020	/* c_get(), log_get() */
-#define	DB_FLUSH	0x000040	/* log_put() */
-#define	DB_GET_RECNO	0x000080	/* get(), c_get() */
-#define	DB_KEYFIRST	0x000100	/* c_put() */
-#define	DB_KEYLAST	0x000200	/* c_put() */
-#define	DB_LAST		0x000400	/* c_get(), log_get() */
-#define	DB_NEXT		0x000800	/* c_get(), log_get() */
-#define	DB_NOOVERWRITE	0x001000	/* put() */
-#define	DB_NOSYNC	0x002000	/* close() */
-#define	DB_PREV		0x004000	/* c_get(), log_get() */
-#define	DB_RECORDCOUNT	0x008000	/* stat() */
-#define	DB_SET		0x010000	/* c_get(), log_get() */
-#define	DB_SET_RANGE	0x020000	/* c_get() */
-#define	DB_SET_RECNO	0x040000	/* c_get() */
-#define	DB_CURLSN	0x080000	/* log_put() */
-
-/*
- * DB (user visible) error return codes.
- *
- * XXX
- * Changes to any of the user visible error return codes must be reflected
- * in java/src/com/sleepycat/db/Db.java.
- */
-#define	DB_INCOMPLETE		( -1)	/* Sync didn't finish. */
-#define	DB_KEYEMPTY		( -2)	/* The key/data pair was deleted or
-					   was never created by the user. */
-#define	DB_KEYEXIST		( -3)	/* The key/data pair already exists. */
-#define	DB_LOCK_DEADLOCK	( -4)	/* Locker killed to resolve deadlock. */
-#define	DB_LOCK_NOTGRANTED	( -5)	/* Lock unavailable, no-wait set. */
-#define	DB_LOCK_NOTHELD		( -6)	/* Lock not held by locker. */
-#define	DB_NOTFOUND		( -7)	/* Key/data pair not found (EOF). */
-
-/* DB (private) error return codes. */
-#define	DB_DELETED		( -8)	/* Recovery file marked deleted. */
-#define	DB_NEEDSPLIT		( -9)	/* Page needs to be split. */
-#define	DB_REGISTERED		(-10)	/* Entry was previously registered. */
-#define	DB_SWAPBYTES		(-11)	/* Database needs byte swapping. */
-#define DB_TXN_CKP		(-12)	/* Encountered ckp record in log. */
-
-struct __db_ilock {			/* Internal DB access method lock. */
-	db_pgno_t	pgno;		/* Page being locked. */
-					/* File id. */
-	u_int8_t	fileid[DB_FILE_ID_LEN];
-};
-
-/* DB access method description structure. */
-struct __db {
-	void	*mutexp;		/* Synchronization for free threading */
-	DBTYPE	 type;			/* DB access method. */
-	DB_ENV	*dbenv;			/* DB_ENV structure. */
-	DB_ENV	*mp_dbenv;		/* DB_ENV for local mpool creation. */
-
-	DB	*master;		/* Original DB created by db_open. */
-	void	*internal;		/* Access method private. */
-
-	DB_MPOOL	*mp;		/* The access method's mpool. */
-	DB_MPOOLFILE	*mpf;		/* The access method's mpool file. */
-
-	/*
-	 * XXX
-	 * Explicit representations of structures in queue.h.
-	 *
-	 * TAILQ_HEAD(curs_queue, __dbc);
-	 */
-	struct {
-		struct __dbc *tqh_first;
-		struct __dbc **tqh_last;
-	} curs_queue;
-
-	/*
-	 * XXX
-	 * Explicit representations of structures in queue.h.
-	 *
-	 * LIST_HEAD(handleq, __db);
-	 * LIST_ENTRY(__db);
-	 */
-	struct {
-		struct __db *lh_first;
-	} handleq;			/* List of handles for this DB. */
-	struct {
-		struct __db *le_next;
-		struct __db **le_prev;
-	} links;			/* Links for the handle list. */
-
-	u_int32_t log_fileid;		/* Logging file id. */
-
-	DB_TXN	 *txn;			/* Current transaction. */
-	u_int32_t locker;		/* Default process' locker id. */
-	DBT	  lock_dbt;		/* DBT referencing lock. */
-	struct __db_ilock lock;		/* Lock. */
-
-	size_t	  pgsize;		/* Logical page size of file. */
-
-					/* Local heap allocation. */
-	void *(*db_malloc) __P((size_t));
-
-					/* Functions. */
-	int (*close)	__P((DB *, u_int32_t));
-	int (*cursor)	__P((DB *, DB_TXN *, DBC **));
-	int (*del)	__P((DB *, DB_TXN *, DBT *, u_int32_t));
-	int (*fd)	__P((DB *, int *));
-	int (*get)	__P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-	int (*put)	__P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
-	int (*stat)	__P((DB *, void *, void *(*)(size_t), u_int32_t));
-	int (*sync)	__P((DB *, u_int32_t));
-
-#define	DB_AM_DUP	0x000001	/* DB_DUP (internal). */
-#define	DB_AM_INMEM	0x000002	/* In-memory; no sync on close. */
-#define	DB_AM_LOCKING	0x000004	/* Perform locking. */
-#define	DB_AM_LOGGING	0x000008	/* Perform logging. */
-#define	DB_AM_MLOCAL	0x000010	/* Database memory pool is local. */
-#define	DB_AM_PGDEF	0x000020	/* Page size was defaulted. */
-#define	DB_AM_RDONLY	0x000040	/* Database is readonly. */
-#define	DB_AM_RECOVER	0x000080	/* In recovery (do not log or lock). */
-#define	DB_AM_SWAP	0x000100	/* Pages need to be byte-swapped. */
-#define	DB_AM_THREAD	0x000200	/* DB is multi-threaded. */
-#define	DB_BT_RECNUM	0x000400	/* DB_RECNUM (internal) */
-#define	DB_HS_DIRTYMETA 0x000800	/* Hash: Metadata page modified. */
-#define	DB_RE_DELIMITER	0x001000	/* DB_DELIMITER (internal). */
-#define	DB_RE_FIXEDLEN	0x002000	/* DB_FIXEDLEN (internal). */
-#define	DB_RE_PAD	0x004000	/* DB_PAD (internal). */
-#define	DB_RE_RENUMBER	0x008000	/* DB_RENUMBER (internal). */
-#define	DB_RE_SNAPSHOT	0x010000	/* DB_SNAPSHOT (internal). */
-	u_int32_t flags;
-};
-
-/* Cursor description structure. */
-struct __dbc {
-	DB *dbp;			/* Related DB access method. */
-	DB_TXN	 *txn;			/* Associated transaction. */
-
-	/*
-	 * XXX
-	 * Explicit representations of structures in queue.h.
-	 *
-	 * TAILQ_ENTRY(__dbc);
-	 */
-	struct {
-		struct __dbc *tqe_next;
-		struct __dbc **tqe_prev;
-	} links;
-
-	void	 *internal;		/* Access method private. */
-
-	int (*c_close)	__P((DBC *));
-	int (*c_del)	__P((DBC *, u_int32_t));
-	int (*c_get)	__P((DBC *, DBT *, DBT *, u_int32_t));
-	int (*c_put)	__P((DBC *, DBT *, DBT *, u_int32_t));
-};
-
-/* Btree/recno statistics structure. */
-struct __db_bt_stat {
-	u_int32_t bt_flags;		/* Open flags. */
-	u_int32_t bt_maxkey;		/* Maxkey value. */
-	u_int32_t bt_minkey;		/* Minkey value. */
-	u_int32_t bt_re_len;		/* Fixed-length record length. */
-	u_int32_t bt_re_pad;		/* Fixed-length record pad. */
-	u_int32_t bt_pagesize;		/* Page size. */
-	u_int32_t bt_levels;		/* Tree levels. */
-	u_int32_t bt_nrecs;		/* Number of records. */
-	u_int32_t bt_int_pg;		/* Internal pages. */
-	u_int32_t bt_leaf_pg;		/* Leaf pages. */
-	u_int32_t bt_dup_pg;		/* Duplicate pages. */
-	u_int32_t bt_over_pg;		/* Overflow pages. */
-	u_int32_t bt_free;		/* Pages on the free list. */
-	u_int32_t bt_freed;		/* Pages freed for reuse. */
-	u_int32_t bt_int_pgfree;	/* Bytes free in internal pages. */
-	u_int32_t bt_leaf_pgfree;	/* Bytes free in leaf pages. */
-	u_int32_t bt_dup_pgfree;	/* Bytes free in duplicate pages. */
-	u_int32_t bt_over_pgfree;	/* Bytes free in overflow pages. */
-	u_int32_t bt_pfxsaved;		/* Bytes saved by prefix compression. */
-	u_int32_t bt_split;		/* Total number of splits. */
-	u_int32_t bt_rootsplit;		/* Root page splits. */
-	u_int32_t bt_fastsplit;		/* Fast splits. */
-	u_int32_t bt_added;		/* Items added. */
-	u_int32_t bt_deleted;		/* Items deleted. */
-	u_int32_t bt_get;		/* Items retrieved. */
-	u_int32_t bt_cache_hit;		/* Hits in fast-insert code. */
-	u_int32_t bt_cache_miss;	/* Misses in fast-insert code. */
-	u_int32_t bt_magic;		/* Magic number. */
-	u_int32_t bt_version;		/* Version number. */
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int   db_appinit __P((const char *, char * const *, DB_ENV *, u_int32_t));
-int   db_appexit __P((DB_ENV *));
-int   db_jump_set __P((void *, int));
-int   db_open __P((const char *,
-	  DBTYPE, u_int32_t, int, DB_ENV *, DB_INFO *, DB **));
-int   db_value_set __P((int, int));
-char *db_version __P((int *, int *, int *));
-#if defined(__cplusplus)
-}
-#endif
-
-/*******************************************************
- * Locking
- *******************************************************/
-#define	DB_LOCKVERSION	1
-#define	DB_LOCKMAGIC	0x090193
-
-/* Flag values for lock_vec(). */
-#define	DB_LOCK_NOWAIT		0x01	/* Don't wait on unavailable lock. */
-
-/* Flag values for lock_detect(). */
-#define	DB_LOCK_CONFLICT	0x01	/* Run on any conflict. */
-
-/*
- * Request types.
- *
- * XXX
- * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
- */
-typedef enum {
-	DB_LOCK_DUMP=0,			/* Display held locks. */
-	DB_LOCK_GET,			/* Get the lock. */
-	DB_LOCK_PUT,			/* Release the lock. */
-	DB_LOCK_PUT_ALL,		/* Release locker's locks. */
-	DB_LOCK_PUT_OBJ			/* Release locker's locks on obj. */
-} db_lockop_t;
-
-/*
- * Simple R/W lock modes and for multi-granularity intention locking.
- *
- * XXX
- * Changes here must be reflected in java/src/com/sleepycat/db/Db.java.
- */
-typedef enum {
-	DB_LOCK_NG=0,			/* Not granted. */
-	DB_LOCK_READ,			/* Shared/read. */
-	DB_LOCK_WRITE,			/* Exclusive/write. */
-	DB_LOCK_IREAD,			/* Intent to share/read. */
-	DB_LOCK_IWRITE,			/* Intent exclusive/write. */
-	DB_LOCK_IWR			/* Intent to read and write. */
-} db_lockmode_t;
-
-/*
- * Status of a lock.
- */
-typedef enum {
-	DB_LSTAT_ABORTED,		/* Lock belongs to an aborted txn. */
-	DB_LSTAT_ERR,			/* Lock is bad. */
-	DB_LSTAT_FREE,			/* Lock is unallocated. */
-	DB_LSTAT_HELD,			/* Lock is currently held. */
-	DB_LSTAT_NOGRANT,		/* Lock was not granted. */
-	DB_LSTAT_PENDING,		/* Lock was waiting and has been
-					 * promoted; waiting for the owner
-					 * to run and upgrade it to held. */
-	DB_LSTAT_WAITING		/* Lock is on the wait queue. */
-} db_status_t;
-
-/* Lock request structure. */
-struct __db_lockreq {
-	db_lockop_t	 op;		/* Operation. */
-	db_lockmode_t	 mode;		/* Requested mode. */
-	u_int32_t	 locker;	/* Locker identity. */
-	DBT		*obj;		/* Object being locked. */
-	DB_LOCK		 lock;		/* Lock returned. */
-};
-
-/*
- * Commonly used conflict matrices.
- *
- * Standard Read/Write (or exclusive/shared) locks.
- */
-#define	DB_LOCK_RW_N	3
-extern const u_int8_t db_rw_conflicts[];
-
-/* Multi-granularity locking. */
-#define	DB_LOCK_RIW_N	6
-extern const u_int8_t db_riw_conflicts[];
-
-struct __db_lock_stat {
-	u_int32_t st_magic;		/* Lock file magic number. */
-	u_int32_t st_version;		/* Lock file version number. */
-	u_int32_t st_maxlocks;		/* Maximum number of locks in table. */
-	u_int32_t st_nmodes;		/* Number of lock modes. */
-	u_int32_t st_numobjs;		/* Number of objects. */
-	u_int32_t st_nlockers;		/* Number of lockers. */
-	u_int32_t st_nconflicts;	/* Number of lock conflicts. */
-	u_int32_t st_nrequests;		/* Number of lock gets. */
-	u_int32_t st_nreleases;		/* Number of lock puts. */
-	u_int32_t st_ndeadlocks;	/* Number of lock deadlocks. */
-	u_int32_t st_region_wait;	/* Region lock granted after wait. */
-	u_int32_t st_region_nowait;	/* Region lock granted without wait. */
-	u_int32_t st_refcnt;		/* Region reference count. */
-	u_int32_t st_regsize;		/* Region size. */
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	  lock_close __P((DB_LOCKTAB *));
-int	  lock_detect __P((DB_LOCKTAB *, u_int32_t, u_int32_t));
-int	  lock_get __P((DB_LOCKTAB *,
-	    u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
-int	  lock_id __P((DB_LOCKTAB *, u_int32_t *));
-int	  lock_open __P((const char *,
-	    u_int32_t, int, DB_ENV *, DB_LOCKTAB **));
-int	  lock_put __P((DB_LOCKTAB *, DB_LOCK));
-int	  lock_stat __P((DB_LOCKTAB *, DB_LOCK_STAT **, void *(*)(size_t)));
-int	  lock_unlink __P((const char *, int, DB_ENV *));
-int	  lock_vec __P((DB_LOCKTAB *,
-	    u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
-#if defined(__cplusplus)
-}
-#endif
-
-/*******************************************************
- * Logging.
- *******************************************************/
-/* Flag values for log_archive(). */
-#define	DB_ARCH_ABS		0x001	/* Absolute pathnames. */
-#define	DB_ARCH_DATA		0x002	/* Data files. */
-#define	DB_ARCH_LOG		0x004	/* Log files. */
-
-/*
- * A DB_LSN has two parts, a fileid which identifies a specific file, and an
- * offset within that file.  The fileid is an unsigned 4-byte quantity that
- * uniquely identifies a file within the log directory -- currently a simple
- * counter inside the log.  The offset is also an unsigned 4-byte value.  The
- * log manager guarantees the offset is never more than 4 bytes by switching
- * to a new log file before the maximum length imposed by an unsigned 4-byte
- * offset is reached.
- */
-struct __db_lsn {
-	u_int32_t	file;		/* File ID. */
-	u_int32_t	offset;		/* File offset. */
-};
-
-/* Log statistics structure. */
-struct __db_log_stat {
-	u_int32_t st_magic;		/* Log file magic number. */
-	u_int32_t st_version;		/* Log file version number. */
-	int st_mode;			/* Log file mode. */
-	u_int32_t st_lg_max;		/* Maximum log file size. */
-	u_int32_t st_w_bytes;		/* Bytes to log. */
-	u_int32_t st_w_mbytes;		/* Megabytes to log. */
-	u_int32_t st_wc_bytes;		/* Bytes to log since checkpoint. */
-	u_int32_t st_wc_mbytes;		/* Megabytes to log since checkpoint. */
-	u_int32_t st_wcount;		/* Total syncs to the log. */
-	u_int32_t st_scount;		/* Total writes to the log. */
-	u_int32_t st_region_wait;	/* Region lock granted after wait. */
-	u_int32_t st_region_nowait;	/* Region lock granted without wait. */
-	u_int32_t st_cur_file;		/* Current log file number. */
-	u_int32_t st_cur_offset;	/* Current log file offset. */
-	u_int32_t st_refcnt;		/* Region reference count. */
-	u_int32_t st_regsize;		/* Region size. */
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	 log_archive __P((DB_LOG *, char **[], u_int32_t, void *(*)(size_t)));
-int	 log_close __P((DB_LOG *));
-int	 log_compare __P((const DB_LSN *, const DB_LSN *));
-int	 log_file __P((DB_LOG *, const DB_LSN *, char *, size_t));
-int	 log_flush __P((DB_LOG *, const DB_LSN *));
-int	 log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t));
-int	 log_open __P((const char *, u_int32_t, int, DB_ENV *, DB_LOG **));
-int	 log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
-int	 log_register __P((DB_LOG *, DB *, const char *, DBTYPE, u_int32_t *));
-int	 log_stat __P((DB_LOG *, DB_LOG_STAT **, void *(*)(size_t)));
-int	 log_unlink __P((const char *, int, DB_ENV *));
-int	 log_unregister __P((DB_LOG *, u_int32_t));
-#if defined(__cplusplus)
-}
-#endif
-
-/*******************************************************
- * Mpool
- *******************************************************/
-/* Flag values for memp_fget(). */
-#define	DB_MPOOL_CREATE		0x001	/* Create a page. */
-#define	DB_MPOOL_LAST		0x002	/* Return the last page. */
-#define	DB_MPOOL_NEW		0x004	/* Create a new page. */
-
-/* Flag values for memp_fput(), memp_fset(). */
-#define	DB_MPOOL_CLEAN		0x001	/* Clear modified bit. */
-#define	DB_MPOOL_DIRTY		0x002	/* Page is modified. */
-#define	DB_MPOOL_DISCARD	0x004	/* Don't cache the page. */
-
-/* Mpool statistics structure. */
-struct __db_mpool_stat {
-	size_t st_cachesize;		/* Cache size. */
-	u_int32_t st_cache_hit;		/* Pages found in the cache. */
-	u_int32_t st_cache_miss;	/* Pages not found in the cache. */
-	u_int32_t st_map;		/* Pages from mapped files. */
-	u_int32_t st_page_create;	/* Pages created in the cache. */
-	u_int32_t st_page_in;		/* Pages read in. */
-	u_int32_t st_page_out;		/* Pages written out. */
-	u_int32_t st_ro_evict;		/* Clean pages forced from the cache. */
-	u_int32_t st_rw_evict;		/* Dirty pages forced from the cache. */
-	u_int32_t st_hash_buckets;	/* Number of hash buckets. */
-	u_int32_t st_hash_searches;	/* Total hash chain searches. */
-	u_int32_t st_hash_longest;	/* Longest hash chain searched. */
-	u_int32_t st_hash_examined;	/* Total hash entries searched. */
-	u_int32_t st_page_clean;	/* Clean pages. */
-	u_int32_t st_page_dirty;	/* Dirty pages. */
-	u_int32_t st_page_trickle;	/* Pages written by memp_trickle. */
-	u_int32_t st_region_wait;	/* Region lock granted after wait. */
-	u_int32_t st_region_nowait;	/* Region lock granted without wait. */
-	u_int32_t st_refcnt;		/* Region reference count. */
-	u_int32_t st_regsize;		/* Region size. */
-};
-
-/* Mpool file open information structure. */
-struct __db_mpool_finfo {
-	int	   ftype;		/* File type. */
-	DBT	  *pgcookie;		/* Byte-string passed to pgin/pgout. */
-	u_int8_t  *fileid;		/* Unique file ID. */
-	int32_t	   lsn_offset;		/* LSN offset in page. */
-	u_int32_t  clear_len;		/* Cleared length on created pages. */
-};
-
-/* Mpool file statistics structure. */
-struct __db_mpool_fstat {
-	char *file_name;		/* File name. */
-	size_t st_pagesize;		/* Page size. */
-	u_int32_t st_cache_hit;		/* Pages found in the cache. */
-	u_int32_t st_cache_miss;	/* Pages not found in the cache. */
-	u_int32_t st_map;		/* Pages from mapped files. */
-	u_int32_t st_page_create;	/* Pages created in the cache. */
-	u_int32_t st_page_in;		/* Pages read in. */
-	u_int32_t st_page_out;		/* Pages written out. */
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	memp_close __P((DB_MPOOL *));
-int	memp_fclose __P((DB_MPOOLFILE *));
-int	memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
-int	memp_fopen __P((DB_MPOOL *, const char *,
-	    u_int32_t, int, size_t, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
-int	memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
-int	memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
-int	memp_fsync __P((DB_MPOOLFILE *));
-int	memp_open __P((const char *, u_int32_t, int, DB_ENV *, DB_MPOOL **));
-int	memp_register __P((DB_MPOOL *, int,
-	    int (*)(db_pgno_t, void *, DBT *),
-	    int (*)(db_pgno_t, void *, DBT *)));
-int	memp_stat __P((DB_MPOOL *,
-	    DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, void *(*)(size_t)));
-int	memp_sync __P((DB_MPOOL *, DB_LSN *));
-int	memp_trickle __P((DB_MPOOL *, int, int *));
-int	memp_unlink __P((const char *, int, DB_ENV *));
-#if defined(__cplusplus)
-}
-#endif
-
-/*******************************************************
- * Transactions.
- *******************************************************/
-#define	DB_TXNVERSION	1
-#define	DB_TXNMAGIC	0x041593
-
-/* Operations values to the tx_recover() function. */
-#define	DB_TXN_BACKWARD_ROLL	1	/* Read the log backwards. */
-#define	DB_TXN_FORWARD_ROLL	2	/* Read the log forwards. */
-#define	DB_TXN_OPENFILES	3	/* Read for open files. */
-#define	DB_TXN_REDO		4	/* Redo the operation. */
-#define	DB_TXN_UNDO		5	/* Undo the operation. */
-
-/* Internal transaction status values. */
-
-/* Transaction statistics structure. */
-struct __db_txn_active {
-	u_int32_t	txnid;		/* Transaction ID */
-	DB_LSN		lsn;		/* Lsn of the begin record */
-};
-
-struct __db_txn_stat {
-	DB_LSN	  st_last_ckp;		/* lsn of the last checkpoint */
-	DB_LSN	  st_pending_ckp;	/* last checkpoint did not finish */
-	time_t	  st_time_ckp;		/* time of last checkpoint */
-	u_int32_t st_last_txnid;	/* last transaction id given out */
-	u_int32_t st_maxtxns;	/* maximum number of active txns */
-	u_int32_t st_naborts;	/* number of aborted transactions */
-	u_int32_t st_nbegins;	/* number of begun transactions */
-	u_int32_t st_ncommits;	/* number of committed transactions */
-	u_int32_t st_nactive;	/* number of active transactions */
-	DB_TXN_ACTIVE
-		 *st_txnarray;	/* array of active transactions */
-	u_int32_t st_region_wait;	/* Region lock granted after wait. */
-	u_int32_t st_region_nowait;	/* Region lock granted without wait. */
-	u_int32_t st_refcnt;		/* Region reference count. */
-	u_int32_t st_regsize;		/* Region size. */
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	  txn_abort __P((DB_TXN *));
-int	  txn_begin __P((DB_TXNMGR *, DB_TXN *, DB_TXN **));
-int	  txn_checkpoint __P((const DB_TXNMGR *, u_int32_t, u_int32_t));
-int	  txn_commit __P((DB_TXN *));
-int	  txn_close __P((DB_TXNMGR *));
-u_int32_t txn_id __P((DB_TXN *));
-int	  txn_open __P((const char *, u_int32_t, int, DB_ENV *, DB_TXNMGR **));
-int	  txn_prepare __P((DB_TXN *));
-int	  txn_stat __P((DB_TXNMGR *, DB_TXN_STAT **, void *(*)(size_t)));
-int	  txn_unlink __P((const char *, int, DB_ENV *));
-#if defined(__cplusplus)
-}
-#endif
-
-#ifndef DB_DBM_HSEARCH
-#define	DB_DBM_HSEARCH	0		/* No historic interfaces by default. */
-#endif
-#if DB_DBM_HSEARCH != 0
-/*******************************************************
- * Dbm/Ndbm historic interfaces.
- *******************************************************/
-#define	DBM_INSERT	0		/* Flags to dbm_store(). */
-#define	DBM_REPLACE	1
-
-/*
- * The db(3) support for ndbm(3) always appends this suffix to the
- * file name to avoid overwriting the user's original database.
- */
-#define	DBM_SUFFIX	".db"
-
-#if defined(_XPG4_2)
-typedef struct {
-	char *dptr;
-	size_t dsize;
-} datum;
-#else
-typedef struct {
-	char *dptr;
-	int dsize;
-} datum;
-#endif
-
-/*
- * Translate DBM calls into DB calls so that DB doesn't step on the
- * application's name space.
- *
- * The global variables dbrdonly, dirf and pagf were not retained when
- * 4BSD replaced the dbm interface with ndbm, and are not support here.
- */
-#define	dbminit(a)	__db_dbm_init(a)
-#if !defined(__cplusplus)
-#define	delete(a)	__db_dbm_delete(a)
-#endif
-#define	fetch(a)	__db_dbm_fetch(a)
-#define	firstkey	__db_dbm_firstkey
-#define	nextkey(a)	__db_dbm_nextkey(a)
-#define	store(a, b)	__db_dbm_store(a, b)
-
-/* Prototype the DB calls. */
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	 __db_dbm_init __P((char *));
-int	 __db_dbm_delete __P((datum));
-int	 __db_dbm_dbrdonly __P((void));
-int	 __db_dbm_dirf __P((void));
-datum	 __db_dbm_fetch __P((datum));
-datum	 __db_dbm_firstkey __P((void));
-datum	 __db_dbm_nextkey __P((datum));
-int	 __db_dbm_pagf __P((void));
-int	 __db_dbm_store __P((datum, datum));
-#if defined(__cplusplus)
-}
-#endif
-
-/*
- * Translate NDBM calls into DB calls so that DB doesn't step on the
- * application's name space.
- */
-#define	dbm_clearerr(a)		__db_ndbm_clearerr(a)
-#define	dbm_close(a)		__db_ndbm_close(a)
-#define	dbm_delete(a, b)	__db_ndbm_delete(a, b)
-#define	dbm_dirfno(a)		__db_ndbm_dirfno(a)
-#define	dbm_error(a)		__db_ndbm_error(a)
-#define	dbm_fetch(a, b)		__db_ndbm_fetch(a, b)
-#define	dbm_firstkey(a)		__db_ndbm_firstkey(a)
-#define	dbm_nextkey(a)		__db_ndbm_nextkey(a)
-#define	dbm_open(a, b, c)	__db_ndbm_open(a, b, c)
-#define	dbm_pagfno(a)		__db_ndbm_pagfno(a)
-#define	dbm_rdonly(a)		__db_ndbm_rdonly(a)
-#define	dbm_store(a, b, c, d)	__db_ndbm_store(a, b, c, d)
-
-/* Prototype the DB calls. */
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	 __db_ndbm_clearerr __P((DBM *));
-void	 __db_ndbm_close __P((DBM *));
-int	 __db_ndbm_delete __P((DBM *, datum));
-int	 __db_ndbm_dirfno __P((DBM *));
-int	 __db_ndbm_error __P((DBM *));
-datum	 __db_ndbm_fetch __P((DBM *, datum));
-datum	 __db_ndbm_firstkey __P((DBM *));
-datum	 __db_ndbm_nextkey __P((DBM *));
-DBM	*__db_ndbm_open __P((const char *, int, int));
-int	 __db_ndbm_pagfno __P((DBM *));
-int	 __db_ndbm_rdonly __P((DBM *));
-int	 __db_ndbm_store __P((DBM *, datum, datum, int));
-#if defined(__cplusplus)
-}
-#endif
-
-/*******************************************************
- * Hsearch historic interface.
- *******************************************************/
-typedef enum {
-	FIND, ENTER
-} ACTION;
-
-typedef struct entry {
-	char *key;
-	char *data;
-} ENTRY;
-
-/*
- * Translate HSEARCH calls into DB calls so that DB doesn't step on the
- * application's name space.
- */
-#define	hcreate(a)	__db_hcreate(a)
-#define	hdestroy	__db_hdestroy
-#define	hsearch(a, b)	__db_hsearch(a, b)
-
-/* Prototype the DB calls. */
-#if defined(__cplusplus)
-extern "C" {
-#endif
-int	 __db_hcreate __P((size_t));
-void	 __db_hdestroy __P((void));
-ENTRY	*__db_hsearch __P((ENTRY, ACTION));
-#if defined(__cplusplus)
-}
-#endif
-#endif /* DB_DBM_HSEARCH */
-
-/*
- * XXX
- * MacOS: Reset Metrowerks C enum sizes.
- */
-#ifdef __MWERKS__
-#pragma enumsalwaysint reset
-#endif
-#endif /* !_DB_H_ */
diff --git a/db2/include/db_am.h b/db2/include/db_am.h
index 0c189244a2..fe2176d772 100644
--- a/db2/include/db_am.h
+++ b/db2/include/db_am.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db_am.h	10.9 (Sleepycat) 4/10/98
+ *	@(#)db_am.h	10.15 (Sleepycat) 11/22/98
  */
 #ifndef _DB_AM_H
 #define _DB_AM_H
@@ -16,6 +16,8 @@
 #define	DB_REM_BIG	0x40
 #define	DB_SPLITOLD	0x50
 #define	DB_SPLITNEW	0x60
+#define	DB_ADD_PAGE	0x70
+#define	DB_REM_PAGE	0x80
 
 /*
  * Standard initialization and shutdown macros for all recovery functions.
@@ -27,34 +29,31 @@
  *	int ret;
  */
 #define	REC_INTRO(func) {						\
-	file_dbp = mdbp = NULL;						\
+	file_dbp = NULL;						\
+	dbc = NULL;							\
 	if ((ret = func(dbtp->data, &argp)) != 0)			\
 		goto out;						\
-	if ((ret = __db_fileid_to_db(logp, &mdbp, argp->fileid)) != 0) {\
-		if (ret	== DB_DELETED)					\
+	if ((ret =							\
+	    __db_fileid_to_db(logp, &file_dbp, argp->fileid)) != 0) {	\
+		if (ret	== DB_DELETED) {				\
 			ret = 0;					\
+			goto done;					\
+		}							\
 		goto out;						\
 	}								\
-	if (mdbp == NULL)						\
+	if (file_dbp == NULL)						\
 		goto out;						\
-	if (F_ISSET(mdbp, DB_AM_THREAD)) {				\
-		if ((ret = __db_gethandle(mdbp,				\
-		    mdbp->type == DB_HASH ? __ham_hdup : __bam_bdup,	\
-		    &file_dbp)) != 0)					\
-			goto out;					\
-	} else								\
-		file_dbp = mdbp;					\
-	F_SET(file_dbp, DB_AM_RECOVER);					\
+	if ((ret = file_dbp->cursor(file_dbp, NULL, &dbc, 0)) != 0)	\
+		goto out;						\
+	F_SET(dbc, DBC_RECOVER);					\
 	mpf = file_dbp->mpf;						\
 }
+
 #define	REC_CLOSE {							\
 	if (argp != NULL)						\
-		__db_free(argp);					\
-	if (file_dbp != NULL) {						\
-		F_CLR(file_dbp, DB_AM_RECOVER);				\
-		if (F_ISSET(file_dbp, DB_AM_THREAD))			\
-			__db_puthandle(file_dbp);			\
-	}								\
+		__os_free(argp, sizeof(*argp));				\
+	if (dbc != NULL)						\
+		dbc->c_close(dbc);					\
 	return (ret);							\
 }
 
@@ -67,7 +66,7 @@
 }
 #define	REC_NOOP_CLOSE {						\
 	if (argp != NULL)						\
-		__db_free(argp);					\
+		__os_free(argp, sizeof(*argp));				\
 	return (ret);							\
 }
 
diff --git a/db2/include/db_auto.h b/db2/include/db_auto.h
index 1b07c748e8..0d1e43a26a 100644
--- a/db2/include/db_auto.h
+++ b/db2/include/db_auto.h
@@ -70,6 +70,7 @@ typedef struct _db_relink_args {
 	u_int32_t type;
 	DB_TXN *txnid;
 	DB_LSN prev_lsn;
+	u_int32_t	opcode;
 	u_int32_t	fileid;
 	db_pgno_t	pgno;
 	DB_LSN 	lsn;
@@ -107,16 +108,4 @@ typedef struct _db_debug_args {
 	u_int32_t	arg_flags;
 } __db_debug_args;
 
-
-#define	DB_db_noop	(DB_db_BEGIN + 8)
-
-typedef struct _db_noop_args {
-	u_int32_t type;
-	DB_TXN *txnid;
-	DB_LSN prev_lsn;
-	u_int32_t	fileid;
-	db_pgno_t	pgno;
-	DB_LSN 	prevlsn;
-} __db_noop_args;
-
 #endif
diff --git a/db2/include/db_cxx.h b/db2/include/db_cxx.h
index fc04d5d66b..f415d594b5 100644
--- a/db2/include/db_cxx.h
+++ b/db2/include/db_cxx.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db_cxx.h	10.17 (Sleepycat) 5/2/98
+ *	@(#)db_cxx.h	10.30 (Sleepycat) 11/22/98
  */
 
 #ifndef _DB_CXX_H_
@@ -49,7 +49,8 @@
 // Forward declarations
 //
 
-#include "db.h"
+#include <iostream.h>
+#include <db.h>
 
 class Db;                                        // forward
 class Dbc;                                       // forward
@@ -66,6 +67,19 @@ class Dbt;                                       // forward
 class DbTxn;                                     // forward
 class DbTxnMgr;                                  // forward
 
+// These classes are not defined here and should be invisible
+// to the user, but some compilers require forward references.
+// There is one for each use of the DEFINE_DB_CLASS macro.
+
+class DbLockTabImp;
+class DbLogImp;
+class DbMpoolImp;
+class DbMpoolFileImp;
+class DbImp;
+class DbTxnImp;
+class DbTxnMgrImp;
+
+
 ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////
 //
@@ -175,15 +189,11 @@ private:
 
 class _exported DbLock
 {
-    friend DbLockTab;
+    friend class DbLockTab;
 
 public:
-    DbLock(u_int);
     DbLock();
 
-    u_int get_lock_id();
-    void set_lock_id(u_int);
-
     int put(DbLockTab *locktab);
 
     DbLock(const DbLock &);
@@ -194,18 +204,21 @@ protected:
     // since its contained class is not allocated by db.
     // (see comment at top)
 
+    DbLock(DB_LOCK);
     DB_LOCK lock_;
 };
 
 class _exported DbLockTab
 {
-friend DbEnv;
+    friend class DbEnv;
+
 public:
     int close();
     int detect(u_int32_t flags, int atype);
     int get(u_int32_t locker, u_int32_t flags, const Dbt *obj,
             db_lockmode_t lock_mode, DbLock *lock);
     int id(u_int32_t *idp);
+    int stat(DB_LOCK_STAT **statp, void *(*db_malloc)(size_t));
     int vec(u_int32_t locker, u_int32_t flags, DB_LOCKREQ list[],
 	    int nlist, DB_LOCKREQ **elistp);
 
@@ -244,13 +257,14 @@ private:
 
 class _exported DbLsn : protected DB_LSN
 {
-    friend DbLog;               // friendship needed to cast to base class
-    friend DbMpool;
+    friend class DbLog;          // friendship needed to cast to base class
+    friend class DbMpool;
 };
 
 class _exported DbLog
 {
-friend DbEnv;
+    friend class DbEnv;
+
 public:
     int archive(char **list[], u_int32_t flags, void *(*db_malloc)(size_t));
     int close();
@@ -300,7 +314,8 @@ private:
 
 class _exported DbMpoolFile
 {
-friend DbEnv;
+    friend class DbEnv;
+
 public:
     int close();
     int get(db_pgno_t *pgnoaddr, u_int32_t flags, void *pagep);
@@ -337,7 +352,8 @@ private:
 
 class _exported DbMpool
 {
-friend DbEnv;
+    friend class DbEnv;
+
 public:
     int close();
 
@@ -388,7 +404,8 @@ private:
 
 class _exported DbTxnMgr
 {
-friend DbEnv;
+    friend class DbEnv;
+
 public:
     int begin(DbTxn *pid, DbTxn **tid);
     int checkpoint(u_int32_t kbyte, u_int32_t min) const;
@@ -422,7 +439,8 @@ private:
 
 class _exported DbTxn
 {
-friend DbTxnMgr;
+    friend class DbTxnMgr;
+
 public:
     int abort();
     int commit();
@@ -461,90 +479,78 @@ private:
 //
 class _exported DbInfo : protected DB_INFO
 {
-    friend DbEnv;
-    friend Db;
+    friend class DbEnv;
+    friend class Db;
 
 public:
     DbInfo();
     ~DbInfo();
 
     // Byte order.
-    int	get_lorder() const;
     void set_lorder(int);
 
     // Underlying cache size.
-    size_t get_cachesize() const;
     void set_cachesize(size_t);
 
     // Underlying page size.
-    size_t get_pagesize() const;
     void set_pagesize(size_t);
 
     // Local heap allocation.
     typedef void *(*db_malloc_fcn)(size_t);
-    db_malloc_fcn get_malloc() const;
     void set_malloc(db_malloc_fcn);
 
+    // Duplicate compare function.
+    typedef int (*dup_compare_fcn)(const DBT *, const DBT *);
+    void set_dup_compare(dup_compare_fcn);
+
     ////////////////////////////////////////////////////////////////
     // Btree access method.
 
     // Maximum keys per page.
-    int	get_bt_maxkey() const;
     void set_bt_maxkey(int);
 
     // Minimum keys per page.
-    int	get_bt_minkey() const;
     void set_bt_minkey(int);
 
     // Comparison function.
     typedef int (*bt_compare_fcn)(const DBT *, const DBT *);
-    bt_compare_fcn get_bt_compare() const;
     void set_bt_compare(bt_compare_fcn);
 
     // Prefix function.
     typedef size_t (*bt_prefix_fcn)(const DBT *, const DBT *);
-    bt_prefix_fcn get_bt_prefix() const;
     void set_bt_prefix(bt_prefix_fcn);
 
     ////////////////////////////////////////////////////////////////
     // Hash access method.
 
     // Fill factor.
-    u_int32_t get_h_ffactor() const;
     void set_h_ffactor(u_int32_t);
 
     // Number of elements.
-    u_int32_t get_h_nelem() const;
     void set_h_nelem(u_int32_t);
 
     // Hash function.
     typedef u_int32_t (*h_hash_fcn)(const void *, u_int32_t);
-    h_hash_fcn get_h_hash() const;
     void set_h_hash(h_hash_fcn);
 
     ////////////////////////////////////////////////////////////////
     // Recno access method.
 
     // Fixed-length padding byte.
-    int	get_re_pad() const;
     void set_re_pad(int);
 
     // Variable-length delimiting byte.
-    int	get_re_delim() const;
     void set_re_delim(int);
 
     // Length for fixed-length records.
-    u_int32_t get_re_len() const;
     void set_re_len(u_int32_t);
 
     // Source file name.
-    char *get_re_source() const;
     void set_re_source(char *);
 
     // Note: some flags are set as side effects of calling
     // above "set" methods.
     //
-    u_int32_t get_flags() const;
     void set_flags(u_int32_t);
 
 
@@ -570,11 +576,11 @@ private:
 //
 class _exported DbEnv : protected DB_ENV
 {
-friend DbTxnMgr;
-friend DbLog;
-friend DbLockTab;
-friend DbMpool;
-friend Db;
+    friend class DbTxnMgr;
+    friend class DbLog;
+    friend class DbLockTab;
+    friend class DbMpool;
+    friend class Db;
 
 public:
 
@@ -603,6 +609,10 @@ public:
     //
     int appexit();
 
+    // Version information.  A static method so it can be obtained anytime.
+    //
+    static char *version(int *major, int *minor, int *patch);
+
     ////////////////////////////////////////////////////////////////
     // simple get/set access methods
     //
@@ -610,74 +620,41 @@ public:
     // use the default constructor along with appinit().
 
     // Byte order.
-    int	get_lorder() const;
     void set_lorder(int);
 
+    // Panic callback.
+    typedef void (*db_paniccall_fcn)(DbEnv *, int);
+    void set_paniccall(db_paniccall_fcn);
+
     // Error message callback.
     typedef void (*db_errcall_fcn)(const char *, char *);
-    db_errcall_fcn get_errcall() const;
     void set_errcall(db_errcall_fcn);
 
     // Error message file stream.
-    FILE *get_errfile() const;
     void set_errfile(FILE *);
 
     // Error message prefix.
-    const char *get_errpfx() const;
     void set_errpfx(const char *);
 
     // Generate debugging messages.
-    int get_verbose() const;
     void set_verbose(int);
 
     ////////////////////////////////////////////////////////////////
-    // User paths.
-
-    // Database home.
-    char *get_home() const;
-    void set_home(char *);
-
-    // Database log file directory.
-    char *get_log_dir() const;
-    void set_log_dir(char *);
-
-    // Database tmp file directory.
-    char *get_tmp_dir() const;
-    void set_tmp_dir(char *);
-
-    // Database data file directories.
-    char **get_data_dir() const;
-    void set_data_dir(char **);
-
-    // Database data file slots.
-    int get_data_cnt() const;
-    void set_data_cnt(int);
-
-    // Next Database data file slot.
-    int get_data_next() const;
-    void set_data_next(int);
-
-
-    ////////////////////////////////////////////////////////////////
     // Locking.
 
     // Return from lock_open().
     DbLockTab *get_lk_info() const;
 
     // Two dimensional conflict matrix.
-    u_int8_t *get_lk_conflicts() const;
     void set_lk_conflicts(u_int8_t *);
 
     // Number of lock modes in table.
-    int get_lk_modes() const;
     void set_lk_modes(int);
 
     // Maximum number of locks.
-    u_int32_t get_lk_max() const;
     void set_lk_max(u_int32_t);
 
     // Deadlock detect on every conflict.
-    u_int32_t get_lk_detect() const;
     void set_lk_detect(u_int32_t);
 
 
@@ -688,7 +665,6 @@ public:
     DbLog *get_lg_info() const;
 
     // Maximum file size.
-    u_int32_t get_lg_max() const;
     void set_lg_max(u_int32_t);
 
 
@@ -699,11 +675,9 @@ public:
     DbMpool *get_mp_info() const;
 
     // Maximum file size for mmap.
-    size_t get_mp_mmapsize() const;
     void set_mp_mmapsize(size_t);
 
     // Bytes in the mpool cache.
-    size_t get_mp_size() const;
     void set_mp_size(size_t);
 
 
@@ -714,16 +688,13 @@ public:
     DbTxnMgr *get_tx_info() const;
 
     // Maximum number of transactions.
-    u_int32_t get_tx_max() const;
     void set_tx_max(u_int32_t);
 
     // Dispatch function for recovery.
     typedef int (*tx_recover_fcn)(DB_LOG *, DBT *, DB_LSN *, int, void *);
-    tx_recover_fcn get_tx_recover() const;
     void set_tx_recover(tx_recover_fcn);
 
     // Flags.
-    u_int32_t get_flags() const;
     void set_flags(u_int32_t);
 
     ////////////////////////////////////////////////////////////////
@@ -736,7 +707,6 @@ public:
     //
     enum ErrorModel { Exception, ErrorReturn };
     void set_error_model(ErrorModel);
-    ErrorModel get_error_model() const;
 
     // If an error is detected and the error call function
     // or stream is set, a message is dispatched or printed.
@@ -747,11 +717,11 @@ public:
     // call set_error_stream() to force all errors to a C++ stream.
     // It is unwise to mix these approaches.
     //
-    class ostream* get_error_stream() const;
     void set_error_stream(class ostream*);
 
     // used internally
-    static int runtime_error(const char *caller, int err, int in_destructor = 0);
+    static int runtime_error(const char *caller, int err,
+                             int in_destructor = 0, int force_throw = 0);
 
 private:
     // We can add data to this class if needed
@@ -778,23 +748,27 @@ private:
 //
 class _exported Db
 {
-    friend DbEnv;
+    friend class DbEnv;
 
 public:
     int close(u_int32_t flags);
-    int cursor(DbTxn *txnid, Dbc **cursorp);
+    int cursor(DbTxn *txnid, Dbc **cursorp, u_int32_t flags);
     int del(DbTxn *txnid, Dbt *key, u_int32_t flags);
     int fd(int *fdp);
     int get(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags);
+    int join(Dbc **curslist, u_int32_t flags, Dbc **dbcp);
     int put(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags);
     int stat(void *sp, void *(*db_malloc)(size_t), u_int32_t flags);
     int sync(u_int32_t flags);
 
+    int get_byteswapped() const;
     DBTYPE get_type() const;
 
     static int open(const char *fname, DBTYPE type, u_int32_t flags,
                     int mode, DbEnv *dbenv, DbInfo *info, Db **dbpp);
 
+    static int xa_open(const char *fname, DBTYPE type, u_int32_t flags,
+                    int mode, DbInfo *info, Db **dbpp);
 private:
     // We can add data to this class if needed
     // since it is implemented via a pointer.
@@ -817,11 +791,11 @@ private:
 //
 class _exported Dbt : private DBT
 {
-    friend Dbc;
-    friend Db;
-    friend DbLog;
-    friend DbMpoolFile;
-    friend DbLockTab;
+    friend class Dbc;
+    friend class Db;
+    friend class DbLog;
+    friend class DbMpoolFile;
+    friend class DbLockTab;
 
 public:
 
@@ -863,7 +837,7 @@ private:
 
 class _exported Dbc : protected DBC
 {
-    friend Db;
+    friend class Db;
 
 public:
     int close();
diff --git a/db2/include/db_ext.h b/db2/include/db_ext.h
index 8a03db9f64..1ad1643bfa 100644
--- a/db2/include/db_ext.h
+++ b/db2/include/db_ext.h
@@ -1,8 +1,11 @@
 /* DO NOT EDIT: automatically built by dist/distrib. */
 #ifndef _db_ext_h_
 #define _db_ext_h_
-int __db_pgerr __P((DB *, db_pgno_t));
-int __db_pgfmt __P((DB *, db_pgno_t));
+int __db_close __P((DB *, u_int32_t));
+int __db_init_wrapper __P((DB *));
+int __db_cprint __P((DB *));
+int __db_c_destroy __P((DBC *));
+int __db_sync __P((DB *, u_int32_t));
 int __db_addrem_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
     u_int32_t, u_int32_t, db_pgno_t, u_int32_t,
@@ -33,8 +36,8 @@ int __db_ovref_print
 int __db_ovref_read __P((void *, __db_ovref_args **));
 int __db_relink_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
-    u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t,
-    DB_LSN *, db_pgno_t, DB_LSN *));
+    u_int32_t, u_int32_t, db_pgno_t, DB_LSN *,
+    db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *));
 int __db_relink_print
    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __db_relink_read __P((void *, __db_relink_args **));
@@ -52,12 +55,6 @@ int __db_debug_log
 int __db_debug_print
    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __db_debug_read __P((void *, __db_debug_args **));
-int __db_noop_log
-    __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
-    u_int32_t, db_pgno_t, DB_LSN *));
-int __db_noop_print
-   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
-int __db_noop_read __P((void *, __db_noop_args **));
 int __db_init_print __P((DB_ENV *));
 int __db_init_recover __P((DB_ENV *));
 int __db_pgin __P((db_pgno_t, size_t, void *));
@@ -71,23 +68,40 @@ int __db_txnlist_find __P((void *, u_int32_t));
 void __db_txnlist_end __P((void *));
 void __db_txnlist_gen __P((void *, int));
 void __db_txnlist_print __P((void *));
-int __db_dput __P((DB *,
-   DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
-int __db_drem __P((DB *,
-   PAGE **, u_int32_t, int (*)(DB *, PAGE *)));
-int __db_dend __P((DB *, db_pgno_t, PAGE **));
- int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t));
+int __db_dput __P((DBC *, DBT *,
+   PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **)));
+int __db_drem __P((DBC *,
+   PAGE **, u_int32_t, int (*)(DBC *, PAGE *)));
+int __db_dend __P((DBC *, db_pgno_t, PAGE **));
+ int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
 int __db_pitem
-    __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
-int __db_relink __P((DB *, PAGE *, PAGE **, int));
-int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
+    __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __db_relink __P((DBC *, u_int32_t, PAGE *, PAGE **, int));
+int __db_ddup __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
+int __db_dsearch __P((DBC *,
+    int, DBT *, db_pgno_t, db_indx_t *, PAGE **, int *));
+int __db_cdelchk __P((const DB *, u_int32_t, int, int));
+int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
+int __db_cputchk __P((const DB *,
+   const DBT *, DBT *, u_int32_t, int, int));
+int __db_closechk __P((const DB *, u_int32_t));
+int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
+int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
+int __db_joinchk __P((const DB *, u_int32_t));
+int __db_putchk
+   __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
+int __db_statchk __P((const DB *, u_int32_t));
+int __db_syncchk __P((const DB *, u_int32_t));
+int __db_eopnotsup __P((const DB_ENV *));
+int __db_join __P((DB *, DBC **, u_int32_t, DBC **));
 int __db_goff __P((DB *, DBT *,
     u_int32_t, db_pgno_t, void **, u_int32_t *));
-int __db_poff __P((DB *, const DBT *, db_pgno_t *,
-    int (*)(DB *, u_int32_t, PAGE **)));
-int __db_ovref __P((DB *, db_pgno_t, int32_t));
-int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
-int __db_moff __P((DB *, const DBT *, db_pgno_t));
+int __db_poff __P((DBC *, const DBT *, db_pgno_t *,
+    int (*)(DBC *, u_int32_t, PAGE **)));
+int __db_ovref __P((DBC *, db_pgno_t, int32_t));
+int __db_doff __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
+int __db_moff __P((DB *, const DBT *, db_pgno_t, u_int32_t,
+    int (*)(const DBT *, const DBT *), int *));
 void __db_loadme __P((void));
 FILE *__db_prinit __P((FILE *));
 int __db_dump __P((DB *, char *, int));
@@ -111,11 +125,8 @@ int __db_relink_recover
 int __db_addpage_recover
    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __db_debug_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
-int __db_noop_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __db_ret __P((DB *,
    PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
 int __db_retcopy __P((DBT *,
    void *, u_int32_t, void **, u_int32_t *, void *(*)(size_t)));
-int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **));
-int __db_puthandle __P((DB *));
 #endif /* _db_ext_h_ */
diff --git a/db2/include/db_int.h.src b/db2/include/db_int.h.src
deleted file mode 100644
index d67e2c428c..0000000000
--- a/db2/include/db_int.h.src
+++ /dev/null
@@ -1,402 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- *
- *	@(#)db_int.h.src	10.62 (Sleepycat) 5/23/98
- */
-
-#ifndef _DB_INTERNAL_H_
-#define	_DB_INTERNAL_H_
-
-#include "db.h"				/* Standard DB include file. */
-#include "queue.h"
-
-/*******************************************************
- * General purpose constants and macros.
- *******************************************************/
-#define	UINT16_T_MAX	    0xffff	/* Maximum 16 bit unsigned. */
-#define	UINT32_T_MAX	0xffffffff	/* Maximum 32 bit unsigned. */
-
-#define	DB_MIN_PGSIZE	0x000200	/* Minimum page size. */
-#define	DB_MAX_PGSIZE	0x010000	/* Maximum page size. */
-
-#define	DB_MINCACHE	10		/* Minimum cached pages */
-
-#define	MEGABYTE	1048576
-
-/*
- * If we are unable to determine the underlying filesystem block size, use
- * 8K on the grounds that most OS's use less than 8K as their VM page size.
- */
-#define	DB_DEF_IOSIZE	(8 * 1024)
-
-/*
- * Aligning items to particular sizes or in pages or memory.  ALIGNP is a
- * separate macro, as we've had to cast the pointer to different integral
- * types on different architectures.
- *
- * We cast pointers into unsigned longs when manipulating them because C89
- * guarantees that u_long is the largest available integral type and further,
- * to never generate overflows.  However, neither C89 or C9X  requires that
- * any integer type be large enough to hold a pointer, although C9X created
- * the intptr_t type, which is guaranteed to hold a pointer but may or may
- * not exist.  At some point in the future, we should test for intptr_t and
- * use it where available.
- */
-#undef	ALIGNTYPE
-#define	ALIGNTYPE		u_long
-#undef	ALIGNP
-#define	ALIGNP(value, bound)	ALIGN((ALIGNTYPE)value, bound)
-#undef	ALIGN
-#define	ALIGN(value, bound)	(((value) + (bound) - 1) & ~((bound) - 1))
-
-/*
- * There are several on-page structures that are declared to have a number of
- * fields followed by a variable length array of items.  The structure size
- * without including the variable length array or the address of the first of
- * those elements can be found using SSZ.
- *
- * This macro can also be used to find the offset of a structure element in a
- * structure.  This is used in various places to copy structure elements from
- * unaligned memory references, e.g., pointers into a packed page.
- *
- * There are two versions because compilers object if you take the address of
- * an array.
- */
-#undef	SSZ
-#define SSZ(name, field)	((int)&(((name *)0)->field))
-
-#undef	SSZA
-#define SSZA(name, field)	((int)&(((name *)0)->field[0]))
-
-/* Macros to return per-process address, offsets based on shared regions. */
-#define	R_ADDR(base, offset)	((void *)((u_int8_t *)((base)->addr) + offset))
-#define	R_OFFSET(base, p)	((u_int8_t *)(p) - (u_int8_t *)(base)->addr)
-
-/* Free and free-string macros that overwrite memory. */
-#ifdef DIAGNOSTIC
-#undef	FREE
-#define	FREE(p, len) {							\
-	memset(p, 0xff, len);						\
-	__db_free(p);							\
-}
-#undef	FREES
-#define	FREES(p) {							\
-	FREE(p, strlen(p));						\
-}
-#else
-#undef	FREE
-#define	FREE(p, len) {							\
-	__db_free(p);							\
-}
-#undef	FREES
-#define	FREES(p) {							\
-	__db_free(p);							\
-}
-#endif
-
-/* Structure used to print flag values. */
-typedef struct __fn {
-	u_int32_t mask;			/* Flag value. */
-	const char *name;		/* Flag name. */
-} FN;
-
-/* Set, clear and test flags. */
-#define	F_SET(p, f)	(p)->flags |= (f)
-#define	F_CLR(p, f)	(p)->flags &= ~(f)
-#define	F_ISSET(p, f)	((p)->flags & (f))
-#define	LF_SET(f)	(flags |= (f))
-#define	LF_CLR(f)	(flags &= ~(f))
-#define	LF_ISSET(f)	(flags & (f))
-
-/* Display separator string. */
-#undef	DB_LINE
-#define	DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
-
-/* Global variables. */
-typedef struct __db_globals {
-	int db_mutexlocks;		/* DB_MUTEXLOCKS */
-	int db_region_anon;		/* DB_REGION_ANON, DB_REGION_NAME */
-	int db_region_init;		/* DB_REGION_INIT */
-	int db_tsl_spins;		/* DB_TSL_SPINS */
-	int db_pageyield;		/* DB_PAGEYIELD */
-} DB_GLOBALS;
-extern	DB_GLOBALS	__db_global_values;
-#define	DB_GLOBAL(v)	__db_global_values.v
-
-/* Unused, or not-used-yet variable.  "Shut that bloody compiler up!" */
-#define	COMPQUIET(n, v)	(n) = (v)
-
-/*
- * Win16 needs specific syntax on callback functions.  Nobody else cares.
- */
-#ifndef	DB_CALLBACK
-#define	DB_CALLBACK	/* Nothing. */
-#endif
-
-/*******************************************************
- * Files.
- *******************************************************/
- /*
-  * We use 1024 as the maximum path length.  It's too hard to figure out what
-  * the real path length is, as it was traditionally stored in <sys/param.h>,
-  * and that file isn't always available.
-  */
-#undef	MAXPATHLEN
-#define	MAXPATHLEN	1024
-
-#define	PATH_DOT	"."	/* Current working directory. */
-#define	PATH_SEPARATOR	"/"	/* Path separator character. */
-
-/*******************************************************
- * Mutex support.
- *******************************************************/
-@spin_line1@
-@spin_line2@
-@spin_line3@
-
-/*
- * !!!
- * Various systems require different alignments for mutexes (the worst we've
- * seen so far is 16-bytes on some HP architectures).  The mutex (tsl_t) must
- * be first in the db_mutex_t structure, which must itself be first in the
- * region.  This ensures the alignment is as returned by mmap(2), which should
- * be sufficient.  All other mutex users must ensure proper alignment locally.
- */
-#define	MUTEX_ALIGNMENT	@mutex_align@
-
-/*
- * The offset of a mutex in memory.
- *
- * !!!
- * Not an off_t, so backing file offsets MUST be less than 4Gb.  See the
- * off field of the db_mutex_t as well.
- */
-#define	MUTEX_LOCK_OFFSET(a, b)	((u_int32_t)((u_int8_t *)b - (u_int8_t *)a))
-
-typedef struct _db_mutex_t {
-#ifdef HAVE_SPINLOCKS
-	tsl_t	  tsl_resource;		/* Resource test and set. */
-#ifdef DIAGNOSTIC
-	u_int32_t pid;			/* Lock holder: 0 or process pid. */
-#endif
-#else
-	u_int32_t off;			/* Backing file offset. */
-	u_int32_t pid;			/* Lock holder: 0 or process pid. */
-#endif
-	u_int32_t spins;		/* Spins before block. */
-	u_int32_t mutex_set_wait;	/* Granted after wait. */
-	u_int32_t mutex_set_nowait;	/* Granted without waiting. */
-} db_mutex_t;
-
-#include "mutex_ext.h"
-
-/*******************************************************
- * Access methods.
- *******************************************************/
-/* Lock/unlock a DB thread. */
-#define	DB_THREAD_LOCK(dbp)						\
-	if (F_ISSET(dbp, DB_AM_THREAD))					\
-	    (void)__db_mutex_lock((db_mutex_t *)(dbp)->mutexp, -1);
-#define	DB_THREAD_UNLOCK(dbp)						\
-	if (F_ISSET(dbp, DB_AM_THREAD))					\
-	    (void)__db_mutex_unlock((db_mutex_t *)(dbp)->mutexp, -1);
-
-/* Btree/recno local statistics structure. */
-struct __db_bt_lstat;	typedef struct __db_bt_lstat DB_BTREE_LSTAT;
-struct __db_bt_lstat {
-	u_int32_t bt_freed;		/* Pages freed for reuse. */
-	u_int32_t bt_pfxsaved;		/* Bytes saved by prefix compression. */
-	u_int32_t bt_split;		/* Total number of splits. */
-	u_int32_t bt_rootsplit;		/* Root page splits. */
-	u_int32_t bt_fastsplit;		/* Fast splits. */
-	u_int32_t bt_added;		/* Items added. */
-	u_int32_t bt_deleted;		/* Items deleted. */
-	u_int32_t bt_get;		/* Items retrieved. */
-	u_int32_t bt_cache_hit;		/* Hits in fast-insert code. */
-	u_int32_t bt_cache_miss;	/* Misses in fast-insert code. */
-};
-
-/*******************************************************
- * Environment.
- *******************************************************/
-/* Type passed to __db_appname(). */
-typedef enum {
-	DB_APP_NONE=0,			/* No type (region). */
-	DB_APP_DATA,			/* Data file. */
-	DB_APP_LOG,			/* Log file. */
-	DB_APP_TMP			/* Temporary file. */
-} APPNAME;
-
-/*******************************************************
- * Shared memory regions.
- *******************************************************/
-/*
- * The shared memory regions share an initial structure so that the general
- * region code can handle races between the region being deleted and other
- * processes waiting on the region mutex.
- *
- * !!!
- * Note, the mutex must be the first entry in the region; see comment above.
- */
-typedef struct _rlayout {
-	db_mutex_t lock;		/* Region mutex. */
-#define	DB_REGIONMAGIC	0x120897
-	u_int32_t  valid;		/* Valid magic number. */
-	u_int32_t  refcnt;		/* Region reference count. */
-	size_t	   size;		/* Region length. */
-	int	   majver;		/* Major version number. */
-	int	   minver;		/* Minor version number. */
-	int	   patch;		/* Patch version number. */
-#define	INVALID_SEGID	-1
-	int	   segid;		/* shmget(2) ID, or Win16 segment ID. */
-
-#define	REGION_ANONYMOUS	0x01	/* Region is/should be in anon mem. */
-	u_int32_t  flags;
-} RLAYOUT;
-
-/*
- * DB creates all regions on 4K boundaries out of sheer paranoia, so that
- * we don't make the underlying VM unhappy.
- */
-#define	DB_VMPAGESIZE	(4 * 1024)
-#define	DB_ROUNDOFF(i) {						\
-	(i) += DB_VMPAGESIZE - 1;					\
-	(i) -= (i) % DB_VMPAGESIZE;					\
-}
-
-/*
- * The interface to region attach is nasty, there is a lot of complex stuff
- * going on, which has to be retained between create/attach and detach.  The
- * REGINFO structure keeps track of it.
- */
-struct __db_reginfo;	typedef struct __db_reginfo REGINFO;
-struct __db_reginfo {
-					/* Arguments. */
-	DB_ENV	   *dbenv;		/* Region naming info. */
-	APPNAME	    appname;		/* Region naming info. */
-	char	   *path;		/* Region naming info. */
-	const char *file;		/* Region naming info. */
-	int	    mode;		/* Region mode, if a file. */
-	size_t	    size;		/* Region size. */
-	u_int32_t   dbflags;		/* Region file open flags, if a file. */
-
-					/* Results. */
-	char	   *name;		/* Region name. */
-	void	   *addr;		/* Region address. */
-	int	    fd;			/* Fcntl(2) locking file descriptor.
-					   NB: this is only valid if a regular
-					   file is backing the shared region,
-					   and mmap(2) is being used to map it
-					   into our address space. */
-	int	    segid;		/* shmget(2) ID, or Win16 segment ID. */
-
-					/* Shared flags. */
-/*				0x0001	COMMON MASK with RLAYOUT structure. */
-#define	REGION_CANGROW		0x0002	/* Can grow. */
-#define	REGION_CREATED		0x0004	/* Created. */
-#define	REGION_HOLDINGSYS	0x0008	/* Holding system resources. */
-#define	REGION_LASTDETACH	0x0010	/* Delete on last detach. */
-#define	REGION_MALLOC		0x0020	/* Created in malloc'd memory. */
-#define	REGION_PRIVATE		0x0040	/* Private to thread/process. */
-#define	REGION_REMOVED		0x0080	/* Already deleted. */
-#define	REGION_SIZEDEF		0x0100	/* Use default region size if exists. */
-	u_int32_t   flags;
-};
-
-/*******************************************************
- * Mpool.
- *******************************************************/
-/*
- * File types for DB access methods.  Negative numbers are reserved to DB.
- */
-#define	DB_FTYPE_BTREE		-1	/* Btree. */
-#define	DB_FTYPE_HASH		-2	/* Hash. */
-
-/* Structure used as the DB pgin/pgout pgcookie. */
-typedef struct __dbpginfo {
-	size_t	db_pagesize;		/* Underlying page size. */
-	int	needswap;		/* If swapping required. */
-} DB_PGINFO;
-
-/*******************************************************
- * Log.
- *******************************************************/
-/* Initialize an LSN to 'zero'. */
-#define	ZERO_LSN(LSN) {							\
-	(LSN).file = 0;							\
-	(LSN).offset = 0;						\
-}
-
-/* Return 1 if LSN is a 'zero' lsn, otherwise return 0. */
-#define	IS_ZERO_LSN(LSN)	((LSN).file == 0)
-
-/* Test if we need to log a change. */
-#define	DB_LOGGING(dbp)							\
-	(F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER))
-
-#ifdef DIAGNOSTIC
-/*
- * Debugging macro to log operations.
- *	If DEBUG_WOP is defined, log operations that modify the database.
- *	If DEBUG_ROP is defined, log operations that read the database.
- *
- * D dbp
- * T txn
- * O operation (string)
- * K key
- * A data
- * F flags
- */
-#define	LOG_OP(D, T, O, K, A, F) {					\
-	DB_LSN _lsn;							\
-	DBT _op;							\
-	if (DB_LOGGING((D))) {						\
-		memset(&_op, 0, sizeof(_op));				\
-		_op.data = O;						\
-		_op.size = strlen(O) + 1;				\
-		(void)__db_debug_log((D)->dbenv->lg_info,		\
-		    T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F);	\
-	}								\
-}
-#ifdef DEBUG_ROP
-#define	DEBUG_LREAD(D, T, O, K, A, F)	LOG_OP(D, T, O, K, A, F)
-#else
-#define	DEBUG_LREAD(D, T, O, K, A, F)
-#endif
-#ifdef DEBUG_WOP
-#define	DEBUG_LWRITE(D, T, O, K, A, F)	LOG_OP(D, T, O, K, A, F)
-#else
-#define	DEBUG_LWRITE(D, T, O, K, A, F)
-#endif
-#else
-#define	DEBUG_LREAD(D, T, O, K, A, F)
-#define	DEBUG_LWRITE(D, T, O, K, A, F)
-#endif /* DIAGNOSTIC */
-
-/*******************************************************
- * Transactions and recovery.
- *******************************************************/
-/*
- * Out of band value for a lock.  The locks are returned to callers as offsets
- * into the lock regions.  Since the RLAYOUT structure begins all regions, an
- * offset of 0 is guaranteed not to be a valid lock.
- */
-#define	LOCK_INVALID	0
-
-/* The structure allocated for every transaction. */
-struct __db_txn {
-	DB_TXNMGR	*mgrp;		/* Pointer to transaction manager. */
-	DB_TXN		*parent;	/* Pointer to transaction's parent. */
-	DB_LSN		last_lsn;	/* Lsn of last log write. */
-	u_int32_t	txnid;		/* Unique transaction id. */
-	size_t		off;		/* Detail structure within region. */
-	TAILQ_ENTRY(__db_txn) links;
-};
-
-#include "os_func.h"
-#include "os_ext.h"
-
-#endif /* !_DB_INTERNAL_H_ */
diff --git a/db2/include/db_join.h b/db2/include/db_join.h
new file mode 100644
index 0000000000..cb27e21f68
--- /dev/null
+++ b/db2/include/db_join.h
@@ -0,0 +1,23 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ *
+ *	@(#)db_join.h	10.2 (Sleepycat) 10/4/98
+ */
+
+#ifndef _DB_JOIN_H
+#define _DB_JOIN_H
+/*
+ * Joins use a join cursor that is similar to a regular DB cursor except
+ * that it only supports c_get and c_close functionality.  Also, it does
+ * not support the full range of flags for get.
+ */
+typedef struct __join_cursor {
+	u_int32_t j_init;		/* Set when cursor is initialized. */
+	DBC 	**j_curslist;		/* Array of cursors in the join. */
+	DB	 *j_primary;		/* Primary dbp. */
+	DBT	  j_key;		/* Used to do lookups. */
+} JOIN_CURSOR;
+#endif
diff --git a/db2/include/db_page.h b/db2/include/db_page.h
index e1846cbbbd..5c9ca674f1 100644
--- a/db2/include/db_page.h
+++ b/db2/include/db_page.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db_page.h	10.15 (Sleepycat) 5/1/98
+ *	@(#)db_page.h	10.18 (Sleepycat) 12/2/98
  */
 
 #ifndef _DB_PAGE_H_
@@ -43,14 +43,6 @@
 
 /*
  * Btree metadata page layout:
- *
- *	+-----------------------------------+
- *	|    lsn    |   pgno    |   magic   |
- *	+-----------------------------------+
- *	|   version |  pagesize |   free    |
- *	+-----------------------------------+
- *	|    flags  |  unused ...	    |
- *	+-----------------------------------+
  */
 typedef struct _btmeta {
 	DB_LSN	  lsn;		/* 00-07: LSN. */
@@ -72,10 +64,6 @@ typedef struct _btmeta {
 	u_int32_t re_pad;	/* 44-47: Recno: fixed-length record pad. */
 				/* 48-67: Unique file ID. */
 	u_int8_t  uid[DB_FILE_ID_LEN];
-
-	u_int32_t spare[13];	/* 68-123: Save some room for growth. */
-
-	DB_BTREE_LSTAT stat;	/* 124-163: Statistics. */
 } BTMETA;
 
 /************************************************************************
@@ -84,18 +72,6 @@ typedef struct _btmeta {
 
 /*
  * Hash metadata page layout:
- *
- *	+-----------------------------------+
- *	|    lsn    |   magic   |  version  |
- *	+-----------------------------------+
- *	|  pagesize | ovfl_point| last_freed|
- *	+-----------------------------------+
- *	| max_bucket| high_mask | low_mask  |
- *	+-----------------------------------+
- * 	| ffactor   |   nelem   | charkey   |
- *	+-----------------------------------+
- *	| spares[32]|   flags   | unused    |
- *	+-----------------------------------+
  */
 /* Hash Table Information */
 typedef struct hashhdr {	/* Disk resident portion */
@@ -359,10 +335,6 @@ typedef struct _hkeydata {
 
 /*
  * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
- *
- *	+-----------------------------------+
- *	|   type    |  pgno_t   | total len |
- *	+-----------------------------------+
  */
 typedef struct _hoffpage {
 	u_int8_t  type;		/*    00: Page type and delete flag. */
@@ -383,10 +355,6 @@ typedef struct _hoffpage {
 
 /*
  * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
- *
- *	+-----------------------+
- *	|   type    |  pgno_t   |
- *	+-----------------------+
  */
 typedef struct _hoffdup {
 	u_int8_t  type;		/*    00: Page type and delete flag. */
@@ -431,10 +399,6 @@ typedef struct _hoffdup {
 
 /*
  * The first type is B_KEYDATA, represented by the BKEYDATA structure:
- *
- *	+-----------------------------------+
- *	|   length  |    type   | key/data  |
- *	+-----------------------------------+
  */
 typedef struct _bkeydata {
 	db_indx_t len;		/* 00-01: Key/data item length. */
@@ -457,13 +421,7 @@ typedef struct _bkeydata {
 
 /*
  * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
- * by the BOVERFLOW structure:
- *
- *	+-----------------------------------+
- *	| total len |    type   |   unused  |
- *	+-----------------------------------+
- *	| nxt: page |  nxt: off | nxt: len  |
- *	+-----------------------------------+
+ * by the BOVERFLOW structure.
  */
 typedef struct _boverflow {
 	db_indx_t unused1;	/* 00-01: Padding, unused. */
@@ -501,10 +459,6 @@ typedef struct _boverflow {
 
 /*
  * Btree internal entry.
- *
- *	+-----------------------------------+
- *	| leaf pgno |   type    | data ...  |
- *	+-----------------------------------+
  */
 typedef struct _binternal {
 	db_indx_t  len;		/* 00-01: Key/data item length. */
@@ -535,12 +489,8 @@ typedef struct _binternal {
 /*
  * The recno internal entry.
  *
- *	+-----------------------+
- *	| leaf pgno | # of recs |
- *	+-----------------------+
- *
  * XXX
- * Why not fold this into the db_indx_t structure, it's fixed length.
+ * Why not fold this into the db_indx_t structure, it's fixed length?
  */
 typedef struct _rinternal {
 	db_pgno_t  pgno;	/* 00-03: Page number of referenced page. */
diff --git a/db2/include/hash.h b/db2/include/hash.h
index e55c2102cb..5d85a2a3a7 100644
--- a/db2/include/hash.h
+++ b/db2/include/hash.h
@@ -43,13 +43,22 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)hash.h	10.8 (Sleepycat) 4/10/98
+ *	@(#)hash.h	10.14 (Sleepycat) 10/4/98
  */
 
 /* Cursor structure definitions. */
 typedef struct cursor_t {
-	DBC		*db_cursor;
+	DBC		*dbc;
+
+	/* Per-thread information */
+	DB_LOCK hlock;			/* Metadata page lock. */
+	HASHHDR *hdr;			/* Pointer to meta-data page. */
+	PAGE *split_buf;		/* Temporary buffer for splits. */
+	struct __db_h_stat stats;	/* Hash statistics. */
+
+	/* Hash cursor information */
 	db_pgno_t	bucket;		/* Bucket we are traversing. */
+	db_pgno_t	lbucket;	/* Bucket for which we are locked. */
 	DB_LOCK		lock;		/* Lock held on the current bucket. */
 	PAGE		*pagep;		/* The current page. */
 	db_pgno_t	pgno;		/* Current page number. */
@@ -62,104 +71,83 @@ typedef struct cursor_t {
 	db_indx_t	dup_tlen;	/* Total length of duplicate entry. */
 	u_int32_t	seek_size;	/* Number of bytes we need for add. */
 	db_pgno_t	seek_found_page;/* Page on which we can insert. */
-	u_int32_t	big_keylen;	/* Length of big_key buffer. */
-	void		*big_key;	/* Temporary buffer for big keys. */
-	u_int32_t	big_datalen;	/* Length of big_data buffer. */
-	void		*big_data;	/* Temporary buffer for big data. */
-#define	H_OK		0x0001
-#define	H_NOMORE	0x0002
-#define	H_DELETED	0x0004
-#define	H_ISDUP		0x0008
-#define	H_EXPAND	0x0020
-	u_int32_t	flags;		/* Is cursor inside a dup set. */
+
+#define	H_DELETED	0x0001		/* Cursor item is deleted. */
+#define	H_DUPONLY	0x0002		/* Dups only; do not change key. */
+#define	H_EXPAND	0x0004		/* Table expanded. */
+#define	H_ISDUP		0x0008		/* Cursor is within duplicate set. */
+#define	H_NOMORE	0x0010		/* No more entries in bucket. */
+#define	H_OK		0x0020		/* Request succeeded. */
+#define H_DIRTY		0x0040		/* Meta-data page needs to be written */
+#define	H_ORIGINAL	0x0080		/* Bucket lock existed on entry. */
+	u_int32_t	flags;
 } HASH_CURSOR;
 
 #define	IS_VALID(C) ((C)->bucket != BUCKET_INVALID)
 
+#define	SAVE_CURSOR(ORIG, COPY) {					\
+	F_SET((ORIG), H_ORIGINAL);					\
+	*(COPY) = *(ORIG);						\
+}
 
-typedef struct htab {		/* Memory resident data structure. */
-	DB *dbp;		/* Pointer to parent db structure. */
-	DB_LOCK hlock;		/* Metadata page lock. */
-	HASHHDR *hdr;		/* Pointer to meta-data page. */
-	u_int32_t (*hash) __P((const void *, u_int32_t)); /* Hash Function */
-	PAGE *split_buf;	/* Temporary buffer for splits. */
-	int local_errno;	/* Error Number -- for DBM compatability */
-	u_long hash_accesses;	/* Number of accesses to this table. */
-	u_long hash_collisions;	/* Number of collisions on search. */
-	u_long hash_expansions;	/* Number of times we added a bucket. */
-	u_long hash_overflows;	/* Number of overflow pages. */
-	u_long hash_bigpages;	/* Number of big key/data pages. */
-} HTAB;
-
-/*
- * Macro used for interface functions to set the txnid in the DBP.
- */
-#define	SET_LOCKER(D, T) ((D)->txn = (T))
+#define	RESTORE_CURSOR(D, ORIG, COPY, RET) {				\
+	if ((RET) == 0) {						\
+		if ((ORIG)->dbc->txn == NULL &&				\
+		    (COPY)->lock != 0 && (ORIG)->lock != (COPY)->lock)	\
+			(void)lock_put((D)->dbenv->lk_info, (COPY)->lock); \
+	} else {							\
+		if ((ORIG)->dbc->txn == NULL &&				\
+		    (ORIG)->lock != 0 && (ORIG)->lock != (COPY)->lock)	\
+			(void)lock_put((D)->dbenv->lk_info, (ORIG)->lock); \
+		*ORIG = *COPY;						\
+	}								\
+}
 
 /*
  * More interface macros used to get/release the meta data page.
  */
-#define	GET_META(D, H) {						\
-	int _r;								\
-	if (F_ISSET(D, DB_AM_LOCKING) && !F_ISSET(D, DB_AM_RECOVER)) {	\
-		(D)->lock.pgno = BUCKET_INVALID;			\
-	    	if ((_r = lock_get((D)->dbenv->lk_info,			\
-	    	    (D)->txn == NULL ? (D)->locker : (D)->txn->txnid,	\
-		    0, &(D)->lock_dbt, DB_LOCK_READ,			\
-		    &(H)->hlock)) != 0)					\
-			return (_r < 0 ? EAGAIN : _r);			\
+#define	GET_META(D, I, R) {						\
+	if (F_ISSET(D, DB_AM_LOCKING) &&				\
+	    !F_ISSET((I)->dbc, DBC_RECOVER)) {				\
+		(I)->dbc->lock.pgno = BUCKET_INVALID;			\
+		(R) = lock_get((D)->dbenv->lk_info, (I)->dbc->locker, 	\
+		    0, &(I)->dbc->lock_dbt, DB_LOCK_READ, &(I)->hlock);	\
+		(R) = (R) < 0 ? EAGAIN : (R);				\
 	}								\
-	if ((_r = __ham_get_page(D, 0, (PAGE **)&((H)->hdr))) != 0) {	\
-		if ((H)->hlock) {					\
-			(void)lock_put((D)->dbenv->lk_info, (H)->hlock);\
-			(H)->hlock = 0;					\
-		}							\
-		return (_r);						\
+	if ((R) == 0 && 						\
+	    ((R) = __ham_get_page(D, 0, (PAGE **)&((I)->hdr))) != 0 &&  \
+	    (I)->hlock != LOCK_INVALID) {				\
+		(void)lock_put((D)->dbenv->lk_info, (I)->hlock);	\
+		(I)->hlock = LOCK_INVALID;				\
 	}								\
 }
 
-#define	RELEASE_META(D, H) {						\
-	if (!F_ISSET(D, DB_AM_RECOVER) &&				\
-	    (D)->txn == NULL && (H)->hlock)				\
-		(void)lock_put((H)->dbp->dbenv->lk_info, (H)->hlock);	\
-	(H)->hlock = 0;							\
-	if ((H)->hdr)							\
-		(void)__ham_put_page(D, (PAGE *)(H)->hdr,		\
-		    F_ISSET(D, DB_HS_DIRTYMETA) ? 1 : 0);		\
-	(H)->hdr = NULL;						\
-	F_CLR(D, DB_HS_DIRTYMETA);					\
+#define	RELEASE_META(D, I) {						\
+	if ((I)->hdr)							\
+		(void)__ham_put_page(D, (PAGE *)(I)->hdr,		\
+		    F_ISSET(I, H_DIRTY) ? 1 : 0);			\
+	(I)->hdr = NULL;						\
+	if (!F_ISSET((I)->dbc, DBC_RECOVER) &&				\
+	    (I)->dbc->txn == NULL && (I)->hlock)			\
+		(void)lock_put((D)->dbenv->lk_info, (I)->hlock);	\
+	(I)->hlock = LOCK_INVALID;					\
+	F_CLR(I, H_DIRTY);						\
 }
 
-#define	DIRTY_META(H, R) {						\
-	if (F_ISSET((H)->dbp, DB_AM_LOCKING) &&				\
-	    !F_ISSET((H)->dbp, DB_AM_RECOVER)) {			\
+#define	DIRTY_META(D, I, R) {						\
+	if (F_ISSET(D, DB_AM_LOCKING) &&				\
+	    !F_ISSET((I)->dbc, DBC_RECOVER)) {				\
 		DB_LOCK _tmp;						\
-		(H)->dbp->lock.pgno = BUCKET_INVALID;			\
-	    	if (((R) = lock_get((H)->dbp->dbenv->lk_info,		\
-	    	    (H)->dbp->txn ? (H)->dbp->txn->txnid :		\
-	    	    (H)->dbp->locker, 0, &(H)->dbp->lock_dbt,		\
+		(I)->dbc->lock.pgno = BUCKET_INVALID;			\
+	    	if (((R) = lock_get((D)->dbenv->lk_info,		\
+	    	    (I)->dbc->locker, 0, &(I)->dbc->lock_dbt,		\
 	    	    DB_LOCK_WRITE, &_tmp)) == 0)			\
-			(R) = lock_put((H)->dbp->dbenv->lk_info,	\
-			    (H)->hlock);				\
+			(R) = lock_put((D)->dbenv->lk_info, (I)->hlock);\
 		else if ((R) < 0)					\
 			(R) = EAGAIN;					\
-		(H)->hlock = _tmp;					\
+		(I)->hlock = _tmp;					\
 	}								\
-	F_SET((H)->dbp, DB_HS_DIRTYMETA);				\
-}
-
-/* Allocate and discard thread structures. */
-#define	H_GETHANDLE(dbp, dbpp, ret)					\
-	if (F_ISSET(dbp, DB_AM_THREAD))					\
-		ret = __db_gethandle(dbp, __ham_hdup, dbpp);		\
-	else {								\
-		ret = 0;						\
-		*dbpp = dbp;						\
-	}
-
-#define	H_PUTHANDLE(dbp) {						\
-	if (F_ISSET(dbp, DB_AM_THREAD))					\
-		__db_puthandle(dbp);					\
+	F_SET((I), H_DIRTY);						\
 }
 
 /* Test string. */
@@ -171,16 +159,16 @@ typedef struct htab {		/* Memory resident data structure. */
  * the table, we can allocate extra pages.  We keep track of how many pages
  * we've allocated at each point to calculate bucket to page number mapping.
  */
-#define	BUCKET_TO_PAGE(H, B) \
-	((B) + 1 + ((B) ? (H)->hdr->spares[__db_log2((B)+1)-1] : 0))
+#define	BUCKET_TO_PAGE(I, B) \
+	((B) + 1 + ((B) ? (I)->hdr->spares[__db_log2((B)+1)-1] : 0))
 
-#define	PGNO_OF(H, S, O) (BUCKET_TO_PAGE((H), (1 << (S)) - 1) + (O))
+#define	PGNO_OF(I, S, O) (BUCKET_TO_PAGE((I), (1 << (S)) - 1) + (O))
 
 /* Constraints about number of pages and how much data goes on a page. */
 
 #define	MAX_PAGES(H)	UINT32_T_MAX
 #define	MINFILL		4
-#define	ISBIG(H, N)	(((N) > ((H)->hdr->pagesize / MINFILL)) ? 1 : 0)
+#define	ISBIG(I, N)	(((N) > ((I)->hdr->pagesize / MINFILL)) ? 1 : 0)
 
 /* Shorthands for accessing structure */
 #define	NDX_INVALID	0xFFFF
diff --git a/db2/include/hash_ext.h b/db2/include/hash_ext.h
index 7086adcc44..fe17dc7b39 100644
--- a/db2/include/hash_ext.h
+++ b/db2/include/hash_ext.h
@@ -3,13 +3,11 @@
 #define _hash_ext_h_
 int __ham_open __P((DB *, DB_INFO *));
 int __ham_close __P((DB *));
-int __ham_c_iclose __P((DB *, DBC *));
-int __ham_expand_table __P((HTAB *));
-u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t));
+int __ham_c_init __P((DBC *));
+u_int32_t __ham_call_hash __P((HASH_CURSOR *, u_int8_t *, int32_t));
 int __ham_init_dbt __P((DBT *, u_int32_t, void **, u_int32_t *));
 void __ham_c_update
    __P((HASH_CURSOR *, db_pgno_t, u_int32_t, int, int));
-int  __ham_hdup __P((DB *, DB *));
 int __ham_insdel_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
     u_int32_t, u_int32_t, db_pgno_t, u_int32_t,
@@ -72,48 +70,45 @@ int __ham_init_recover __P((DB_ENV *));
 int __ham_pgin __P((db_pgno_t, void *, DBT *));
 int __ham_pgout __P((db_pgno_t, void *, DBT *));
 int __ham_mswap __P((void *));
-#ifdef DEBUG
-void __ham_dump_bucket __P((HTAB *, u_int32_t));
-#endif
-int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
-void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t));
+int __ham_add_dup __P((DBC *, DBT *, u_int32_t));
+void __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t));
+void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *));
 u_int32_t __ham_func2 __P((const void *, u_int32_t));
 u_int32_t __ham_func3 __P((const void *, u_int32_t));
 u_int32_t __ham_func4 __P((const void *, u_int32_t));
 u_int32_t __ham_func5 __P((const void *, u_int32_t));
-int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
-int __ham_item_reset __P((HTAB *, HASH_CURSOR *));
+int __ham_item __P((DBC *, db_lockmode_t));
+int __ham_item_reset __P((DBC *));
 void __ham_item_init __P((HASH_CURSOR *));
-int __ham_item_done __P((HTAB *, HASH_CURSOR *, int));
-int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
-int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
-int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
-int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+int __ham_item_done __P((DBC *, int));
+int __ham_item_last __P((DBC *, db_lockmode_t));
+int __ham_item_first __P((DBC *, db_lockmode_t));
+int __ham_item_prev __P((DBC *, db_lockmode_t));
+int __ham_item_next __P((DBC *, db_lockmode_t));
 void __ham_putitem __P((PAGE *p, const DBT *, int));
 void __ham_reputpair
    __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *));
-int __ham_del_pair __P((HTAB *, HASH_CURSOR *, int));
-int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
+int __ham_del_pair __P((DBC *, int));
+int __ham_replpair __P((DBC *, DBT *, u_int32_t));
 void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t,
     int32_t,  DBT *));
-int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t));
-int __ham_add_el
-   __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, int));
-void __ham_copy_item __P((HTAB *, PAGE *, u_int32_t, PAGE *));
-int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **));
-int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **));
-int __ham_del_page __P((DB *, PAGE *));
+int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
+int __ham_add_el __P((DBC *, const DBT *, const DBT *, int));
+void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *));
+int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **));
+int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **));
+int __ham_del_page __P((DBC *, PAGE *));
 int __ham_put_page __P((DB *, PAGE *, int32_t));
-int __ham_dirty_page __P((HTAB *, PAGE *));
+int __ham_dirty_page __P((DB *, PAGE *));
 int __ham_get_page __P((DB *, db_pgno_t, PAGE **));
-int __ham_overflow_page __P((DB *, u_int32_t, PAGE **));
+int __ham_overflow_page
+    __P((DBC *, u_int32_t, PAGE **));
 #ifdef DEBUG
-db_pgno_t __bucket_to_page __P((HTAB *, db_pgno_t));
+db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t));
 #endif
-void __ham_init_ovflpages __P((HTAB *));
-int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
-int __ham_next_cpage
-   __P((HTAB *, HASH_CURSOR *, db_pgno_t, int, u_int32_t));
+void __ham_init_ovflpages __P((DBC *));
+int __ham_get_cpage __P((DBC *, db_lockmode_t));
+int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t));
 void __ham_dpair __P((DB *, PAGE *, u_int32_t));
 int __ham_insdel_recover
     __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
@@ -131,5 +126,5 @@ int __ham_ovfl_recover
     __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __ham_copypage_recover
   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
-int __ham_stat __P((DB *, FILE *));
+int __ham_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
 #endif /* _hash_ext_h_ */
diff --git a/db2/include/lock.h b/db2/include/lock.h
index 47a38b8783..13364ca7a5 100644
--- a/db2/include/lock.h
+++ b/db2/include/lock.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)lock.h	10.15 (Sleepycat) 5/10/98
+ *	@(#)lock.h	10.17 (Sleepycat) 1/3/99
  */
 
 typedef struct __db_lockobj	DB_LOCKOBJ;
@@ -22,6 +22,12 @@ typedef struct __db_lockobj	DB_LOCKOBJ;
  */
 #define DB_LOCK_MAXID		0x7fffffff
 
+/* Check for region catastrophic shutdown. */
+#define	LOCK_PANIC_CHECK(lt) {						\
+	if ((lt)->region->hdr.panic)					\
+		return (DB_RUNRECOVERY);				\
+}
+
 /*
  * The lock region consists of:
  *	The DB_LOCKREGION structure (sizeof(DB_LOCKREGION)).
@@ -135,10 +141,24 @@ struct __db_lock {
 	u_int32_t	refcount;	/* Reference count the lock. */
 	db_lockmode_t	mode;		/* What sort of lock. */
 	ssize_t		obj;		/* Relative offset of object struct. */
+	size_t		txnoff;		/* Offset of holding transaction. */
 	db_status_t	status;		/* Status of this lock. */
 };
 
 /*
+ * This is a serious layering violation.  To support nested transactions, we
+ * need to be able to tell that a lock is held by a transaction (as opposed to
+ * some other locker) and to be able to traverse the parent/descendent chain.
+ * In order to do this, each lock held by a transaction maintains a reference
+ * to the shared memory transaction structure so it can be accessed during lock
+ * promotion.  As the structure is in shared memory, we cannot store a pointer
+ * to it, so we use the offset within the region.  As nothing lives at region
+ * offset 0, we use that to indicate that there is no transaction associated
+ * with the current lock.
+ */
+#define TXN_IS_HOLDING(L)	((L)->txnoff != 0 /* INVALID_REG_OFFSET */)
+
+/*
  * We cannot return pointers to the user (else we cannot easily grow regions),
  * so we return offsets in the region.  These must be converted to and from
  * regular pointers.  Always use the macros below.
diff --git a/db2/include/lock_ext.h b/db2/include/lock_ext.h
index 1e0522c6b5..ce7994774a 100644
--- a/db2/include/lock_ext.h
+++ b/db2/include/lock_ext.h
@@ -6,6 +6,9 @@ int __lock_is_locked
 void __lock_printlock __P((DB_LOCKTAB *, struct __db_lock *, int));
 int __lock_getobj  __P((DB_LOCKTAB *,
     u_int32_t, const DBT *, u_int32_t type, DB_LOCKOBJ **));
+int __lock_downgrade __P((DB_LOCKTAB *,
+    DB_LOCK, db_lockmode_t, u_int32_t));
+void __lock_panic __P((DB_ENV *));
 int __lock_validate_region __P((DB_LOCKTAB *));
 int __lock_grow_region __P((DB_LOCKTAB *, int, size_t));
 void __lock_dump_region __P((DB_LOCKTAB *, char *, FILE *));
diff --git a/db2/include/log.h b/db2/include/log.h
index 7d5161cc9d..50309085aa 100644
--- a/db2/include/log.h
+++ b/db2/include/log.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)log.h	10.25 (Sleepycat) 4/10/98
+ *	@(#)log.h	10.30 (Sleepycat) 10/11/98
  */
 
 #ifndef _LOG_H_
@@ -16,8 +16,10 @@ struct __log;		typedef struct __log LOG;
 struct __log_persist;	typedef struct __log_persist LOGP;
 
 #ifndef MAXLFNAME
-#define	MAXLFNAME	99999		/* Maximum log file name. */
-#define	LFNAME		"log.%05d"	/* Log file name template. */
+#define	LFPREFIX	"log."		/* Log file name prefix. */
+#define	LFNAME		"log.%010d"	/* Log file name template. */
+#define	LFNAME_V1	"log.%05d"	/* Log file name template, rev 1. */
+#define	MAXLFNAME	2000000000	/* Maximum log file name. */
 #endif
 					/* Default log name. */
 #define DB_DEFAULT_LOG_FILE	"__db_log.share"
@@ -38,6 +40,12 @@ struct __log_persist;	typedef struct __log_persist LOGP;
 	(void)__db_mutex_unlock(&((RLAYOUT *)(dblp)->lp)->lock,		\
 	    (dblp)->reginfo.fd)
 
+/* Check for region catastrophic shutdown. */
+#define	LOG_PANIC_CHECK(dblp) {						\
+	if ((dblp)->lp->rlayout.panic)					\
+		return (DB_RUNRECOVERY);				\
+}
+
 /*
  * The per-process table that maps log file-id's to DB structures.
  */
@@ -84,7 +92,28 @@ struct __db_log {
 
 	char	 *dir;			/* Directory argument. */
 
-	u_int32_t flags;		/* Support the DB_AM_XXX flags. */
+/*
+ * These fields are used by XA; since XA forbids threaded execution, these
+ * do not have to be protected.
+ */
+	void 	*xa_info;		/* Committed transaction list that
+					 * has to be carried between calls
+					 * to xa_recover. */
+	DB_LSN	xa_lsn;			/* Position of an XA recovery scan. */
+	DB_LSN	xa_first;		/* LSN to which we need to roll back
+					   for this XA recovery scan. */
+
+	/*
+	 * !!!
+	 * Currently used to hold:
+	 *	DB_AM_THREAD	(a DB flag)
+	 *	DBC_RECOVER	(a DBC flag)
+	 * If they are ever the same bits, we're in serious trouble.
+	 */
+#if DB_AM_THREAD == DBC_RECOVER
+	DB_AM_THREAD, DBC_RECOVER, FLAG MISMATCH
+#endif
+	u_int32_t flags;
 };
 
 /*
diff --git a/db2/include/log_ext.h b/db2/include/log_ext.h
index bf3bcb02ce..842a3f4265 100644
--- a/db2/include/log_ext.h
+++ b/db2/include/log_ext.h
@@ -1,8 +1,9 @@
 /* DO NOT EDIT: automatically built by dist/distrib. */
 #ifndef _log_ext_h_
 #define _log_ext_h_
+void __log_panic __P((DB_ENV *));
 int __log_find __P((DB_LOG *, int, int *));
-int __log_valid __P((DB_LOG *, LOG *, int));
+int __log_valid __P((DB_LOG *, u_int32_t, int));
 int __log_register_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
     u_int32_t, const DBT *, const DBT *, u_int32_t,
@@ -15,7 +16,7 @@ int __log_init_recover __P((DB_ENV *));
 int __log_findckp __P((DB_LOG *, DB_LSN *));
 int __log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t, int));
 int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
-int __log_name __P((DB_LOG *, int, char **));
+int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t));
 int __log_register_recover
     __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __log_add_logid __P((DB_LOG *, DB *, u_int32_t));
diff --git a/db2/include/mp.h b/db2/include/mp.h
index 8635efa722..904bccfe98 100644
--- a/db2/include/mp.h
+++ b/db2/include/mp.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)mp.h	10.33 (Sleepycat) 5/4/98
+ *	@(#)mp.h	10.37 (Sleepycat) 1/1/99
  */
 
 struct __bh;		typedef struct __bh BH;
@@ -16,11 +16,11 @@ struct __mpoolfile;	typedef struct __mpoolfile MPOOLFILE;
 #define	DB_DEFAULT_MPOOL_FILE	"__db_mpool.share"
 
 /*
- * We default to 128K (16 8K pages) if the user doesn't specify, and
+ * We default to 256K (32 8K pages) if the user doesn't specify, and
  * require a minimum of 20K.
  */
 #ifndef	DB_CACHESIZE_DEF
-#define	DB_CACHESIZE_DEF	(128 * 1024)
+#define	DB_CACHESIZE_DEF	(256 * 1024)
 #endif
 #define	DB_CACHESIZE_MIN	( 20 * 1024)
 
@@ -106,6 +106,12 @@ struct __mpoolfile;	typedef struct __mpoolfile MPOOLFILE;
 	if (F_ISSET(dbmp, MP_LOCKREGION))				\
 		(void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd)
 
+/* Check for region catastrophic shutdown. */
+#define	MP_PANIC_CHECK(dbmp) {						\
+	if ((dbmp)->mp->rlayout.panic)					\
+		return (DB_RUNRECOVERY);				\
+}
+
 /*
  * DB_MPOOL --
  *	Per-process memory pool structure.
@@ -158,6 +164,18 @@ struct __db_mpoolfile {
 
 	int	   fd;			/* Underlying file descriptor. */
 
+	u_int32_t ref;			/* Reference count. */
+
+	/*
+	 * !!!
+	 * This field is a special case -- it's protected by the region lock
+	 * NOT the thread lock.  The reason for this is that we always have
+	 * the region lock immediately before or after we modify the field,
+	 * and we don't want to use the structure lock to protect it because
+	 * then I/O (which is done with the structure lock held because of
+	 * the race between the seek and write of the file descriptor) will
+	 * block any other put/get calls using this DB_MPOOLFILE structure.
+	 */
 	u_int32_t pinref;		/* Pinned block reference count. */
 
 /* These fields are not protected. */
diff --git a/db2/include/mp_ext.h b/db2/include/mp_ext.h
index 3650839475..8b46334408 100644
--- a/db2/include/mp_ext.h
+++ b/db2/include/mp_ext.h
@@ -9,10 +9,12 @@ int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
 void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
 int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
    u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
+void __memp_panic __P((DB_ENV *));
 char * __memp_fn __P((DB_MPOOLFILE *));
 char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
 void __memp_dump_region __P((DB_MPOOL *, char *, FILE *));
-int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *));
+int __memp_reg_alloc __P((DB_MPOOL *, size_t, size_t *, void *));
+int __memp_alloc __P((DB_MPOOL *, size_t, size_t *, void *));
 int __memp_ropen
    __P((DB_MPOOL *, const char *, size_t, int, int, u_int32_t));
 int __mp_xxx_fd __P((DB_MPOOLFILE *, int *));
diff --git a/db2/include/os.h b/db2/include/os.h
new file mode 100644
index 0000000000..f173d1f610
--- /dev/null
+++ b/db2/include/os.h
@@ -0,0 +1,24 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 1998
+ *	Sleepycat Software.  All rights reserved.
+ *
+ *	@(#)os.h	10.11 (Sleepycat) 10/12/98
+ */
+
+/*
+ * We group seek/write calls into a single function so that we can use
+ * pread(2)/pwrite(2) where they're available.
+ */
+#define	DB_IO_READ	1
+#define	DB_IO_WRITE	2
+typedef struct __io {
+	int	    fd_io;		/* I/O file descriptor. */
+	int	    fd_lock;		/* Locking file descriptor. */
+	db_mutex_t *mutexp;		/* Mutex to lock. */
+	size_t	    pagesize;		/* Page size. */
+	db_pgno_t   pgno;		/* Page number. */
+	u_int8_t   *buf;		/* Buffer. */
+	size_t	    bytes;		/* Bytes read/written. */
+} DB_IO;
diff --git a/db2/include/os_ext.h b/db2/include/os_ext.h
index 889a45a44e..346210975f 100644
--- a/db2/include/os_ext.h
+++ b/db2/include/os_ext.h
@@ -1,15 +1,17 @@
 /* DO NOT EDIT: automatically built by dist/distrib. */
 #ifndef _os_ext_h_
 #define _os_ext_h_
-int __db_abspath __P((const char *));
-char *__db_strdup __P((const char *));
-void *__db_calloc __P((size_t, size_t));
-void *__db_malloc __P((size_t));
-void *__db_realloc __P((void *, size_t));
+int __os_abspath __P((const char *));
+int __os_strdup __P((const char *, void *));
+int __os_calloc __P((size_t, size_t, void *));
+int __os_malloc __P((size_t, void *(*)(size_t), void *));
+int __os_realloc __P((void *, size_t));
+void __os_free __P((void *, size_t));
+void __os_freestr __P((void *));
 int __os_dirlist __P((const char *, char ***, int *));
 void __os_dirfree __P((char **, int));
-int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *));
-int __db_fsync __P((int));
+int __os_fileid __P((DB_ENV *, const char *, int, u_int8_t *));
+int __os_fsync __P((int));
 int __db_mapanon_ok __P((int));
 int __db_mapinit __P((void));
 int __db_mapregion __P((char *, REGINFO *));
@@ -20,15 +22,19 @@ int __db_unmapfile __P((void *, size_t));
 u_int32_t __db_oflags __P((int));
 int __db_omode __P((const char *));
 int __db_open __P((const char *, u_int32_t, u_int32_t, int, int *));
-int __db_close __P((int));
+int __os_open __P((const char *, int, int, int *));
+int __os_close __P((int));
 char *__db_rpath __P((const char *));
-int __db_read __P((int, void *, size_t, ssize_t *));
-int __db_write __P((int, void *, size_t, ssize_t *));
+int __os_io __P((DB_IO *, int, ssize_t *));
+int __os_read __P((int, void *, size_t, ssize_t *));
+int __os_write __P((int, const void *, size_t, ssize_t *));
 int __os_seek __P((int, size_t, db_pgno_t, u_int32_t, int, int));
 int __os_sleep __P((u_long, u_long));
 int __os_spin __P((void));
+void __os_yield __P((u_long));
 int __os_exists __P((const char *, int *));
 int __os_ioinfo
    __P((const char *, int, u_int32_t *, u_int32_t *, u_int32_t *));
-int __db_unlink __P((const char *));
+int __os_tmpdir __P((DB_ENV *, u_int32_t));
+int __os_unlink __P((const char *));
 #endif /* _os_ext_h_ */
diff --git a/db2/include/os_func.h b/db2/include/os_jump.h
index 12794d550d..e2d577ff10 100644
--- a/db2/include/os_func.h
+++ b/db2/include/os_jump.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)os_func.h	10.8 (Sleepycat) 4/19/98
+ *	@(#)os_jump.h	10.1 (Sleepycat) 10/17/98
  */
 
 /* Calls which can be replaced by the application. */
@@ -38,32 +38,3 @@ struct __db_jumptab {
 };
 
 extern struct __db_jumptab __db_jump;
-
-/*
- * Names used by DB to call through the jump table.
- *
- * The naming scheme goes like this: if the functionality the application can
- * replace is the same as the DB functionality, e.g., malloc, or dirlist, then
- * we use the name __db_XXX, and the application is expected to replace the
- * complete functionality, which may or may not map directly to an ANSI C or
- * POSIX 1003.1 interface.  If the functionality that the aplication replaces
- * only underlies what the DB os directory exports to other parts of DB, e.g.,
- * read, then the name __os_XXX is used, and the application can only replace
- * the underlying functionality.  Under most circumstances, the os directory
- * part of DB is the only code that should use the __os_XXX names, all other
- * parts of DB should be calling __db_XXX functions.
- */
-#define	__os_close	__db_jump.j_close	/* __db_close is a wrapper. */
-#define	__db_dirfree	__db_jump.j_dirfree
-#define	__db_dirlist	__db_jump.j_dirlist
-#define	__db_exists	__db_jump.j_exists
-#define	__db_free	__db_jump.j_free
-#define	__os_fsync	__db_jump.j_fsync	/* __db_fsync is a wrapper. */
-#define	__db_ioinfo	__db_jump.j_ioinfo
-#define	__os_open	__db_jump.j_open	/* __db_open is a wrapper. */
-#define	__os_read	__db_jump.j_read	/* __db_read is a wrapper. */
-#define	__db_seek	__db_jump.j_seek
-#define	__db_sleep	__db_jump.j_sleep
-#define	__os_unlink	__db_jump.j_unlink	/* __db_unlink is a wrapper. */
-#define	__os_write	__db_jump.j_write	/* __db_write is a wrapper. */
-#define	__db_yield	__db_jump.j_yield
diff --git a/db2/include/txn.h b/db2/include/txn.h
index a2512ed152..a6fa4db8de 100644
--- a/db2/include/txn.h
+++ b/db2/include/txn.h
@@ -4,11 +4,13 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)txn.h	10.15 (Sleepycat) 4/21/98
+ *	@(#)txn.h	10.18 (Sleepycat) 1/3/99
  */
 #ifndef	_TXN_H_
 #define	_TXN_H_
 
+#include "xa.h"
+
 /*
  * The name of the transaction shared memory region is DEFAULT_TXN_FILE and
  * the region is always created group RW of the group owning the directory.
@@ -25,6 +27,8 @@
 /*
  * Internal data maintained in shared memory for each transaction.
  */
+typedef char DB_XID[XIDDATASIZE];
+
 typedef struct __txn_detail {
 	u_int32_t txnid;		/* current transaction id
 					   used to link free list also */
@@ -32,12 +36,31 @@ typedef struct __txn_detail {
 	DB_LSN	begin_lsn;		/* lsn of begin record */
 	size_t	last_lock;		/* offset in lock region of last lock
 					   for this transaction. */
+	size_t	parent;			/* Offset of transaction's parent. */
 #define	TXN_UNALLOC	0
 #define	TXN_RUNNING	1
 #define	TXN_ABORTED	2
 #define	TXN_PREPARED	3
+#define	TXN_COMMITTED	4
 	u_int32_t status;		/* status of the transaction */
 	SH_TAILQ_ENTRY	links;		/* free/active list */
+
+#define	TXN_XA_ABORTED		1
+#define	TXN_XA_DEADLOCKED	2
+#define	TXN_XA_ENDED		3
+#define	TXN_XA_PREPARED		4
+#define	TXN_XA_STARTED		5
+#define	TXN_XA_SUSPENDED	6
+	u_int32_t xa_status;		/* XA status */
+
+	/*
+	 * XID (xid_t) structure: because these fields are logged, the
+	 * sizes have to be explicit.
+	 */
+	DB_XID xid;			/* XA global transaction id */
+	u_int32_t bqual;		/* bqual_length from XID */
+	u_int32_t gtrid;		/* gtrid_length from XID */
+	int32_t format;			/* XA format */
 } TXN_DETAIL;
 
 /*
@@ -105,6 +128,12 @@ struct __db_txnregion {
 #define	UNLOCK_TXNREGION(tmgrp)						\
 	(void)__db_mutex_unlock(&(tmgrp)->region->hdr.lock, (tmgrp)->reginfo.fd)
 
+/* Check for region catastrophic shutdown. */
+#define	TXN_PANIC_CHECK(tmgrp) {					\
+	if ((tmgrp)->region->hdr.panic)					\
+		return (DB_RUNRECOVERY);				\
+}
+
 /*
  * Log record types.
  */
@@ -114,4 +143,6 @@ struct __db_txnregion {
 
 #include "txn_auto.h"
 #include "txn_ext.h"
+
+#include "xa_ext.h"
 #endif /* !_TXN_H_ */
diff --git a/db2/include/txn_auto.h b/db2/include/txn_auto.h
index fd5a456115..bb3de4eb17 100644
--- a/db2/include/txn_auto.h
+++ b/db2/include/txn_auto.h
@@ -22,4 +22,30 @@ typedef struct _txn_ckp_args {
 	DB_LSN 	last_ckp;
 } __txn_ckp_args;
 
+
+#define	DB_txn_xa_regop	(DB_txn_BEGIN + 3)
+
+typedef struct _txn_xa_regop_args {
+	u_int32_t type;
+	DB_TXN *txnid;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	DBT	xid;
+	int32_t	formatID;
+	u_int32_t	gtrid;
+	u_int32_t	bqual;
+	DB_LSN 	begin_lsn;
+} __txn_xa_regop_args;
+
+
+#define	DB_txn_child	(DB_txn_BEGIN + 4)
+
+typedef struct _txn_child_args {
+	u_int32_t type;
+	DB_TXN *txnid;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	u_int32_t	parent;
+} __txn_child_args;
+
 #endif
diff --git a/db2/include/txn_ext.h b/db2/include/txn_ext.h
index 7d694f070d..e0d69c360d 100644
--- a/db2/include/txn_ext.h
+++ b/db2/include/txn_ext.h
@@ -1,6 +1,9 @@
 /* DO NOT EDIT: automatically built by dist/distrib. */
 #ifndef _txn_ext_h_
 #define _txn_ext_h_
+void __txn_panic __P((DB_ENV *));
+int __txn_xa_begin __P((DB_ENV *, DB_TXN *));
+int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t));
 int __txn_regop_log
     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
     u_int32_t));
@@ -13,9 +16,26 @@ int __txn_ckp_log
 int __txn_ckp_print
    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __txn_ckp_read __P((void *, __txn_ckp_args **));
+int __txn_xa_regop_log
+    __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+    u_int32_t, const DBT *, int32_t, u_int32_t,
+    u_int32_t, DB_LSN *));
+int __txn_xa_regop_print
+   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+int __txn_xa_regop_read __P((void *, __txn_xa_regop_args **));
+int __txn_child_log
+    __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+    u_int32_t, u_int32_t));
+int __txn_child_print
+   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+int __txn_child_read __P((void *, __txn_child_args **));
 int __txn_init_print __P((DB_ENV *));
 int __txn_init_recover __P((DB_ENV *));
 int __txn_regop_recover
-    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+int __txn_xa_regop_recover
+   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 int __txn_ckp_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+int __txn_child_recover
+   __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
 #endif /* _txn_ext_h_ */
diff --git a/db2/include/xa.h b/db2/include/xa.h
new file mode 100644
index 0000000000..ae822f3e75
--- /dev/null
+++ b/db2/include/xa.h
@@ -0,0 +1,179 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ *
+ *	@(#)xa.h	10.1 (Sleepycat) 6/22/98
+ */
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	XA_H
+#define	XA_H
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#define	XIDDATASIZE	128		/* size in bytes */
+#define	MAXGTRIDSIZE	 64		/* maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/* maximum size in bytes of bqual */
+
+struct xid_t {
+	long formatID;			/* format identifier */
+	long gtrid_length;		/* value from 1 through 64 */
+	long bqual_length;		/* value from 1 through 64 */
+	char data[XIDDATASIZE];
+};
+typedef	struct xid_t XID;
+/*
+ * A value of -1 in formatID means that the XID is null.
+ */
+
+/*
+ * Declarations of routines by which RMs call TMs:
+ */
+extern int ax_reg __P((int, XID *, long));
+extern int ax_unreg __P((int, long));
+
+/*
+ * XA Switch Data Structure
+ */
+#define	RMNAMESZ	32		/* length of resource manager name, */
+					/* including the null terminator */
+#define	MAXINFOSIZE	256		/* maximum size in bytes of xa_info */
+					/* strings, including the null
+					terminator */
+struct xa_switch_t {
+	char name[RMNAMESZ];		/* name of resource manager */
+	long flags;			/* resource manager specific options */
+	long version;			/* must be 0 */
+	int (*xa_open_entry)		/* xa_open function pointer */
+	    __P((char *, int, long));
+	int (*xa_close_entry)		/* xa_close function pointer */
+	    __P((char *, int, long));
+	int (*xa_start_entry)		/* xa_start function pointer */
+	    __P((XID *, int, long));
+	int (*xa_end_entry)		/* xa_end function pointer */
+	    __P((XID *, int, long));
+	int (*xa_rollback_entry)	/* xa_rollback function pointer */
+	    __P((XID *, int, long));
+	int (*xa_prepare_entry)		/* xa_prepare function pointer */
+	    __P((XID *, int, long));
+	int (*xa_commit_entry)		/* xa_commit function pointer */
+	    __P((XID *, int, long));
+	int (*xa_recover_entry)		/* xa_recover function pointer */
+	    __P((XID *, long, int, long));
+	int (*xa_forget_entry)		/* xa_forget function pointer */
+	    __P((XID *, int, long));
+	int (*xa_complete_entry)	/* xa_complete function pointer */
+	    __P((int *, int *, int, long));
+};
+
+/*
+ * Flag definitions for the RM switch
+ */
+#define	TMNOFLAGS	0x00000000L	/* no resource manager features
+					selected */
+#define	TMREGISTER	0x00000001L	/* resource manager dynamically
+					registers */
+#define	TMNOMIGRATE	0x00000002L	/* resource manager does not support
+					association migration */
+#define	TMUSEASYNC	0x00000004L	/* resource manager supports
+					asynchronous operations */
+/*
+ * Flag definitions for xa_ and ax_ routines
+ */
+/* use TMNOFLAGGS, defined above, when not specifying other flags */
+#define	TMASYNC		0x80000000L	/* perform routine asynchronously */
+#define	TMONEPHASE	0x40000000L	/* caller is using one-phase commit
+					optimisation */
+#define	TMFAIL		0x20000000L	/* dissociates caller and marks
+					transaction branch rollback-only */
+#define	TMNOWAIT	0x10000000L	/* return if blocking condition
+					exists */
+#define	TMRESUME	0x08000000L	/* caller is resuming association with
+					suspended transaction branch */
+#define	TMSUCCESS	0x04000000L	/* dissociate caller from transaction
+					branch */
+#define	TMSUSPEND	0x02000000L	/* caller is suspending, not ending,
+					association */
+#define	TMSTARTRSCAN	0x01000000L	/* start a recovery scan */
+#define	TMENDRSCAN	0x00800000L	/* end a recovery scan */
+#define	TMMULTIPLE	0x00400000L	/* wait for any asynchronous
+					operation */
+#define	TMJOIN		0x00200000L	/* caller is joining existing
+					transaction branch */
+#define	TMMIGRATE	0x00100000L	/* caller intends to perform
+					migration */
+
+/*
+ * ax_() return codes (transaction manager reports to resource manager)
+ */
+#define	TM_JOIN		2		/* caller is joining existing
+					transaction branch */
+#define	TM_RESUME	1		/* caller is resuming association with
+					suspended transaction branch */
+#define	TM_OK		0		/* normal execution */
+#define	TMER_TMERR	-1		/* an error occurred in the transaction
+					manager */
+#define	TMER_INVAL	-2		/* invalid arguments were given */
+#define	TMER_PROTO	-3		/* routine invoked in an improper
+					context */
+
+/*
+ * xa_() return codes (resource manager reports to transaction manager)
+ */
+#define	XA_RBBASE	100		/* The inclusive lower bound of the
+					rollback codes */
+#define	XA_RBROLLBACK	XA_RBBASE	/* The rollback was caused by an
+					unspecified reason */
+#define	XA_RBCOMMFAIL	XA_RBBASE+1	/* The rollback was caused by a
+					communication failure */
+#define	XA_RBDEADLOCK	XA_RBBASE+2	/* A deadlock was detected */
+#define	XA_RBINTEGRITY	XA_RBBASE+3	/* A condition that violates the
+					integrity of the resources was
+					detected */
+#define	XA_RBOTHER	XA_RBBASE+4	/* The resource manager rolled back the
+					transaction branch for a reason not
+					on this list */
+#define	XA_RBPROTO	XA_RBBASE+5	/* A protocol error occurred in the
+					resource manager */
+#define	XA_RBTIMEOUT	XA_RBBASE+6	/* A transaction branch took too long */
+#define	XA_RBTRANSIENT	XA_RBBASE+7	/* May retry the transaction branch */
+#define	XA_RBEND	XA_RBTRANSIENT	/* The inclusive upper bound of the
+					rollback codes */
+#define	XA_NOMIGRATE	9		/* resumption must occur where
+					suspension occurred */
+#define	XA_HEURHAZ	8		/* the transaction branch may have
+					been heuristically completed */
+#define	XA_HEURCOM	7		/* the transaction branch has been
+					heuristically committed */
+#define	XA_HEURRB	6		/* the transaction branch has been
+					heuristically rolled back */
+#define	XA_HEURMIX	5		/* the transaction branch has been
+					heuristically committed and rolled
+					back */
+#define	XA_RETRY	4		/* routine returned with no effect and
+					may be re-issued */
+#define	XA_RDONLY	3		/* the transaction branch was read-only
+					and has been committed */
+#define	XA_OK		0		/* normal execution */
+#define	XAER_ASYNC	-2		/* asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/* a resource manager error occurred in
+					 the transaction branch */
+#define	XAER_NOTA	-4		/* the XID is not valid */
+#define	XAER_INVAL	-5		/* invalid arguments were given */
+#define	XAER_PROTO	-6		/* routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/* resource manager unavailable */
+#define	XAER_DUPID	-8		/* the XID already exists */
+#define	XAER_OUTSIDE	-9		/* resource manager doing work outside
+					transaction */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/db2/include/xa_ext.h b/db2/include/xa_ext.h
new file mode 100644
index 0000000000..00369ccaae
--- /dev/null
+++ b/db2/include/xa_ext.h
@@ -0,0 +1,13 @@
+/* DO NOT EDIT: automatically built by dist/distrib. */
+#ifndef _xa_ext_h_
+#define _xa_ext_h_
+int __db_rmid_to_env __P((int rmid, DB_ENV **envp, int open_ok));
+int __db_xid_to_txn __P((DB_ENV *, XID *, size_t *));
+int __db_map_rmid __P((int, DB_ENV *));
+int __db_unmap_rmid __P((int));
+int __db_map_xid __P((DB_ENV *, XID *, size_t));
+void __db_unmap_xid __P((DB_ENV *, XID *, size_t));
+int __db_map_rmid_name __P((int, char *));
+int __db_rmid_to_name __P((int, char **));
+ void __db_unmap_rmid_name __P((int));
+#endif /* _xa_ext_h_ */
diff --git a/db2/lock/lock.c b/db2/lock/lock.c
index 3d20e0d65b..4cf1d9ecca 100644
--- a/db2/lock/lock.c
+++ b/db2/lock/lock.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)lock.c	10.52 (Sleepycat) 5/10/98";
+static const char sccsid[] = "@(#)lock.c	10.61 (Sleepycat) 1/3/99";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -23,16 +23,22 @@ static const char sccsid[] = "@(#)lock.c	10.52 (Sleepycat) 5/10/98";
 #include "db_page.h"
 #include "db_shash.h"
 #include "lock.h"
-#include "common_ext.h"
 #include "db_am.h"
+#include "txn_auto.h"
+#include "txn_ext.h"
+#include "common_ext.h"
 
 static void __lock_checklocker __P((DB_LOCKTAB *, struct __db_lock *, int));
 static void __lock_freeobj __P((DB_LOCKTAB *, DB_LOCKOBJ *));
-static int  __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, u_int32_t,
-    const DBT *, db_lockmode_t, struct __db_lock **));
+static int  __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *,
+    u_int32_t, const DBT *, db_lockmode_t, struct __db_lock **));
+static int  __lock_is_parent __P((u_int32_t, DB_TXN *));
+static int  __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *));
 static int  __lock_put_internal __P((DB_LOCKTAB *, struct __db_lock *, int));
 static void __lock_remove_waiter
     __P((DB_LOCKTAB *, DB_LOCKOBJ *, struct __db_lock *, db_status_t));
+static int  __lock_vec_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *,
+	    u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **elistp));
 
 int
 lock_id(lt, idp)
@@ -41,6 +47,8 @@ lock_id(lt, idp)
 {
 	u_int32_t id;
 
+	LOCK_PANIC_CHECK(lt);
+
 	LOCK_LOCKREGION(lt);
 	if (lt->region->id >= DB_LOCK_MAXID)
 		lt->region->id = 0;
@@ -58,10 +66,37 @@ lock_vec(lt, locker, flags, list, nlist, elistp)
 	int nlist;
 	DB_LOCKREQ *list, **elistp;
 {
+	return (__lock_vec_internal(lt,
+	    locker, NULL, flags, list, nlist, elistp));
+}
+
+int
+lock_tvec(lt, txn, flags, list, nlist, elistp)
+	DB_LOCKTAB *lt;
+	DB_TXN *txn;
+	u_int32_t flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	return (__lock_vec_internal(lt,
+	    txn->txnid, txn, flags, list, nlist, elistp));
+}
+
+static int
+__lock_vec_internal(lt, locker, txn, flags, list, nlist, elistp)
+	DB_LOCKTAB *lt;
+	u_int32_t locker;
+	DB_TXN *txn;
+	u_int32_t flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
 	struct __db_lock *lp;
-	DB_LOCKOBJ *sh_obj, *sh_locker;
+	DB_LOCKOBJ *sh_obj, *sh_locker, *sh_parent;
 	int i, ret, run_dd;
 
+	LOCK_PANIC_CHECK(lt);
+
 	/* Validate arguments. */
 	if ((ret =
 	    __db_fchk(lt->dbenv, "lock_vec", flags, DB_LOCK_NOWAIT)) != 0)
@@ -78,13 +113,43 @@ lock_vec(lt, locker, flags, list, nlist, elistp)
 	for (i = 0; i < nlist && ret == 0; i++) {
 		switch (list[i].op) {
 		case DB_LOCK_GET:
-			ret = __lock_get_internal(lt, locker, flags,
+			ret = __lock_get_internal(lt, locker, txn, flags,
 			    list[i].obj, list[i].mode, &lp);
 			if (ret == 0) {
 				list[i].lock = LOCK_TO_OFFSET(lt, lp);
 				lt->region->nrequests++;
 			}
 			break;
+		case DB_LOCK_INHERIT:
+			/* Find the locker. */
+			if ((ret = __lock_getobj(lt, locker,
+			    NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
+				break;
+			if (txn == NULL || txn->parent == NULL) {
+				ret = EINVAL;
+				break;
+			}
+
+			if ((ret = __lock_getobj(lt, txn->parent->txnid,
+			    NULL, DB_LOCK_LOCKER, &sh_parent)) != 0)
+				break;
+
+			/*
+			 * Traverse all the locks held by this locker.  Remove
+			 * the locks from the locker's list and put them on the
+			 * parent's list.
+			 */
+			for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
+			    lp != NULL;
+			    lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
+				SH_LIST_REMOVE(lp, locker_links, __db_lock);
+				SH_LIST_INSERT_HEAD(&sh_parent->heldby, lp,
+				    locker_links, __db_lock);
+				lp->holder = txn->parent->txnid;
+			}
+			__lock_freeobj(lt, sh_locker);
+			lt->region->nlockers--;
+			break;
 		case DB_LOCK_PUT:
 			lp = OFFSET_TO_LOCK(lt, list[i].lock);
 			if (lp->holder != locker) {
@@ -93,8 +158,8 @@ lock_vec(lt, locker, flags, list, nlist, elistp)
 			}
 			list[i].mode = lp->mode;
 
-			/* XXX Need to copy the object. ??? */
 			ret = __lock_put_internal(lt, lp, 0);
+			__lock_checklocker(lt, lp, 0);
 			break;
 		case DB_LOCK_PUT_ALL:
 			/* Find the locker. */
@@ -204,18 +269,25 @@ lock_get(lt, locker, flags, obj, lock_mode, lock)
 	struct __db_lock *lockp;
 	int ret;
 
+	LOCK_PANIC_CHECK(lt);
+
 	/* Validate arguments. */
-	if ((ret =
-	    __db_fchk(lt->dbenv, "lock_get", flags, DB_LOCK_NOWAIT)) != 0)
+	if ((ret = __db_fchk(lt->dbenv,
+	    "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0)
 		return (ret);
 
 	LOCK_LOCKREGION(lt);
 
-	ret = __lock_validate_region(lt);
-	if (ret == 0 && (ret = __lock_get_internal(lt,
-	    locker, flags, obj, lock_mode, &lockp)) == 0) {
-		*lock = LOCK_TO_OFFSET(lt, lockp);
-		lt->region->nrequests++;
+	if ((ret = __lock_validate_region(lt)) == 0) {
+		if (LF_ISSET(DB_LOCK_UPGRADE))
+			lockp = OFFSET_TO_LOCK(lt, *lock);
+
+		if ((ret = __lock_get_internal(lt,
+		    locker, NULL, flags, obj, lock_mode, &lockp)) == 0) {
+			if (!LF_ISSET(DB_LOCK_UPGRADE))
+				*lock = LOCK_TO_OFFSET(lt, lockp);
+			lt->region->nrequests++;
+		}
 	}
 
 	UNLOCK_LOCKREGION(lt);
@@ -223,6 +295,42 @@ lock_get(lt, locker, flags, obj, lock_mode, lock)
 }
 
 int
+lock_tget(lt, txn, flags, obj, lock_mode, lock)
+	DB_LOCKTAB *lt;
+	DB_TXN *txn;
+	u_int32_t flags;
+	const DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	struct __db_lock *lockp;
+	int ret;
+
+	LOCK_PANIC_CHECK(lt);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(lt->dbenv,
+	    "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0)
+		return (ret);
+
+	LOCK_LOCKREGION(lt);
+
+	if ((ret = __lock_validate_region(lt)) == 0) {
+		if (LF_ISSET(DB_LOCK_UPGRADE))
+			lockp = OFFSET_TO_LOCK(lt, *lock);
+
+		if ((ret = __lock_get_internal(lt,
+		    txn->txnid, txn, flags, obj, lock_mode, &lockp)) == 0) {
+			if (!LF_ISSET(DB_LOCK_UPGRADE))
+				*lock = LOCK_TO_OFFSET(lt, lockp);
+			lt->region->nrequests++;
+		}
+	}
+
+	UNLOCK_LOCKREGION(lt);
+	return (ret);
+}
+int
 lock_put(lt, lock)
 	DB_LOCKTAB *lt;
 	DB_LOCK lock;
@@ -230,6 +338,8 @@ lock_put(lt, lock)
 	struct __db_lock *lockp;
 	int ret, run_dd;
 
+	LOCK_PANIC_CHECK(lt);
+
 	LOCK_LOCKREGION(lt);
 
 	if ((ret = __lock_validate_region(lt)) != 0)
@@ -261,7 +371,6 @@ __lock_put_internal(lt, lockp, do_all)
 	struct __db_lock *lockp;
 	int do_all;
 {
-	struct __db_lock *lp_w, *lp_h, *next_waiter;
 	DB_LOCKOBJ *sh_obj;
 	int state_changed;
 
@@ -293,39 +402,7 @@ __lock_put_internal(lt, lockp, do_all)
 	else
 		SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock);
 
-	/*
-	 * We need to do lock promotion.  We also need to determine if
-	 * we're going to need to run the deadlock detector again.  If
-	 * we release locks, and there are waiters, but no one gets promoted,
-	 * then we haven't fundamentally changed the lockmgr state, so
-	 * we may still have a deadlock and we have to run again.  However,
-	 * if there were no waiters, or we actually promoted someone, then
-	 * we are OK and we don't have to run it immediately.
-	 */
-	for (lp_w = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock),
-	    state_changed = lp_w == NULL;
-	    lp_w != NULL;
-	    lp_w = next_waiter) {
-		next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock);
-		for (lp_h = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
-		    lp_h != NULL;
-		    lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) {
-			if (CONFLICTS(lt, lp_h->mode, lp_w->mode) &&
-			    lp_h->holder != lp_w->holder)
-				break;
-		}
-		if (lp_h != NULL)	/* Found a conflict. */
-			break;
-
-		/* No conflict, promote the waiting lock. */
-		SH_TAILQ_REMOVE(&sh_obj->waiters, lp_w, links, __db_lock);
-		lp_w->status = DB_LSTAT_PENDING;
-		SH_TAILQ_INSERT_TAIL(&sh_obj->holders, lp_w, links);
-
-		/* Wake up waiter. */
-		(void)__db_mutex_unlock(&lp_w->mutex, lt->reginfo.fd);
-		state_changed = 1;
-	}
+	state_changed = __lock_promote(lt, sh_obj);
 
 	/* Check if object should be reclaimed. */
 	if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL) {
@@ -354,9 +431,10 @@ __lock_put_internal(lt, lockp, do_all)
 }
 
 static int
-__lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
+__lock_get_internal(lt, locker, txn, flags, obj, lock_mode, lockp)
 	DB_LOCKTAB *lt;
 	u_int32_t locker, flags;
+	DB_TXN *txn;
 	const DBT *obj;
 	db_lockmode_t lock_mode;
 	struct __db_lock **lockp;
@@ -365,13 +443,13 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 	DB_LOCKOBJ *sh_obj, *sh_locker;
 	DB_LOCKREGION *lrp;
 	size_t newl_off;
-	int ihold, ret;
+	int ihold, no_dd, ret;
+
+	no_dd = ret = 0;
 
-	ret = 0;
 	/*
 	 * Check that lock mode is valid.
 	 */
-
 	lrp = lt->region;
 	if ((u_int32_t)lock_mode >= lrp->nmodes) {
 		__db_err(lt->dbenv,
@@ -423,20 +501,28 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 	 * lock, then we guarantee deadlock.
 	 *
 	 * In case of conflict, we put the new lock on the end of the waiters
-	 * list.
+	 * list, unless we are upgrading in which case the locker goes on the
+	 * front of the list.
 	 */
 	ihold = 0;
 	for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
 	    lp != NULL;
 	    lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
-		if (locker == lp->holder) {
+		if (locker == lp->holder ||
+		    __lock_is_parent(lp->holder, txn)) {
 			if (lp->mode == lock_mode &&
 			    lp->status == DB_LSTAT_HELD) {
-				/* Lock is held, just inc the ref count. */
+				if (LF_ISSET(DB_LOCK_UPGRADE))
+					goto upgrade;
+
+				/*
+				 * Lock is held, so we can increment the
+				 * reference count and return this lock.
+				 */
 				lp->refcount++;
+				*lockp = lp;
 				SH_TAILQ_INSERT_HEAD(&lrp->free_locks,
 				    newl, links, __db_lock);
-				*lockp = lp;
 				return (0);
 			} else
 				ihold = 1;
@@ -444,6 +530,21 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 			break;
     	}
 
+	/*
+	 * If we are upgrading, then there are two scenarios.  Either
+	 * we had no conflicts, so we can do the upgrade.  Or, there
+	 * is a conflict and we should wait at the HEAD of the waiters
+	 * list.
+	 */
+	if (LF_ISSET(DB_LOCK_UPGRADE)) {
+		if (lp == NULL)
+			goto upgrade;
+
+		/* There was a conflict, wait. */
+		SH_TAILQ_INSERT_HEAD(&sh_obj->waiters, newl, links, __db_lock);
+		goto wait;
+	}
+
 	if (lp == NULL && !ihold)
 		for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock);
 		    lp != NULL;
@@ -464,31 +565,35 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 	}
 
 	/*
-	 * This is really a blocker for the process, so initialize it
-	 * set.  That way the current process will block when it tries
-	 * to get it and the waking process will release it.
-	 */
-	(void)__db_mutex_init(&newl->mutex,
-	    MUTEX_LOCK_OFFSET(lt->region, &newl->mutex));
-	(void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd);
-
-	/*
-	 * Now, insert the lock onto its locker's list.
+	 * Now, insert the lock onto its locker's list.  If the locker does
+	 * not currently hold any locks, there's no reason to run a deadlock
+	 * detector, save that information.
 	 */
 	if ((ret =
 	    __lock_getobj(lt, locker, NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
 		return (ret);
+	no_dd = SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL;
 
 	lrp = lt->region;
 	SH_LIST_INSERT_HEAD(&sh_locker->heldby, newl, locker_links, __db_lock);
 
 	if (lp != NULL) {
+		/*
+		 * This is really a blocker for the process, so initialize it
+		 * set.  That way the current process will block when it tries
+		 * to get it and the waking process will release it.
+		 */
+wait:		(void)__db_mutex_init(&newl->mutex,
+		    MUTEX_LOCK_OFFSET(lt->region, &newl->mutex));
+		(void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd);
+
 		newl->status = DB_LSTAT_WAITING;
 		lrp->nconflicts++;
+
 		/*
-		 * We are about to wait; must release the region mutex.
-		 * Then, when we wakeup, we need to reacquire the region
-		 * mutex before continuing.
+		 * We are about to wait; must release the region mutex.  Then,
+		 * when we wakeup, we need to reacquire the region mutex before
+		 * continuing.
 		 */
 		if (lrp->detect == DB_LOCK_NORUN)
 			lt->region->need_dd = 1;
@@ -498,13 +603,19 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 		 * We are about to wait; before waiting, see if the deadlock
 		 * detector should be run.
 		 */
-		if (lrp->detect != DB_LOCK_NORUN)
-			ret = lock_detect(lt, 0, lrp->detect);
+		if (lrp->detect != DB_LOCK_NORUN && !no_dd)
+			(void)lock_detect(lt, 0, lrp->detect);
 
 		(void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd);
 
 		LOCK_LOCKREGION(lt);
 		if (newl->status != DB_LSTAT_PENDING) {
+			/*
+			 * If this lock errored due to a deadlock, then
+			 * we have waiters that require promotion.
+			 */
+			if (newl->status == DB_LSTAT_ABORTED)
+				(void)__lock_promote(lt, sh_obj);
 			/* Return to free list. */
 			__lock_checklocker(lt, newl, 0);
 			SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links,
@@ -522,12 +633,31 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp)
 			}
 			newl->status = DB_LSTAT_FREE;
 			newl = NULL;
+		} else if (LF_ISSET(DB_LOCK_UPGRADE)) {
+			/*
+			 * The lock that was just granted got put on the
+			 * holders list.  Since we're upgrading some other
+			 * lock, we've got to remove it here.
+			 */
+			SH_TAILQ_REMOVE(&sh_obj->holders,
+			    newl, links, __db_lock);
+			goto upgrade;
 		} else
 			newl->status = DB_LSTAT_HELD;
 	}
 
 	*lockp = newl;
 	return (ret);
+
+upgrade:
+	/*
+	 * This was an upgrade, so return the new lock to the free list and
+	 * upgrade the mode.
+	 */
+	(*lockp)->mode = lock_mode;
+	newl->status = DB_LSTAT_FREE;
+	SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock);
+	return (0);
 }
 
 /*
@@ -788,3 +918,117 @@ __lock_freeobj(lt, obj)
 		__db_shalloc_free(lt->mem, SH_DBT_PTR(&obj->lockobj));
 	SH_TAILQ_INSERT_HEAD(&lt->region->free_objs, obj, links, __db_lockobj);
 }
+
+/*
+ * __lock_downgrade --
+ *	Used by the concurrent access product to downgrade write locks
+ * back to iwrite locks.
+ *
+ * PUBLIC: int __lock_downgrade __P((DB_LOCKTAB *,
+ * PUBLIC:     DB_LOCK, db_lockmode_t, u_int32_t));
+ */
+int
+__lock_downgrade(lt, lock, new_mode, flags)
+	DB_LOCKTAB *lt;
+	DB_LOCK lock;
+	db_lockmode_t new_mode;
+	u_int32_t flags;
+{
+	struct __db_lock *lockp;
+	DB_LOCKOBJ *obj;
+	int ret;
+
+	COMPQUIET(flags, 0);
+	LOCK_PANIC_CHECK(lt);
+	LOCK_LOCKREGION(lt);
+
+	if ((ret = __lock_validate_region(lt)) == 0) {
+		lockp = OFFSET_TO_LOCK(lt, lock);
+		lockp->mode = new_mode;
+
+		/* Get the object associated with this lock. */
+		obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj);
+		(void)__lock_promote(lt, obj);
+		++lt->region->nreleases;
+	}
+
+	UNLOCK_LOCKREGION(lt);
+
+	return (ret);
+}
+
+/*
+ * __lock_promote --
+ *
+ * Look through the waiters and holders lists and decide which (if any)
+ * locks can be promoted.   Promote any that are eligible.
+ */
+static int
+__lock_promote(lt, obj)
+	DB_LOCKTAB *lt;
+	DB_LOCKOBJ *obj;
+{
+	struct __db_lock *lp_w, *lp_h, *next_waiter;
+	int state_changed, waiter_is_txn;
+
+	/*
+	 * We need to do lock promotion.  We also need to determine if
+	 * we're going to need to run the deadlock detector again.  If
+	 * we release locks, and there are waiters, but no one gets promoted,
+	 * then we haven't fundamentally changed the lockmgr state, so
+	 * we may still have a deadlock and we have to run again.  However,
+	 * if there were no waiters, or we actually promoted someone, then
+	 * we are OK and we don't have to run it immediately.
+	 *
+	 * During promotion, we look for state changes so we can return
+	 * this information to the caller.
+	 */
+	for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock),
+	    state_changed = lp_w == NULL;
+	    lp_w != NULL;
+	    lp_w = next_waiter) {
+		waiter_is_txn = TXN_IS_HOLDING(lp_w);
+		next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock);
+		for (lp_h = SH_TAILQ_FIRST(&obj->holders, __db_lock);
+		    lp_h != NULL;
+		    lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) {
+			if (CONFLICTS(lt, lp_h->mode, lp_w->mode) &&
+			    lp_h->holder != lp_w->holder &&
+			    !(waiter_is_txn &&
+			    TXN_IS_HOLDING(lp_h) &&
+			    __txn_is_ancestor(lt->dbenv->tx_info,
+			        lp_h->txnoff, lp_w->txnoff)))
+				break;
+		}
+		if (lp_h != NULL)	/* Found a conflict. */
+			break;
+
+		/* No conflict, promote the waiting lock. */
+		SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock);
+		lp_w->status = DB_LSTAT_PENDING;
+		SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links);
+
+		/* Wake up waiter. */
+		(void)__db_mutex_unlock(&lp_w->mutex, lt->reginfo.fd);
+		state_changed = 1;
+	}
+
+	return (state_changed);
+}
+
+static int
+__lock_is_parent(locker, txn)
+	u_int32_t locker;
+	DB_TXN *txn;
+{
+	DB_TXN *t;
+
+	if (txn == NULL)
+		return (0);
+
+	for (t = txn->parent; t != NULL; t = t->parent)
+		if (t->txnid == locker)
+			return (1);
+
+	return (0);
+}
diff --git a/db2/lock/lock_conflict.c b/db2/lock/lock_conflict.c
index 870aa0dc17..4be858af7a 100644
--- a/db2/lock/lock_conflict.c
+++ b/db2/lock/lock_conflict.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)lock_conflict.c	10.3 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)lock_conflict.c	10.4 (Sleepycat) 11/20/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -29,11 +29,11 @@ const u_int8_t db_rw_conflicts[] = {
 };
 
 const u_int8_t db_riw_conflicts[] = {
-	/*		N   	S   	X  	IS  	IX	SIX */
+	/*		N   	S   	X  	IX  	IS	SIX */
 	/*   N */	0,	0,	0,	0,	0,	0,
-	/*   S */	0,	0,	1,	0,	1,	1,
+	/*   S */	0,	0,	1,	1,	0,	1,
 	/*   X */	1,	1,	1,	1,	1,	1,
-	/*  IS */	0,	0,	1,	0,	0,	0,
 	/*  IX */	0,	1,	1,	0,	0,	0,
+	/*  IS */	0,	0,	1,	0,	0,	0,
 	/* SIX */	0,	1,	1,	0,	0,	0
 };
diff --git a/db2/lock/lock_deadlock.c b/db2/lock/lock_deadlock.c
index 4de492944e..8b2f91bc9e 100644
--- a/db2/lock/lock_deadlock.c
+++ b/db2/lock/lock_deadlock.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)lock_deadlock.c	10.32 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)lock_deadlock.c	10.37 (Sleepycat) 10/4/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -69,6 +69,8 @@ lock_detect(lt, flags, atype)
 	u_int32_t *bitmap, *deadlock, i, killid, nentries, nlockers;
 	int do_pass, ret;
 
+	LOCK_PANIC_CHECK(lt);
+
 	/* Validate arguments. */
 	if ((ret =
 	    __db_fchk(lt->dbenv, "lock_detect", flags, DB_LOCK_CONFLICT)) != 0)
@@ -176,8 +178,8 @@ lock_detect(lt, flags, atype)
 			    "warning: unable to abort locker %lx",
 			    (u_long)idmap[killid].id);
 	}
-	__db_free(bitmap);
-	__db_free(idmap);
+	__os_free(bitmap, 0);
+	__os_free(idmap, 0);
 
 	return (ret);
 }
@@ -198,7 +200,7 @@ __dd_build(dbenv, bmp, nlockers, idmap)
 	u_int8_t *pptr;
 	locker_info *id_array;
 	u_int32_t *bitmap, count, *entryp, i, id, nentries, *tmpmap;
-	int is_first;
+	int is_first, ret;
 
 	lt = dbenv->lk_info;
 
@@ -230,25 +232,20 @@ retry:	count = lt->region->nlockers;
 	 * We can probably save the malloc's between iterations just
 	 * reallocing if necessary because count grew by too much.
 	 */
-	if ((bitmap = (u_int32_t *)__db_calloc((size_t)count,
-	    sizeof(u_int32_t) * nentries)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc((size_t)count,
+	    sizeof(u_int32_t) * nentries, &bitmap)) != 0)
+		return (ret);
 
-	if ((tmpmap =
-	    (u_int32_t *)__db_calloc(sizeof(u_int32_t), nentries)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		__db_free(bitmap);
-		return (ENOMEM);
+	if ((ret = __os_calloc(sizeof(u_int32_t), nentries, &tmpmap)) != 0) {
+		__os_free(bitmap, sizeof(u_int32_t) * nentries);
+		return (ret);
 	}
 
-	if ((id_array = (locker_info *)__db_calloc((size_t)count,
-	    sizeof(locker_info))) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		__db_free(bitmap);
-		__db_free(tmpmap);
-		return (ENOMEM);
+	if ((ret =
+	    __os_calloc((size_t)count, sizeof(locker_info), &id_array)) != 0) {
+		__os_free(bitmap, count * sizeof(u_int32_t) * nentries);
+		__os_free(tmpmap, sizeof(u_int32_t) * nentries);
+		return (ret);
 	}
 
 	/*
@@ -256,9 +253,9 @@ retry:	count = lt->region->nlockers;
 	 */
 	LOCK_LOCKREGION(lt);
 	if (lt->region->nlockers > count) {
-		__db_free(bitmap);
-		__db_free(tmpmap);
-		__db_free(id_array);
+		__os_free(bitmap, count * sizeof(u_int32_t) * nentries);
+		__os_free(tmpmap, sizeof(u_int32_t) * nentries);
+		__os_free(id_array, count * sizeof(locker_info));
 		goto retry;
 	}
 
@@ -383,7 +380,7 @@ retry:	count = lt->region->nlockers;
 	*nlockers = id;
 	*idmap = id_array;
 	*bmp = bitmap;
-	__db_free(tmpmap);
+	__os_free(tmpmap, sizeof(u_int32_t) * nentries);
 	return (0);
 }
 
@@ -434,8 +431,21 @@ __dd_abort(dbenv, info)
 		goto out;
 
 	lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock);
-	if (LOCK_TO_OFFSET(lt, lockp) != info->last_lock ||
-	    lockp == NULL || lockp->status != DB_LSTAT_WAITING)
+
+	/*
+	 * It's possible that this locker was already aborted.
+	 * If that's the case, make sure that we remove its
+	 * locker from the hash table.
+	 */
+	if (lockp == NULL) {
+		HASHREMOVE_EL(lt->hashtab, __db_lockobj,
+		    links, lockerp, lt->region->table_size, __lock_lhash);
+		SH_TAILQ_INSERT_HEAD(&lt->region->free_objs,
+		    lockerp, links, __db_lockobj);
+		lt->region->nlockers--;
+		goto out;
+	} else if (LOCK_TO_OFFSET(lt, lockp) != info->last_lock ||
+	    lockp->status != DB_LSTAT_WAITING)
 		goto out;
 
 	/* Abort lock, take it off list, and wake up this lock. */
@@ -460,17 +470,17 @@ __dd_debug(dbenv, idmap, bitmap, nlockers)
 	u_int32_t *bitmap, nlockers;
 {
 	u_int32_t i, j, *mymap, nentries;
+	int ret;
 	char *msgbuf;
 
 	__db_err(dbenv, "Waitsfor array");
 	__db_err(dbenv, "waiter\twaiting on");
-	/*
-	 * Allocate space to print 10 bytes per item waited on.
-	 */
-	if ((msgbuf = (char *)__db_malloc((nlockers + 1) * 10 + 64)) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
+
+	/* Allocate space to print 10 bytes per item waited on. */
+#undef	MSGBUF_LEN
+#define	MSGBUF_LEN ((nlockers + 1) * 10 + 64)
+	if ((ret = __os_malloc(MSGBUF_LEN, NULL, &msgbuf)) != 0)
 		return;
-	}
 
 	nentries = ALIGN(nlockers, 32) / 32;
 	for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nentries) {
@@ -487,6 +497,6 @@ __dd_debug(dbenv, idmap, bitmap, nlockers)
 		__db_err(dbenv, msgbuf);
 	}
 
-	__db_free(msgbuf);
+	__os_free(msgbuf, MSGBUF_LEN);
 }
 #endif
diff --git a/db2/lock/lock_region.c b/db2/lock/lock_region.c
index b597560744..613a6cefb2 100644
--- a/db2/lock/lock_region.c
+++ b/db2/lock/lock_region.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)lock_region.c	10.15 (Sleepycat) 6/2/98";
+static const char sccsid[] = "@(#)lock_region.c	10.21 (Sleepycat) 10/19/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -29,7 +29,8 @@ static u_int32_t __lock_count_locks __P((DB_LOCKREGION *));
 static u_int32_t __lock_count_objs __P((DB_LOCKREGION *));
 static void	 __lock_dump_locker __P((DB_LOCKTAB *, DB_LOCKOBJ *, FILE *));
 static void	 __lock_dump_object __P((DB_LOCKTAB *, DB_LOCKOBJ *, FILE *));
-static const char *__lock_dump_status __P((db_status_t));
+static const char *
+		 __lock_dump_status __P((db_status_t));
 static void	 __lock_reset_region __P((DB_LOCKTAB *));
 static int	 __lock_tabinit __P((DB_ENV *, DB_LOCKREGION *));
 
@@ -55,10 +56,8 @@ lock_open(path, flags, mode, dbenv, ltp)
 		return (ret);
 
 	/* Create the lock table structure. */
-	if ((lt = (DB_LOCKTAB *)__db_calloc(1, sizeof(DB_LOCKTAB))) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(DB_LOCKTAB), &lt)) != 0)
+		return (ret);
 	lt->dbenv = dbenv;
 
 	/* Grab the values that we need to compute the region size. */
@@ -82,7 +81,7 @@ lock_open(path, flags, mode, dbenv, ltp)
 	if (path == NULL)
 		lt->reginfo.path = NULL;
 	else
-		if ((lt->reginfo.path = (char *)__db_strdup(path)) == NULL)
+		if ((ret = __os_strdup(path, &lt->reginfo.path)) != 0)
 			goto err;
 	lt->reginfo.file = DB_DEFAULT_LOCK_FILE;
 	lt->reginfo.mode = mode;
@@ -147,12 +146,27 @@ err:	if (lt->reginfo.addr != NULL) {
 	}
 
 	if (lt->reginfo.path != NULL)
-		FREES(lt->reginfo.path);
-	FREE(lt, sizeof(*lt));
+		__os_freestr(lt->reginfo.path);
+	__os_free(lt, sizeof(*lt));
 	return (ret);
 }
 
 /*
+ * __lock_panic --
+ *	Panic a lock region.
+ *
+ * PUBLIC: void __lock_panic __P((DB_ENV *));
+ */
+void
+__lock_panic(dbenv)
+	DB_ENV *dbenv;
+{
+	if (dbenv->lk_info != NULL)
+		dbenv->lk_info->region->hdr.panic = 1;
+}
+
+
+/*
  * __lock_tabinit --
  *	Initialize the lock region.
  */
@@ -254,12 +268,14 @@ lock_close(lt)
 {
 	int ret;
 
+	LOCK_PANIC_CHECK(lt);
+
 	if ((ret = __db_rdetach(&lt->reginfo)) != 0)
 		return (ret);
 
 	if (lt->reginfo.path != NULL)
-		FREES(lt->reginfo.path);
-	FREE(lt, sizeof(*lt));
+		__os_freestr(lt->reginfo.path);
+	__os_free(lt, sizeof(*lt));
 
 	return (0);
 }
@@ -276,12 +292,12 @@ lock_unlink(path, force, dbenv)
 	memset(&reginfo, 0, sizeof(reginfo));
 	reginfo.dbenv = dbenv;
 	reginfo.appname = DB_APP_NONE;
-	if (path != NULL && (reginfo.path = (char *)__db_strdup(path)) == NULL)
-		return (ENOMEM);
+	if (path != NULL && (ret = __os_strdup(path, &reginfo.path)) != 0)
+		return (ret);
 	reginfo.file = DB_DEFAULT_LOCK_FILE;
 	ret = __db_runlink(&reginfo, force);
 	if (reginfo.path != NULL)
-		FREES(reginfo.path);
+		__os_freestr(reginfo.path);
 	return (ret);
 }
 
@@ -463,13 +479,14 @@ lock_stat(lt, gspp, db_malloc)
 	void *(*db_malloc) __P((size_t));
 {
 	DB_LOCKREGION *rp;
+	int ret;
 
 	*gspp = NULL;
 
-	if ((*gspp = db_malloc == NULL ?
-	    (DB_LOCK_STAT *)__db_malloc(sizeof(**gspp)) :
-	    (DB_LOCK_STAT *)db_malloc(sizeof(**gspp))) == NULL)
-		return (ENOMEM);
+	LOCK_PANIC_CHECK(lt);
+
+	if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0)
+		return (ret);
 
 	/* Copy out the global statistics. */
 	LOCK_LOCKREGION(lt);
@@ -632,15 +649,15 @@ __lock_dump_region(lt, area, fp)
 		for (lp = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock);
 		    lp != NULL;
 		    lp = SH_TAILQ_NEXT(lp, links, __db_lock))
-			fprintf(fp, "0x%x: %lu\t%lu\t%s\t0x%x\n", (u_int)lp,
+			fprintf(fp, "0x%lx: %lu\t%lu\t%s\t0x%lx\n", (u_long)lp,
 			    (u_long)lp->holder, (u_long)lp->mode,
-			    __lock_dump_status(lp->status), (u_int)lp->obj);
+			    __lock_dump_status(lp->status), (u_long)lp->obj);
 
 		fprintf(fp, "%s\nObject free list\n", DB_LINE);
 		for (op = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj);
 		    op != NULL;
 		    op = SH_TAILQ_NEXT(op, links, __db_lockobj))
-			fprintf(fp, "0x%x\n", (u_int)op);
+			fprintf(fp, "0x%lx\n", (u_long)op);
 	}
 
 	if (LF_ISSET(LOCK_DUMP_MEM))
diff --git a/db2/lock/lock_util.c b/db2/lock/lock_util.c
index 7274a50422..29da75b8a8 100644
--- a/db2/lock/lock_util.c
+++ b/db2/lock/lock_util.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)lock_util.c	10.9 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)lock_util.c	10.10 (Sleepycat) 9/20/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -75,7 +75,7 @@ __lock_locker_cmp(locker, lock_obj)
  * fast path the case where we think we are doing a hash on a DB page/fileid
  * pair.  If the size is right, then we do the fast hash.
  *
- * We know that DB uses struct __db_ilocks for its lock objects.  The first
+ * We know that DB uses DB_LOCK_ILOCK types for its lock objects.  The first
  * four bytes are the 4-byte page number and the next DB_FILE_ID_LEN bytes
  * are a unique file id, where the first 4 bytes on UNIX systems are the file
  * inode number, and the first 4 bytes on Windows systems are the FileIndexLow
@@ -107,7 +107,7 @@ u_int32_t
 __lock_ohash(dbt)
 	const DBT *dbt;
 {
-	if (dbt->size == sizeof(struct __db_ilock))
+	if (dbt->size == sizeof(DB_LOCK_ILOCK))
 		FAST_HASH(dbt->data);
 
 	return (__ham_func5(dbt->data, dbt->size));
@@ -131,7 +131,7 @@ __lock_lhash(lock_obj)
 		return (tmp);
 	}
 
-	if (lock_obj->lockobj.size == sizeof(struct __db_ilock))
+	if (lock_obj->lockobj.size == sizeof(DB_LOCK_ILOCK))
 		FAST_HASH(obj_data);
 
 	return (__ham_func5(obj_data, lock_obj->lockobj.size));
diff --git a/db2/log/log.c b/db2/log/log.c
index d642c9f9ef..ad15f16aef 100644
--- a/db2/log/log.c
+++ b/db2/log/log.c
@@ -7,13 +7,14 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log.c	10.54 (Sleepycat) 5/31/98";
+static const char sccsid[] = "@(#)log.c	10.63 (Sleepycat) 10/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <shqueue.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -23,6 +24,7 @@ static const char sccsid[] = "@(#)log.c	10.54 (Sleepycat) 5/31/98";
 #include "shqueue.h"
 #include "log.h"
 #include "db_dispatch.h"
+#include "txn.h"
 #include "txn_auto.h"
 #include "common_ext.h"
 
@@ -54,13 +56,11 @@ log_open(path, flags, mode, dbenv, lpp)
 		return (ret);
 
 	/* Create and initialize the DB_LOG structure. */
-	if ((dblp = (DB_LOG *)__db_calloc(1, sizeof(DB_LOG))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_calloc(1, sizeof(DB_LOG), &dblp)) != 0)
+		return (ret);
 
-	if (path != NULL && (dblp->dir = __db_strdup(path)) == NULL) {
-		ret = ENOMEM;
+	if (path != NULL && (ret = __os_strdup(path, &dblp->dir)) != 0)
 		goto err;
-	}
 
 	dblp->dbenv = dbenv;
 	dblp->lfd = -1;
@@ -80,7 +80,7 @@ log_open(path, flags, mode, dbenv, lpp)
 	if (path == NULL)
 		dblp->reginfo.path = NULL;
 	else
-		if ((dblp->reginfo.path = __db_strdup(path)) == NULL)
+		if ((ret = __os_strdup(path, &dblp->reginfo.path)) != 0)
 			goto err;
 	dblp->reginfo.file = DB_DEFAULT_LOG_FILE;
 	dblp->reginfo.mode = mode;
@@ -122,7 +122,7 @@ log_open(path, flags, mode, dbenv, lpp)
 		if ((ret = __db_shalloc(dblp->addr,
 		    sizeof(db_mutex_t), MUTEX_ALIGNMENT, &dblp->mutexp)) != 0)
 			goto err;
-		(void)__db_mutex_init(dblp->mutexp, -1);
+		(void)__db_mutex_init(dblp->mutexp, 0);
 	}
 
 	/*
@@ -148,14 +148,28 @@ err:	if (dblp->reginfo.addr != NULL) {
 	}
 
 	if (dblp->reginfo.path != NULL)
-		FREES(dblp->reginfo.path);
+		__os_freestr(dblp->reginfo.path);
 	if (dblp->dir != NULL)
-		FREES(dblp->dir);
-	FREE(dblp, sizeof(*dblp));
+		__os_freestr(dblp->dir);
+	__os_free(dblp, sizeof(*dblp));
 	return (ret);
 }
 
 /*
+ * __log_panic --
+ *	Panic a log.
+ *
+ * PUBLIC: void __log_panic __P((DB_ENV *));
+ */
+void
+__log_panic(dbenv)
+	DB_ENV *dbenv;
+{
+	if (dbenv->lg_info != NULL)
+		dbenv->lg_info->lp->rlayout.panic = 1;
+}
+
+/*
  * __log_recover --
  *	Recover a log.
  */
@@ -212,12 +226,12 @@ __log_recover(dblp)
 	}
 
 	/*
-	 * We know where the end of the log is.  Since that record is on disk,
-	 * it's also the last-synced LSN.
+	 * We now know where the end of the log is.  Set the first LSN that
+	 * we want to return to an application and the LSN of the last known
+	 * record on disk.
 	 */
-	lp->lsn = lsn;
+	lp->lsn = lp->s_lsn = lsn;
 	lp->lsn.offset += dblp->c_len;
-	lp->s_lsn = lp->lsn;
 
 	/* Set up the current buffer information, too. */
 	lp->len = dblp->c_len;
@@ -250,13 +264,23 @@ __log_recover(dblp)
 			}
 		}
 	}
+	/*
+	 * Reset the cursor lsn to the beginning of the log, so that an
+	 * initial call to DB_NEXT does the right thing.
+	 */
+	ZERO_LSN(dblp->c_lsn);
 
 	/* If we never find a checkpoint, that's okay, just 0 it out. */
 	if (!found_checkpoint)
 		ZERO_LSN(lp->chkpt_lsn);
 
+	/*
+	 * !!!
+	 * The test suite explicitly looks for this string -- don't change
+	 * it here unless you also change it there.
+	 */
 	__db_err(dblp->dbenv,
-	    "Recovering the log: last valid LSN: file: %lu offset %lu",
+	    "Finding last valid log LSN: file: %lu offset %lu",
 	    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
 
 	return (0);
@@ -275,14 +299,15 @@ __log_find(dblp, find_first, valp)
 	DB_LOG *dblp;
 	int find_first, *valp;
 {
-	int cnt, fcnt, logval, ret;
+	u_int32_t clv, logval;
+	int cnt, fcnt, ret;
 	const char *dir;
 	char **names, *p, *q;
 
 	*valp = 0;
 
 	/* Find the directory name. */
-	if ((ret = __log_name(dblp, 1, &p)) != 0)
+	if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0)
 		return (ret);
 	if ((q = __db_rpath(p)) == NULL)
 		dir = PATH_DOT;
@@ -292,8 +317,8 @@ __log_find(dblp, find_first, valp)
 	}
 
 	/* Get the list of file names. */
-	ret = __db_dirlist(dir, &names, &fcnt);
-	FREES(p);
+	ret = __os_dirlist(dir, &names, &fcnt);
+	__os_freestr(p);
 	if (ret != 0) {
 		__db_err(dblp->dbenv, "%s: %s", dir, strerror(ret));
 		return (ret);
@@ -302,29 +327,31 @@ __log_find(dblp, find_first, valp)
 	/*
 	 * Search for a valid log file name, return a value of 0 on
 	 * failure.
+	 *
+	 * XXX
+	 * Assumes that atoi(3) returns a 32-bit number.
 	 */
-	for (cnt = fcnt, logval = 0; --cnt >= 0;)
-		if (strncmp(names[cnt], "log.", sizeof("log.") - 1) == 0) {
-			logval = atoi(names[cnt] + 4);
-			if (logval != 0 &&
-			    __log_valid(dblp, dblp->lp, logval) == 0)
-				break;
-		}
+	for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
+		if (strncmp(names[cnt], LFPREFIX, sizeof(LFPREFIX) - 1) != 0)
+			continue;
+
+		clv = atoi(names[cnt] + (sizeof(LFPREFIX) - 1));
+		if (find_first) {
+			if (logval != 0 && clv > logval)
+				continue;
+		} else
+			if (logval != 0 && clv < logval)
+				continue;
+
+		if (__log_valid(dblp, clv, 1) == 0)
+			logval = clv;
+	}
 
-	/* Discard the list. */
-	__db_dirfree(names, fcnt);
-
-	/* We have a valid log file, find either the first or last one. */
-	if (find_first) {
-		for (; logval > 0; --logval)
-			if (__log_valid(dblp, dblp->lp, logval - 1) != 0)
-				break;
-	} else
-		for (; logval < MAXLFNAME; ++logval)
-			if (__log_valid(dblp, dblp->lp, logval + 1) != 0)
-				break;
 	*valp = logval;
 
+	/* Discard the list. */
+	__os_dirfree(names, fcnt);
+
 	return (0);
 }
 
@@ -332,62 +359,68 @@ __log_find(dblp, find_first, valp)
  * log_valid --
  *	Validate a log file.
  *
- * PUBLIC: int __log_valid __P((DB_LOG *, LOG *, int));
+ * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int));
  */
 int
-__log_valid(dblp, lp, cnt)
+__log_valid(dblp, number, set_persist)
 	DB_LOG *dblp;
-	LOG *lp;
-	int cnt;
+	u_int32_t number;
+	int set_persist;
 {
 	LOGP persist;
 	ssize_t nw;
+	char *fname;
 	int fd, ret;
-	char *p;
 
-	if ((ret = __log_name(dblp, cnt, &p)) != 0)
+	/* Try to open the log file. */
+	if ((ret = __log_name(dblp,
+	    number, &fname, &fd, DB_RDONLY | DB_SEQUENTIAL)) != 0) {
+		__os_freestr(fname);
 		return (ret);
+	}
 
-	fd = -1;
-	if ((ret = __db_open(p,
-	    DB_RDONLY | DB_SEQUENTIAL,
-	    DB_RDONLY | DB_SEQUENTIAL, 0, &fd)) != 0 ||
-	    (ret = __db_seek(fd, 0, 0, sizeof(HDR), 0, SEEK_SET)) != 0 ||
-	    (ret = __db_read(fd, &persist, sizeof(LOGP), &nw)) != 0 ||
+	/* Try to read the header. */
+	if ((ret = __os_seek(fd, 0, 0, sizeof(HDR), 0, SEEK_SET)) != 0 ||
+	    (ret = __os_read(fd, &persist, sizeof(LOGP), &nw)) != 0 ||
 	    nw != sizeof(LOGP)) {
 		if (ret == 0)
 			ret = EIO;
-		if (fd != -1) {
-			(void)__db_close(fd);
-			__db_err(dblp->dbenv,
-			    "Ignoring log file: %s: %s", p, strerror(ret));
-		}
+
+		(void)__os_close(fd);
+
+		__db_err(dblp->dbenv,
+		    "Ignoring log file: %s: %s", fname, strerror(ret));
 		goto err;
 	}
-	(void)__db_close(fd);
+	(void)__os_close(fd);
 
+	/* Validate the header. */
 	if (persist.magic != DB_LOGMAGIC) {
 		__db_err(dblp->dbenv,
 		    "Ignoring log file: %s: magic number %lx, not %lx",
-		    p, (u_long)persist.magic, (u_long)DB_LOGMAGIC);
+		    fname, (u_long)persist.magic, (u_long)DB_LOGMAGIC);
 		ret = EINVAL;
 		goto err;
 	}
 	if (persist.version < DB_LOGOLDVER || persist.version > DB_LOGVERSION) {
 		__db_err(dblp->dbenv,
 		    "Ignoring log file: %s: unsupported log version %lu",
-		    p, (u_long)persist.version);
+		    fname, (u_long)persist.version);
 		ret = EINVAL;
 		goto err;
 	}
 
-	if (lp != NULL) {
-		lp->persist.lg_max = persist.lg_max;
-		lp->persist.mode = persist.mode;
+	/*
+	 * If we're going to use this log file, set the region's persistent
+	 * information based on the headers.
+	 */
+	if (set_persist) {
+		dblp->lp->persist.lg_max = persist.lg_max;
+		dblp->lp->persist.mode = persist.mode;
 	}
 	ret = 0;
 
-err:	FREES(p);
+err:	__os_freestr(fname);
 	return (ret);
 }
 
@@ -401,6 +434,11 @@ log_close(dblp)
 {
 	int ret, t_ret;
 
+	LOG_PANIC_CHECK(dblp);
+
+	/* We may have opened files as part of XA; if so, close them. */
+	__log_close_files(dblp);
+
 	/* Discard the per-thread pointer. */
 	if (dblp->mutexp != NULL) {
 		LOCK_LOGREGION(dblp);
@@ -412,21 +450,22 @@ log_close(dblp)
 	ret = __db_rdetach(&dblp->reginfo);
 
 	/* Close open files, release allocated memory. */
-	if (dblp->lfd != -1 && (t_ret = __db_close(dblp->lfd)) != 0 && ret == 0)
+	if (dblp->lfd != -1 && (t_ret = __os_close(dblp->lfd)) != 0 && ret == 0)
 		ret = t_ret;
 	if (dblp->c_dbt.data != NULL)
-		FREE(dblp->c_dbt.data, dblp->c_dbt.ulen);
+		__os_free(dblp->c_dbt.data, dblp->c_dbt.ulen);
 	if (dblp->c_fd != -1 &&
-	    (t_ret = __db_close(dblp->c_fd)) != 0 && ret == 0)
+	    (t_ret = __os_close(dblp->c_fd)) != 0 && ret == 0)
 		ret = t_ret;
 	if (dblp->dbentry != NULL)
-		FREE(dblp->dbentry, (dblp->dbentry_cnt * sizeof(DB_ENTRY)));
+		__os_free(dblp->dbentry,
+		    (dblp->dbentry_cnt * sizeof(DB_ENTRY)));
 	if (dblp->dir != NULL)
-		FREES(dblp->dir);
+		__os_freestr(dblp->dir);
 
 	if (dblp->reginfo.path != NULL)
-		FREES(dblp->reginfo.path);
-	FREE(dblp, sizeof(*dblp));
+		__os_freestr(dblp->reginfo.path);
+	__os_free(dblp, sizeof(*dblp));
 
 	return (ret);
 }
@@ -447,12 +486,12 @@ log_unlink(path, force, dbenv)
 	memset(&reginfo, 0, sizeof(reginfo));
 	reginfo.dbenv = dbenv;
 	reginfo.appname = DB_APP_LOG;
-	if (path != NULL && (reginfo.path = __db_strdup(path)) == NULL)
-		return (ENOMEM);
+	if (path != NULL && (ret = __os_strdup(path, &reginfo.path)) != 0)
+		return (ret);
 	reginfo.file = DB_DEFAULT_LOG_FILE;
 	ret = __db_runlink(&reginfo, force);
 	if (reginfo.path != NULL)
-		FREES(reginfo.path);
+		__os_freestr(reginfo.path);
 	return (ret);
 }
 
@@ -467,14 +506,15 @@ log_stat(dblp, gspp, db_malloc)
 	void *(*db_malloc) __P((size_t));
 {
 	LOG *lp;
+	int ret;
 
 	*gspp = NULL;
 	lp = dblp->lp;
 
-	if ((*gspp = db_malloc == NULL ?
-	    (DB_LOG_STAT *)__db_malloc(sizeof(**gspp)) :
-	    (DB_LOG_STAT *)db_malloc(sizeof(**gspp))) == NULL)
-		return (ENOMEM);
+	LOG_PANIC_CHECK(dblp);
+
+	if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0)
+		return (ret);
 
 	/* Copy out the global statistics. */
 	LOCK_LOGREGION(dblp);
diff --git a/db2/log/log_archive.c b/db2/log/log_archive.c
index 7db0cc3e36..9f3b24d8e3 100644
--- a/db2/log/log_archive.c
+++ b/db2/log/log_archive.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_archive.c	10.37 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)log_archive.c	10.44 (Sleepycat) 10/9/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -49,8 +49,11 @@ log_archive(dblp, listp, flags, db_malloc)
 	int array_size, n, ret;
 	char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN];
 
+	name = NULL;
 	COMPQUIET(fnum, 0);
 
+	LOG_PANIC_CHECK(dblp);
+
 #define	OKFLAGS	(DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG)
 	if (flags != 0) {
 		if ((ret =
@@ -84,7 +87,7 @@ log_archive(dblp, listp, flags, db_malloc)
 		if ((ret = log_get(dblp, &stable_lsn, &rec, DB_LAST)) != 0)
 			return (ret);
 		if (F_ISSET(dblp, DB_AM_THREAD))
-			__db_free(rec.data);
+			__os_free(rec.data, rec.size);
 		fnum = stable_lsn.file;
 		break;
 	case 0:
@@ -106,40 +109,40 @@ log_archive(dblp, listp, flags, db_malloc)
 
 #define	LIST_INCREMENT	64
 	/* Get some initial space. */
-	if ((array =
-	    (char **)__db_malloc(sizeof(char *) * (array_size = 10))) == NULL)
-		return (ENOMEM);
+	array_size = 10;
+	if ((ret = __os_malloc(sizeof(char *) * array_size, NULL, &array)) != 0)
+		return (ret);
 	array[0] = NULL;
 
 	/* Build an array of the file names. */
 	for (n = 0; fnum > 0; --fnum) {
-		if ((ret = __log_name(dblp, fnum, &name)) != 0)
+		if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
 			goto err;
-		if (__db_exists(name, NULL) != 0)
+		if (__os_exists(name, NULL) != 0) {
+			__os_freestr(name);
+			name = NULL;
 			break;
+		}
 
 		if (n >= array_size - 1) {
 			array_size += LIST_INCREMENT;
-			if ((array = (char **)__db_realloc(array,
-			    sizeof(char *) * array_size)) == NULL) {
-				ret = ENOMEM;
+			if ((ret = __os_realloc(&array,
+			    sizeof(char *) * array_size)) != 0)
 				goto err;
-			}
 		}
 
 		if (LF_ISSET(DB_ARCH_ABS)) {
 			if ((ret = __absname(pref, name, &array[n])) != 0)
 				goto err;
-			FREES(name);
+			__os_freestr(name);
 		} else if ((p = __db_rpath(name)) != NULL) {
-			if ((array[n] = (char *)__db_strdup(p + 1)) == NULL) {
-				ret = ENOMEM;
+			if ((ret = __os_strdup(p + 1, &array[n])) != 0)
 				goto err;
-			}
-			FREES(name);
+			__os_freestr(name);
 		} else
 			array[n] = name;
 
+		name = NULL;
 		array[++n] = NULL;
 	}
 
@@ -162,9 +165,11 @@ log_archive(dblp, listp, flags, db_malloc)
 
 err:	if (array != NULL) {
 		for (arrayp = array; *arrayp != NULL; ++arrayp)
-			FREES(*arrayp);
-		__db_free(array);
+			__os_freestr(*arrayp);
+		__os_free(array, sizeof(char *) * array_size);
 	}
+	if (name != NULL)
+		__os_freestr(name);
 	return (ret);
 }
 
@@ -186,9 +191,9 @@ __build_data(dblp, pref, listp, db_malloc)
 	char **array, **arrayp, *p, *real_name;
 
 	/* Get some initial space. */
-	if ((array =
-	    (char **)__db_malloc(sizeof(char *) * (array_size = 10))) == NULL)
-		return (ENOMEM);
+	array_size = 10;
+	if ((ret = __os_malloc(sizeof(char *) * array_size, NULL, &array)) != 0)
+		return (ret);
 	array[0] = NULL;
 
 	memset(&rec, 0, sizeof(rec));
@@ -205,7 +210,7 @@ __build_data(dblp, pref, listp, db_malloc)
 		memcpy(&rectype, rec.data, sizeof(rectype));
 		if (rectype != DB_log_register) {
 			if (F_ISSET(dblp, DB_AM_THREAD)) {
-				__db_free(rec.data);
+				__os_free(rec.data, rec.size);
 				rec.data = NULL;
 			}
 			continue;
@@ -219,25 +224,22 @@ __build_data(dblp, pref, listp, db_malloc)
 
 		if (n >= array_size - 1) {
 			array_size += LIST_INCREMENT;
-			if ((array = (char **)__db_realloc(array,
-			    sizeof(char *) * array_size)) == NULL) {
-				ret = ENOMEM;
+			if ((ret = __os_realloc(&array,
+			    sizeof(char *) * array_size)) != 0)
 				goto lg_free;
-			}
 		}
 
-		if ((array[n] = (char *)__db_strdup(argp->name.data)) == NULL) {
-			ret = ENOMEM;
+		if ((ret = __os_strdup(argp->name.data, &array[n])) != 0) {
 lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
-				__db_free(rec.data);
+				__os_free(rec.data, rec.size);
 			goto err1;
 		}
 
 		array[++n] = NULL;
-		__db_free(argp);
+		__os_free(argp, 0);
 
 		if (F_ISSET(dblp, DB_AM_THREAD)) {
-			__db_free(rec.data);
+			__os_free(rec.data, rec.size);
 			rec.data = NULL;
 		}
 	}
@@ -268,7 +270,7 @@ lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
 		}
 		for (++nxt; nxt < n &&
 		    strcmp(array[last], array[nxt]) == 0; ++nxt) {
-			FREES(array[nxt]);
+			__os_freestr(array[nxt]);
 			array[nxt] = NULL;
 		}
 
@@ -278,25 +280,25 @@ lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
 			goto err2;
 
 		/* If the file doesn't exist, ignore it. */
-		if (__db_exists(real_name, NULL) != 0) {
-			FREES(real_name);
-			FREES(array[last]);
+		if (__os_exists(real_name, NULL) != 0) {
+			__os_freestr(real_name);
+			__os_freestr(array[last]);
 			array[last] = NULL;
 			continue;
 		}
 
 		/* Rework the name as requested by the user. */
-		FREES(array[last]);
+		__os_freestr(array[last]);
 		array[last] = NULL;
 		if (pref != NULL) {
 			ret = __absname(pref, real_name, &array[last]);
-			FREES(real_name);
+			__os_freestr(real_name);
 			if (ret != 0)
 				goto err2;
 		} else if ((p = __db_rpath(real_name)) != NULL) {
-			array[last] = (char *)__db_strdup(p + 1);
-			FREES(real_name);
-			if (array[last] == NULL)
+			ret = __os_strdup(p + 1, &array[last]);
+			__os_freestr(real_name);
+			if (ret != 0)
 				goto err2;
 		} else
 			array[last] = real_name;
@@ -320,13 +322,13 @@ err2:	/*
 	 */
 	if (array != NULL)
 		for (; nxt < n; ++nxt)
-			FREES(array[nxt]);
+			__os_freestr(array[nxt]);
 	/* FALLTHROUGH */
 
 err1:	if (array != NULL) {
 		for (arrayp = array; *arrayp != NULL; ++arrayp)
-			FREES(*arrayp);
-		__db_free(array);
+			__os_freestr(*arrayp);
+		__os_free(array, array_size * sizeof(char *));
 	}
 	return (ret);
 }
@@ -340,17 +342,17 @@ __absname(pref, name, newnamep)
 	char *pref, *name, **newnamep;
 {
 	size_t l_pref, l_name;
-	int isabspath;
+	int isabspath, ret;
 	char *newname;
 
 	l_name = strlen(name);
-	isabspath = __db_abspath(name);
+	isabspath = __os_abspath(name);
 	l_pref = isabspath ? 0 : strlen(pref);
 
 	/* Malloc space for concatenating the two. */
-	if ((*newnamep =
-	    newname = (char *)__db_malloc(l_pref + l_name + 2)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(l_pref + l_name + 2, NULL, &newname)) != 0)
+		return (ret);
+	*newnamep = newname;
 
 	/* Build the name.  If `name' is an absolute path, ignore any prefix. */
 	if (!isabspath) {
@@ -369,11 +371,12 @@ __absname(pref, name, newnamep)
  *	If the user has their own malloc routine, use it.
  */
 static int
-__usermem(listp, cmpfunc)
+__usermem(listp, db_malloc)
 	char ***listp;
-	void *(*cmpfunc) __P((size_t));
+	void *(*db_malloc) __P((size_t));
 {
 	size_t len;
+	int ret;
 	char **array, **arrayp, **orig, *strp;
 
 	/* Find out how much space we need. */
@@ -381,18 +384,10 @@ __usermem(listp, cmpfunc)
 		len += sizeof(char *) + strlen(*orig) + 1;
 	len += sizeof(char *);
 
-	/*
-	 * Allocate it and set up the pointers.
-	 *
-	 * XXX
-	 * Don't simplify this expression, SunOS compilers don't like it.
-	 */
-	if (cmpfunc == NULL)
-		array = (char **)__db_malloc(len);
-	else
-		array = (char **)cmpfunc(len);
-	if (array == NULL)
-		return (ENOMEM);
+	/* Allocate it and set up the pointers. */
+	if ((ret = __os_malloc(len, db_malloc, &array)) != 0)
+		return (ret);
+
 	strp = (char *)(array + (orig - *listp) + 1);
 
 	/* Copy the original information into the new memory. */
@@ -402,13 +397,13 @@ __usermem(listp, cmpfunc)
 		*arrayp = strp;
 		strp += len + 1;
 
-		FREES(*orig);
+		__os_freestr(*orig);
 	}
 
 	/* NULL-terminate the list. */
 	*arrayp = NULL;
 
-	__db_free(*listp);
+	__os_free(*listp, 0);
 	*listp = array;
 
 	return (0);
diff --git a/db2/log/log_auto.c b/db2/log/log_auto.c
index b17b1ffb2f..92e682661c 100644
--- a/db2/log/log_auto.c
+++ b/db2/log/log_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "log.h"
@@ -43,8 +42,7 @@ int __log_register_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_log_register;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -54,8 +52,8 @@ int __log_register_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (uid == NULL ? 0 : uid->size)
 	    + sizeof(id)
 	    + sizeof(ftype);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -97,7 +95,7 @@ int __log_register_log(logp, txnid, ret_lsnp, flags,
 	ret = __log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -155,7 +153,7 @@ __log_register_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tid: %lu\n", (u_long)argp->id);
 	printf("\tftype: 0x%lx\n", (u_long)argp->ftype);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -169,11 +167,12 @@ __log_register_read(recbuf, argpp)
 {
 	__log_register_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__log_register_args *)__db_malloc(sizeof(__log_register_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__log_register_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
diff --git a/db2/log/log_findckp.c b/db2/log/log_findckp.c
index 1f717b49e7..ab13c8380e 100644
--- a/db2/log/log_findckp.c
+++ b/db2/log/log_findckp.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_findckp.c	10.15 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)log_findckp.c	10.17 (Sleepycat) 9/17/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -28,7 +28,10 @@ static const char sccsid[] = "@(#)log_findckp.c	10.15 (Sleepycat) 4/26/98";
  * __log_findckp --
  *
  * Looks for the most recent checkpoint that occurs before the most recent
- * checkpoint LSN.  This is the point from which recovery can start and the
+ * checkpoint LSN, subject to the constraint that there must be at least two
+ * checkpoints.  The reason you need two checkpoints is that you might have
+ * crashed during the most recent one and may not have a copy of all the
+ * open files.  This is the point from which recovery can start and the
  * point up to which archival/truncation can take place.  Checkpoints in
  * the log look like:
  *
@@ -56,7 +59,7 @@ __log_findckp(lp, lsnp)
 	DB_LSN *lsnp;
 {
 	DBT data;
-	DB_LSN ckp_lsn, last_ckp, next_lsn;
+	DB_LSN ckp_lsn, final_ckp, last_ckp, next_lsn;
 	__txn_ckp_args *ckp_args;
 	int ret, verbose;
 
@@ -77,16 +80,17 @@ __log_findckp(lp, lsnp)
 			return (ret);
 	}
 
+	final_ckp = last_ckp;
 	next_lsn = last_ckp;
 	do {
 		if (F_ISSET(lp, DB_AM_THREAD))
-			__db_free(data.data);
+			__os_free(data.data, data.size);
 
 		if ((ret = log_get(lp, &next_lsn, &data, DB_SET)) != 0)
 			return (ret);
 		if ((ret = __txn_ckp_read(data.data, &ckp_args)) != 0) {
 			if (F_ISSET(lp, DB_AM_THREAD))
-				__db_free(data.data);
+				__os_free(data.data, data.size);
 			return (ret);
 		}
 		if (IS_ZERO_LSN(ckp_lsn))
@@ -103,12 +107,19 @@ __log_findckp(lp, lsnp)
 		}
 		last_ckp = next_lsn;
 		next_lsn = ckp_args->last_ckp;
-		__db_free(ckp_args);
+		__os_free(ckp_args, sizeof(*ckp_args));
+
+		/*
+		 * Keep looping until either you 1) run out of checkpoints,
+		 * 2) you've found a checkpoint before the most recent
+		 * checkpoint's LSN and you have at least 2 checkpoints.
+		 */
 	} while (!IS_ZERO_LSN(next_lsn) &&
-	    log_compare(&last_ckp, &ckp_lsn) > 0);
+	    (log_compare(&last_ckp, &ckp_lsn) > 0 ||
+	    log_compare(&final_ckp, &last_ckp) == 0));
 
 	if (F_ISSET(lp, DB_AM_THREAD))
-		__db_free(data.data);
+		__os_free(data.data, data.size);
 
 	/*
 	 * At this point, either, next_lsn is ZERO or ckp_lsn is the
@@ -117,11 +128,12 @@ __log_findckp(lp, lsnp)
 	 * next_lsn must be 0 and we need to roll forward from the
 	 * beginning of the log.
 	 */
-	if (log_compare(&last_ckp, &ckp_lsn) > 0) {
+	if (log_compare(&last_ckp, &ckp_lsn) > 0 ||
+	    log_compare(&final_ckp, &last_ckp) == 0) {
 get_first:	if ((ret = log_get(lp, &last_ckp, &data, DB_FIRST)) != 0)
 			return (ret);
 		if (F_ISSET(lp, DB_AM_THREAD))
-			__db_free(data.data);
+			__os_free(data.data, data.size);
 	}
 	*lsnp = last_ckp;
 
diff --git a/db2/log/log_get.c b/db2/log/log_get.c
index 84ddca1c73..de81519a7c 100644
--- a/db2/log/log_get.c
+++ b/db2/log/log_get.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_get.c	10.32 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)log_get.c	10.38 (Sleepycat) 10/3/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -38,26 +38,16 @@ log_get(dblp, alsn, dbt, flags)
 {
 	int ret;
 
+	LOG_PANIC_CHECK(dblp);
+
 	/* Validate arguments. */
-#define	OKFLAGS	(DB_CHECKPOINT | \
-    DB_CURRENT | DB_FIRST | DB_LAST | DB_NEXT | DB_PREV | DB_SET)
-	if ((ret = __db_fchk(dblp->dbenv, "log_get", flags, OKFLAGS)) != 0)
-		return (ret);
-	switch (flags) {
-	case DB_CHECKPOINT:
-	case DB_CURRENT:
-	case DB_FIRST:
-	case DB_LAST:
-	case DB_NEXT:
-	case DB_PREV:
-	case DB_SET:
-		break;
-	default:
+	if (flags != DB_CHECKPOINT && flags != DB_CURRENT &&
+	    flags != DB_FIRST && flags != DB_LAST &&
+	    flags != DB_NEXT && flags != DB_PREV && flags != DB_SET)
 		return (__db_ferr(dblp->dbenv, "log_get", 1));
-	}
 
 	if (F_ISSET(dblp, DB_AM_THREAD)) {
-		if (LF_ISSET(DB_NEXT | DB_PREV | DB_CURRENT))
+		if (flags == DB_NEXT || flags == DB_PREV || flags == DB_CURRENT)
 			return (__db_ferr(dblp->dbenv, "log_get", 1));
 		if (!F_ISSET(dbt, DB_DBT_USERMEM | DB_DBT_MALLOC))
 			return (__db_ferr(dblp->dbenv, "threaded data", 1));
@@ -156,7 +146,7 @@ __log_get(dblp, alsn, dbt, flags, silent)
 			/* If at start-of-file, move to the previous file. */
 			if (nlsn.offset == 0) {
 				if (nlsn.file == 1 ||
-				    __log_valid(dblp, NULL, nlsn.file - 1) != 0)
+				    __log_valid(dblp, nlsn.file - 1, 0) != 0)
 					return (DB_NOTFOUND);
 
 				--nlsn.file;
@@ -183,7 +173,7 @@ retry:
 
 	/* If we've switched files, discard the current fd. */
 	if (dblp->c_lsn.file != nlsn.file && dblp->c_fd != -1) {
-		(void)__db_close(dblp->c_fd);
+		(void)__os_close(dblp->c_fd);
 		dblp->c_fd = -1;
 	}
 
@@ -203,24 +193,22 @@ retry:
 
 	/* Acquire a file descriptor. */
 	if (dblp->c_fd == -1) {
-		if ((ret = __log_name(dblp, nlsn.file, &np)) != 0)
-			goto err1;
-		if ((ret = __db_open(np, DB_RDONLY | DB_SEQUENTIAL,
-		    DB_RDONLY | DB_SEQUENTIAL, 0, &dblp->c_fd)) != 0) {
+		if ((ret = __log_name(dblp, nlsn.file,
+		    &np, &dblp->c_fd, DB_RDONLY | DB_SEQUENTIAL)) != 0) {
 			fail = np;
 			goto err1;
 		}
-		__db_free(np);
+		__os_freestr(np);
 		np = NULL;
 	}
 
 	/* Seek to the header offset and read the header. */
 	if ((ret =
-	    __db_seek(dblp->c_fd, 0, 0, nlsn.offset, 0, SEEK_SET)) != 0) {
+	    __os_seek(dblp->c_fd, 0, 0, nlsn.offset, 0, SEEK_SET)) != 0) {
 		fail = "seek";
 		goto err1;
 	}
-	if ((ret = __db_read(dblp->c_fd, &hdr, sizeof(HDR), &nr)) != 0) {
+	if ((ret = __os_read(dblp->c_fd, &hdr, sizeof(HDR), &nr)) != 0) {
 		fail = "read";
 		goto err1;
 	}
@@ -276,10 +264,8 @@ retry:
 	 * We're calling malloc(3) with a region locked.  This isn't
 	 * a good idea.
 	 */
-	if ((tbuf = (char *)__db_malloc(len)) == NULL) {
-		ret = ENOMEM;
+	if ((ret = __os_malloc(len, NULL, &tbuf)) != 0)
 		goto err1;
-	}
 
 	/*
 	 * Read the record into the buffer.  If read returns a short count,
@@ -287,7 +273,7 @@ retry:
 	 * buffer.  Note, the information may be garbage if we're in recovery,
 	 * so don't read past the end of the buffer's memory.
 	 */
-	if ((ret = __db_read(dblp->c_fd, tbuf, len, &nr)) != 0) {
+	if ((ret = __os_read(dblp->c_fd, tbuf, len, &nr)) != 0) {
 		fail = "read";
 		goto err1;
 	}
@@ -305,7 +291,7 @@ retry:
 	if ((ret = __db_retcopy(dbt, tbuf, len,
 	    &dblp->c_dbt.data, &dblp->c_dbt.ulen, NULL)) != 0)
 		goto err1;
-	__db_free(tbuf);
+	__os_free(tbuf, 0);
 	tbuf = NULL;
 
 cksum:	if (hdr.cksum != __ham_func4(dbt->data, dbt->size)) {
@@ -329,7 +315,7 @@ corrupt:/*
 	ret = EIO;
 	fail = "read";
 
- err1:	if (!silent) {
+err1:	if (!silent) {
 		if (fail == NULL)
 			__db_err(dblp->dbenv, "log_get: %s", strerror(ret));
 		else
@@ -337,8 +323,8 @@ corrupt:/*
 			    "log_get: %s: %s", fail, strerror(ret));
 	}
 err2:	if (np != NULL)
-		__db_free(np);
+		__os_freestr(np);
 	if (tbuf != NULL)
-		__db_free(tbuf);
+		__os_free(tbuf, 0);
 	return (ret);
 }
diff --git a/db2/log/log_put.c b/db2/log/log_put.c
index 5ef2294af5..86de6b0d1d 100644
--- a/db2/log/log_put.c
+++ b/db2/log/log_put.c
@@ -7,13 +7,14 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_put.c	10.35 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)log_put.c	10.44 (Sleepycat) 11/3/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
@@ -24,6 +25,7 @@ static const char sccsid[] = "@(#)log_put.c	10.35 (Sleepycat) 5/6/98";
 #include "db_page.h"
 #include "log.h"
 #include "hash.h"
+#include "clib_ext.h"
 #include "common_ext.h"
 
 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
@@ -45,22 +47,12 @@ log_put(dblp, lsn, dbt, flags)
 {
 	int ret;
 
+	LOG_PANIC_CHECK(dblp);
+
 	/* Validate arguments. */
-#define	OKFLAGS	(DB_CHECKPOINT | DB_FLUSH | DB_CURLSN)
-	if (flags != 0) {
-		if ((ret =
-		    __db_fchk(dblp->dbenv, "log_put", flags, OKFLAGS)) != 0)
-			return (ret);
-		switch (flags) {
-		case DB_CHECKPOINT:
-		case DB_CURLSN:
-		case DB_FLUSH:
-		case 0:
-			break;
-		default:
-			return (__db_ferr(dblp->dbenv, "log_put", 1));
-		}
-	}
+	if (flags != 0 && flags != DB_CHECKPOINT &&
+	    flags != DB_CURLSN && flags != DB_FLUSH)
+		return (__db_ferr(dblp->dbenv, "log_put", 0));
 
 	LOCK_LOGREGION(dblp);
 	ret = __log_put(dblp, lsn, dbt, flags);
@@ -95,7 +87,7 @@ __log_put(dblp, lsn, dbt, flags)
 	 * the information.  Currently used by the transaction manager
 	 * to avoid writing TXN_begin records.
 	 */
-	if (LF_ISSET(DB_CURLSN)) {
+	if (flags == DB_CURLSN) {
 		lsn->file = lp->lsn.file;
 		lsn->offset = lp->lsn.offset;
 		return (0);
@@ -165,6 +157,8 @@ __log_put(dblp, lsn, dbt, flags)
 
 		for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname);
 		    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
+			if (fnp->ref == 0)	/* Entry not in use. */
+				continue;
 			memset(&t, 0, sizeof(t));
 			t.data = R_ADDR(dblp, fnp->name_off);
 			t.size = strlen(t.data) + 1;
@@ -248,6 +242,8 @@ log_flush(dblp, lsn)
 {
 	int ret;
 
+	LOG_PANIC_CHECK(dblp);
+
 	LOCK_LOGREGION(dblp);
 	ret = __log_flush(dblp, lsn);
 	UNLOCK_LOGREGION(dblp);
@@ -304,8 +300,7 @@ __log_flush(dblp, lsn)
 	 * buffer's starting LSN.
 	 */
 	current = 0;
-	if (lp->b_off != 0 &&
-	    lsn->file >= lp->f_lsn.file && lsn->offset >= lp->f_lsn.offset) {
+	if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) {
 		if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0)
 			return (ret);
 
@@ -322,8 +317,10 @@ __log_flush(dblp, lsn)
 			return (ret);
 
 	/* Sync all writes to disk. */
-	if ((ret = __db_fsync(dblp->lfd)) != 0)
+	if ((ret = __os_fsync(dblp->lfd)) != 0) {
+		__db_panic(dblp->dbenv, ret);
 		return (ret);
+	}
 	++lp->stat.st_scount;
 
 	/*
@@ -331,9 +328,16 @@ __log_flush(dblp, lsn)
 	 * the current buffer was flushed, we know the LSN of the first byte
 	 * of the buffer is on disk, otherwise, we only know that the LSN of
 	 * the record before the one beginning the current buffer is on disk.
+	 *
+	 * XXX
+	 * Check to make sure that the saved lsn isn't 0 before we go making
+	 * this change.  If DB_CHECKPOINT was called before we actually wrote
+	 * something, you can end up here without ever having written anything
+	 * to a log file, and decrementing either s_lsn.file or s_lsn.offset
+	 * will cause much sadness later on.
 	 */
 	lp->s_lsn = lp->f_lsn;
-	if (!current) {
+	if (!current && lp->s_lsn.file != 0) {
 		if (lp->s_lsn.offset == 0) {
 			--lp->s_lsn.file;
 			lp->s_lsn.offset = lp->persist.lg_max;
@@ -431,10 +435,11 @@ __log_write(dblp, addr, len)
 	 * Seek to the offset in the file (someone may have written it
 	 * since we last did).
 	 */
-	if ((ret = __db_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0)
-		return (ret);
-	if ((ret = __db_write(dblp->lfd, addr, len, &nw)) != 0)
+	if ((ret = __os_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0 ||
+	    (ret = __os_write(dblp->lfd, addr, len, &nw)) != 0) {
+		__db_panic(dblp->dbenv, ret);
 		return (ret);
+	}
 	if (nw != (int32_t)len)
 		return (EIO);
 
@@ -467,21 +472,23 @@ log_file(dblp, lsn, namep, len)
 	size_t len;
 {
 	int ret;
-	char *p;
+	char *name;
+
+	LOG_PANIC_CHECK(dblp);
 
 	LOCK_LOGREGION(dblp);
-	ret = __log_name(dblp, lsn->file, &p);
+	ret = __log_name(dblp, lsn->file, &name, NULL, 0);
 	UNLOCK_LOGREGION(dblp);
 	if (ret != 0)
 		return (ret);
 
 	/* Check to make sure there's enough room and copy the name. */
-	if (len < strlen(p) + 1) {
+	if (len < strlen(name) + 1) {
 		*namep = '\0';
 		return (ENOMEM);
 	}
-	(void)strcpy(namep, p);
-	__db_free(p);
+	(void)strcpy(namep, name);
+	__os_freestr(name);
 
 	return (0);
 }
@@ -495,43 +502,102 @@ __log_newfd(dblp)
 	DB_LOG *dblp;
 {
 	int ret;
-	char *p;
+	char *name;
 
 	/* Close any previous file descriptor. */
 	if (dblp->lfd != -1) {
-		(void)__db_close(dblp->lfd);
+		(void)__os_close(dblp->lfd);
 		dblp->lfd = -1;
 	}
 
 	/* Get the path of the new file and open it. */
 	dblp->lfname = dblp->lp->lsn.file;
-	if ((ret = __log_name(dblp, dblp->lfname, &p)) != 0)
-		return (ret);
-	if ((ret = __db_open(p,
-	    DB_CREATE | DB_SEQUENTIAL,
-	    DB_CREATE | DB_SEQUENTIAL,
-	    dblp->lp->persist.mode, &dblp->lfd)) != 0)
-		__db_err(dblp->dbenv,
-		    "log_put: %s: %s", p, strerror(ret));
-	FREES(p);
+	if ((ret = __log_name(dblp,
+	    dblp->lfname, &name, &dblp->lfd, DB_CREATE | DB_SEQUENTIAL)) != 0)
+		__db_err(dblp->dbenv, "log_put: %s: %s", name, strerror(ret));
+
+	__os_freestr(name);
 	return (ret);
 }
 
 /*
  * __log_name --
- *	Return the log name for a particular file.
+ *	Return the log name for a particular file, and optionally open it.
  *
- * PUBLIC: int __log_name __P((DB_LOG *, int, char **));
+ * PUBLIC: int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t));
  */
 int
-__log_name(dblp, filenumber, namep)
+__log_name(dblp, filenumber, namep, fdp, flags)
 	DB_LOG *dblp;
+	u_int32_t filenumber, flags;
 	char **namep;
-	int filenumber;
+	int *fdp;
 {
-	char name[sizeof(LFNAME) + 10];
+	int ret;
+	char *oname;
+	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
+
+	/*
+	 * !!!
+	 * The semantics of this routine are bizarre.
+	 *
+	 * The reason for all of this is that we need a place where we can
+	 * intercept requests for log files, and, if appropriate, check for
+	 * both the old-style and new-style log file names.  The trick is
+	 * that all callers of this routine that are opening the log file
+	 * read-only want to use an old-style file name if they can't find
+	 * a match using a new-style name.  The only down-side is that some
+	 * callers may check for the old-style when they really don't need
+	 * to, but that shouldn't mess up anything, and we only check for
+	 * the old-style name when we've already failed to find a new-style
+	 * one.
+	 *
+	 * Create a new-style file name, and if we're not going to open the
+	 * file, return regardless.
+	 */
+	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
+	if ((ret = __db_appname(dblp->dbenv,
+	    DB_APP_LOG, dblp->dir, new, 0, NULL, namep)) != 0 || fdp == NULL)
+		return (ret);
 
-	(void)snprintf(name, sizeof(name), LFNAME, filenumber);
-	return (__db_appname(dblp->dbenv,
-	    DB_APP_LOG, dblp->dir, name, 0, NULL, namep));
+	/* Open the new-style file -- if we succeed, we're done. */
+	if ((ret = __db_open(*namep,
+	    flags, flags, dblp->lp->persist.mode, fdp)) == 0)
+		return (0);
+
+	/*
+	 * The open failed... if the DB_RDONLY flag isn't set, we're done,
+	 * the caller isn't interested in old-style files.
+	 */
+	if (!LF_ISSET(DB_RDONLY))
+		return (ret);
+
+	/* Create an old-style file name. */
+	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
+	if ((ret = __db_appname(dblp->dbenv,
+	    DB_APP_LOG, dblp->dir, old, 0, NULL, &oname)) != 0)
+		goto err;
+
+	/*
+	 * Open the old-style file -- if we succeed, we're done.  Free the
+	 * space allocated for the new-style name and return the old-style
+	 * name to the caller.
+	 */
+	if ((ret = __db_open(oname,
+	    flags, flags, dblp->lp->persist.mode, fdp)) == 0) {
+		__os_freestr(*namep);
+		*namep = oname;
+		return (0);
+	}
+
+	/*
+	 * Couldn't find either style of name -- return the new-style name
+	 * for the caller's error message.  If it's an old-style name that's
+	 * actually missing we're going to confuse the user with the error
+	 * message, but that implies that not only were we looking for an
+	 * old-style name, but we expected it to exist and we weren't just
+	 * looking for any log file.  That's not a likely error.
+	 */
+err:	__os_freestr(oname);
+	return (ret);
 }
diff --git a/db2/log/log_rec.c b/db2/log/log_rec.c
index 5deac46298..8895150be1 100644
--- a/db2/log/log_rec.c
+++ b/db2/log/log_rec.c
@@ -40,7 +40,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_rec.c	10.20 (Sleepycat) 4/28/98";
+static const char sccsid[] = "@(#)log_rec.c	10.26 (Sleepycat) 10/21/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -56,8 +56,10 @@ static const char sccsid[] = "@(#)log_rec.c	10.20 (Sleepycat) 4/28/98";
 #include "db_dispatch.h"
 #include "common_ext.h"
 
-static int __log_open_file __P((DB_LOG *,
+static int __log_do_open __P((DB_LOG *,
     u_int8_t *, char *, DBTYPE, u_int32_t));
+static int __log_lid_to_fname __P((DB_LOG *, u_int32_t, FNAME **));
+static int __log_open_file __P((DB_LOG *, __log_register_args *));
 
 /*
  * PUBLIC: int __log_register_recover
@@ -80,7 +82,7 @@ __log_register_recover(logp, dbtp, lsnp, redo, info)
 	COMPQUIET(info, NULL);
 	COMPQUIET(lsnp, NULL);
 
-	F_SET(logp, DB_AM_RECOVER);
+	F_SET(logp, DBC_RECOVER);
 
 	if ((ret = __log_register_read(dbtp->data, &argp)) != 0)
 		goto out;
@@ -95,13 +97,11 @@ __log_register_recover(logp, dbtp, lsnp, redo, info)
 		 * If we are redoing an open or undoing a close, then we need
 		 * to open a file.
 		 */
-		ret = __log_open_file(logp,
-		    argp->uid.data, argp->name.data, argp->ftype, argp->id);
+		ret = __log_open_file(logp, argp);
 		if (ret == ENOENT) {
 			if (redo == TXN_OPENFILES)
-				__db_err(logp->dbenv,
-				    "warning: file %s not found",
-				    argp->name.data);
+				__db_err(logp->dbenv, "warning: %s: %s",
+				    argp->name.data, strerror(ENOENT));
 			ret = 0;
 		}
 	} else if (argp->opcode != LOG_CHECKPOINT) {
@@ -109,26 +109,42 @@ __log_register_recover(logp, dbtp, lsnp, redo, info)
 		 * If we are redoing a close or undoing an open, then we need
 		 * to close the file.
 		 *
-		 * If the file is deleted, then we can just ignore this close.
-		 * Otherwise, we'd better have a valid dbp that we should either
-		 * close or whose reference count should be decremented.
+  		 * If the file is deleted, then we can just ignore this close.
+ 		 * Otherwise, we should usually have a valid dbp we should
+  		 * close or whose reference count should be decremented.
+ 		 * However, if we shut down without closing a file, we
+		 * may, in fact, not have the file open, and that's OK.
 		 */
 		LOCK_LOGTHREAD(logp);
-		if (logp->dbentry[argp->id].dbp == NULL) {
-			if (!logp->dbentry[argp->id].deleted)
-				ret = EINVAL;
-		} else if (--logp->dbentry[argp->id].refcount == 0) {
-			F_SET(logp->dbentry[argp->id].dbp, DB_AM_RECOVER);
+		if (logp->dbentry[argp->id].dbp != NULL &&
+		    --logp->dbentry[argp->id].refcount == 0) {
 			ret = logp->dbentry[argp->id].dbp->close(
 			    logp->dbentry[argp->id].dbp, 0);
 			logp->dbentry[argp->id].dbp = NULL;
 		}
 		UNLOCK_LOGTHREAD(logp);
+ 	} else if (redo == TXN_UNDO &&
+ 	    (argp->id >= logp->dbentry_cnt ||
+ 	    (!logp->dbentry[argp->id].deleted &&
+ 	    logp->dbentry[argp->id].dbp == NULL))) {
+ 		/*
+ 		 * It's a checkpoint and we are rolling backward.  It
+ 		 * is possible that the system was shut down and thus
+ 		 * ended with a stable checkpoint; this file was never
+ 		 * closed and has therefore not been reopened yet.  If
+ 		 * so, we need to try to open it.
+ 		 */
+ 		ret = __log_open_file(logp, argp);
+ 		if (ret == ENOENT) {
+ 			__db_err(logp->dbenv, "warning: %s: %s",
+			    argp->name.data, strerror(ENOENT));
+ 			ret = 0;
+ 		}
 	}
 
-out:	F_CLR(logp, DB_AM_RECOVER);
+out:	F_CLR(logp, DBC_RECOVER);
 	if (argp != NULL)
-		__db_free(argp);
+		__os_free(argp, 0);
 	return (ret);
 }
 
@@ -140,34 +156,49 @@ out:	F_CLR(logp, DB_AM_RECOVER);
  * Returns 0 on success, non-zero on error.
  */
 static int
-__log_open_file(lp, uid, name, ftype, ndx)
+__log_open_file(lp, argp)
 	DB_LOG *lp;
-	u_int8_t *uid;
-	char *name;
-	DBTYPE ftype;
-	u_int32_t ndx;
+	__log_register_args *argp;
 {
-	DB *dbp;
-	int ret;
-
 	LOCK_LOGTHREAD(lp);
-	if (ndx < lp->dbentry_cnt &&
-	    (lp->dbentry[ndx].deleted == 1 || lp->dbentry[ndx].dbp != NULL)) {
-		lp->dbentry[ndx].refcount++;
+	if (argp->id < lp->dbentry_cnt &&
+	    (lp->dbentry[argp->id].deleted == 1 ||
+	    lp->dbentry[argp->id].dbp != NULL)) {
+		if (argp->opcode != LOG_CHECKPOINT)
+			lp->dbentry[argp->id].refcount++;
 
 		UNLOCK_LOGTHREAD(lp);
 		return (0);
 	}
 	UNLOCK_LOGTHREAD(lp);
+	return (__log_do_open(lp,
+	    argp->uid.data, argp->name.data, argp->ftype, argp->id));
+}
+
+/*
+ * __log_do_open --
+ * 	Open files referenced in the log.  This is the part of the open that
+ * is not protected by the thread mutex.
+ */
+
+static int
+__log_do_open(lp, uid, name, ftype, ndx)
+	DB_LOG *lp;
+	u_int8_t *uid;
+	char *name;
+	DBTYPE ftype;
+	u_int32_t ndx;
+{
+	DB *dbp;
+	int ret;
 
-	/* Need to open file. */
 	dbp = NULL;
 	if ((ret = db_open(name, ftype, 0, 0, lp->dbenv, NULL, &dbp)) == 0) {
 		/*
 		 * Verify that we are opening the same file that we were
 		 * referring to when we wrote this log record.
 		 */
-		if (memcmp(uid, dbp->lock.fileid, DB_FILE_ID_LEN) != 0) {
+		if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) {
 			(void)dbp->close(dbp, 0);
 			dbp = NULL;
 			ret = ENOENT;
@@ -181,10 +212,9 @@ __log_open_file(lp, uid, name, ftype, ndx)
 }
 
 /*
- * This function returns:
- *	0 SUCCESS (the entry was not previously set and is now set or the
- *		entry was previously set and we just inced the ref count.
- *	>0 on system error (returns errno value).
+ * __log_add_logid --
+ *	Adds a DB entry to the log's DB entry table.
+ *
  * PUBLIC: int __log_add_logid __P((DB_LOG *, DB *, u_int32_t));
  */
 int
@@ -193,43 +223,30 @@ __log_add_logid(logp, dbp, ndx)
 	DB *dbp;
 	u_int32_t ndx;
 {
-	DB_ENTRY *temp_entryp;
 	u_int32_t i;
 	int ret;
 
 	ret = 0;
 
 	LOCK_LOGTHREAD(logp);
+
 	/*
-	 * Check if we need to grow the table.
+	 * Check if we need to grow the table.  Note, ndx is 0-based (the
+	 * index into the DB entry table) an dbentry_cnt is 1-based, the
+	 * number of available slots.
 	 */
 	if (logp->dbentry_cnt <= ndx) {
-		if (logp->dbentry_cnt == 0) {
-			logp->dbentry = (DB_ENTRY *)
-			    __db_malloc(DB_GROW_SIZE * sizeof(DB_ENTRY));
-			if (logp->dbentry == NULL) {
-				ret = ENOMEM;
-				goto err;
-			}
-		} else {
-			temp_entryp = (DB_ENTRY *)__db_realloc(logp->dbentry,
-			    (DB_GROW_SIZE + logp->dbentry_cnt) *
-			    sizeof(DB_ENTRY));
-			if (temp_entryp == NULL) {
-				ret = ENOMEM;
-				goto err;
-			}
-			logp->dbentry = temp_entryp;
+		if ((ret = __os_realloc(&logp->dbentry,
+		    (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY))) != 0)
+			goto err;
 
-		}
 		/* Initialize the new entries. */
-		for (i = logp->dbentry_cnt;
-		    i < logp->dbentry_cnt + DB_GROW_SIZE; i++) {
+		for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) {
 			logp->dbentry[i].dbp = NULL;
 			logp->dbentry[i].deleted = 0;
 		}
 
-		logp->dbentry_cnt += DB_GROW_SIZE;
+		logp->dbentry_cnt = i;
 	}
 
 	if (logp->dbentry[ndx].deleted == 0 && logp->dbentry[ndx].dbp == NULL) {
@@ -257,11 +274,47 @@ __db_fileid_to_db(logp, dbpp, ndx)
 	u_int32_t ndx;
 {
 	int ret;
+	char *name;
+	FNAME *fname;
 
 	ret = 0;
 	LOCK_LOGTHREAD(logp);
 
 	/*
+	 * Under XA, a process different than the one issuing DB
+	 * operations may abort a transaction.  In this case, 
+	 * recovery routines are run by a process that does not
+	 * necessarily have the file open.  In this case, we must
+	 * open the file explicitly.
+	 */
+	if (ndx >= logp->dbentry_cnt ||
+	    (!logp->dbentry[ndx].deleted && logp->dbentry[ndx].dbp == NULL)) {
+		if (__log_lid_to_fname(logp, ndx, &fname) != 0) {
+			/* Couldn't find entry; this is a fatal error. */
+			ret = EINVAL;
+			goto err;
+		}
+		name = R_ADDR(logp, fname->name_off);
+		/*
+		 * __log_do_open is called without protection of the
+		 * log thread lock.
+		 */
+		UNLOCK_LOGTHREAD(logp);
+		/*
+		 * At this point, we are not holding the thread lock, so
+		 * exit directly instead of going through the exit code
+		 * at the bottom.  If the __log_do_open succeeded, then
+		 * we don't need to do any of the remaining error checking
+		 * at the end of this routine.
+		 */
+		if ((ret = __log_do_open(logp,
+		    fname->ufid, name, fname->s_type, ndx)) != 0)
+			return (ret);
+		*dbpp = logp->dbentry[ndx].dbp;
+		return (0);
+	}
+
+	/*
 	 * Return DB_DELETED if the file has been deleted
 	 * (it's not an error).
 	 */
@@ -294,8 +347,12 @@ __log_close_files(logp)
 
 	LOCK_LOGTHREAD(logp);
 	for (i = 0; i < logp->dbentry_cnt; i++)
-		if (logp->dbentry[i].dbp)
+		if (logp->dbentry[i].dbp) {
 			logp->dbentry[i].dbp->close(logp->dbentry[i].dbp, 0);
+			logp->dbentry[i].dbp = NULL;
+			logp->dbentry[i].deleted = 0;
+		}
+	F_CLR(logp, DBC_RECOVER);
 	UNLOCK_LOGTHREAD(logp);
 }
 
@@ -314,3 +371,28 @@ __log_rem_logid(logp, ndx)
 	}
 	UNLOCK_LOGTHREAD(logp);
 }
+
+/*
+ * __log_lid_to_fname --
+ * 	Traverse the shared-memory region looking for the entry that
+ * matches the passed log fileid.  Returns 0 on success; -1 on error.
+ */
+static int
+__log_lid_to_fname(dblp, lid, fnamep)
+	DB_LOG *dblp;
+	u_int32_t lid;
+	FNAME **fnamep;
+{
+	FNAME *fnp;
+
+	for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname);
+	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
+		if (fnp->ref == 0)	/* Entry not in use. */
+			continue;
+		if (fnp->id == lid) {
+			*fnamep = fnp;
+			return (0);
+		}
+	}
+	return (-1);
+}
diff --git a/db2/log/log_register.c b/db2/log/log_register.c
index a6fc4c1b3b..22264e3291 100644
--- a/db2/log/log_register.c
+++ b/db2/log/log_register.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)log_register.c	10.18 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)log_register.c	10.22 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -36,17 +36,18 @@ log_register(dblp, dbp, name, type, idp)
 {
 	DBT fid_dbt, r_name;
 	DB_LSN r_unused;
-	FNAME *fnp;
+	FNAME *fnp, *reuse_fnp;
 	size_t len;
-	u_int32_t fid;
+	u_int32_t maxid;
 	int inserted, ret;
 	char *fullname;
 	void *namep;
 
-	fid = 0;
 	inserted = 0;
 	fullname = NULL;
-	fnp = namep = NULL;
+	fnp = namep = reuse_fnp = NULL;
+
+	LOG_PANIC_CHECK(dblp);
 
 	/* Check the arguments. */
 	if (type != DB_BTREE && type != DB_HASH && type != DB_RECNO) {
@@ -63,26 +64,37 @@ log_register(dblp, dbp, name, type, idp)
 
 	/*
 	 * See if we've already got this file in the log, finding the
-	 * next-to-lowest file id currently in use as we do it.
+	 * (maximum+1) in-use file id and some available file id (if we
+	 * find an available fid, we'll use it, else we'll have to allocate
+	 * one after the maximum that we found).
 	 */
-	for (fid = 1, fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname);
+	for (maxid = 0, fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname);
 	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-		if (fid <= fnp->id)
-			fid = fnp->id + 1;
-		if (!memcmp(dbp->lock.fileid, fnp->ufid, DB_FILE_ID_LEN)) {
+		if (fnp->ref == 0) {		/* Entry is not in use. */
+			if (reuse_fnp == NULL)
+				reuse_fnp = fnp;
+			continue;
+		}
+		if (!memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN)) {
 			++fnp->ref;
-			fid = fnp->id;
 			goto found;
 		}
+		if (maxid <= fnp->id)
+			maxid = fnp->id + 1;
 	}
 
-	/* Allocate a new file name structure. */
-	if ((ret = __db_shalloc(dblp->addr, sizeof(FNAME), 0, &fnp)) != 0)
+	/* Fill in fnp structure. */
+
+	if (reuse_fnp != NULL)		/* Reuse existing one. */
+		fnp = reuse_fnp;
+	else if ((ret = __db_shalloc(dblp->addr, sizeof(FNAME), 0, &fnp)) != 0)
 		goto err;
+	else				/* Allocate a new one. */
+		fnp->id = maxid;
+
 	fnp->ref = 1;
-	fnp->id = fid;
 	fnp->s_type = type;
-	memcpy(fnp->ufid, dbp->lock.fileid, DB_FILE_ID_LEN);
+	memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
 
 	len = strlen(name) + 1;
 	if ((ret = __db_shalloc(dblp->addr, len, 0, &namep)) != 0)
@@ -90,20 +102,22 @@ log_register(dblp, dbp, name, type, idp)
 	fnp->name_off = R_OFFSET(dblp, namep);
 	memcpy(namep, name, len);
 
-	SH_TAILQ_INSERT_HEAD(&dblp->lp->fq, fnp, q, __fname);
+	/* Only do the insert if we allocated a new fnp. */
+	if (reuse_fnp == NULL)
+		SH_TAILQ_INSERT_HEAD(&dblp->lp->fq, fnp, q, __fname);
 	inserted = 1;
 
 found:	/* Log the registry. */
-	if (!F_ISSET(dblp, DB_AM_RECOVER)) {
+	if (!F_ISSET(dblp, DBC_RECOVER)) {
 		r_name.data = (void *)name;		/* XXX: Yuck! */
 		r_name.size = strlen(name) + 1;
 		memset(&fid_dbt, 0, sizeof(fid_dbt));
-		fid_dbt.data = dbp->lock.fileid;
+		fid_dbt.data = dbp->fileid;
 		fid_dbt.size = DB_FILE_ID_LEN;
 		if ((ret = __log_register_log(dblp, NULL, &r_unused,
-		    0, LOG_OPEN, &r_name, &fid_dbt, fid, type)) != 0)
+		    0, LOG_OPEN, &r_name, &fid_dbt, fnp->id, type)) != 0)
 			goto err;
-		if ((ret = __log_add_logid(dblp, dbp, fid)) != 0)
+		if ((ret = __log_add_logid(dblp, dbp, fnp->id)) != 0)
 			goto err;
 	}
 
@@ -120,13 +134,13 @@ err:		/*
 			__db_shalloc_free(dblp->addr, fnp);
 	}
 
+	if (idp != NULL)
+		*idp = fnp->id;
 	UNLOCK_LOGREGION(dblp);
 
 	if (fullname != NULL)
-		FREES(fullname);
+		__os_freestr(fullname);
 
-	if (idp != NULL)
-		*idp = fid;
 	return (ret);
 }
 
@@ -144,6 +158,8 @@ log_unregister(dblp, fid)
 	FNAME *fnp;
 	int ret;
 
+	LOG_PANIC_CHECK(dblp);
+
 	ret = 0;
 	LOCK_LOGREGION(dblp);
 
@@ -159,7 +175,7 @@ log_unregister(dblp, fid)
 	}
 
 	/* Unlog the registry. */
-	if (!F_ISSET(dblp, DB_AM_RECOVER)) {
+	if (!F_ISSET(dblp, DBC_RECOVER)) {
 		memset(&r_name, 0, sizeof(r_name));
 		r_name.data = R_ADDR(dblp, fnp->name_off);
 		r_name.size = strlen(r_name.data) + 1;
@@ -173,22 +189,18 @@ log_unregister(dblp, fid)
 
 	/*
 	 * If more than 1 reference, just decrement the reference and return.
-	 * Otherwise, free the unique file information, name and structure.
+	 * Otherwise, free the name.
 	 */
-	if (fnp->ref > 1)
-		--fnp->ref;
-	else {
+	--fnp->ref;
+	if (fnp->ref == 0)
 		__db_shalloc_free(dblp->addr, R_ADDR(dblp, fnp->name_off));
-		SH_TAILQ_REMOVE(&dblp->lp->fq, fnp, q, __fname);
-		__db_shalloc_free(dblp->addr, fnp);
-	}
 
 	/*
 	 * Remove from the process local table.  If this operation is taking
 	 * place during recovery, then the logid was never added to the table,
 	 * so do not remove it.
 	 */
-	if (!F_ISSET(dblp, DB_AM_RECOVER))
+	if (!F_ISSET(dblp, DBC_RECOVER))
 		__log_rem_logid(dblp, fid);
 
 ret1:	UNLOCK_LOGREGION(dblp);
diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c
index d89f9c2ded..12c53417d9 100644
--- a/db2/mp/mp_bh.c
+++ b/db2/mp/mp_bh.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_bh.c	10.38 (Sleepycat) 5/20/98";
+static const char sccsid[] = "@(#)mp_bh.c	10.45 (Sleepycat) 11/25/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -42,11 +42,13 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 {
 	DB_MPOOLFILE *dbmfp;
 	DB_MPREG *mpreg;
+	int incremented, ret;
 
 	if (restartp != NULL)
 		*restartp = 0;
 	if (wrotep != NULL)
 		*wrotep = 0;
+	incremented = 0;
 
 	/*
 	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
@@ -63,6 +65,13 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 				UNLOCKHANDLE(dbmp, dbmp->mutexp);
 				return (0);
 			}
+
+			/*
+			 * Increment the reference count -- see the comment in
+			 * memp_fclose().
+			 */
+			++dbmfp->ref;
+			incremented = 1;
 			break;
 		}
 	UNLOCKHANDLE(dbmp, dbmp->mutexp);
@@ -117,7 +126,15 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
 		return (0);
 
-found:	return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
+found:	ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep);
+
+	if (incremented) {
+		LOCKHANDLE(dbmp, dbmp->mutexp);
+		--dbmfp->ref;
+		UNLOCKHANDLE(dbmp, dbmp->mutexp);
+	}
+
+	return (ret);
 }
 
 /*
@@ -132,11 +149,12 @@ __memp_pgread(dbmfp, bhp, can_create)
 	BH *bhp;
 	int can_create;
 {
+	DB_IO db_io;
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
-	size_t pagesize;
+	size_t len, pagesize;
 	ssize_t nr;
-	int ret;
+	int created, ret;
 
 	dbmp = dbmfp->dbmp;
 	mfp = dbmfp->mfp;
@@ -147,70 +165,63 @@ __memp_pgread(dbmfp, bhp, can_create)
 	UNLOCKREGION(dbmp);
 
 	/*
-	 * Temporary files may not yet have been created.
-	 *
-	 * Seek to the page location.
+	 * Temporary files may not yet have been created.  We don't create
+	 * them now, we create them when the pages have to be flushed.
 	 */
-	ret = 0;
-	LOCKHANDLE(dbmp, dbmfp->mutexp);
-	if (dbmfp->fd == -1 || (ret =
-	    __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0) {
-		if (!can_create) {
-			if (dbmfp->fd == -1)
-				ret = EINVAL;
-			UNLOCKHANDLE(dbmp, dbmfp->mutexp);
+	nr = 0;
+	if (dbmfp->fd == -1)
+		ret = 0;
+	else {
+		/*
+		 * Ignore read errors if we have permission to create the page.
+		 * Assume that the page doesn't exist, and that we'll create it
+		 * when we write it out.
+		 */
+		db_io.fd_io = dbmfp->fd;
+		db_io.fd_lock = dbmp->reginfo.fd;
+		db_io.mutexp =
+		    F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;
+		db_io.pagesize = db_io.bytes = pagesize;
+		db_io.pgno = bhp->pgno;
+		db_io.buf = bhp->buf;
+
+		ret = __os_io(&db_io, DB_IO_READ, &nr);
+	}
+
+	created = 0;
+	if (nr < (ssize_t)pagesize) {
+		if (can_create)
+			created = 1;
+		else {
+			/* If we had a short read, ret may be 0. */
+			if (ret == 0)
+				ret = EIO;
 			__db_err(dbmp->dbenv,
 			    "%s: page %lu doesn't exist, create flag not set",
 			    __memp_fn(dbmfp), (u_long)bhp->pgno);
 			goto err;
 		}
-		UNLOCKHANDLE(dbmp, dbmfp->mutexp);
-
-		/* Clear the created page. */
-		if (mfp->clear_len == 0)
-			memset(bhp->buf, 0, pagesize);
-		else {
-			memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
-			memset(bhp->buf + mfp->clear_len,
-			    0xff, pagesize - mfp->clear_len);
-#endif
-		}
-
-		goto pgin;
 	}
 
 	/*
-	 * Read the page; short reads are treated like creates, although
-	 * any valid data is preserved.
+	 * Clear any bytes we didn't read that need to be cleared.  If we're
+	 * running in diagnostic mode, smash any bytes on the page that are
+	 * unknown quantities for the caller.
 	 */
-	ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr);
-	UNLOCKHANDLE(dbmp, dbmfp->mutexp);
-	if (ret != 0)
-		goto err;
-
-	if (nr == (ssize_t)pagesize)
-		can_create = 0;
-	else {
-		if (!can_create) {
-			ret = EINVAL;
-			goto err;
-		}
-
-		/*
-		 * If we didn't fail until we tried the read, don't clear the
-		 * whole page, it wouldn't be insane for a filesystem to just
-		 * always behave that way.  Else, clear any uninitialized data.
-		 */
-		if (nr == 0)
-			memset(bhp->buf, 0,
-			    mfp->clear_len == 0 ? pagesize : mfp->clear_len);
-		else
-			memset(bhp->buf + nr, 0, pagesize - nr);
+	if (nr != (ssize_t)pagesize) {
+		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
+		if (nr < (ssize_t)len)
+			memset(bhp->buf + nr, 0, len - nr);
+#ifdef DIAGNOSTIC
+		if (nr > (ssize_t)len)
+			len = nr;
+		if (len < pagesize)
+			memset(bhp->buf + len, 0xdb, pagesize - len);
+#endif
 	}
 
 	/* Call any pgin function. */
-pgin:	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
+	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
 
 	/* Unlock the buffer and reacquire the region lock. */
 err:	UNLOCKBUFFER(dbmp, bhp);
@@ -225,7 +236,7 @@ err:	UNLOCKBUFFER(dbmp, bhp);
 		F_CLR(bhp, BH_TRASH);
 
 		/* Update the statistics. */
-		if (can_create) {
+		if (created) {
 			++dbmp->mp->stat.st_page_create;
 			++mfp->stat.st_page_create;
 		} else {
@@ -250,12 +261,12 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
 	int *restartp, *wrotep;
 {
 	DB_ENV *dbenv;
+	DB_IO db_io;
 	DB_LOG *lg_info;
 	DB_LSN lsn;
 	DB_MPOOL *dbmp;
 	MPOOL *mp;
 	MPOOLFILE *mfp;
-	size_t pagesize;
 	ssize_t nw;
 	int callpgin, ret, syncfail;
 	const char *fail;
@@ -270,7 +281,6 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
 	if (wrotep != NULL)
 		*wrotep = 0;
 	callpgin = 0;
-	pagesize = mfp->stat.st_pagesize;
 
 	/*
 	 * Check the dirty bit -- this buffer may have been written since we
@@ -326,34 +336,32 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
 	}
 
 	/* Temporary files may not yet have been created. */
-	LOCKHANDLE(dbmp, dbmfp->mutexp);
-	if (dbmfp->fd == -1 &&
-	    ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
-	    DB_CREATE | DB_EXCL | DB_TEMPORARY, &dbmfp->fd, NULL)) != 0 ||
-	    dbmfp->fd == -1)) {
+	if (dbmfp->fd == -1) {
+		LOCKHANDLE(dbmp, dbmfp->mutexp);
+		if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv,
+		    DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY,
+		    &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {
+			UNLOCKHANDLE(dbmp, dbmfp->mutexp);
+			__db_err(dbenv,
+			    "unable to create temporary backing file");
+			goto err;
+		}
 		UNLOCKHANDLE(dbmp, dbmfp->mutexp);
-		__db_err(dbenv, "unable to create temporary backing file");
-		goto err;
 	}
 
-	/*
-	 * Write the page out.
-	 *
-	 * XXX
-	 * Shut the compiler up; it doesn't understand the correlation between
-	 * the failing clauses to __db_lseek and __db_write and this ret != 0.
-	 */
-	COMPQUIET(fail, NULL);
-	if ((ret =
-	    __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0)
-		fail = "seek";
-	else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
+	/* Write the page. */
+	db_io.fd_io = dbmfp->fd;
+	db_io.fd_lock = dbmp->reginfo.fd;
+	db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;
+	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+	db_io.pgno = bhp->pgno;
+	db_io.buf = bhp->buf;
+	if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) {
+		__db_panic(dbenv, ret);
 		fail = "write";
-	UNLOCKHANDLE(dbmp, dbmfp->mutexp);
-	if (ret != 0)
 		goto syserr;
-
-	if (nw != (ssize_t)pagesize) {
+	}
+	if (nw != (ssize_t)mfp->stat.st_pagesize) {
 		ret = EIO;
 		fail = "write";
 		goto syserr;
@@ -394,7 +402,7 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
 	if (F_ISSET(bhp, BH_WRITE)) {
 		if (mfp->lsn_cnt == 1) {
 			UNLOCKREGION(dbmp);
-			syncfail = __db_fsync(dbmfp->fd) != 0;
+			syncfail = __os_fsync(dbmfp->fd) != 0;
 			LOCKREGION(dbmp);
 			if (syncfail)
 				F_SET(mp, MP_LSN_RETRY);
@@ -574,11 +582,11 @@ __memp_upgrade(dbmp, dbmfp, mfp)
 		ret = 1;
 	} else {
 		/* Swap the descriptors and set the upgrade flag. */
-		(void)__db_close(dbmfp->fd);
+		(void)__os_close(dbmfp->fd);
 		dbmfp->fd = fd;
 		F_SET(dbmfp, MP_UPGRADE);
 		ret = 0;
 	}
-	FREES(rpath);
+	__os_freestr(rpath);
 	return (ret);
 }
diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c
index 0777aa7dc6..f159dc2d3e 100644
--- a/db2/mp/mp_fget.c
+++ b/db2/mp/mp_fget.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_fget.c	10.48 (Sleepycat) 6/2/98";
+static const char sccsid[] = "@(#)mp_fget.c	10.53 (Sleepycat) 11/16/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -46,6 +46,8 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	mp = dbmp->mp;
 	mfp = dbmfp->mfp;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/*
 	 * Validate arguments.
 	 *
@@ -79,12 +81,11 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 #ifdef DIAGNOSTIC
 	/*
 	 * XXX
-	 * We want to switch threads as often as possible.  Sleep every time
-	 * we get a new page to make it more likely.
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a new page to ensure contention.
 	 */
-	if (DB_GLOBAL(db_pageyield) &&
-	    (__db_yield == NULL || __db_yield() != 0))
-		__db_sleep(0, 1);
+	if (DB_GLOBAL(db_pageyield))
+		__os_yield(1);
 #endif
 
 	/* Initialize remaining local variables. */
@@ -205,8 +206,8 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 			 * up running to the end of our CPU quantum as we will
 			 * simply be swapping between the two locks.
 			 */
-			if (!first && (__db_yield == NULL || __db_yield() != 0))
-				__db_sleep(0, 1);
+			if (!first)
+				__os_yield(1);
 
 			LOCKBUFFER(dbmp, bhp);
 			/* Wait for I/O to finish... */
@@ -240,7 +241,7 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	}
 
 alloc:	/* Allocate new buffer header and data space. */
-	if ((ret = __memp_ralloc(dbmp, sizeof(BH) -
+	if ((ret = __memp_alloc(dbmp, sizeof(BH) -
 	    sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
 		goto err;
 
@@ -285,7 +286,7 @@ alloc:	/* Allocate new buffer header and data space. */
 		else {
 			memset(bhp->buf, 0, mfp->clear_len);
 #ifdef DIAGNOSTIC
-			memset(bhp->buf + mfp->clear_len, 0xff,
+			memset(bhp->buf + mfp->clear_len, 0xdb,
 			    mfp->stat.st_pagesize - mfp->clear_len);
 #endif
 		}
@@ -335,11 +336,9 @@ done:	/* Update the chain search statistics. */
 		mp->stat.st_hash_examined += st_hsearch;
 	}
 
-	UNLOCKREGION(dbmp);
-
-	LOCKHANDLE(dbmp, dbmfp->mutexp);
 	++dbmfp->pinref;
-	UNLOCKHANDLE(dbmp, dbmfp->mutexp);
+
+	UNLOCKREGION(dbmp);
 
 	return (0);
 
diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c
index a4cbac8d4e..dd02662fd8 100644
--- a/db2/mp/mp_fopen.c
+++ b/db2/mp/mp_fopen.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_fopen.c	10.47 (Sleepycat) 5/4/98";
+static const char sccsid[] = "@(#)mp_fopen.c	10.60 (Sleepycat) 1/1/99";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -43,6 +43,8 @@ memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp)
 {
 	int ret;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/* Validate arguments. */
 	if ((ret = __db_fchk(dbmp->dbenv,
 	    "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0)
@@ -53,6 +55,8 @@ memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp)
 		__db_err(dbmp->dbenv, "memp_fopen: pagesize not specified");
 		return (EINVAL);
 	}
+	if (finfop != NULL && finfop->clear_len > pagesize)
+		return (EINVAL);
 
 	return (__memp_fopen(dbmp,
 	    NULL, path, flags, mode, pagesize, 1, finfop, retp));
@@ -80,7 +84,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 	DB_MPOOLFILE *dbmfp;
 	DB_MPOOL_FINFO finfo;
 	db_pgno_t last_pgno;
-	size_t size;
+	size_t maxmap;
 	u_int32_t mbytes, bytes;
 	int ret;
 	u_int8_t idbuf[DB_FILE_ID_LEN];
@@ -115,13 +119,11 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 	}
 
 	/* Allocate and initialize the per-process structure. */
-	if ((dbmfp =
-	    (DB_MPOOLFILE *)__db_calloc(1, sizeof(DB_MPOOLFILE))) == NULL) {
-		__db_err(dbenv, "memp_fopen: %s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+		return (ret);
 	dbmfp->dbmp = dbmp;
 	dbmfp->fd = -1;
+	dbmfp->ref = 1;
 	if (LF_ISSET(DB_RDONLY))
 		F_SET(dbmfp, MP_READONLY);
 
@@ -132,7 +134,6 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 			ret = EINVAL;
 			goto err;
 		}
-		size = 0;
 		last_pgno = 0;
 	} else {
 		/* Get the real name for this file and open it. */
@@ -146,21 +147,40 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 			goto err;
 		}
 
-		/* Don't permit files that aren't a multiple of the pagesize. */
-		if ((ret = __db_ioinfo(rpath,
+		/*
+		 * Don't permit files that aren't a multiple of the pagesize,
+		 * and find the number of the last page in the file, all the
+		 * time being careful not to overflow 32 bits.
+		 *
+		 * !!!
+		 * We can't use off_t's here, or in any code in the mainline
+		 * library for that matter.  (We have to use them in the os
+		 * stubs, of course, as there are system calls that take them
+		 * as arguments.)  The reason is that some customers build in
+		 * environments where an off_t is 32-bits, but still run where
+		 * offsets are 64-bits, and they pay us a lot of money.
+		 */
+		if ((ret = __os_ioinfo(rpath,
 		    dbmfp->fd, &mbytes, &bytes, NULL)) != 0) {
 			__db_err(dbenv, "%s: %s", rpath, strerror(ret));
 			goto err;
 		}
-		if (bytes % pagesize) {
+
+		/* Page sizes have to be a power-of-two, ignore mbytes. */
+		if (bytes % pagesize != 0) {
 			__db_err(dbenv,
 			    "%s: file size not a multiple of the pagesize",
 			    rpath);
 			ret = EINVAL;
 			goto err;
 		}
-		size = mbytes * MEGABYTE + bytes;
-		last_pgno = size == 0 ? 0 : (size - 1) / pagesize;
+
+		last_pgno = mbytes * (MEGABYTE / pagesize);
+		last_pgno += bytes / pagesize;
+
+		/* Correction: page numbers are zero-based, not 1-based. */
+		if (last_pgno != 0)
+			--last_pgno;
 
 		/*
 		 * Get the file id if we weren't given one.  Generated file id's
@@ -168,7 +188,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 		 * other process joining the party.
 		 */
 		if (finfop->fileid == NULL) {
-			if ((ret = __db_fileid(dbenv, rpath, 0, idbuf)) != 0)
+			if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
 				goto err;
 			finfop->fileid = idbuf;
 		}
@@ -191,7 +211,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 	}
 	if (ret == 0 &&
 	    F_ISSET(dbmp, MP_LOCKHANDLE) && (ret =
-	    __memp_ralloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0)
+	    __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0)
 		LOCKINIT(dbmp, dbmfp->mutexp);
 
 	if (needlock)
@@ -232,13 +252,15 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 			F_CLR(mfp, MP_CAN_MMAP);
 		if (LF_ISSET(DB_NOMMAP))
 			F_CLR(mfp, MP_CAN_MMAP);
-		if (size > (dbenv == NULL || dbenv->mp_mmapsize == 0 ?
-		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize))
+		maxmap = dbenv == NULL || dbenv->mp_mmapsize == 0 ?
+		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
+		if (mbytes > maxmap / MEGABYTE ||
+		    (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
 			F_CLR(mfp, MP_CAN_MMAP);
 	}
 	dbmfp->addr = NULL;
 	if (F_ISSET(mfp, MP_CAN_MMAP)) {
-		dbmfp->len = size;
+		dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
 		if (__db_mapfile(rpath,
 		    dbmfp->fd, dbmfp->len, 1, &dbmfp->addr) != 0) {
 			dbmfp->addr = NULL;
@@ -246,7 +268,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 		}
 	}
 	if (rpath != NULL)
-		FREES(rpath);
+		__os_freestr(rpath);
 
 	LOCKHANDLE(dbmp, dbmp->mutexp);
 	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
@@ -260,11 +282,11 @@ err:	/*
 	 * never get to here after we have successfully allocated it.
 	 */
 	if (rpath != NULL)
-		FREES(rpath);
+		__os_freestr(rpath);
 	if (dbmfp->fd != -1)
-		(void)__db_close(dbmfp->fd);
+		(void)__os_close(dbmfp->fd);
 	if (dbmfp != NULL)
-		FREE(dbmfp, sizeof(DB_MPOOLFILE));
+		__os_free(dbmfp, sizeof(DB_MPOOLFILE));
 	return (ret);
 }
 
@@ -315,7 +337,7 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp)
 		}
 
 	/* Allocate a new MPOOLFILE. */
-	if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+	if ((ret = __memp_alloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
 		return (ret);
 	*retp = mfp;
 
@@ -334,21 +356,22 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp)
 	mfp->stat.st_pagesize = pagesize;
 	mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
 
-	F_SET(mfp, MP_CAN_MMAP);
 	if (ISTEMPORARY)
 		F_SET(mfp, MP_TEMP);
 	else {
 		/* Copy the file path into shared memory. */
-		if ((ret = __memp_ralloc(dbmp,
+		if ((ret = __memp_alloc(dbmp,
 		    strlen(path) + 1, &mfp->path_off, &p)) != 0)
 			goto err;
 		memcpy(p, path, strlen(path) + 1);
 
 		/* Copy the file identification string into shared memory. */
-		if ((ret = __memp_ralloc(dbmp,
+		if ((ret = __memp_alloc(dbmp,
 		    DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
 			goto err;
 		memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
+
+		F_SET(mfp, MP_CAN_MMAP);
 	}
 
 	/* Copy the page cookie into shared memory. */
@@ -356,7 +379,7 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp)
 		mfp->pgcookie_len = 0;
 		mfp->pgcookie_off = 0;
 	} else {
-		if ((ret = __memp_ralloc(dbmp,
+		if ((ret = __memp_alloc(dbmp,
 		    finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
 			goto err;
 		memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
@@ -394,16 +417,48 @@ memp_fclose(dbmfp)
 	dbmp = dbmfp->dbmp;
 	ret = 0;
 
+	MP_PANIC_CHECK(dbmp);
+
+	for (;;) {
+		LOCKHANDLE(dbmp, dbmp->mutexp);
+
+		/*
+		 * We have to reference count DB_MPOOLFILE structures as other
+		 * threads may be using them.  The problem only happens if the
+		 * application makes a bad design choice.  Here's the path:
+		 *
+		 * Thread A opens a database.
+		 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+		 *    in order to free up memory in the mpool cache.
+		 * Thread A closes the database while thread B is using the
+		 *    DB_MPOOLFILE structure.
+		 *
+		 * By opening all databases before creating the threads, and
+		 * closing them after the threads have exited, applications
+		 * get better performance and avoid the problem path entirely.
+		 *
+		 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
+		 * is a short-term lock, even in worst case, since we better be
+		 * the only thread of control using the DB_MPOOLFILE structure
+		 * to read pages *into* the cache.  Wait until we're the only
+		 * reference holder and remove the DB_MPOOLFILE structure from
+		 * the list, so nobody else can even find it.
+		 */
+		if (dbmfp->ref == 1) {
+			TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+			break;
+		}
+		UNLOCKHANDLE(dbmp, dbmp->mutexp);
+
+		(void)__os_sleep(1, 0);
+	}
+	UNLOCKHANDLE(dbmp, dbmp->mutexp);
+
 	/* Complain if pinned blocks never returned. */
 	if (dbmfp->pinref != 0)
 		__db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned",
 		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
 
-	/* Remove the DB_MPOOLFILE structure from the list. */
-	LOCKHANDLE(dbmp, dbmp->mutexp);
-	TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
-	UNLOCKHANDLE(dbmp, dbmp->mutexp);
-
 	/* Close the underlying MPOOLFILE. */
 	(void)__memp_mf_close(dbmp, dbmfp);
 
@@ -414,7 +469,7 @@ memp_fclose(dbmfp)
 		    "%s: %s", __memp_fn(dbmfp), strerror(ret));
 
 	/* Close the file; temporary files may not yet have been created. */
-	if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) {
+	if (dbmfp->fd != -1 && (t_ret = __os_close(dbmfp->fd)) != 0) {
 		__db_err(dbmp->dbenv,
 		    "%s: %s", __memp_fn(dbmfp), strerror(t_ret));
 		if (ret != 0)
@@ -429,7 +484,7 @@ memp_fclose(dbmfp)
 	}
 
 	/* Discard the DB_MPOOLFILE structure. */
-	FREE(dbmfp, sizeof(DB_MPOOLFILE));
+	__os_free(dbmfp, sizeof(DB_MPOOLFILE));
 
 	return (ret);
 }
diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c
index 48fdfc3b7f..c551f97380 100644
--- a/db2/mp/mp_fput.c
+++ b/db2/mp/mp_fput.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_fput.c	10.22 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)mp_fput.c	10.24 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -40,6 +40,8 @@ memp_fput(dbmfp, pgaddr, flags)
 	dbmp = dbmfp->dbmp;
 	mp = dbmp->mp;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/* Validate arguments. */
 	if (flags) {
 		if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags,
@@ -57,15 +59,15 @@ memp_fput(dbmfp, pgaddr, flags)
 		}
 	}
 
+	LOCKREGION(dbmp);
+
 	/* Decrement the pinned reference count. */
-	LOCKHANDLE(dbmp, dbmfp->mutexp);
 	if (dbmfp->pinref == 0)
 		__db_err(dbmp->dbenv,
 		    "%s: put: more blocks returned than retrieved",
 		    __memp_fn(dbmfp));
 	else
 		--dbmfp->pinref;
-	UNLOCKHANDLE(dbmp, dbmfp->mutexp);
 
 	/*
 	 * If we're mapping the file, there's nothing to do.  Because we can
@@ -74,14 +76,14 @@ memp_fput(dbmfp, pgaddr, flags)
 	 * region.
 	 */
 	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
-	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
+		UNLOCKREGION(dbmp);
 		return (0);
+	}
 
 	/* Convert the page address to a buffer header. */
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
 
-	LOCKREGION(dbmp);
-
 	/* Set/clear the page bits. */
 	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
 		++mp->stat.st_page_clean;
diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c
index 3b352aa553..1940d3b198 100644
--- a/db2/mp/mp_fset.c
+++ b/db2/mp/mp_fset.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_fset.c	10.15 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)mp_fset.c	10.16 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -40,6 +40,8 @@ memp_fset(dbmfp, pgaddr, flags)
 	dbmp = dbmfp->dbmp;
 	mp = dbmp->mp;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/* Validate arguments. */
 	if (flags == 0)
 		return (__db_ferr(dbmp->dbenv, "memp_fset", 1));
diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c
index fc985bc521..4c90fc438f 100644
--- a/db2/mp/mp_open.c
+++ b/db2/mp/mp_open.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_open.c	10.23 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)mp_open.c	10.27 (Sleepycat) 10/1/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -52,8 +52,8 @@ memp_open(path, flags, mode, dbenv, retp)
 	cachesize = dbenv == NULL ? 0 : dbenv->mp_size;
 
 	/* Create and initialize the DB_MPOOL structure. */
-	if ((dbmp = (DB_MPOOL *)__db_calloc(1, sizeof(DB_MPOOL))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_calloc(1, sizeof(DB_MPOOL), &dbmp)) != 0)
+		return (ret);
 	LIST_INIT(&dbmp->dbregq);
 	TAILQ_INIT(&dbmp->dbmfq);
 
@@ -83,7 +83,7 @@ memp_open(path, flags, mode, dbenv, retp)
 	if (LF_ISSET(DB_THREAD)) {
 		F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION);
 		LOCKREGION(dbmp);
-		ret = __memp_ralloc(dbmp,
+		ret = __memp_alloc(dbmp,
 		    sizeof(db_mutex_t), NULL, &dbmp->mutexp);
 		UNLOCKREGION(dbmp);
 		if (ret != 0) {
@@ -97,7 +97,7 @@ memp_open(path, flags, mode, dbenv, retp)
 	return (0);
 
 err:	if (dbmp != NULL)
-		FREE(dbmp, sizeof(DB_MPOOL));
+		__os_free(dbmp, sizeof(DB_MPOOL));
 	return (ret);
 }
 
@@ -115,10 +115,12 @@ memp_close(dbmp)
 
 	ret = 0;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/* Discard DB_MPREGs. */
 	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
 		LIST_REMOVE(mpreg, q);
-		FREE(mpreg, sizeof(DB_MPREG));
+		__os_free(mpreg, sizeof(DB_MPREG));
 	}
 
 	/* Discard DB_MPOOLFILEs. */
@@ -138,13 +140,27 @@ memp_close(dbmp)
 		ret = t_ret;
 
 	if (dbmp->reginfo.path != NULL)
-		FREES(dbmp->reginfo.path);
-	FREE(dbmp, sizeof(DB_MPOOL));
+		__os_freestr(dbmp->reginfo.path);
+	__os_free(dbmp, sizeof(DB_MPOOL));
 
 	return (ret);
 }
 
 /*
+ * __memp_panic --
+ *	Panic a memory pool.
+ *
+ * PUBLIC: void __memp_panic __P((DB_ENV *));
+ */
+void
+__memp_panic(dbenv)
+	DB_ENV *dbenv;
+{
+	if (dbenv->mp_info != NULL)
+		dbenv->mp_info->mp->rlayout.panic = 1;
+}
+
+/*
  * memp_unlink --
  *	Exit a memory pool.
  */
@@ -160,12 +176,12 @@ memp_unlink(path, force, dbenv)
 	memset(&reginfo, 0, sizeof(reginfo));
 	reginfo.dbenv = dbenv;
 	reginfo.appname = DB_APP_NONE;
-	if (path != NULL && (reginfo.path = __db_strdup(path)) == NULL)
-		return (ENOMEM);
+	if (path != NULL && (ret = __os_strdup(path, &reginfo.path)) != 0)
+		return (ret);
 	reginfo.file = DB_DEFAULT_MPOOL_FILE;
 	ret = __db_runlink(&reginfo, force);
 	if (reginfo.path != NULL)
-		FREES(reginfo.path);
+		__os_freestr(reginfo.path);
 	return (ret);
 }
 
@@ -181,9 +197,12 @@ memp_register(dbmp, ftype, pgin, pgout)
 	int (*pgout) __P((db_pgno_t, void *, DBT *));
 {
 	DB_MPREG *mpr;
+	int ret;
+
+	MP_PANIC_CHECK(dbmp);
 
-	if ((mpr = (DB_MPREG *)__db_malloc(sizeof(DB_MPREG))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(sizeof(DB_MPREG), NULL, &mpr)) != 0)
+		return (ret);
 
 	mpr->ftype = ftype;
 	mpr->pgin = pgin;
diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c
index e83e0f44fa..84c782e781 100644
--- a/db2/mp/mp_pr.c
+++ b/db2/mp/mp_pr.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_pr.c	10.26 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)mp_pr.c	10.30 (Sleepycat) 10/1/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -44,16 +44,17 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
 	DB_MPOOL_FSTAT **tfsp;
 	MPOOLFILE *mfp;
 	size_t len, nlen;
+	int ret;
 	char *name;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/* Allocate space for the global statistics. */
 	if (gspp != NULL) {
 		*gspp = NULL;
 
-		if ((*gspp = db_malloc == NULL ?
-		    (DB_MPOOL_STAT *)__db_malloc(sizeof(**gspp)) :
-		    (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL)
-			return (ENOMEM);
+		if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0)
+			return (ret);
 
 		LOCKREGION(dbmp);
 
@@ -89,10 +90,8 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
 
 		/* Allocate space for the pointers. */
 		len = (len + 1) * sizeof(DB_MPOOL_FSTAT *);
-		if ((*fspp = db_malloc == NULL ?
-		    (DB_MPOOL_FSTAT **)__db_malloc(len) :
-		    (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL)
-			return (ENOMEM);
+		if ((ret = __os_malloc(len, db_malloc, fspp)) != 0)
+			return (ret);
 
 		LOCKREGION(dbmp);
 
@@ -104,10 +103,8 @@ memp_stat(dbmp, gspp, fspp, db_malloc)
 			name = __memp_fns(dbmp, mfp);
 			nlen = strlen(name);
 			len = sizeof(DB_MPOOL_FSTAT) + nlen + 1;
-			if ((*tfsp = db_malloc == NULL ?
-			    (DB_MPOOL_FSTAT *)__db_malloc(len) :
-			    (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL)
-				return (ENOMEM);
+			if ((ret = __os_malloc(len, db_malloc, tfsp)) != 0)
+				return (ret);
 			**tfsp = mfp->stat;
 			(*tfsp)->file_name = (char *)
 			    (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT);
@@ -212,8 +209,9 @@ __memp_dump_region(dbmp, area, fp)
 	cnt = 0;
 	for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
 	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
-		(void)fprintf(fp, "file #%d: %s: %lu references: %s\n",
+		(void)fprintf(fp, "file #%d: %s: refs %lu, type %ld, %s\n",
 		    cnt + 1, __memp_fns(dbmp, mfp), (u_long)mfp->ref,
+		    (long)mfp->ftype,
 		    F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
 		    if (cnt < FMAP_ENTRIES)
 			fmap[cnt] = R_OFFSET(dbmp, mfp);
diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c
index b8a72286cd..b9c92f2e13 100644
--- a/db2/mp/mp_region.c
+++ b/db2/mp/mp_region.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_region.c	10.30 (Sleepycat) 5/31/98";
+static const char sccsid[] = "@(#)mp_region.c	10.35 (Sleepycat) 12/11/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -24,13 +24,33 @@ static const char sccsid[] = "@(#)mp_region.c	10.30 (Sleepycat) 5/31/98";
 #include "common_ext.h"
 
 /*
- * __memp_ralloc --
+ * __memp_reg_alloc --
+ *	Allocate some space in the mpool region, with locking.
+ *
+ * PUBLIC: int __memp_reg_alloc __P((DB_MPOOL *, size_t, size_t *, void *));
+ */
+int
+__memp_reg_alloc(dbmp, len, offsetp, retp)
+	DB_MPOOL *dbmp;
+	size_t len, *offsetp;
+	void *retp;
+{
+	int ret;
+
+	LOCKREGION(dbmp);
+	ret = __memp_alloc(dbmp, len, offsetp, retp);
+	UNLOCKREGION(dbmp);
+	return (ret);
+}
+
+/*
+ * __memp_alloc --
  *	Allocate some space in the mpool region.
  *
- * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *));
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *, size_t, size_t *, void *));
  */
 int
-__memp_ralloc(dbmp, len, offsetp, retp)
+__memp_alloc(dbmp, len, offsetp, retp)
 	DB_MPOOL *dbmp;
 	size_t len, *offsetp;
 	void *retp;
@@ -52,7 +72,9 @@ alloc:	if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
 		return (0);
 	}
 	if (nomore) {
-		__db_err(dbmp->dbenv, "%s", strerror(ret));
+		__db_err(dbmp->dbenv,
+	    "Unable to allocate %lu bytes from mpool shared region: %s\n",
+		    (u_long)len, strerror(ret));
 		return (ret);
 	}
 
@@ -91,7 +113,7 @@ alloc:	if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
 	}
 
 retry:	/* Find a buffer we can flush; pure LRU. */
-	total = 0;
+	restart = total = 0;
 	for (bhp =
 	    SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
 		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
@@ -222,8 +244,8 @@ __memp_ropen(dbmp, path, cachesize, mode, is_private, flags)
 	if (path == NULL)
 		dbmp->reginfo.path = NULL;
 	else
-		if ((dbmp->reginfo.path = __db_strdup(path)) == NULL)
-			return (ENOMEM);
+		if ((ret = __os_strdup(path, &dbmp->reginfo.path)) != 0)
+			return (ret);
 	dbmp->reginfo.file = DB_DEFAULT_MPOOL_FILE;
 	dbmp->reginfo.mode = mode;
 	dbmp->reginfo.size = rlen;
@@ -244,7 +266,7 @@ __memp_ropen(dbmp, path, cachesize, mode, is_private, flags)
 
 	if ((ret = __db_rattach(&dbmp->reginfo)) != 0) {
 		if (dbmp->reginfo.path != NULL)
-			FREES(dbmp->reginfo.path);
+			__os_freestr(dbmp->reginfo.path);
 		return (ret);
 	}
 
@@ -303,6 +325,6 @@ err:	UNLOCKREGION(dbmp);
 		(void)memp_unlink(path, 1, dbmp->dbenv);
 
 	if (dbmp->reginfo.path != NULL)
-		FREES(dbmp->reginfo.path);
+		__os_freestr(dbmp->reginfo.path);
 	return (ret);
 }
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c
index 33218eef1a..535348517c 100644
--- a/db2/mp/mp_sync.c
+++ b/db2/mp/mp_sync.c
@@ -7,7 +7,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mp_sync.c	10.25 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)mp_sync.c	10.31 (Sleepycat) 12/11/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -39,9 +39,12 @@ memp_sync(dbmp, lsnp)
 	DB_ENV *dbenv;
 	MPOOL *mp;
 	MPOOLFILE *mfp;
-	int ar_cnt, cnt, nalloc, next, ret, wrote;
+	int ar_cnt, nalloc, next, maxpin, ret, wrote;
+
+	MP_PANIC_CHECK(dbmp);
 
 	dbenv = dbmp->dbenv;
+	mp = dbmp->mp;
 
 	if (dbenv->lg_info == NULL) {
 		__db_err(dbenv, "memp_sync: requires logging");
@@ -49,16 +52,19 @@ memp_sync(dbmp, lsnp)
 	}
 
 	/*
-	 * We try and write the buffers in page order so that the underlying
-	 * filesystem doesn't have to seek and can write contiguous blocks,
-	 * plus, we don't want to hold the region lock while we write the
-	 * buffers.  Get memory to hold the buffer pointers.  Get a good-size
-	 * block, too, because we realloc while holding the region lock if we
-	 * run out.
+	 * We try and write the buffers in page order: it should reduce seeks
+	 * by the underlying filesystem and possibly reduce the actual number
+	 * of writes.  We don't want to hold the region lock while we write
+	 * the buffers, so only hold it lock while we create a list.  Get a
+	 * good-size block of memory to hold buffer pointers, we don't want
+	 * to run out.
 	 */
-	if ((bharray =
-	    (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL)
-		return (ENOMEM);
+	LOCKREGION(dbmp);
+	nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10;
+	UNLOCKREGION(dbmp);
+
+	if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0)
+		return (ret);
 
 	LOCKREGION(dbmp);
 
@@ -70,7 +76,6 @@ memp_sync(dbmp, lsnp)
 	 * we've already handled or are currently handling, then we return a
 	 * result based on the count for the larger LSN.
 	 */
-	mp = dbmp->mp;
 	if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
 		if (mp->lsn_cnt == 0) {
 			*lsnp = mp->lsn;
@@ -114,10 +119,15 @@ memp_sync(dbmp, lsnp)
 	 * finish.  Since the application may have restarted the sync, clear
 	 * any BH_WRITE flags that appear to be left over from previous calls.
 	 *
+	 * We don't want to pin down the entire buffer cache, otherwise we'll
+	 * starve threads needing new pages.  Don't pin down more than 80% of
+	 * the cache.
+	 *
 	 * Keep a count of the total number of buffers we need to write in
 	 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
 	 */
 	ar_cnt = 0;
+	maxpin = ((mp->stat.st_page_dirty + mp->stat.st_page_clean) * 8) / 10;
 	for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
 		if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
@@ -130,19 +140,27 @@ memp_sync(dbmp, lsnp)
 
 			/*
 			 * If the buffer isn't in use, we should be able to
-			 * write it immediately, so save a reference to it.
+			 * write it immediately, so increment the reference
+			 * count to lock it and its contents down, and then
+			 * save a reference to it.
+			 *
+			 * If we've run out space to store buffer references,
+			 * we're screwed.  We don't want to realloc the array
+			 * while holding a region lock, so we set the flag to
+			 * force the checkpoint to be done again, from scratch,
+			 * later.
+			 *
+			 * If we've pinned down too much of the cache stop, and
+			 * set a flag to force the checkpoint to be tried again
+			 * later.
 			 */
 			if (bhp->ref == 0) {
-				if (ar_cnt == nalloc) {
-					nalloc *= 2;
-					if ((bharray =
-					    (BH **)__db_realloc(bharray,
-					    nalloc * sizeof(BH *))) == NULL) {
-						ret = ENOMEM;
-						goto err;
-					}
+				++bhp->ref;
+				bharray[ar_cnt] = bhp;
+				if (++ar_cnt >= nalloc || ar_cnt >= maxpin) {
+					F_SET(mp, MP_LSN_RETRY);
+					break;
 				}
-				bharray[ar_cnt++] = bhp;
 			}
 		} else
 			if (F_ISSET(bhp, BH_WRITE))
@@ -154,10 +172,6 @@ memp_sync(dbmp, lsnp)
 		goto done;
 	}
 
-	/* Lock down the buffers and their contents. */
-	for (cnt = 0; cnt < ar_cnt; ++cnt)
-		++bharray[cnt]->ref;
-
 	UNLOCKREGION(dbmp);
 
 	/* Sort the buffers we're going to write. */
@@ -205,7 +219,8 @@ memp_sync(dbmp, lsnp)
 			goto err;
 		}
 	}
-	ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+	ret = mp->lsn_cnt != 0 ||
+	    F_ISSET(mp, MP_LSN_RETRY) ? DB_INCOMPLETE : 0;
 
 done:
 	if (0) {
@@ -224,7 +239,7 @@ err:		/*
 			F_CLR(bhp, BH_WRITE);
 	}
 	UNLOCKREGION(dbmp);
-	__db_free(bharray);
+	__os_free(bharray, nalloc * sizeof(BH *));
 	return (ret);
 }
 
@@ -241,6 +256,8 @@ memp_fsync(dbmfp)
 
 	dbmp = dbmfp->dbmp;
 
+	MP_PANIC_CHECK(dbmp);
+
 	/*
 	 * If this handle doesn't have a file descriptor that's open for
 	 * writing, or if the file is a temporary, there's no reason to
@@ -300,25 +317,29 @@ __memp_fsync(dbmfp)
 {
 	BH *bhp, **bharray;
 	DB_MPOOL *dbmp;
+	MPOOL *mp;
 	size_t mf_offset;
-	int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote;
+	int ar_cnt, incomplete, nalloc, next, ret, wrote;
 
 	ret = 0;
 	dbmp = dbmfp->dbmp;
+	mp = dbmp->mp;
 	mf_offset = R_OFFSET(dbmp, dbmfp->mfp);
 
 	/*
-	 * We try and write the buffers in page order so that the underlying
-	 * filesystem doesn't have to seek and can write contiguous blocks,
-	 * plus, we don't want to hold the region lock while we write the
-	 * buffers.  Get memory to hold the buffer pointers.  Get a good-size
-	 * block, too, because we realloc while holding the region lock if we
-	 * run out.
+	 * We try and write the buffers in page order: it should reduce seeks
+	 * by the underlying filesystem and possibly reduce the actual number
+	 * of writes.  We don't want to hold the region lock while we write
+	 * the buffers, so only hold it lock while we create a list.  Get a
+	 * good-size block of memory to hold buffer pointers, we don't want
+	 * to run out.
 	 */
-	nalloc = 1024;
-	if ((bharray =
-	    (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL)
-		return (ENOMEM);
+	LOCKREGION(dbmp);
+	nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10;
+	UNLOCKREGION(dbmp);
+
+	if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0)
+		return (ret);
 
 	LOCKREGION(dbmp);
 
@@ -326,36 +347,37 @@ __memp_fsync(dbmfp)
 	 * Walk the LRU list of buffer headers, and get a list of buffers to
 	 * write for this MPOOLFILE.
 	 */
-	ar_cnt = pincnt = 0;
-	for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+	ar_cnt = incomplete = 0;
+	for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
 		if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset)
 			continue;
 		if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
-			++pincnt;
+			incomplete = 1;
 			continue;
 		}
 
-		if (ar_cnt == nalloc) {
-			nalloc *= 2;
-			if ((bharray = (BH **)__db_realloc(bharray,
-			    nalloc * sizeof(BH *))) == NULL) {
-				ret = ENOMEM;
-				goto err;
-			}
-		}
+		++bhp->ref;
+		bharray[ar_cnt] = bhp;
 
-		bharray[ar_cnt++] = bhp;
+		/*
+		 * If we've run out space to store buffer references, we're
+		 * screwed, as we don't want to realloc the array holding a
+		 * region lock.  Set the incomplete flag -- the only way we
+		 * can get here is if the file is active in the buffer cache,
+		 * which is the same thing as finding pinned buffers.
+		 */
+		if (++ar_cnt >= nalloc) {
+			incomplete = 1;
+			break;
+		}
 	}
 
-	/* Lock down the buffers and their contents. */
-	for (cnt = 0; cnt < ar_cnt; ++cnt)
-		++bharray[cnt]->ref;
-
 	UNLOCKREGION(dbmp);
 
 	/* Sort the buffers we're going to write. */
-	qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+	if (ar_cnt != 0)
+		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
 
 	LOCKREGION(dbmp);
 
@@ -365,11 +387,10 @@ __memp_fsync(dbmfp)
 		 * It's possible for a thread to have gotten the buffer since
 		 * we listed it for writing.  If the reference count is still
 		 * 1, we're the only ones using the buffer, go ahead and write.
-		 * If it's >1, then skip the buffer and assume that it will be
-		 * written when it's returned to the cache.
+		 * If it's >1, then skip the buffer.
 		 */
 		if (bharray[next]->ref > 1) {
-			++pincnt;
+			incomplete = 1;
 
 			--bharray[next]->ref;
 			continue;
@@ -387,13 +408,18 @@ __memp_fsync(dbmfp)
 				--bharray[next]->ref;
 			goto err;
 		}
+
+		/*
+		 * If we didn't write the buffer for some reason, don't return
+		 * success.
+		 */
 		if (!wrote)
-			++pincnt;
+			incomplete = 1;
 	}
 
 err:	UNLOCKREGION(dbmp);
 
-	__db_free(bharray);
+	__os_free(bharray, nalloc * sizeof(BH *));
 
 	/*
 	 * Sync the underlying file as the last thing we do, so that the OS
@@ -404,7 +430,7 @@ err:	UNLOCKREGION(dbmp);
 	 * issues.
 	 */
 	if (ret == 0)
-		return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE);
+		return (incomplete ? DB_INCOMPLETE : __os_fsync(dbmfp->fd));
 	return (ret);
 }
 
@@ -423,6 +449,8 @@ memp_trickle(dbmp, pct, nwrotep)
 	u_long total;
 	int ret, wrote;
 
+	MP_PANIC_CHECK(dbmp);
+
 	mp = dbmp->mp;
 	if (nwrotep != NULL)
 		*nwrotep = 0;
@@ -487,7 +515,7 @@ loop:	total = mp->stat.st_page_clean + mp->stat.st_page_dirty;
 	}
 
 	/* No more buffers to write. */
-	return (0);
+	ret = 0;
 
 err:	UNLOCKREGION(dbmp);
 	return (ret);
@@ -508,6 +536,14 @@ __bhcmp(p1, p2)
 	if (bhp1->mf_offset > bhp2->mf_offset)
 		return (1);
 
-	/* Sort by page in file. */
-	return (bhp1->pgno < bhp2->pgno ? -1 : 1);
+	/*
+	 * !!!
+	 * Defend against badly written quicksort code calling the comparison
+	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+	 */
+	if (bhp1->pgno < bhp2->pgno)
+		return (-1);
+	if (bhp1->pgno > bhp2->pgno)
+		return (1);
+	return (0);
 }
diff --git a/db2/mutex/alpha.dec b/db2/mutex/alpha.dec
deleted file mode 100644
index 83ed371136..0000000000
--- a/db2/mutex/alpha.dec
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * @(#)alpha.dec	8.3 (Sleepycat Software) 1/18/97
- *
- * The DEC C asm acts as a pseudo-call.  The first argument is the assembly
- * code, and the remaining arguments are assigned as in a procedure call, to
- * r16, r17, etc. (represented in asm as %a0, %a1, and so forth).
- *
- * From: Dave Butenhof.
- */
-
-#include <c_asm.h>
-
-#define	TSL_SET(tsl)	(asm ("mb;					\
-    10:	ldl_l	%v0,(%a0) ;						\
-	bne	%v0,30f ;						\
-	or	%v0,1,%r1 ;						\
-	stl_c	%r1,(%a0) ;						\
-	beq	%r1,20f ;						\
-	mb	;							\
-	br	%r31,30f ;						\
-    20:	br	%r31,10b ;						\
-    30:	", (tsl)))
-
-THIS WAS NOT CONVERTED TO TAKE A POINTER AS AN ARGUMENT...
-#define	TSL_UNSET(tsl)	(asm ("mb"), *(tsl) = 0)
diff --git a/db2/mutex/alpha.gcc b/db2/mutex/alpha.gcc
deleted file mode 100644
index 247d04cf31..0000000000
--- a/db2/mutex/alpha.gcc
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * @(#)alpha.gcc	10.1 (Sleepycat) 4/12/97
- *
- * The code appearing below is taken from Richard L. Sites, ed.  "Alpha
- * Architecture Reference Manual", Digital Press, 1992, page 5-7 and 5-8.
- * There are 2 modifications:
- *
- * 1. The jump from blbs __r1,30f to !__r1, which is dictated by the way the
- * TSL_SET macro is used.  The code suggested in Sites includes the main loop
- * of the spin lock, whereas in this code the rest the loop is specified in C.
- * The generated code might be suboptimal if the compiler generates a forward
- * branch for the usual case in which the mutex is uncontested.
- *
- * 2. At label 20, Sites suggests including code for testing for an excessive
- * number of _processor_ lock conflicts.  (The seq_c instruction stores its
- * first argument provided that no other processor has written to a byte range
- * including its memory-location argument.)  Absent such checking the code
- * below could conceivably stall silently on a multiprocessor alpha, depending
- * on how often processor/processor conflicts occur in a particular byte range.
- *
- * Note that the mb ("memory-barrier") instruction in TSL_UNSET is critical to
- * correct operation in a multiprocessor alpha (as is, of course, the mb in
- * the TSL_SET macro).  Without the mb, changes to shared memory that occurred
- * inside the critical section (before the TSL_UNSET) might reach shared memory
- * _after_ the change of tsl to 0, thereby permitting another processor to see
- * an inconsistent view of the data protected by the mutex.
- *
- * For gcc/alpha, 0 is clear, 1 is set.
- */
-#define TSL_SET(tsl) ({							\
-	register tsl_t *__l = (tsl);					\
-	register tsl_t __r1, __r2;					\
-	__asm__ volatile("						\n\
-	   10: ldq_l %0,(%2)						\n\
-	       blbs  %0,30f						\n\
-	       or    %0,1,%1						\n\
-	       stq_c %1,(%2)						\n\
-	       beq   %1,20f						\n\
-	       mb							\n\
-	       br    30f						\n\
-	   20: br    10b						\n\
-	   30: "							\
-	  : "=&r" (__r1), "=&r" (__r2)					\
-	  : "r" (__l));							\
-	!__r1;								\
-})
-
-#define TSL_UNSET(tsl) ({						\
-	register tsl_t *__l = (tsl);					\
-	__asm__ volatile("mb; stq $31,(%0);" : : "r" (__l));		\
-})
-#define	TSL_INIT(tsl)	TSL_UNSET(tsl)
diff --git a/db2/mutex/mutex.c b/db2/mutex/mutex.c
index de0d0e23fe..acc6aa07c9 100644
--- a/db2/mutex/mutex.c
+++ b/db2/mutex/mutex.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)mutex.c	10.48 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)mutex.c	10.52 (Sleepycat) 11/8/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -37,9 +37,12 @@ static const char sccsid[] = "@(#)mutex.c	10.48 (Sleepycat) 5/23/98";
 
 #if defined(HAVE_FUNC_MSEM)
 /*
- * XXX
- * Should we not use MSEM_IF_NOWAIT and let the system block for us?
- * I've no idea if this will block all threads in the process or not.
+ * !!!
+ * Do not remove the MSEM_IF_NOWAIT flag.  The problem is that if a single
+ * process makes two msem_lock() calls in a row, the second one returns an
+ * error.  We depend on the fact that we can lock against ourselves in the
+ * locking subsystem, where we set up a mutex so that we can block ourselves.
+ * Tested on OSF1 v4.0.
  */
 #define	TSL_INIT(x)	(msem_init(x, MSEM_UNLOCKED) == NULL)
 #define	TSL_INIT_ERROR	1
@@ -74,6 +77,17 @@ static const char sccsid[] = "@(#)mutex.c	10.48 (Sleepycat) 5/23/98";
 #define	TSL_UNSET(x)	_lock_clear(x)
 #endif
 
+#ifdef HAVE_FUNC_VMS
+#include <builtins.h>
+#ifdef __ALPHA
+#define	TSL_SET(tsl)	(!__TESTBITSSI(tsl, 0))
+#else /* __VAX */
+#define	TSL_SET(tsl)	(!(int)_BBSSI(0, tsl))
+#endif
+#define	TSL_UNSET(tsl) 	(*(tsl) = 0)
+#define	TSL_INIT(tsl)	TSL_UNSET(tsl)
+#endif
+
 #ifdef HAVE_ASSEM_PARISC_GCC
 #include "parisc.gcc"
 #endif
@@ -181,7 +195,7 @@ __db_mutex_lock(mp, fd)
 #ifdef HAVE_SPINLOCKS
 	COMPQUIET(fd, 0);
 
-	for (usecs = MS(10);;) {
+	for (usecs = MS(1);;) {
 		/* Try and acquire the uncontested resource lock for N spins. */
 		for (nspins = mp->spins; nspins > 0; --nspins)
 			if (TSL_SET(&mp->tsl_resource)) {
@@ -193,19 +207,17 @@ __db_mutex_lock(mp, fd)
 				}
 				mp->pid = getpid();
 #endif
-				if (usecs == MS(10))
+				if (usecs == MS(1))
 					++mp->mutex_set_nowait;
 				else
 					++mp->mutex_set_wait;
 				return (0);
 			}
 
-		/* Yield the processor; wait 10ms initially, up to 1 second. */
-		if (__db_yield == NULL || __db_yield() != 0) {
-			(void)__db_sleep(0, usecs);
-			if ((usecs <<= 1) > SECOND)
-				usecs = SECOND;
-		}
+		/* Yield the processor; wait 1ms initially, up to 1 second. */
+		__os_yield(usecs);
+		if ((usecs <<= 1) > SECOND)
+			usecs = SECOND;
 	}
 	/* NOTREACHED */
 
@@ -218,15 +230,14 @@ __db_mutex_lock(mp, fd)
 
 	for (locked = 0, mypid = getpid();;) {
 		/*
-		 * Wait for the lock to become available; wait 10ms initially,
+		 * Wait for the lock to become available; wait 1ms initially,
 		 * up to 1 second.
 		 */
-		for (usecs = MS(10); mp->pid != 0;)
-			if (__db_yield == NULL || __db_yield() != 0) {
-				(void)__db_sleep(0, usecs);
-				if ((usecs <<= 1) > SECOND)
-					usecs = SECOND;
-			}
+		for (usecs = MS(1); mp->pid != 0;) {
+			__os_yield(usecs);
+			if ((usecs <<= 1) > SECOND)
+				usecs = SECOND;
+		}
 
 		/* Acquire an exclusive kernel lock. */
 		k_lock.l_type = F_WRLCK;
diff --git a/db2/mutex/parisc.hp b/db2/mutex/parisc.hp
deleted file mode 100644
index bd0e37fc78..0000000000
--- a/db2/mutex/parisc.hp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * @(#)parisc.hp	8.6 (Sleepycat) 6/2/98
- *
- * Copyright (c) 1996-1997, The University of Utah and the Computer Systems
- * Laboratory at the University of Utah (CSL).  All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software is hereby
- * granted provided that (1) source code retains these copyright, permission,
- * and disclaimer notices, and (2) redistributions including binaries
- * reproduce the notices in supporting documentation, and (3) all advertising
- * materials mentioning features or use of this software display the following
- * acknowledgement: ``This product includes software developed by the Computer
- * Systems Laboratory at the University of Utah.''
- *
- * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS
- * IS" CONDITION.  THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF
- * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * CSL requests users of this software to return to csl-dist@cs.utah.edu any
- * improvements that they make and grant CSL redistribution rights.
- */
-
-/*
- * The PA-RISC has a "load and clear" instead of a "test and set" instruction.
- * The 32-bit word used by that instruction must be 16-byte aligned hence we
- * allocate 16 bytes for a tsl_t and use the word that is properly aligned.
- */
-#define	TSL_SET(tsl)	tsl_set(tsl)
-#define	TSL_UNSET(tsl)	tsl_unset(tsl)
diff --git a/db2/mutex/uts4.cc.s b/db2/mutex/uts4_cc.s
index ee5f4143bd..ee5f4143bd 100644
--- a/db2/mutex/uts4.cc.s
+++ b/db2/mutex/uts4_cc.s
diff --git a/db2/os/os_abs.c b/db2/os/os_abs.c
index d9f4970467..547a6804b4 100644
--- a/db2/os/os_abs.c
+++ b/db2/os/os_abs.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_abs.c	10.8 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)os_abs.c	10.9 (Sleepycat) 7/21/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -18,13 +18,13 @@ static const char sccsid[] = "@(#)os_abs.c	10.8 (Sleepycat) 4/10/98";
 #include "db_int.h"
 
 /*
- * __db_abspath --
+ * __os_abspath --
  *	Return if a path is an absolute path.
  *
- * PUBLIC: int __db_abspath __P((const char *));
+ * PUBLIC: int __os_abspath __P((const char *));
  */
 int
-__db_abspath(path)
+__os_abspath(path)
 	const char *path;
 {
 	return (path[0] == '/');
diff --git a/db2/os/os_alloc.c b/db2/os/os_alloc.c
index 35784476c0..0090eb14a7 100644
--- a/db2/os/os_alloc.c
+++ b/db2/os/os_alloc.c
@@ -8,40 +8,22 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_alloc.c	10.6 (Sleepycat) 5/2/98";
+static const char sccsid[] = "@(#)os_alloc.c	10.10 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
+#include <errno.h>
 #include <string.h>
+#include <stdlib.h>
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
- * __db_strdup --
- *	The strdup(3) function for DB.
- *
- * PUBLIC: char *__db_strdup __P((const char *));
- */
-char *
-__db_strdup(str)
-	const char *str;
-{
-	size_t len;
-	char *copy;
-
-	len = strlen(str) + 1;
-	if ((copy = __db_malloc(len)) == NULL)
-		return (NULL);
-
-	memcpy(copy, str, len);
-	return (copy);
-}
-
-/*
- * XXX
+ * !!!
  * Correct for systems that return NULL when you allocate 0 bytes of memory.
  * There are several places in DB where we allocate the number of bytes held
  * by the key/data item, and it can be 0.  Correct here so that malloc never
@@ -49,59 +31,189 @@ __db_strdup(str)
  * could make these calls macros on non-Alpha architectures (that's where we
  * saw the problem), but it's probably not worth the autoconf complexity.
  *
+ * !!!
+ * Correct for systems that don't set errno when malloc and friends fail.
+ *
  *	Out of memory.
  *	We wish to hold the whole sky,
  *	But we never will.
  */
+
+/*
+ * __os_strdup --
+ *	The strdup(3) function for DB.
+ *
+ * PUBLIC: int __os_strdup __P((const char *, void *));
+ */
+int
+__os_strdup(str, storep)
+	const char *str;
+	void *storep;
+{
+	size_t size;
+	int ret;
+	void *p;
+
+	*(void **)storep = NULL;
+
+	size = strlen(str) + 1;
+	if ((ret = __os_malloc(size, NULL, &p)) != 0)
+		return (ret);
+
+	memcpy(p, str, size);
+
+	*(void **)storep = p;
+	return (0);
+}
+
 /*
- * __db_calloc --
+ * __os_calloc --
  *	The calloc(3) function for DB.
  *
- * PUBLIC: void *__db_calloc __P((size_t, size_t));
+ * PUBLIC: int __os_calloc __P((size_t, size_t, void *));
  */
-void *
-__db_calloc(num, size)
+int
+__os_calloc(num, size, storep)
 	size_t num, size;
+	void *storep;
 {
 	void *p;
+	int ret;
 
 	size *= num;
-	if ((p = __db_jump.j_malloc(size == 0 ? 1 : size)) != NULL)
-		memset(p, 0, size);
-	return (p);
+	if ((ret = __os_malloc(size, NULL, &p)) != 0)
+		return (ret);
+
+	memset(p, 0, size);
+	*(void **)storep = p;
+
+	return (0);
 }
 
 /*
- * __db_malloc --
+ * __os_malloc --
  *	The malloc(3) function for DB.
  *
- * PUBLIC: void *__db_malloc __P((size_t));
+ * PUBLIC: int __os_malloc __P((size_t, void *(*)(size_t), void *));
  */
-void *
-__db_malloc(size)
+int
+__os_malloc(size, db_malloc, storep)
 	size_t size;
+	void *(*db_malloc) __P((size_t)), *storep;
 {
-#ifdef DIAGNOSTIC
 	void *p;
 
-	p = __db_jump.j_malloc(size == 0 ? 1 : size);
-	memset(p, 0xff, size == 0 ? 1 : size);
-	return (p);
-#else
-	return (__db_jump.j_malloc(size == 0 ? 1 : size));
+	*(void **)storep = NULL;
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+	/* Some C libraries don't correctly set errno when malloc(3) fails. */
+	errno = 0;
+	if (db_malloc != NULL)
+		p = db_malloc(size);
+	else if (__db_jump.j_malloc != NULL)
+		p = __db_jump.j_malloc(size);
+	else
+		p = malloc(size);
+	if (p == NULL) {
+		if (errno == 0)
+			errno = ENOMEM;
+		return (errno);
+	}
+
+#ifdef DIAGNOSTIC
+	memset(p, 0xdb, size);
 #endif
+	*(void **)storep = p;
+
+	return (0);
 }
 
 /*
- * __db_realloc --
+ * __os_realloc --
  *	The realloc(3) function for DB.
  *
- * PUBLIC: void *__db_realloc __P((void *, size_t));
+ * PUBLIC: int __os_realloc __P((void *, size_t));
+ */
+int
+__os_realloc(storep, size)
+	void *storep;
+	size_t size;
+{
+	void *p, *ptr;
+
+	ptr = *(void **)storep;
+
+	/* If we haven't yet allocated anything yet, simply call malloc. */
+	if (ptr == NULL)
+		return (__os_malloc(size, NULL, storep));
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+	/*
+	 * Some C libraries don't correctly set errno when realloc(3) fails.
+	 *
+	 * Don't overwrite the original pointer, there are places in DB we
+	 * try to continue after realloc fails.
+	 */
+	errno = 0;
+	if (__db_jump.j_realloc != NULL)
+		p = __db_jump.j_realloc(ptr, size);
+	else
+		p = realloc(ptr, size);
+	if (p == NULL) {
+		if (errno == 0)
+			errno = ENOMEM;
+		return (errno);
+	}
+
+	*(void **)storep = p;
+
+	return (0);
+}
+
+/*
+ * __os_free --
+ *	The free(3) function for DB.
+ *
+ * PUBLIC: void __os_free __P((void *, size_t));
  */
-void *
-__db_realloc(ptr, size)
+void
+__os_free(ptr, size)
 	void *ptr;
 	size_t size;
 {
-	return (__db_jump.j_realloc(ptr, size == 0 ? 1 : size));
+#ifdef DIAGNOSTIC
+	if (size != 0)
+		memset(ptr, 0xdb, size);
+#endif
+
+	if (__db_jump.j_free != NULL)
+		__db_jump.j_free(ptr);
+	else
+		free(ptr);
+}
+
+/*
+ * __os_freestr --
+ *	The free(3) function for DB, freeing a string.
+ *
+ * PUBLIC: void __os_freestr __P((void *));
+ */
+void
+__os_freestr(ptr)
+	void *ptr;
+{
+#ifdef DIAGNOSTIC
+	memset(ptr, 0xdb, strlen(ptr) + 1);
+#endif
+
+	if (__db_jump.j_free != NULL)
+		__db_jump.j_free(ptr);
+	else
+		free(ptr);
 }
diff --git a/db2/os/os_config.c b/db2/os/os_config.c
index 4150c843e4..71d379a387 100644
--- a/db2/os/os_config.c
+++ b/db2/os/os_config.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_config.c	10.26 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)os_config.c	10.30 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -18,72 +18,18 @@ static const char sccsid[] = "@(#)os_config.c	10.26 (Sleepycat) 5/23/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
-/*
- * XXX
- * We provide our own extern declarations so that we don't collide with
- * systems that get them wrong, e.g., SunOS.
- */
-#ifdef _WIN32
-#define fsync		_commit
-#define imported	__declspec(dllimport)
-#else
-#define imported
-#endif
-
-/*
- * XXX
- * HP/UX MPE doesn't have fsync, but you can build one using FCONTROL.
- */
-#ifdef __hp3000s900
-#define	fsync	__mpe_fsync
-#endif
-
-imported extern int	 close __P((int));
-imported extern void	 free __P((void *));
-imported extern int	 fsync __P((int));
-imported extern void    *malloc __P((size_t));
-imported extern int	 open __P((const char *, int, ...));
-imported extern ssize_t	 read __P((int, void *, size_t));
-imported extern void    *realloc __P((void *, size_t));
-imported extern int	 unlink __P((const char *));
-imported extern ssize_t	 write __P((int, const void *, size_t));
-
-/*
- * __db_jump --
- *	This list of interfaces that applications can replace.  In some
- *	cases, the user is permitted to replace the standard ANSI C or
- *	POSIX 1003.1 call, e.g., malloc or read.  In others, we provide
- *	a local interface to the functionality, e.g., __os_ioinfo.
- */
-struct __db_jumptab __db_jump = {
-	close,				/* DB_FUNC_CLOSE */
-	__os_dirfree,			/* DB_FUNC_DIRFREE */
-	__os_dirlist,			/* DB_FUNC_DIRLIST */
-	__os_exists,			/* DB_FUNC_EXISTS */
-	free,				/* DB_FUNC_FREE */
-	fsync,				/* DB_FUNC_FSYNC */
-	__os_ioinfo,			/* DB_FUNC_IOINFO */
-	malloc,				/* DB_FUNC_MALLOC */
-	NULL,				/* DB_FUNC_MAP */
-	open,				/* DB_FUNC_OPEN */
-	read,				/* DB_FUNC_READ */
-	realloc,			/* DB_FUNC_REALLOC */
-	NULL,				/* DB_FUNC_RUNLINK */
-	__os_seek,			/* DB_FUNC_SEEK */
-	__os_sleep,			/* DB_FUNC_SLEEP */
-	unlink,				/* DB_FUNC_UNLINK */
-	NULL,				/* DB_FUNC_UNMAP */
-	write,				/* DB_FUNC_WRITE */
-	NULL				/* DB_FUNC_YIELD */
-};
+struct __db_jumptab __db_jump;
 
 DB_GLOBALS __db_global_values = {
 	1,				/* DB_MUTEXLOCKS */
+	0,				/* DB_PAGEYIELD */
 	0,				/* DB_REGION_ANON, DB_REGION_NAME */
 	0,				/* DB_REGION_INIT */
 	0,				/* DB_TSL_SPINS */
-	0				/* DB_PAGEYIELD */
+        {NULL, &__db_global_values.db_envq.tqh_first},  /* Environemnt queue */
+	{NULL, &__db_global_values.db_nameq.tqh_first}	/* Name queue */
 };
 
 /*
diff --git a/db2/os/os_dir.c b/db2/os/os_dir.c
index 14a10ad23f..f2ee128c1e 100644
--- a/db2/os/os_dir.c
+++ b/db2/os/os_dir.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_dir.c	10.15 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)os_dir.c	10.19 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -35,6 +35,7 @@ static const char sccsid[] = "@(#)os_dir.c	10.15 (Sleepycat) 4/26/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
  * __os_dirlist --
@@ -50,22 +51,23 @@ __os_dirlist(dir, namesp, cntp)
 {
 	struct dirent *dp;
 	DIR *dirp;
-	int arraysz, cnt;
+	int arraysz, cnt, ret;
 	char **names;
 
+	if (__db_jump.j_dirlist != NULL)
+		return (__db_jump.j_dirlist(dir, namesp, cntp));
+
 	if ((dirp = opendir(dir)) == NULL)
 		return (errno);
 	names = NULL;
 	for (arraysz = cnt = 0; (dp = readdir(dirp)) != NULL; ++cnt) {
 		if (cnt >= arraysz) {
 			arraysz += 100;
-			names = (char **)(names == NULL ?
-			    __db_malloc(arraysz * sizeof(names[0])) :
-			    __db_realloc(names, arraysz * sizeof(names[0])));
-			if (names == NULL)
+			if ((ret = __os_realloc(&names,
+			    arraysz * sizeof(names[0]))) != 0)
 				goto nomem;
 		}
-		if ((names[cnt] = (char *)__db_strdup(dp->d_name)) == NULL)
+		if ((ret = __os_strdup(dp->d_name, &names[cnt])) != 0)
 			goto nomem;
 	}
 	(void)closedir(dirp);
@@ -76,7 +78,7 @@ __os_dirlist(dir, namesp, cntp)
 
 nomem:	if (names != NULL)
 		__os_dirfree(names, cnt);
-	return (ENOMEM);
+	return (ret);
 }
 
 /*
@@ -90,7 +92,10 @@ __os_dirfree(names, cnt)
 	char **names;
 	int cnt;
 {
+	if (__db_jump.j_dirfree != NULL)
+		__db_jump.j_dirfree(names, cnt);
+
 	while (cnt > 0)
-		__db_free(names[--cnt]);
-	__db_free(names);
+		__os_free(names[--cnt], 0);
+	__os_free(names, 0);
 }
diff --git a/db2/os/os_fid.c b/db2/os/os_fid.c
index cf48c01bd8..62da590611 100644
--- a/db2/os/os_fid.c
+++ b/db2/os/os_fid.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_fid.c	10.11 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)os_fid.c	10.12 (Sleepycat) 7/21/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -24,13 +24,13 @@ static const char sccsid[] = "@(#)os_fid.c	10.11 (Sleepycat) 4/26/98";
 #include "common_ext.h"
 
 /*
- * __db_fileid --
+ * __os_fileid --
  *	Return a unique identifier for a file.
  *
- * PUBLIC: int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *));
+ * PUBLIC: int __os_fileid __P((DB_ENV *, const char *, int, u_int8_t *));
  */
 int
-__db_fileid(dbenv, fname, timestamp, fidp)
+__os_fileid(dbenv, fname, timestamp, fidp)
 	DB_ENV *dbenv;
 	const char *fname;
 	int timestamp;
diff --git a/db2/os/os_fsync.c b/db2/os/os_fsync.c
index e1f271a75c..61a504f84d 100644
--- a/db2/os/os_fsync.c
+++ b/db2/os/os_fsync.c
@@ -8,34 +8,21 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_fsync.c	10.5 (Sleepycat) 4/19/98";
+static const char sccsid[] = "@(#)os_fsync.c	10.7 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <fcntl.h>			/* XXX: Required by __hp3000s900 */
 #include <unistd.h>
 #endif
 
 #include "db_int.h"
-
-/*
- * __db_fsync --
- *	Flush a file descriptor.
- *
- * PUBLIC: int __db_fsync __P((int));
- */
-int
-__db_fsync(fd)
-	int fd;
-{
-	return (__os_fsync(fd) ? errno : 0);
-}
+#include "os_jump.h"
 
 #ifdef __hp3000s900
-#include <fcntl.h>
-
 int
 __mpe_fsync(fd)
 	int fd;
@@ -47,3 +34,26 @@ __mpe_fsync(fd)
 	return (0);
 }
 #endif
+
+#ifdef __hp3000s900
+#define	fsync(fd)	__mpe_fsync(fd);
+#endif
+#ifdef _WIN32
+#define	fsync(fd)	_commit(fd);
+#endif
+
+/*
+ * __os_fsync --
+ *	Flush a file descriptor.
+ *
+ * PUBLIC: int __os_fsync __P((int));
+ */
+int
+__os_fsync(fd)
+	int fd;
+{
+	int ret;
+
+	ret = __db_jump.j_fsync != NULL ?  __db_jump.j_fsync(fd) : fsync(fd);
+	return (ret == 0 ? 0 : errno);
+}
diff --git a/db2/os/os_map.c b/db2/os/os_map.c
index 5f0fd790e6..5664a2edec 100644
--- a/db2/os/os_map.c
+++ b/db2/os/os_map.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_map.c	10.19 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)os_map.c	10.24 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -27,13 +27,14 @@ static const char sccsid[] = "@(#)os_map.c	10.19 (Sleepycat) 5/3/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 #include "common_ext.h"
 
 #ifdef HAVE_MMAP
 static int __os_map __P((char *, int, size_t, int, int, int, void **));
 #endif
 #ifdef HAVE_SHMGET
-static int __os_shmget __P((char *, REGINFO *));
+static int __os_shmget __P((REGINFO *));
 #endif
 
 /*
@@ -165,7 +166,7 @@ __db_mapregion(path, infop)
 #ifdef HAVE_SHMGET
 		if (!called) {
 			called = 1;
-			ret = __os_shmget(path, infop);
+			ret = __os_shmget(infop);
 		}
 #endif
 #ifdef HAVE_MMAP
@@ -207,7 +208,7 @@ __db_mapregion(path, infop)
 #ifdef HAVE_SHMGET
 		if (!called) {
 			called = 1;
-			ret = __os_shmget(path, infop);
+			ret = __os_shmget(infop);
 		}
 #endif
 	}
@@ -271,10 +272,9 @@ __db_unlinkregion(name, infop)
 		called = 1;
 		ret = shmctl(infop->segid, IPC_RMID, NULL) ? errno : 0;
 	}
-#else
-	COMPQUIET(infop, NULL);
 #endif
 #ifdef HAVE_MMAP
+	COMPQUIET(infop, NULL);
 	if (!called) {
 		called = 1;
 		ret = 0;
@@ -388,6 +388,23 @@ __os_map(path, fd, len, is_region, is_anonymous, is_rdonly, addr)
 
 	prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE);
 
+/*
+ * XXX
+ * Work around a bug in the VMS V7.1 mmap() implementation.  To map a file
+ * into memory on VMS it needs to be opened in a certain way, originally.
+ * To get the file opened in that certain way, the VMS mmap() closes the
+ * file and re-opens it.  When it does this, it doesn't flush any caches
+ * out to disk before closing.  The problem this causes us is that when the
+ * memory cache doesn't get written out, the file isn't big enough to match
+ * the memory chunk and the mmap() call fails.  This call to fsync() fixes
+ * the problem.  DEC thinks this isn't a bug because of language in XPG5
+ * discussing user responsibility for on-disk and in-memory synchronization.
+ */
+#ifdef VMS
+	if (__os_fsync(fd) == -1)
+		return(errno);
+#endif
+
 	/* MAP_FAILED was not defined in early mmap implementations. */
 #ifndef MAP_FAILED
 #define	MAP_FAILED	-1
@@ -407,47 +424,12 @@ __os_map(path, fd, len, is_region, is_anonymous, is_rdonly, addr)
  *	Call the shmget(2) family of functions.
  */
 static int
-__os_shmget(path, infop)
+__os_shmget(infop)
 	REGINFO *infop;
-	char *path;
 {
-	key_t key;
-	int shmflg;
-
-	if (F_ISSET(infop, REGION_CREATED)) {
-		/*
-		 * The return key from ftok(3) is not guaranteed to be unique.
-		 * The nice thing about the shmget(2) interface is that it
-		 * allows you to name anonymous pieces of memory.  The evil
-		 * thing about it is that the name space is separate from the
-		 * filesystem.
-		 */
-#ifdef __hp3000s900
-		{char mpe_path[MAXPATHLEN];
-		/*
-		 * MPE ftok() is broken as of 5.5pp4.  If the file path does
-		 * not start with '/' or '.', then ftok() tries to interpret
-		 * the file path in MPE syntax instead of POSIX HFS syntax.
-		 * The workaround is to prepend "./" to these paths.  See HP
-		 * SR 5003416081 for details.
-		 */
-		if (*path != '/' && *path != '.') {
-			if (strlen(path) + strlen("./") + 1 > sizeof(mpe_path))
-				return (ENAMETOOLONG);
-			mpe_path[0] = '.';
-			mpe_path[1] = '/';
-			(void)strcpy(mpe_path + 2, path);
-			path = mpe_path;
-		}
-		}
-#endif
-		if ((key = ftok(path, 1)) == (key_t)-1)
-			return (errno);
-
-		shmflg = IPC_CREAT | 0600;
-		if ((infop->segid = shmget(key, infop->size, shmflg)) == -1)
-			return (errno);
-	}
+	if (F_ISSET(infop, REGION_CREATED) &&
+	   (infop->segid = shmget(0, infop->size, IPC_PRIVATE | 0600)) == -1)
+		return (errno);
 
 	if ((infop->addr = shmat(infop->segid, NULL, 0)) == (void *)-1) {
 		/*
diff --git a/db2/os/os_oflags.c b/db2/os/os_oflags.c
index 976b84d709..a4003dd5f0 100644
--- a/db2/os/os_oflags.c
+++ b/db2/os/os_oflags.c
@@ -44,7 +44,7 @@ __db_oflags(oflags)
 	case O_RDWR:
 		break;
 	default:		/* Bogus flags value from user.  */
-	  /* XXX no way to return error from here */
+		/* XXX no way to return error from here */
 	}
 	if (oflags & O_CREAT)
 		dbflags |= DB_CREATE;
diff --git a/db2/os/os_open.c b/db2/os/os_open.c
index e960377ebb..c54fd7365d 100644
--- a/db2/os/os_open.c
+++ b/db2/os/os_open.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_open.c	10.26 (Sleepycat) 5/4/98";
+static const char sccsid[] = "@(#)os_open.c	10.33 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,10 +16,12 @@ static const char sccsid[] = "@(#)os_open.c	10.26 (Sleepycat) 5/4/98";
 
 #include <errno.h>
 #include <fcntl.h>
+#include <signal.h>
 #include <unistd.h>
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
  * __db_open --
@@ -33,7 +35,10 @@ __db_open(name, arg_flags, ok_flags, mode, fdp)
 	u_int32_t arg_flags, ok_flags;
 	int mode, *fdp;
 {
-	int fd, flags;
+#if !defined(_WIN32) && defined(HAVE_SIGFILLSET)
+	sigset_t set, oset;
+#endif
+	int flags, ret;
 
 	if (arg_flags & ~ok_flags)
 		return (EINVAL);
@@ -71,41 +76,77 @@ __db_open(name, arg_flags, ok_flags, mode, fdp)
 	if (arg_flags & DB_TRUNCATE)
 		flags |= O_TRUNC;
 
+#if !defined(_WIN32) && defined(HAVE_SIGFILLSET)
+	/*
+	 * We block every signal we can get our hands on so that the temporary
+	 * file isn't left around if we're interrupted at the wrong time.  Of
+	 * course, if we drop core in-between the calls we'll hang forever, but
+	 * that's probably okay.  ;-)
+	 */
+	if (arg_flags & DB_TEMPORARY) {
+		(void)sigfillset(&set);
+		(void)sigprocmask(SIG_BLOCK, &set, &oset);
+	}
+#endif
+
 	/* Open the file. */
-	if ((fd = __os_open(name, flags, mode)) == -1)
-		return (errno);
+	if ((ret = __os_open(name, flags, mode, fdp)) != 0)
+		return (ret);
 
-#ifndef _WIN32
+#if !defined(_WIN32)
 	/* Delete any temporary file; done for Win32 by _O_TEMPORARY. */
-	if (arg_flags & DB_TEMPORARY)
+	if (arg_flags & DB_TEMPORARY) {
 		(void)__os_unlink(name);
+#if defined(HAVE_SIGFILLSET)
+		(void)sigprocmask(SIG_SETMASK, &oset, NULL);
+#endif
+	}
 #endif
 
-#if !defined(_WIN32) && !defined(WIN16)
+#if !defined(_WIN32) && !defined(WIN16) && !defined(VMS)
 	/*
-	 * Deny access to any child process; done for Win32 by O_NOINHERIT,
-	 * MacOS has neither child processes nor fd inheritance.
+	 * Deny access to any child process.
+	 *	VMS: does not have fd inheritance.
+	 *	Win32: done by O_NOINHERIT.
 	 */
-	if (fcntl(fd, F_SETFD, 1) == -1) {
-		int ret = errno;
+	if (fcntl(*fdp, F_SETFD, 1) == -1) {
+		ret = errno;
 
-		(void)__os_close(fd);
+		(void)__os_close(*fdp);
 		return (ret);
 	}
 #endif
-	*fdp = fd;
 	return (0);
 }
 
 /*
- * __db_close --
+ * __os_open --
+ *	Open a file.
+ *
+ * PUBLIC: int __os_open __P((const char *, int, int, int *));
+ */
+int
+__os_open(name, flags, mode, fdp)
+	const char *name;
+	int flags, mode, *fdp;
+{
+	*fdp = __db_jump.j_open != NULL ?
+	    __db_jump.j_open(name, flags, mode) : open(name, flags, mode);
+	return (*fdp == -1 ? errno : 0);
+}
+
+/*
+ * __os_close --
  *	Close a file descriptor.
  *
- * PUBLIC: int __db_close __P((int));
+ * PUBLIC: int __os_close __P((int));
  */
 int
-__db_close(fd)
+__os_close(fd)
 	int fd;
 {
-	return (__os_close(fd) ? errno : 0);
+	int ret;
+
+	ret = __db_jump.j_close != NULL ? __db_jump.j_close(fd) : close(fd);
+	return (ret == 0 ? 0 : errno);
 }
diff --git a/db2/os/os_rw.c b/db2/os/os_rw.c
index 7591041981..38f5b9473a 100644
--- a/db2/os/os_rw.c
+++ b/db2/os/os_rw.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_rw.c	10.7 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)os_rw.c	10.11 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,15 +19,73 @@ static const char sccsid[] = "@(#)os_rw.c	10.7 (Sleepycat) 4/10/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
- * __db_read --
+ * __os_io --
+ *	Do an I/O.
+ *
+ * PUBLIC: int __os_io __P((DB_IO *, int, ssize_t *));
+ */
+int
+__os_io(db_iop, op, niop)
+	DB_IO *db_iop;
+	int op;
+	ssize_t *niop;
+{
+	int ret;
+
+#ifdef HAVE_PREAD
+	switch (op) {
+	case DB_IO_READ:
+		if (__db_jump.j_read != NULL)
+			goto slow;
+		*niop = pread(db_iop->fd_io, db_iop->buf,
+		    db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
+		break;
+	case DB_IO_WRITE:
+		if (__db_jump.j_write != NULL)
+			goto slow;
+		*niop = pwrite(db_iop->fd_io, db_iop->buf,
+		    db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize);
+		break;
+	}
+	if (*niop == db_iop->bytes)
+		return (0);
+slow:
+#endif
+	if (db_iop->mutexp != NULL)
+		(void)__db_mutex_lock(db_iop->mutexp, db_iop->fd_lock);
+
+	if ((ret = __os_seek(db_iop->fd_io,
+	    db_iop->pagesize, db_iop->pgno, 0, 0, SEEK_SET)) != 0)
+		goto err;
+	switch (op) {
+	case DB_IO_READ:
+		ret =
+		    __os_read(db_iop->fd_io, db_iop->buf, db_iop->bytes, niop);
+		break;
+	case DB_IO_WRITE:
+		ret =
+		    __os_write(db_iop->fd_io, db_iop->buf, db_iop->bytes, niop);
+		break;
+	}
+
+err:	if (db_iop->mutexp != NULL)
+		(void)__db_mutex_unlock(db_iop->mutexp, db_iop->fd_lock);
+
+	return (ret);
+
+}
+
+/*
+ * __os_read --
  *	Read from a file handle.
  *
- * PUBLIC: int __db_read __P((int, void *, size_t, ssize_t *));
+ * PUBLIC: int __os_read __P((int, void *, size_t, ssize_t *));
  */
 int
-__db_read(fd, addr, len, nrp)
+__os_read(fd, addr, len, nrp)
 	int fd;
 	void *addr;
 	size_t len;
@@ -39,7 +97,9 @@ __db_read(fd, addr, len, nrp)
 
 	for (taddr = addr,
 	    offset = 0; offset < len; taddr += nr, offset += nr) {
-		if ((nr = __os_read(fd, taddr, len - offset)) < 0)
+		if ((nr = __db_jump.j_read != NULL ?
+		    __db_jump.j_read(fd, taddr, len - offset) :
+		    read(fd, taddr, len - offset)) < 0)
 			return (errno);
 		if (nr == 0)
 			break;
@@ -49,15 +109,15 @@ __db_read(fd, addr, len, nrp)
 }
 
 /*
- * __db_write --
+ * __os_write --
  *	Write to a file handle.
  *
- * PUBLIC: int __db_write __P((int, void *, size_t, ssize_t *));
+ * PUBLIC: int __os_write __P((int, void *, size_t, ssize_t *));
  */
 int
-__db_write(fd, addr, len, nwp)
+__os_write(fd, addr, len, nwp)
 	int fd;
-	void *addr;
+	const void *addr;
 	size_t len;
 	ssize_t *nwp;
 {
@@ -67,7 +127,9 @@ __db_write(fd, addr, len, nwp)
 
 	for (taddr = addr,
 	    offset = 0; offset < len; taddr += nw, offset += nw)
-		if ((nw = __os_write(fd, taddr, len - offset)) < 0)
+		if ((nw = __db_jump.j_write != NULL ?
+		    __db_jump.j_write(fd, taddr, len - offset) :
+		    write(fd, taddr, len - offset)) < 0)
 			return (errno);
 	*nwp = len;
 	return (0);
diff --git a/db2/os/os_seek.c b/db2/os/os_seek.c
index 159425cc27..ae5272bd1c 100644
--- a/db2/os/os_seek.c
+++ b/db2/os/os_seek.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_seek.c	10.9 (Sleepycat) 4/19/98";
+static const char sccsid[] = "@(#)os_seek.c	10.11 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)os_seek.c	10.9 (Sleepycat) 4/19/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
  * __os_seek --
@@ -35,10 +36,17 @@ __os_seek(fd, pgsize, pageno, relative, isrewind, whence)
 	int isrewind, whence;
 {
 	off_t offset;
-
-	offset = (off_t)pgsize * pageno + relative;
-	if (isrewind)
-		offset = -offset;
-
-	return (lseek(fd, offset, whence) == -1 ? errno : 0);
+	int ret;
+
+	if (__db_jump.j_seek != NULL)
+		ret = __db_jump.j_seek(fd,
+		    pgsize, pageno, relative, isrewind, whence);
+	else {
+		offset = (off_t)pgsize * pageno + relative;
+		if (isrewind)
+			offset = -offset;
+
+		ret = lseek(fd, offset, whence);
+	}
+	return (ret == -1 ? errno : 0);
 }
diff --git a/db2/os/os_sleep.c b/db2/os/os_sleep.c
index 6a5b91f5c4..5aa476352e 100644
--- a/db2/os/os_sleep.c
+++ b/db2/os/os_sleep.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_sleep.c	10.10 (Sleepycat) 4/27/98";
+static const char sccsid[] = "@(#)os_sleep.c	10.12 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -28,6 +28,7 @@ static const char sccsid[] = "@(#)os_sleep.c	10.10 (Sleepycat) 4/27/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
  * __os_sleep --
@@ -45,6 +46,9 @@ __os_sleep(secs, usecs)
 	for (; usecs >= 1000000; ++secs, usecs -= 1000000)
 		;
 
+	if (__db_jump.j_sleep != NULL)
+		return (__db_jump.j_sleep(secs, usecs));
+
 	/*
 	 * It's important that we yield the processor here so that other
 	 * processes or threads are permitted to run.
diff --git a/db2/os/os_spin.c b/db2/os/os_spin.c
index 2fd21d018b..cbde58894a 100644
--- a/db2/os/os_spin.c
+++ b/db2/os/os_spin.c
@@ -8,17 +8,50 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_spin.c	10.7 (Sleepycat) 5/20/98";
+static const char sccsid[] = "@(#)os_spin.c	10.10 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+#include <sys/pstat.h>
+#endif
 
 #include <limits.h>
 #include <unistd.h>
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
+
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+/*
+ * __os_pstat_getdynamic --
+ *	HP/UX.
+ */
+static int
+__os_pstat_getdynamic()
+{
+	struct pst_dynamic psd;
+
+	return (pstat_getdynamic(&psd,
+	    sizeof(psd), (size_t)1, 0) == -1 ? 1 : psd.psd_proc_cnt);
+}
+#endif
+
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+/*
+ * __os_sysconf --
+ *	Solaris, Linux.
+ */
+static int
+__os_sysconf(void)
+{
+	int nproc;
+
+	return ((nproc = sysconf(_SC_NPROCESSORS_ONLN)) > 1 ? nproc : 1);
+}
+#endif
 
 /*
  * __os_spin --
@@ -29,33 +62,46 @@ static const char sccsid[] = "@(#)os_spin.c	10.7 (Sleepycat) 5/20/98";
 int
 __os_spin()
 {
-	static long sys_val;
-
-	/* If the application specified the spins, use its value. */
+	/*
+	 * If the application specified a value or we've already figured it
+	 * out, return it.
+	 *
+	 * XXX
+	 * We don't want to repeatedly call the underlying function because
+	 * it can be expensive (e.g., requiring multiple filesystem accesses
+	 * under Debian Linux).
+	 */
 	if (DB_GLOBAL(db_tsl_spins) != 0)
 		return (DB_GLOBAL(db_tsl_spins));
 
-	/* If we've already figured this out, return the value. */
-	if (sys_val != 0)
-		return (sys_val);
+	DB_GLOBAL(db_tsl_spins) = 1;
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+	DB_GLOBAL(db_tsl_spins) = __os_pstat_getdynamic();
+#endif
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+	DB_GLOBAL(db_tsl_spins) = __os_sysconf();
+#endif
 
 	/*
-	 * XXX
-	 * Solaris and Linux use _SC_NPROCESSORS_ONLN to return the number of
-	 * online processors.  We don't want to repeatedly call sysconf because
-	 * it's quite expensive (requiring multiple filesystem accesses) under
-	 * Debian Linux.
-	 *
-	 * Spin 50 times per processor -- we have anecdotal evidence that this
+	 * Spin 50 times per processor, we have anecdotal evidence that this
 	 * is a reasonable value.
 	 */
-#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
-	if ((sys_val = sysconf(_SC_NPROCESSORS_ONLN)) > 1)
-		sys_val *= 50;
-	else
-		sys_val = 1;
-#else
-	sys_val = 1;
-#endif
-	return (sys_val);
+	DB_GLOBAL(db_tsl_spins) *= 50;
+
+	return (DB_GLOBAL(db_tsl_spins));
+}
+
+/*
+ * __os_yield --
+ *	Yield the processor.
+ *
+ * PUBLIC: void __os_yield __P((u_long));
+ */
+void
+__os_yield(usecs)
+	u_long usecs;
+{
+	if (__db_jump.j_yield != NULL && __db_jump.j_yield() == 0)
+		return;
+	__os_sleep(0, usecs);
 }
diff --git a/db2/os/os_stat.c b/db2/os/os_stat.c
index e7d3f24174..65cba82efa 100644
--- a/db2/os/os_stat.c
+++ b/db2/os/os_stat.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_stat.c	10.15 (Sleepycat) 4/27/98";
+static const char sccsid[] = "@(#)os_stat.c	10.18 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)os_stat.c	10.15 (Sleepycat) 4/27/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
  * __os_exists --
@@ -33,6 +34,9 @@ __os_exists(path, isdirp)
 {
 	struct stat sb;
 
+	if (__db_jump.j_exists != NULL)
+		return (__db_jump.j_exists(path, isdirp));
+
 	if (stat(path, &sb) != 0)
 		return (errno);
 
@@ -65,7 +69,8 @@ __os_ioinfo(path, fd, mbytesp, bytesp, iosizep)
 {
 	struct stat sb;
 
-	COMPQUIET(path, NULL);
+	if (__db_jump.j_ioinfo != NULL)
+		return (__db_jump.j_ioinfo(path, fd, mbytesp, bytesp, iosizep));
 
 	if (fstat(fd, &sb) == -1)
 		return (errno);
@@ -80,7 +85,7 @@ __os_ioinfo(path, fd, mbytesp, bytesp, iosizep)
 	 * Return the underlying filesystem blocksize, if available.
 	 *
 	 * XXX
-	 * Check for a 0 size -- HP's MPE architecture has st_blksize,
+	 * Check for a 0 size -- the HP MPE/iX architecture has st_blksize,
 	 * but it's always 0.
 	 */
 #ifdef HAVE_ST_BLKSIZE
diff --git a/db2/os/os_tmpdir.c b/db2/os/os_tmpdir.c
new file mode 100644
index 0000000000..0b0bbc7c61
--- /dev/null
+++ b/db2/os/os_tmpdir.c
@@ -0,0 +1,113 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)os_tmpdir.c	10.3 (Sleepycat) 10/13/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#endif
+
+#include "db_int.h"
+#include "common_ext.h"
+
+#ifdef macintosh
+#include <TFileSpec.h>
+#endif
+
+/*
+ * __os_tmpdir --
+ *	Set the temporary directory path.
+ *
+ * The order of items in the list structure and the order of checks in
+ * the environment are documented.
+ *
+ * PUBLIC: int __os_tmpdir __P((DB_ENV *, u_int32_t));
+ */
+int
+__os_tmpdir(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	/*
+	 * !!!
+	 * Don't change this to:
+	 *
+	 *	static const char * const list[]
+	 *
+	 * because it creates a text relocation in position independent code.
+	 */
+	static const char * list[] = {
+		"/var/tmp",
+		"/usr/tmp",
+		"/temp",		/* Windows. */
+		"/tmp",
+		"C:/temp",		/* Windows. */
+		"C:/tmp",		/* Windows. */
+		NULL
+	};
+	const char * const *lp, *p;
+
+	/* Use the environment if it's permitted and initialized. */
+	p = NULL;
+#ifdef HAVE_GETEUID
+	if (LF_ISSET(DB_USE_ENVIRON) ||
+	    (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0))
+#else
+	if (LF_ISSET(DB_USE_ENVIRON))
+#endif
+	{
+		if ((p = getenv("TMPDIR")) != NULL && p[0] == '\0') {
+			__db_err(dbenv, "illegal TMPDIR environment variable");
+			return (EINVAL);
+		}
+		/* Windows */
+		if (p == NULL && (p = getenv("TEMP")) != NULL && p[0] == '\0') {
+			__db_err(dbenv, "illegal TEMP environment variable");
+			return (EINVAL);
+		}
+		/* Windows */
+		if (p == NULL && (p = getenv("TMP")) != NULL && p[0] == '\0') {
+			__db_err(dbenv, "illegal TMP environment variable");
+			return (EINVAL);
+		}
+		/* Macintosh */
+		if (p == NULL &&
+		    (p = getenv("TempFolder")) != NULL && p[0] == '\0') {
+			__db_err(dbenv,
+			    "illegal TempFolder environment variable");
+			return (EINVAL);
+		}
+	}
+
+#ifdef macintosh
+	/* Get the path to the temporary folder. */
+	if (p == NULL) {
+		FSSpec spec;
+
+		if (!Special2FSSpec(kTemporaryFolderType,
+		    kOnSystemDisk, 0, &spec))
+			(void)__os_strdup(FSp2FullPath(&spec), &p);
+	}
+#endif
+
+	/* Step through the list looking for a possibility. */
+	if (p == NULL)
+		for (lp = list; *lp != NULL; ++lp)
+			if (__os_exists(p = *lp, NULL) == 0)
+				break;
+	if (p == NULL)
+		return (0);
+
+	return (__os_strdup(p, &dbenv->db_tmp_dir));
+}
diff --git a/db2/os/os_unlink.c b/db2/os/os_unlink.c
index 3a1fa3ff99..aa484de843 100644
--- a/db2/os/os_unlink.c
+++ b/db2/os/os_unlink.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)os_unlink.c	10.5 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)os_unlink.c	10.7 (Sleepycat) 10/12/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,16 +19,21 @@ static const char sccsid[] = "@(#)os_unlink.c	10.5 (Sleepycat) 4/10/98";
 #endif
 
 #include "db_int.h"
+#include "os_jump.h"
 
 /*
- * __db_unlink --
+ * __os_unlink --
  *	Remove a file.
  *
- * PUBLIC: int __db_unlink __P((const char *));
+ * PUBLIC: int __os_unlink __P((const char *));
  */
 int
-__db_unlink(path)
+__os_unlink(path)
 	const char *path;
 {
-	return (__os_unlink(path) == -1 ? errno : 0);
+	int ret;
+
+	ret = __db_jump.j_unlink != NULL ?
+	    __db_jump.j_unlink(path) : unlink(path);
+	return (ret == -1 ? errno : 0);
 }
diff --git a/db2/progs/db_archive/db_archive.c b/db2/progs/db_archive/db_archive.c
index 691824c2ab..ca489954f6 100644
--- a/db2/progs/db_archive/db_archive.c
+++ b/db2/progs/db_archive/db_archive.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_archive.c	10.17 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)db_archive.c	10.20 (Sleepycat) 10/3/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -33,12 +33,10 @@ static const char sccsid[] = "@(#)db_archive.c	10.17 (Sleepycat) 4/10/98";
 #include "common_ext.h"
 
 DB_ENV	*db_init __P((char *, int));
-void	 onint __P((int));
 int	 main __P((int, char *[]));
-void	 siginit __P((void));
+void	 nosig __P((void));
 void	 usage __P((void));
 
-int	 interrupted;
 const char
 	*progname = "db_archive";			/* Program name. */
 
@@ -83,13 +81,18 @@ main(argc, argv)
 	if (argc != 0)
 		usage();
 
-	/* Initialize the environment. */
+	/*
+	 * Ignore signals -- we don't want to be interrupted because we're
+	 * spending all of our time in the DB library.
+	 */
+	nosig();
 	dbenv = db_init(home, verbose);
 
 	/* Get the list of names. */
 	if ((errno = log_archive(dbenv->lg_info, &list, flags, NULL)) != 0) {
+		warn(NULL);
 		(void)db_appexit(dbenv);
-		err(1, "log_archive");
+		return (1);
 	}
 
 	/* Print the names. */
@@ -97,7 +100,12 @@ main(argc, argv)
 		for (; *list != NULL; ++list)
 			printf("%s\n", *list);
 
-	return (db_appexit(dbenv) ? 1 : 0);
+	if ((errno = db_appexit(dbenv)) != 0) {
+		warn(NULL);
+		return (1);
+	}
+
+	return (0);
 }
 
 /*
@@ -123,40 +131,21 @@ db_init(home, verbose)
 	    DB_CREATE | DB_INIT_LOG | DB_INIT_TXN | DB_USE_ENVIRON)) != 0)
 		err(1, "db_appinit");
 
-	siginit();
-
 	return (dbenv);
 }
 
 /*
- * siginit --
- *	Initialize the set of signals for which we want to clean up.
- *	Generally, we try not to leave the shared regions locked if
- *	we can.
+ * nosig --
+ *	We don't want to be interrupted.
  */
 void
-siginit()
+nosig()
 {
 #ifdef SIGHUP
-	(void)signal(SIGHUP, onint);
+	(void)signal(SIGHUP, SIG_IGN);
 #endif
-	(void)signal(SIGINT, onint);
-#ifdef SIGKILL
-	(void)signal(SIGKILL, onint);
-#endif
-	(void)signal(SIGTERM, onint);
-}
-
-/*
- * oninit --
- *	Interrupt signal handler.
- */
-void
-onint(signo)
-	int signo;
-{
-	if ((interrupted = signo) == 0)
-		interrupted = SIGINT;
+	(void)signal(SIGINT, SIG_IGN);
+	(void)signal(SIGTERM, SIG_IGN);
 }
 
 void
diff --git a/db2/progs/db_checkpoint/db_checkpoint.c b/db2/progs/db_checkpoint/db_checkpoint.c
index 74f95ccce2..f0fe48ab2e 100644
--- a/db2/progs/db_checkpoint/db_checkpoint.c
+++ b/db2/progs/db_checkpoint/db_checkpoint.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_checkpoint.c	10.17 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)db_checkpoint.c	10.21 (Sleepycat) 10/4/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -59,7 +59,7 @@ main(argc, argv)
 	time_t now;
 	long argval;
 	u_int32_t kbytes, minutes, seconds;
-	int ch, eval, once, verbose;
+	int ch, once, ret, verbose;
 	char *home, *logfile;
 
 	/*
@@ -70,7 +70,7 @@ main(argc, argv)
 #define	MAX_UINT32_T	2147483647
 
 	kbytes = minutes = 0;
-	once = verbose = 0;
+	once = ret = verbose = 0;
 	home = logfile = NULL;
 	while ((ch = getopt(argc, argv, "1h:k:L:p:v")) != EOF)
 		switch (ch) {
@@ -110,6 +110,7 @@ main(argc, argv)
 	}
 
 	/* Initialize the environment. */
+	siginit();
 	dbenv = db_init(home);
 
 	if (logfile != NULL && logpid(logfile, 1)) {
@@ -122,37 +123,40 @@ main(argc, argv)
 	 * to wake up when a checkpoint is necessary.  If we have a "kbytes"
 	 * field set, then we'll check every 30 seconds.
 	 */
-	eval = 0;
 	seconds = kbytes != 0 ? 30 : minutes * 60;
 	while (!interrupted) {
 		if (verbose) {
 			(void)time(&now);
-			printf("checkpoint: %s", ctime(&now));
+			warnx("checkpoint: %s", ctime(&now));
 		}
-		errno = txn_checkpoint(dbenv->tx_info, kbytes, minutes);
 
+		errno = txn_checkpoint(dbenv->tx_info, kbytes, minutes);
 		while (errno == DB_INCOMPLETE) {
 			if (verbose)
-				__db_err(dbenv,
-				    "checkpoint did not finish, retrying");
-			(void)__db_sleep(2, 0);
+				warnx("checkpoint did not finish, retrying\n");
+			(void)sleep(2);
 			errno = txn_checkpoint(dbenv->tx_info, 0, 0);
 		}
 
 		if (errno != 0) {
-			eval = 1;
-			__db_err(dbenv, "checkpoint: %s", strerror(errno));
+			ret = 1;
+			warn(NULL);
 			break;
 		}
 
 		if (once)
 			break;
 
-		(void)__db_sleep(seconds, 0);
+		(void)sleep(seconds);
 	}
 
 	if (logfile != NULL && logpid(logfile, 0))
-		eval = 1;
+		ret = 1;
+
+	if ((errno = db_appexit(dbenv)) != 0) {
+		ret = 1;
+		warn(NULL);
+	}
 
 	if (interrupted) {
 		(void)signal(interrupted, SIG_DFL);
@@ -160,7 +164,7 @@ main(argc, argv)
 		/* NOTREACHED */
 	}
 
-	return (db_appexit(dbenv) || eval ? 1 : 0);
+	return (ret);
 }
 
 /*
@@ -193,8 +197,6 @@ db_init(home)
 		    "db_appinit: failed to register access method functions");
 	}
 
-	siginit();
-
 	return (dbenv);
 }
 
@@ -237,14 +239,11 @@ siginit()
 	(void)signal(SIGHUP, onint);
 #endif
 	(void)signal(SIGINT, onint);
-#ifdef SIGKILL
-	(void)signal(SIGKILL, onint);
-#endif
 	(void)signal(SIGTERM, onint);
 }
 
 /*
- * oninit --
+ * onint --
  *	Interrupt signal handler.
  */
 void
diff --git a/db2/progs/db_deadlock/db_deadlock.c b/db2/progs/db_deadlock/db_deadlock.c
index 49a52416dd..bc5039e95f 100644
--- a/db2/progs/db_deadlock/db_deadlock.c
+++ b/db2/progs/db_deadlock/db_deadlock.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_deadlock.c	10.19 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)db_deadlock.c	10.23 (Sleepycat) 10/4/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -55,14 +55,14 @@ main(argc, argv)
 	time_t now;
 	long usecs;
 	u_int32_t flags;
-	int ch, verbose;
+	int ch, ret, verbose;
 	char *home, *logfile;
 
 	atype = DB_LOCK_DEFAULT;
 	home = logfile = NULL;
 	usecs = 0;
 	flags = 0;
-	verbose = 0;
+	ret = verbose = 0;
 	while ((ch = getopt(argc, argv, "a:h:L:t:vw")) != EOF)
 		switch (ch) {
 		case 'a':
@@ -119,6 +119,7 @@ main(argc, argv)
 		usecs = 100000;
 
 	/* Initialize the deadlock detector by opening the lock manager. */
+	siginit();
 	dbenv = db_init(home, verbose);
 
 	if (logfile != NULL && logpid(logfile, 1)) {
@@ -129,18 +130,26 @@ main(argc, argv)
 	while (!interrupted) {
 		if (dbenv->db_verbose != 0) {
 			time(&now);
-			__db_err(dbenv, "Running at %.24s", ctime(&now));
+			warnx("Running at %.24s", ctime(&now));
 		}
 
-		if ((errno = lock_detect(dbenv->lk_info, flags, atype)) != 0)
+		if ((errno = lock_detect(dbenv->lk_info, flags, atype)) != 0) {
+			ret = 1;
+			warnx(NULL);
 			break;
+		}
 
 		/* Make a pass every "usecs" usecs. */
-		(void)__db_sleep(0, usecs);
+		(void)usleep(usecs);
 	}
 
-	if (logfile != NULL)
-		(void)logpid(logfile, 0);
+	if (logfile != NULL && logpid(logfile, 0))
+		ret = 1;
+
+	if ((errno = db_appexit(dbenv)) != 0) {
+		ret = 1;
+		warn(NULL);
+	}
 
 	if (interrupted) {
 		(void)signal(interrupted, SIG_DFL);
@@ -148,7 +157,7 @@ main(argc, argv)
 		/* NOTREACHED */
 	}
 
-	return (db_appexit(dbenv));
+	return (ret);
 }
 
 DB_ENV *
@@ -170,8 +179,6 @@ db_init(home, verbose)
 	    NULL, dbenv, DB_INIT_LOCK | DB_USE_ENVIRON)) != 0)
 		err(1, "db_appinit");
 
-	siginit();
-
 	return (dbenv);
 }
 
@@ -214,14 +221,11 @@ siginit()
 	(void)signal(SIGHUP, onint);
 #endif
 	(void)signal(SIGINT, onint);
-#ifdef SIGKILL
-	(void)signal(SIGKILL, onint);
-#endif
 	(void)signal(SIGTERM, onint);
 }
 
 /*
- * oninit --
+ * onint --
  *	Interrupt signal handler.
  */
 void
diff --git a/db2/progs/db_dump/db_dump.c b/db2/progs/db_dump/db_dump.c
index f532bc2779..0f34ddc789 100644
--- a/db2/progs/db_dump/db_dump.c
+++ b/db2/progs/db_dump/db_dump.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_dump.c	10.19 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)db_dump.c	10.24 (Sleepycat) 11/22/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -25,14 +25,14 @@ static const char sccsid[] = "@(#)db_dump.c	10.19 (Sleepycat) 5/23/98";
 #include <unistd.h>
 #endif
 
+#undef stat
+
 #include "db_int.h"
 #include "db_page.h"
 #include "btree.h"
 #include "hash.h"
 #include "clib_ext.h"
 
-#undef stat
-
 void	configure __P((char *));
 DB_ENV *db_init __P((char *));
 int	main __P((int, char *[]));
@@ -58,7 +58,7 @@ main(argc, argv)
 
 	home = NULL;
 	checkprint = dflag = 0;
-	while ((ch = getopt(argc, argv, "df:h:p")) != EOF)
+	while ((ch = getopt(argc, argv, "df:h:Np")) != EOF)
 		switch (ch) {
 		case 'd':
 			dflag = 1;
@@ -70,6 +70,9 @@ main(argc, argv)
 		case 'h':
 			home = optarg;
 			break;
+		case 'N':
+			(void)db_value_set(0, DB_MUTEXLOCKS);
+			break;
 		case 'p':
 			checkprint = 1;
 			break;
@@ -83,16 +86,11 @@ main(argc, argv)
 	if (argc != 1)
 		usage();
 
-	if (dflag) {
-		if (home != NULL)
-			errx(1,
-			    "the -d and -h options may not both be specified");
-		if (checkprint)
-			errx(1,
-			    "the -d and -p options may not both be specified");
-	}
+	if (dflag && checkprint)
+		errx(1, "the -d and -p options may not both be specified");
+
 	/* Initialize the environment. */
-	dbenv = dflag ? NULL : db_init(home);
+	dbenv = db_init(home);
 
 	/* Open the DB file. */
 	if ((errno =
@@ -108,7 +106,7 @@ main(argc, argv)
 	}
 
 	/* Get a cursor and step through the database. */
-	if ((errno = dbp->cursor(dbp, NULL, &dbcp)) != 0) {
+	if ((errno = dbp->cursor(dbp, NULL, &dbcp, 0)) != 0) {
 		(void)dbp->close(dbp, 0);
 		err(1, "cursor");
 	}
@@ -145,16 +143,35 @@ db_init(home)
 {
 	DB_ENV *dbenv;
 
-	if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) {
+	if ((dbenv = (DB_ENV *)calloc(1, sizeof(DB_ENV))) == NULL) {
 		errno = ENOMEM;
 		err(1, NULL);
 	}
+
+	/*
+	 * Try and use the shared mpool region so that we get pages that
+	 * haven't been flushed to disk (mostly useful for debugging).
+	 * If that fails, try again, without the DB_INIT_MPOOL flag.
+	 *
+	 * If it works, set the error output options so that future errors
+	 * are correctly reported.
+	 */
+	if ((errno = db_appinit(home,
+	    NULL, dbenv, DB_USE_ENVIRON | DB_INIT_MPOOL)) == 0) {
+		dbenv->db_errfile = stderr;
+		dbenv->db_errpfx = progname;
+		return (dbenv);
+	}
+
+	/* Set the error output options -- this time we want a message. */
+	memset(dbenv, 0, sizeof(*dbenv));
 	dbenv->db_errfile = stderr;
 	dbenv->db_errpfx = progname;
 
-	if ((errno =
-	    db_appinit(home, NULL, dbenv, DB_CREATE | DB_USE_ENVIRON)) != 0)
+	/* Try again, and it's fatal if we fail. */
+	if ((errno = db_appinit(home, NULL, dbenv, DB_USE_ENVIRON)) != 0)
 		err(1, "db_appinit");
+
 	return (dbenv);
 }
 
@@ -167,10 +184,10 @@ pheader(dbp, pflag)
 	DB *dbp;
 	int pflag;
 {
+	DBC *dbc;
 	DB_BTREE_STAT *btsp;
-	HTAB *hashp;
-	HASHHDR *hdr;
-	db_pgno_t pgno;
+	HASH_CURSOR *hcp;
+	int ret;
 
 	printf("format=%s\n", pflag ? "print" : "bytevalue");
 	switch (dbp->type) {
@@ -187,18 +204,25 @@ pheader(dbp, pflag)
 		break;
 	case DB_HASH:
 		printf("type=hash\n");
-		hashp = dbp->internal;
-		pgno = PGNO_METADATA;
-		if (memp_fget(dbp->mpf, &pgno, 0, &hdr) == 0) {
-			if (hdr->ffactor != 0)
-				printf("h_ffactor=%lu\n", (u_long)hdr->ffactor);
-			if (hdr->nelem != 0)
-				printf("h_nelem=%lu\n", (u_long)hdr->nelem);
-			(void)memp_fput(dbp->mpf, hdr, 0);
+		if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+			break;
+		hcp = (HASH_CURSOR *)dbc->internal;
+		GET_META(dbp, hcp, ret);
+		if (ret == 0) {
+			if (hcp->hdr->ffactor != 0)
+				printf("h_ffactor=%lu\n",
+				    (u_long)hcp->hdr->ffactor);
+			if (hcp->hdr->nelem != 0)
+				printf("h_nelem=%lu\n",
+				    (u_long)hcp->hdr->nelem);
+			RELEASE_META(dbp, hcp);
 		}
+		(void)dbc->c_close(dbc);
 		break;
 	case DB_RECNO:
 		printf("type=recno\n");
+		if ((errno = dbp->stat(dbp, &btsp, NULL, 0)) != 0)
+			err(1, "dbp->stat");
 		if (F_ISSET(dbp, DB_RE_RENUMBER))
 			printf("renumber=1\n");
 		if (F_ISSET(dbp, DB_RE_FIXEDLEN))
@@ -231,6 +255,6 @@ void
 usage()
 {
 	(void)fprintf(stderr,
-	    "usage: db_dump [-dp] [-f file] [-h home] db_file\n");
+	    "usage: db_dump [-dNp] [-f file] [-h home] db_file\n");
 	exit(1);
 }
diff --git a/db2/progs/db_load/db_load.c b/db2/progs/db_load/db_load.c
index 84cfb36775..ca30cef342 100644
--- a/db2/progs/db_load/db_load.c
+++ b/db2/progs/db_load/db_load.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_load.c	10.20 (Sleepycat) 6/2/98";
+static const char sccsid[] = "@(#)db_load.c	10.23 (Sleepycat) 10/4/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)db_load.c	10.20 (Sleepycat) 6/2/98";
 
 #include <errno.h>
 #include <limits.h>
+#include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -37,9 +38,12 @@ int	dbt_rdump __P((DBT *));
 int	dbt_rprint __P((DBT *));
 int	digitize __P((int));
 int	main __P((int, char *[]));
+void	onint __P((int));
 void	rheader __P((DBTYPE *, int *, DB_INFO *));
+void	siginit __P((void));
 void	usage __P((void));
 
+int	 interrupted;
 const char
 	*progname = "db_load";				/* Program name. */
 
@@ -57,16 +61,17 @@ main(argc, argv)
 	DB_INFO dbinfo;
 	db_recno_t recno;
 	u_int32_t db_nooverwrite;
-	int ch, checkprint, existed, no_header;
+	int ch, checkprint, existed, no_header, ret;
 	char **clist, **clp, *home;
 
 	/* Allocate enough room for configuration arguments. */
 	if ((clp = clist = (char **)calloc(argc + 1, sizeof(char *))) == NULL)
 		err(1, NULL);
 
+	dbp = NULL;
 	home = NULL;
 	db_nooverwrite = 0;
-	existed = checkprint = no_header = 0;
+	checkprint = existed = no_header = ret = 0;
 	argtype = dbtype = DB_UNKNOWN;
 	while ((ch = getopt(argc, argv, "c:f:h:nTt:")) != EOF)
 		switch (ch) {
@@ -111,9 +116,6 @@ main(argc, argv)
 	if (argc != 1)
 		usage();
 
-	/* Initialize the environment if the user specified one. */
-	dbenv = home == NULL ? NULL : db_init(home);
-
 	/*
 	 * Read the header.  If there isn't any header, we're expecting flat
 	 * text, set the checkprint flag appropriately.
@@ -128,21 +130,17 @@ main(argc, argv)
 			if ((dbtype == DB_RECNO && argtype != DB_RECNO) ||
 			    (argtype == DB_RECNO && dbtype != DB_RECNO))
 				errx(1,
-			    "databases of type recno may not be converted");
+				"databases of type recno may not be converted");
 			dbtype = argtype;
 		}
 	}
+
 	if (dbtype == DB_UNKNOWN)
 		errx(1, "no database type specified");
 
 	/* Apply command-line configuration changes. */
 	configure(&dbinfo, clist);
 
-	/* Open the DB file. */
-	if ((errno = db_open(argv[0], dbtype, DB_CREATE,
-	    __db_omode("rwrwrw"), dbenv, &dbinfo, &dbp)) != 0)
-		err(1, "%s", argv[0]);
-
 	/* Initialize the key/data pair. */
 	memset(&key, 0, sizeof(DBT));
 	if (dbtype == DB_RECNO) {
@@ -159,9 +157,20 @@ main(argc, argv)
 		err(1, NULL);
 	}
 
+	/* Initialize the environment if the user specified one. */
+	siginit();
+	dbenv = home == NULL ? NULL : db_init(home);
+
+	/* Open the DB file. */
+	if ((errno = db_open(argv[0], dbtype, DB_CREATE,
+	    __db_omode("rwrwrw"), dbenv, &dbinfo, &dbp)) != 0) {
+		warn("%s", argv[0]);
+		goto err;
+	}
+
 	/* Get each key/data pair and add them to the database. */
-	for (recno = 1;; ++recno) {
-		if (dbtype == DB_RECNO) {
+	for (recno = 1; !interrupted; ++recno) {
+		if (dbtype == DB_RECNO)
 			if (checkprint) {
 				if (dbt_rprint(&data))
 					break;
@@ -169,7 +178,7 @@ main(argc, argv)
 				if (dbt_rdump(&data))
 					break;
 			}
-		} else
+		else
 			if (checkprint) {
 				if (dbt_rprint(&key))
 					break;
@@ -178,8 +187,10 @@ main(argc, argv)
 			} else {
 				if (dbt_rdump(&key))
 					break;
-				if (dbt_rdump(&data))
-fmt:					err(1, "odd number of key/data pairs");
+				if (dbt_rdump(&data)) {
+fmt:					warnx("odd number of key/data pairs");
+					goto err;
+				}
 			}
 		switch (errno =
 		    dbp->put(dbp, NULL, &key, &data, db_nooverwrite)) {
@@ -190,17 +201,36 @@ fmt:					err(1, "odd number of key/data pairs");
 			warnx("%s: line %d: key already exists, not loaded:",
 			    argv[0],
 			    dbtype == DB_RECNO ? recno : recno * 2 - 1);
+
 			(void)__db_prdbt(&key, checkprint, stderr);
 			break;
 		default:
-			err(1, "%s", argv[0]);
-			/* NOTREACHED */
+			warn(NULL);
+			goto err;
 		}
 	}
 
-	if ((errno = dbp->close(dbp, 0)) != 0)
-		err(1, "%s", argv[0]);
-	return (existed ? 1 : 0);
+	if (0) {
+err:		ret = 1;
+	}
+	if (dbp != NULL && (errno = dbp->close(dbp, 0)) != 0) {
+		ret = 1;
+		warn(NULL);
+	}
+
+	if (dbenv != NULL && (errno = db_appexit(dbenv)) != 0) {
+		ret = 1;
+		warn(NULL);
+	}
+
+	if (interrupted) {
+		(void)signal(interrupted, SIG_DFL);
+		(void)raise(interrupted);
+		/* NOTREACHED */
+	}
+
+	/* Return 0 on success, 1 if keys existed already, and 2 on failure. */
+	return (ret == 0 ? (existed == 0 ? 0 : 1) : 2);
 }
 
 /*
@@ -499,6 +529,34 @@ badnum()
 }
 
 /*
+ * siginit --
+ *	Initialize the set of signals for which we want to clean up.
+ *	Generally, we try not to leave the shared regions locked if
+ *	we can.
+ */
+void
+siginit()
+{
+#ifdef SIGHUP
+	(void)signal(SIGHUP, onint);
+#endif
+	(void)signal(SIGINT, onint);
+	(void)signal(SIGTERM, onint);
+}
+
+/*
+ * onint --
+ *	Interrupt signal handler.
+ */
+void
+onint(signo)
+	int signo;
+{
+	if ((interrupted = signo) == 0)
+		interrupted = SIGINT;
+}
+
+/*
  * usage --
  *	Display the usage message.
  */
diff --git a/db2/progs/db_printlog/README b/db2/progs/db_printlog/README
new file mode 100644
index 0000000000..05051f33cd
--- /dev/null
+++ b/db2/progs/db_printlog/README
@@ -0,0 +1,22 @@
+# @(#)README	10.3 (Sleepycat) 11/1/98
+
+Berkeley DB log dump utility.  This utility dumps out a DB log in human
+readable form, a record at a time, to assist in recovery and transaction
+abort debugging.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+commit.awk	Output transaction ID of committed transactions.
+
+count.awk	Print out the number of log records for transactions
+		that we encountered.
+
+pgno.awk	Take a comma-separated list of page numbers and spit
+		out all the log records that affect those page numbers.
+
+range.awk	Print out a range of the log.
+
+status.awk	Read through db_printlog output and list the transactions
+		encountered, and whether they commited or aborted.
+
+txn.awk		Print out all the records for a comma-separated list of
+		transaction IDs.
diff --git a/db2/progs/db_printlog/commit.awk b/db2/progs/db_printlog/commit.awk
new file mode 100644
index 0000000000..711064bb00
--- /dev/null
+++ b/db2/progs/db_printlog/commit.awk
@@ -0,0 +1,7 @@
+# @(#)commit.awk	10.1 (Sleepycat) 11/1/98
+#
+# Output tid of committed transactions.
+
+/txn_regop/ {
+	print $5
+}
diff --git a/db2/progs/db_printlog/count.awk b/db2/progs/db_printlog/count.awk
new file mode 100644
index 0000000000..a0b214a6ff
--- /dev/null
+++ b/db2/progs/db_printlog/count.awk
@@ -0,0 +1,9 @@
+# @(#)count.awk	10.1 (Sleepycat) 11/1/98
+#
+# Print out the number of log records for transactions that we
+# encountered.
+
+/^\[/{
+	if ($5 != 0)
+		print $5
+}
diff --git a/db2/progs/db_printlog/db_printlog.c b/db2/progs/db_printlog/db_printlog.c
index 3b48ad9643..5a0c2ebd9f 100644
--- a/db2/progs/db_printlog/db_printlog.c
+++ b/db2/progs/db_printlog/db_printlog.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_printlog.c	10.12 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)db_printlog.c	10.17 (Sleepycat) 11/1/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)db_printlog.c	10.12 (Sleepycat) 4/10/98";
 
 #include <errno.h>
 #include <signal.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -37,6 +38,7 @@ static const char sccsid[] = "@(#)db_printlog.c	10.12 (Sleepycat) 4/10/98";
 DB_ENV *db_init __P((char *));
 int	main __P((int, char *[]));
 void	onint __P((int));
+void	siginit __P((void));
 void	usage __P((void));
 
 int	 interrupted;
@@ -53,15 +55,19 @@ main(argc, argv)
 	DB_ENV *dbenv;
 	DBT data;
 	DB_LSN key;
-	int ch, eval;
+	int ch, ret;
 	char *home;
 
+	ret = 0;
 	home = NULL;
-	while ((ch = getopt(argc, argv, "h:")) != EOF)
+	while ((ch = getopt(argc, argv, "h:N")) != EOF)
 		switch (ch) {
 		case 'h':
 			home = optarg;
 			break;
+		case 'N':
+			(void)db_value_set(0, DB_MUTEXLOCKS);
+			break;
 		case '?':
 		default:
 			usage();
@@ -69,54 +75,62 @@ main(argc, argv)
 	argc -= optind;
 	argv += optind;
 
-	if ((home != NULL && argc > 0) || argc > 1)
+	if (argc > 0)
 		usage();
 
-	/* XXX: backward compatibility, first argument is home. */
-	if (argc == 1)
-		home = argv[0];
-
+	/* Initialize the environment. */
+	siginit();
 	dbenv = db_init(home);
 
-	eval = 0;
 	if ((errno = __bam_init_print(dbenv)) != 0 ||
 	    (errno = __db_init_print(dbenv)) != 0 ||
 	    (errno = __ham_init_print(dbenv)) != 0 ||
 	    (errno = __log_init_print(dbenv)) != 0 ||
 	    (errno = __txn_init_print(dbenv)) != 0) {
 		warn("initialization");
-		eval = 1;
 		(void)db_appexit(dbenv);
+		return (1);
 	}
 
-	(void)signal(SIGINT, onint);
-
 	memset(&data, 0, sizeof(data));
 	while (!interrupted) {
 		if ((errno =
 		    log_get(dbenv->lg_info, &key, &data, DB_NEXT)) != 0) {
 			if (errno == DB_NOTFOUND)
 				break;
-			eval = 1;
 			warn("log_get");
-			break;
+			goto err;
 		}
-		if ((errno =
-		    __db_dispatch(dbenv->lg_info, &data, &key, 0, NULL)) != 0) {
-			eval = 1;
+		if (dbenv->tx_recover != NULL)
+			errno = dbenv->tx_recover(dbenv->lg_info,
+			    &data, &key, 0, NULL);
+		else
+			errno = __db_dispatch(dbenv->lg_info,
+			    &data, &key, 0, NULL);
+
+		fflush(stdout);
+		if (errno != 0) {
 			warn("dispatch");
-			break;
+			goto err;
 		}
 	}
 
-	(void)db_appexit(dbenv);
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (dbenv != NULL && (errno = db_appexit(dbenv)) != 0) {
+		ret = 1;
+		warn(NULL);
+	}
 
 	if (interrupted) {
-		(void)signal(SIGINT, SIG_DFL);
-		(void)raise(SIGINT);
+		(void)signal(interrupted, SIG_DFL);
+		(void)raise(interrupted);
 		/* NOTREACHED */
 	}
-	return (eval);
+
+	return (ret);
 }
 
 /*
@@ -143,21 +157,36 @@ db_init(home)
 }
 
 /*
- * oninit --
+ * siginit --
+ *	Initialize the set of signals for which we want to clean up.
+ *	Generally, we try not to leave the shared regions locked if
+ *	we can.
+ */
+void
+siginit()
+{
+#ifdef SIGHUP
+	(void)signal(SIGHUP, onint);
+#endif
+	(void)signal(SIGINT, onint);
+	(void)signal(SIGTERM, onint);
+}
+
+/*
+ * onint --
  *	Interrupt signal handler.
  */
 void
 onint(signo)
 	int signo;
 {
-	COMPQUIET(signo, 0);
-
-	interrupted = 1;
+	if ((interrupted = signo) == 0)
+		interrupted = SIGINT;
 }
 
 void
 usage()
 {
-	fprintf(stderr, "usage: db_printlog [-h home]\n");
+	fprintf(stderr, "usage: db_printlog [-N] [-h home]\n");
 	exit (1);
 }
diff --git a/db2/progs/db_printlog/pgno.awk b/db2/progs/db_printlog/pgno.awk
new file mode 100644
index 0000000000..99aa38f2b9
--- /dev/null
+++ b/db2/progs/db_printlog/pgno.awk
@@ -0,0 +1,43 @@
+# @(#)pgno.awk	10.1 (Sleepycat) 11/1/98
+#
+# Take a comma-separated list of page numbers and spit out all the
+# log records that affect those page numbers.
+
+{
+	if (NR == 1) {
+		npages = 0
+		while ((ndx = index(PGNO, ",")) != 0) {
+			pgno[npages] = substr(PGNO, 1, ndx - 1);
+			PGNO = substr(PGNO, ndx + 1, length(PGNO) - ndx);
+			npages++
+		}
+		pgno[npages] = PGNO;
+	}
+}
+/^\[/{
+	if (printme == 1) {
+		printf("%s\n", rec);
+		printme = 0
+	}
+	rec = "";
+
+	rec = $0
+}
+/^	/{
+	rec = sprintf("%s\n%s", rec, $0);
+}
+/pgno/{
+	for (i = 0; i <= npages; i++)
+		if ($2 == pgno[i])
+			printme = 1
+}
+/right/{
+	for (i = 0; i <= npages; i++)
+		if ($2 == pgno[i])
+			printme = 1
+}
+/left/{
+	for (i = 0; i <= npages; i++)
+		if ($2 == pgno[i])
+			printme = 1
+}
diff --git a/db2/progs/db_printlog/range.awk b/db2/progs/db_printlog/range.awk
new file mode 100644
index 0000000000..89c56eae52
--- /dev/null
+++ b/db2/progs/db_printlog/range.awk
@@ -0,0 +1,27 @@
+# @(#)range.awk	10.1 (Sleepycat) 11/1/98
+#
+# Print out a range of the log
+
+/^\[/{
+	l = length($1) - 1;
+	i = index($1, "]");
+	file = substr($1, 2, i - 2);
+	file += 0;
+	start = i + 2;
+	offset = substr($1, start, l - start + 1);
+	i = index(offset, "]");
+	offset = substr($1, start, i - 1);
+	offset += 0;
+
+	if ((file == START_FILE && offset >= START_OFFSET || file > START_FILE)\
+	    && (file < END_FILE || (file == END_FILE && offset < END_OFFSET)))
+		printme = 1
+	else if (file == END_FILE && offset > END_OFFSET || file > END_FILE)
+		exit
+	else
+		printme = 0
+}
+{
+	if (printme == 1)
+		print $0
+}
diff --git a/db2/progs/db_printlog/status.awk b/db2/progs/db_printlog/status.awk
new file mode 100644
index 0000000000..d97e9357b7
--- /dev/null
+++ b/db2/progs/db_printlog/status.awk
@@ -0,0 +1,26 @@
+# @(#)status.awk	10.1 (Sleepycat) 11/1/98
+#
+# Read through db_printlog output and list all the transactions encountered
+# and whether they commited or aborted.
+#
+# 1 = started
+# 2 = commited
+BEGIN {
+	cur_txn = 0
+}
+/^\[/{
+	if (status[$5] == 0) {
+		status[$5] = 1;
+		txns[cur_txn] = $5;
+		cur_txn++;
+	}
+}
+/txn_regop/ {
+	status[$5] = 2
+}
+END {
+	for (i = 0; i < cur_txn; i++) {
+		printf("%s\t%s\n",
+		    txns[i], status[txns[i]] == 1 ? "ABORT" : "COMMIT");
+	}
+}
diff --git a/db2/progs/db_printlog/txn.awk b/db2/progs/db_printlog/txn.awk
new file mode 100644
index 0000000000..c8d3bd36c8
--- /dev/null
+++ b/db2/progs/db_printlog/txn.awk
@@ -0,0 +1,30 @@
+# @(#)txn.awk	10.1 (Sleepycat) 11/1/98
+#
+# Print out all the records for a comma-separated list of transaction ids.
+{
+	if (NR == 1) {
+		ntxns = 0
+		while ((ndx = index(TXN, ",")) != 0) {
+			txn[ntxns] = substr(TXN, 1, ndx - 1);
+			TXN = substr(TXN, ndx + 1, length(TXN) - ndx);
+			ntxns++
+		}
+		txn[ntxns] = TXN;
+	}
+}
+/^\[/{
+	if (printme == 1) {
+		printf("%s\n", rec);
+		printme = 0
+	}
+	rec = "";
+
+	for (i = 0; i <= ntxns; i++)
+		if (txn[i] == $5) {
+			rec = $0
+			printme = 1
+		}
+}
+/^	/{
+	rec = sprintf("%s\n%s", rec, $0);
+}
diff --git a/db2/progs/db_recover/db_recover.c b/db2/progs/db_recover/db_recover.c
index a2845725b8..d946ca15ee 100644
--- a/db2/progs/db_recover/db_recover.c
+++ b/db2/progs/db_recover/db_recover.c
@@ -11,13 +11,14 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_recover.c	10.19 (Sleepycat) 4/10/98";
+static const char sccsid[] = "@(#)db_recover.c	10.23 (Sleepycat) 10/5/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <signal.h>
 #include <stdlib.h>
 #include <time.h>
 #include <unistd.h>
@@ -31,6 +32,7 @@ static const char sccsid[] = "@(#)db_recover.c	10.19 (Sleepycat) 4/10/98";
 
 DB_ENV	*db_init __P((char *, u_int32_t, int));
 int	 main __P((int, char *[]));
+void	 nosig __P((void));
 void	 usage __P((void));
 
 const char
@@ -72,10 +74,15 @@ main(argc, argv)
 	if (argc != 0)
 		usage();
 
+	/*
+	 * Ignore signals -- we don't want to be interrupted because we're
+	 * spending all of our time in the DB library.
+	 */
+	nosig();
 	dbenv = db_init(home, flags, verbose);
 	if (verbose) {
 		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
-		__db_err(dbenv, "%s %lu %s [%lu][%lu]",
+		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
 		    "Maximum transaction id",
 		    (u_long)dbenv->tx_info->region->last_txnid,
 		    "Recovery checkpoint",
@@ -118,6 +125,20 @@ db_init(home, flags, verbose)
 	return (dbenv);
 }
 
+/*
+ * nosig --
+ *	We don't want to be interrupted.
+ */
+void
+nosig()
+{
+#ifdef SIGHUP
+	(void)signal(SIGHUP, SIG_IGN);
+#endif
+	(void)signal(SIGINT, SIG_IGN);
+	(void)signal(SIGTERM, SIG_IGN);
+}
+
 void
 usage()
 {
diff --git a/db2/progs/db_stat/db_stat.c b/db2/progs/db_stat/db_stat.c
index f2551805b0..cef645da00 100644
--- a/db2/progs/db_stat/db_stat.c
+++ b/db2/progs/db_stat/db_stat.c
@@ -11,7 +11,7 @@
 static const char copyright[] =
 "@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_stat.c	8.38 (Sleepycat) 5/30/98";
+static const char sccsid[] = "@(#)db_stat.c	8.41 (Sleepycat) 10/3/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -26,6 +26,8 @@ static const char sccsid[] = "@(#)db_stat.c	8.38 (Sleepycat) 5/30/98";
 #include <unistd.h>
 #endif
 
+#undef stat
+
 #include "db_int.h"
 #include "shqueue.h"
 #include "db_shash.h"
@@ -33,8 +35,6 @@ static const char sccsid[] = "@(#)db_stat.c	8.38 (Sleepycat) 5/30/98";
 #include "mp.h"
 #include "clib_ext.h"
 
-#undef stat
-
 typedef enum { T_NOTSET, T_DB, T_LOCK, T_LOG, T_MPOOL, T_TXN } test_t;
 
 int	argcheck __P((char *, const char *));
@@ -48,13 +48,12 @@ void	log_stats __P((DB_ENV *));
 int	main __P((int, char *[]));
 int	mpool_ok __P((char *));
 void	mpool_stats __P((DB_ENV *));
-void	onint __P((int));
+void	nosig __P((void));
 void	prflags __P((u_int32_t, const FN *));
 int	txn_compare __P((const void *, const void *));
 void	txn_stats __P((DB_ENV *));
 void	usage __P((void));
 
-int	 interrupted;
 char	*internal;
 const char
 	*progname = "db_stat";				/* Program name. */
@@ -118,15 +117,20 @@ main(argc, argv)
 	if (argc != 0 || ttype == T_NOTSET)
 		usage();
 
+	/*
+	 * Ignore signals -- we don't want to be interrupted because we're
+	 * spending all of our time in the DB library.
+	 */
+	nosig();
 	dbenv = db_init(home, ttype);
 
-	(void)signal(SIGINT, onint);
-
 	switch (ttype) {
 	case T_DB:
 		if ((errno = db_open(db, DB_UNKNOWN,
-		    DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0)
+		    DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0) {
+			warn("%s", db);
 			return (1);
+		}
 		switch (dbp->type) {
 		case DB_BTREE:
 		case DB_RECNO:
@@ -158,12 +162,9 @@ main(argc, argv)
 		/* NOTREACHED */
 	}
 
-	(void)db_appexit(dbenv);
-
-	if (interrupted) {
-		(void)signal(SIGINT, SIG_DFL);
-		(void)raise(SIGINT);
-		/* NOTREACHED */
+	if ((errno = db_appexit(dbenv)) != 0) {
+		warn(NULL);
+		return (1);
 	}
 	return (0);
 }
@@ -218,7 +219,6 @@ btree_stats(dbp)
 	dl("Number of tree duplicate pages.\n", (u_long)sp->bt_dup_pg);
 	dl("Number of tree overflow pages.\n", (u_long)sp->bt_over_pg);
 	dl("Number of pages on the free list.\n", (u_long)sp->bt_free);
-	dl("Number of pages freed for reuse.\n", (u_long)sp->bt_freed);
 	dl("Number of bytes free in tree internal pages",
 	    (u_long)sp->bt_int_pgfree);
 	printf(" (%.0f%% ff).\n", PCT(sp->bt_int_pgfree, sp->bt_int_pg));
@@ -231,17 +231,6 @@ btree_stats(dbp)
 	dl("Number of bytes free in tree overflow pages",
 	    (u_long)sp->bt_over_pgfree);
 	printf(" (%.0f%% ff).\n", PCT(sp->bt_over_pgfree, sp->bt_over_pg));
-	dl("Number of bytes saved by prefix compression.\n",
-	    (u_long)sp->bt_pfxsaved);
-	dl("Total number of tree page splits.\n", (u_long)sp->bt_split);
-	dl("Number of root page splits.\n", (u_long)sp->bt_rootsplit);
-	dl("Number of fast splits.\n", (u_long)sp->bt_fastsplit);
-	dl("Number of hits in tree fast-insert code.\n",
-	    (u_long)sp->bt_cache_hit);
-	dl("Number of misses in tree fast-insert code.\n",
-	    (u_long)sp->bt_cache_miss);
-	dl("Number of keys added.\n", (u_long)sp->bt_added);
-	dl("Number of keys deleted.\n", (u_long)sp->bt_deleted);
 }
 
 /*
@@ -610,16 +599,17 @@ argcheck(arg, ok_args)
 }
 
 /*
- * oninit --
- *	Interrupt signal handler.
+ * nosig --
+ *	We don't want to be interrupted.
  */
 void
-onint(signo)
-	int signo;
+nosig()
 {
-	COMPQUIET(signo, 0);
-
-	interrupted = 1;
+#ifdef SIGHUP
+	(void)signal(SIGHUP, SIG_IGN);
+#endif
+	(void)signal(SIGINT, SIG_IGN);
+	(void)signal(SIGTERM, SIG_IGN);
 }
 
 void
diff --git a/db2/txn/txn.c b/db2/txn/txn.c
index 4f3ffd8ed2..aa0b3652ce 100644
--- a/db2/txn/txn.c
+++ b/db2/txn/txn.c
@@ -43,7 +43,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)txn.c	10.58 (Sleepycat) 5/31/98";
+static const char sccsid[] = "@(#)txn.c	10.66 (Sleepycat) 1/3/99";
 #endif /* not lint */
 
 
@@ -66,12 +66,14 @@ static const char sccsid[] = "@(#)txn.c	10.58 (Sleepycat) 5/31/98";
 #include "db_am.h"
 #include "common_ext.h"
 
-static int __txn_check_running __P((const DB_TXN *));
-static int __txn_end __P((DB_TXN *, int));
-static int __txn_grow_region __P((DB_TXNMGR *));
-static int __txn_init __P((DB_TXNREGION *));
-static int __txn_undo __P((DB_TXN *));
-static int __txn_validate_region __P((DB_TXNMGR *));
+static int  __txn_begin __P((DB_TXN *));
+static int  __txn_check_running __P((const DB_TXN *, TXN_DETAIL **));
+static int  __txn_end __P((DB_TXN *, int));
+static void __txn_freekids __P((DB_TXN *));
+static int  __txn_grow_region __P((DB_TXNMGR *));
+static int  __txn_init __P((DB_TXNREGION *));
+static int  __txn_undo __P((DB_TXN *));
+static int  __txn_validate_region __P((DB_TXNMGR *));
 
 /*
  * This file contains the top level routines of the transaction library.
@@ -93,7 +95,10 @@ __txn_init(txn_region)
 	txn_region->magic = DB_TXNMAGIC;
 	txn_region->version = DB_TXNVERSION;
 	txn_region->last_txnid = TXN_MINIMUM;
-	/* XXX If we ever do more types of locking and logging, this changes. */
+	/*
+	 * XXX
+	 * If we ever do more types of locking and logging, this changes.
+	 */
 	txn_region->logtype = 0;
 	txn_region->locktype = 0;
 	txn_region->time_ckp = now;
@@ -132,10 +137,8 @@ txn_open(path, flags, mode, dbenv, mgrpp)
 	maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 20;
 
 	/* Now, create the transaction manager structure and set its fields. */
-	if ((tmgrp = (DB_TXNMGR *)__db_calloc(1, sizeof(DB_TXNMGR))) == NULL) {
-		__db_err(dbenv, "txn_open: %s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(DB_TXNMGR), &tmgrp)) != 0)
+		return (ret);
 
 	/* Initialize the transaction manager structure. */
 	tmgrp->mutexp = NULL;
@@ -151,7 +154,7 @@ txn_open(path, flags, mode, dbenv, mgrpp)
 	if (path == NULL)
 		tmgrp->reginfo.path = NULL;
 	else
-		if ((tmgrp->reginfo.path = (char *)__db_strdup(path)) == NULL)
+		if ((ret = __os_strdup(path, &tmgrp->reginfo.path)) != 0)
 			goto err;
 	tmgrp->reginfo.file = DEFAULT_TXN_FILE;
 	tmgrp->reginfo.mode = mode;
@@ -207,36 +210,96 @@ err:	if (tmgrp->reginfo.addr != NULL) {
 	}
 
 	if (tmgrp->reginfo.path != NULL)
-		FREES(tmgrp->reginfo.path);
-	FREE(tmgrp, sizeof(*tmgrp));
+		__os_freestr(tmgrp->reginfo.path);
+	__os_free(tmgrp, sizeof(*tmgrp));
 	return (ret);
 }
 
 /*
- * Internally, we use TXN_DETAIL structures, but we allocate and return
- * DB_TXN structures that provide access to the transaction ID and the
- * offset in the transaction region of the TXN_DETAIL structure.
+ * __txn_panic --
+ *	Panic a transaction region.
+ *
+ * PUBLIC: void __txn_panic __P((DB_ENV *));
+ */
+void
+__txn_panic(dbenv)
+	DB_ENV *dbenv;
+{
+	if (dbenv->tx_info != NULL)
+		dbenv->tx_info->region->hdr.panic = 1;
+}
+
+/*
+ * txn_begin --
+ *	This is a wrapper to the actual begin process.  Normal txn_begin()
+ * allocates a DB_TXN structure for the caller, while txn_xa_begin() does
+ * not.  Other than that, both call into the common __txn_begin code().
+ *
+ * Internally, we use TXN_DETAIL structures, but the DB_TXN structure
+ * provides access to the transaction ID and the offset in the transaction
+ * region of the TXN_DETAIL structure.
  */
 int
 txn_begin(tmgrp, parent, txnpp)
 	DB_TXNMGR *tmgrp;
-	DB_TXN *parent;
-	DB_TXN **txnpp;
+	DB_TXN *parent, **txnpp;
 {
-	DB_LSN begin_lsn;
-	DB_TXN *retp;
-	TXN_DETAIL *txnp;
-	size_t off;
-	u_int32_t id;
+	DB_TXN *txn;
 	int ret;
 
-	txnp = NULL;
-	*txnpp = NULL;
+	TXN_PANIC_CHECK(tmgrp);
 
-	if ((retp = (DB_TXN *)__db_malloc(sizeof(DB_TXN))) == NULL) {
-		__db_err(tmgrp->dbenv, "txn_begin : %s", strerror(ENOMEM));
-		return (ENOMEM);
+	if ((ret = __os_calloc(1, sizeof(DB_TXN), &txn)) != 0)
+		return (ret);
+
+	txn->parent = parent;
+	TAILQ_INIT(&txn->kids);
+	txn->mgrp = tmgrp;
+	txn->flags = TXN_MALLOC;
+	if ((ret = __txn_begin(txn)) != 0) {
+		__os_free(txn, sizeof(DB_TXN));
+		txn = NULL;
 	}
+	if (txn != NULL && parent != NULL)
+		TAILQ_INSERT_HEAD(&parent->kids, txn, klinks);
+	*txnpp = txn;
+	return (ret);
+}
+
+/*
+ * __txn_xa_begin --
+ *	XA version of txn_begin.
+ *
+ * PUBLIC: int __txn_xa_begin __P((DB_ENV *, DB_TXN *));
+ */
+int
+__txn_xa_begin(dbenv, txn)
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+{
+	TXN_PANIC_CHECK(dbenv->tx_info);
+
+	memset(txn, 0, sizeof(DB_TXN));
+
+	txn->mgrp = dbenv->tx_info;
+
+	return (__txn_begin(txn));
+}
+
+/*
+ * __txn_begin --
+ *	Normal DB version of txn_begin.
+ */
+static int
+__txn_begin(txn)
+	DB_TXN *txn;
+{
+	DB_LSN begin_lsn;
+	DB_TXNMGR *mgr;
+	TXN_DETAIL *td;
+	size_t off;
+	u_int32_t id;
+	int ret;
 
 	/*
 	 * We do not have to write begin records (and if we do not, then we
@@ -244,65 +307,67 @@ txn_begin(tmgrp, parent, txnpp)
 	 * we do need to find the current LSN so that we can store it in the
 	 * transaction structure, so we can know where to take checkpoints.
 	 */
-	if (tmgrp->dbenv->lg_info != NULL && (ret =
-	    log_put(tmgrp->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0)
+	mgr = txn->mgrp;
+	if (mgr->dbenv->lg_info != NULL && (ret =
+	    log_put(mgr->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0)
 		goto err2;
 
-	LOCK_TXNREGION(tmgrp);
+	LOCK_TXNREGION(mgr);
 
 	/* Make sure that last_txnid is not going to wrap around. */
-	if (tmgrp->region->last_txnid == TXN_INVALID) {
-		__db_err(tmgrp->dbenv, "txn_begin: %s  %s",
+	if (mgr->region->last_txnid == TXN_INVALID) {
+		__db_err(mgr->dbenv, "txn_begin: %s  %s",
 		    "Transaction ID wrapping.",
 		    "Snapshot your database and start a new log.");
 		ret = EINVAL;
 		goto err1;
 	}
 
-	if ((ret = __txn_validate_region(tmgrp)) != 0)
+	if ((ret = __txn_validate_region(mgr)) != 0)
 		goto err1;
 
 	/* Allocate a new transaction detail structure. */
-	if ((ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp)) != 0
-	    && ret == ENOMEM && (ret = __txn_grow_region(tmgrp)) == 0)
-	    	ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp);
+	if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0
+	    && ret == ENOMEM && (ret = __txn_grow_region(mgr)) == 0)
+	    	ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td);
 	if (ret != 0)
 		goto err1;
 
 	/* Place transaction on active transaction list. */
-	SH_TAILQ_INSERT_HEAD(&tmgrp->region->active_txn,
-	    txnp, links, __txn_detail);
-
-	id = ++tmgrp->region->last_txnid;
-	tmgrp->region->nbegins++;
-
-	txnp->txnid = id;
-	txnp->begin_lsn = begin_lsn;
-	ZERO_LSN(txnp->last_lsn);
-	txnp->last_lock = 0;
-	txnp->status = TXN_RUNNING;
-	off = (u_int8_t *)txnp - (u_int8_t *)tmgrp->region;
-	UNLOCK_TXNREGION(tmgrp);
+	SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail);
+
+	id = ++mgr->region->last_txnid;
+	++mgr->region->nbegins;
+
+	td->txnid = id;
+	td->begin_lsn = begin_lsn;
+	ZERO_LSN(td->last_lsn);
+	td->last_lock = 0;
+	td->status = TXN_RUNNING;
+	if (txn->parent != NULL)
+		td->parent = txn->parent->off;
+	else
+		td->parent = 0;
 
-	ZERO_LSN(retp->last_lsn);
-	retp->txnid = id;
-	retp->parent = parent;
-	retp->mgrp = tmgrp;
-	retp->off = off;
+	off = (u_int8_t *)td - (u_int8_t *)mgr->region;
+	UNLOCK_TXNREGION(mgr);
+
+	ZERO_LSN(txn->last_lsn);
+	txn->txnid = id;
+	txn->off = off;
 
-	LOCK_TXNTHREAD(tmgrp);
-	TAILQ_INSERT_TAIL(&tmgrp->txn_chain, retp, links);
-	UNLOCK_TXNTHREAD(tmgrp);
+	if (F_ISSET(txn, TXN_MALLOC)) {
+		LOCK_TXNTHREAD(mgr);
+		TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
+		UNLOCK_TXNTHREAD(mgr);
+	}
 
-	*txnpp = retp;
 	return (0);
 
-err1:	UNLOCK_TXNREGION(tmgrp);
+err1:	UNLOCK_TXNREGION(mgr);
 
-err2:	__db_free(retp);
-	return (ret);
+err2:	return (ret);
 }
-
 /*
  * txn_commit --
  *	Commit a transaction.
@@ -312,21 +377,43 @@ txn_commit(txnp)
 	DB_TXN *txnp;
 {
 	DB_LOG *logp;
+	DB_TXNMGR *mgr;
 	int ret;
 
-	if ((ret = __txn_check_running(txnp)) != 0)
+	mgr = txnp->mgrp;
+
+	TXN_PANIC_CHECK(mgr);
+	if ((ret = __txn_check_running(txnp, NULL)) != 0)
 		return (ret);
 
 	/*
 	 * If there are any log records, write a log record and sync
-	 * the log, else do no log writes.
+	 * the log, else do no log writes.  If the commit is for a child
+	 * transaction, we do not need to commit the child synchronously
+	 * since if its parent aborts, it will abort too and its parent
+	 * (or ultimate ancestor) will write synchronously.
 	 */
-	if ((logp = txnp->mgrp->dbenv->lg_info) != NULL &&
-	    !IS_ZERO_LSN(txnp->last_lsn) &&
-	    (ret = __txn_regop_log(logp, txnp, &txnp->last_lsn,
-	    F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH,
-	    TXN_COMMIT)) != 0)
-		return (ret);
+	if ((logp = mgr->dbenv->lg_info) != NULL &&
+	    !IS_ZERO_LSN(txnp->last_lsn)) {
+		if (txnp->parent == NULL)
+	    		ret = __txn_regop_log(logp, txnp, &txnp->last_lsn,
+			    F_ISSET(mgr, DB_TXN_NOSYNC) ? 0 : DB_FLUSH,
+			    TXN_COMMIT);
+		else
+	    		ret = __txn_child_log(logp, txnp, &txnp->last_lsn, 0,
+			    TXN_COMMIT, txnp->parent->txnid);
+		if (ret != 0)
+			return (ret);
+	}
+
+	/*
+	 * If this is the senior ancestor (i.e., it has no children), then we
+	 * can release all the child transactions since everyone is committing.
+	 * Then we can release this transaction.  If this is not the ultimate
+	 * ancestor, then we can neither free it or its children.
+	 */
+	if (txnp->parent == NULL)
+		__txn_freekids(txnp);
 
 	return (__txn_end(txnp, 1));
 }
@@ -340,10 +427,17 @@ txn_abort(txnp)
 	DB_TXN *txnp;
 {
 	int ret;
+	DB_TXN *kids;
 
-	if ((ret = __txn_check_running(txnp)) != 0)
+	TXN_PANIC_CHECK(txnp->mgrp);
+	if ((ret = __txn_check_running(txnp, NULL)) != 0)
 		return (ret);
 
+	for (kids = TAILQ_FIRST(&txnp->kids);
+	    kids != NULL;
+	    kids = TAILQ_FIRST(&txnp->kids))
+		txn_abort(kids);
+
 	if ((ret = __txn_undo(txnp)) != 0) {
 		__db_err(txnp->mgrp->dbenv,
 		    "txn_abort: Log undo failed %s", strerror(ret));
@@ -353,30 +447,45 @@ txn_abort(txnp)
 }
 
 /*
- * Flush the log so a future commit is guaranteed to succeed.
+ * txn_prepare --
+ *	Flush the log so a future commit is guaranteed to succeed.
  */
 int
 txn_prepare(txnp)
 	DB_TXN *txnp;
 {
-	TXN_DETAIL *tp;
+	DBT xid;
+	DB_ENV *dbenv;
+	TXN_DETAIL *td;
 	int ret;
 
-	if ((ret = __txn_check_running(txnp)) != 0)
+	if ((ret = __txn_check_running(txnp, &td)) != 0)
 		return (ret);
 
-	if (txnp->mgrp->dbenv->lg_info != NULL) {
-		if ((ret = log_flush(txnp->mgrp->dbenv->lg_info,
-		    &txnp->last_lsn)) != 0)
-			__db_err(txnp->mgrp->dbenv,
-			    "txn_prepare: log_flush failed %s\n",
-			    strerror(ret));
+	dbenv = txnp->mgrp->dbenv;
+	memset(&xid, 0, sizeof(xid));
+	xid.data = td->xid;
+	/*
+	 * We indicate that a transaction is an XA transaction by putting
+	 * a valid size in the xid.size fiels.  XA requires that the transaction
+	 * be either ENDED or SUSPENDED when prepare is called, so we know
+	 * that if the xa_status isn't in one of those states, but we are
+	 * calling prepare that we are not an XA transaction.
+	 */
+	xid.size =
+	    td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED ?
+	    0 : sizeof(td->xid);
+	if (dbenv->lg_info != NULL &&
+	    (ret = __txn_xa_regop_log(dbenv->lg_info, txnp, &txnp->last_lsn,
+	    F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_PREPARE,
+	    &xid, td->format, td->gtrid, td->bqual, &td->begin_lsn)) != 0) {
+		__db_err(dbenv,
+		    "txn_prepare: log_write failed %s\n", strerror(ret));
 		return (ret);
 	}
 
 	LOCK_TXNTHREAD(txnp->mgrp);
-	tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
-	tp->status = TXN_PREPARED;
+	td->status = TXN_PREPARED;
 	UNLOCK_TXNTHREAD(txnp->mgrp);
 	return (ret);
 }
@@ -402,6 +511,8 @@ txn_close(tmgrp)
 	DB_TXN *txnp;
 	int ret, t_ret;
 
+	TXN_PANIC_CHECK(tmgrp);
+
 	ret = 0;
 
 	/*
@@ -431,8 +542,8 @@ txn_close(tmgrp)
 		ret = t_ret;
 
 	if (tmgrp->reginfo.path != NULL)
-		FREES(tmgrp->reginfo.path);
-	FREE(tmgrp, sizeof(*tmgrp));
+		__os_freestr(tmgrp->reginfo.path);
+	__os_free(tmgrp, sizeof(*tmgrp));
 
 	return (ret);
 }
@@ -453,12 +564,12 @@ txn_unlink(path, force, dbenv)
 	memset(&reginfo, 0, sizeof(reginfo));
 	reginfo.dbenv = dbenv;
 	reginfo.appname = DB_APP_NONE;
-	if (path != NULL && (reginfo.path = (char *)__db_strdup(path)) == NULL)
-		return (ENOMEM);
+	if (path != NULL && (ret = __os_strdup(path, &reginfo.path)) != 0)
+		return (ret);
 	reginfo.file = DEFAULT_TXN_FILE;
 	ret = __db_runlink(&reginfo, force);
 	if (reginfo.path != NULL)
-		FREES(reginfo.path);
+		__os_freestr(reginfo.path);
 	return (ret);
 }
 
@@ -468,16 +579,23 @@ txn_unlink(path, force, dbenv)
  * Return 0 if the txnp is reasonable, otherwise returns EINVAL.
  */
 static int
-__txn_check_running(txnp)
+__txn_check_running(txnp, tdp)
 	const DB_TXN *txnp;
+	TXN_DETAIL **tdp;
 {
 	TXN_DETAIL *tp;
 
 	tp = NULL;
 	if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) {
 		tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
-		if (tp->status != TXN_RUNNING)
+		/*
+		 * Child transactions could be marked committed which is OK.
+		 */
+		if (tp->status != TXN_RUNNING &&
+		    tp->status != TXN_PREPARED && tp->status != TXN_COMMITTED)
 			tp = NULL;
+		if (tdp != NULL)
+			*tdp = tp;
 	}
 
 	return (tp == NULL ? EINVAL : 0);
@@ -488,25 +606,22 @@ __txn_end(txnp, is_commit)
 	DB_TXN *txnp;
 	int is_commit;
 {
+	DB_LOCKREQ request;
 	DB_TXNMGR *mgr;
 	TXN_DETAIL *tp;
-	DB_LOCKREQ request;
-	int ret;
 	u_int32_t locker;
+	int ret;
 
 	mgr = txnp->mgrp;
 
-	LOCK_TXNTHREAD(mgr);
-	TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
-	UNLOCK_TXNTHREAD(mgr);
-
 	/* Release the locks. */
 	locker = txnp->txnid;
-	request.op = DB_LOCK_PUT_ALL;
+	request.op = txnp->parent == NULL ||
+	    is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT;
 
 	if (mgr->dbenv->lk_info) {
-		ret = lock_vec(mgr->dbenv->lk_info, locker, 0,
-		    &request, 1, NULL);
+		ret =
+		    lock_tvec(mgr->dbenv->lk_info, txnp, 0, &request, 1, NULL);
 		if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) {
 			__db_err(mgr->dbenv, "%s: release locks failed %s",
 			    is_commit ? "txn_commit" : "txn_abort",
@@ -517,16 +632,44 @@ __txn_end(txnp, is_commit)
 
 	/* End the transaction. */
 	LOCK_TXNREGION(mgr);
+
+	/*
+	 * Child transactions that are committing cannot be released until
+	 * the parent commits, since the parent may abort, causing the child
+	 * to abort as well.
+	 */
 	tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off);
-	SH_TAILQ_REMOVE(&mgr->region->active_txn, tp, links, __txn_detail);
-	__db_shalloc_free(mgr->mem, tp);
+	if (txnp->parent == NULL || !is_commit) {
+		SH_TAILQ_REMOVE(&mgr->region->active_txn,
+		    tp, links, __txn_detail);
+
+		__db_shalloc_free(mgr->mem, tp);
+	} else
+		tp->status = is_commit ? TXN_COMMITTED : TXN_ABORTED;
+
 	if (is_commit)
 		mgr->region->ncommits++;
 	else
 		mgr->region->naborts++;
+
 	UNLOCK_TXNREGION(mgr);
 
-	FREE(txnp, sizeof(*txnp));
+	/*
+	 * If the transaction aborted, we can remove it from its parent links.
+	 * If it committed, then we need to leave it on, since the parent can
+	 * still abort.
+	 */
+	if (txnp->parent != NULL && !is_commit)
+		TAILQ_REMOVE(&txnp->parent->kids, txnp, klinks);
+
+	/* Free the space. */
+	if (F_ISSET(txnp, TXN_MALLOC) && (txnp->parent == NULL || !is_commit)) {
+		LOCK_TXNTHREAD(mgr);
+		TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
+		UNLOCK_TXNTHREAD(mgr);
+
+		__os_free(txnp, sizeof(*txnp));
+	}
 
 	return (0);
 }
@@ -571,7 +714,7 @@ __txn_undo(txnp)
 			ret =
 			    mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL);
 			if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) {
-				__db_free(rdbt.data);
+				__os_free(rdbt.data, rdbt.size);
 				rdbt.data = NULL;
 			}
 		}
@@ -597,13 +740,15 @@ txn_checkpoint(mgr, kbytes, minutes)
 	const DB_TXNMGR *mgr;
 	u_int32_t kbytes, minutes;
 {
-	TXN_DETAIL *txnp;
-	DB_LSN ckp_lsn, last_ckp;
 	DB_LOG *dblp;
-	u_int32_t kbytes_written;
+	DB_LSN ckp_lsn, sync_lsn, last_ckp;
+	TXN_DETAIL *txnp;
 	time_t last_ckp_time, now;
+	u_int32_t kbytes_written;
 	int ret;
 
+	TXN_PANIC_CHECK(mgr);
+
 	/*
 	 * Check if we need to run recovery.
 	 */
@@ -672,8 +817,13 @@ do_ckp:
 	mgr->region->pending_ckp = ckp_lsn;
 	UNLOCK_TXNREGION(mgr);
 
+	/*
+	 * memp_sync may change the lsn you pass it, so don't pass it
+	 * the actual ckp_lsn, pass it a temp instead.
+	 */
+	sync_lsn = ckp_lsn;
 	if (mgr->dbenv->mp_info != NULL &&
-	    (ret = memp_sync(mgr->dbenv->mp_info, &ckp_lsn)) != 0) {
+	    (ret = memp_sync(mgr->dbenv->mp_info, &sync_lsn)) != 0) {
 		/*
 		 * ret == DB_INCOMPLETE means that there are still buffers to
 		 * flush, the checkpoint is not complete.  Wait and try again.
@@ -776,6 +926,9 @@ txn_stat(mgr, statp, db_malloc)
 	TXN_DETAIL *txnp;
 	size_t nbytes;
 	u_int32_t nactive, ndx;
+	int ret;
+
+	TXN_PANIC_CHECK(mgr);
 
 	LOCK_TXNREGION(mgr);
 	nactive = mgr->region->nbegins -
@@ -787,13 +940,8 @@ txn_stat(mgr, statp, db_malloc)
 	 * that have been created since we unlocked the region.
 	 */
 	nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200);
-	if (db_malloc == NULL)
-		stats = (DB_TXN_STAT *)__db_malloc(nbytes);
-	else
-		stats = (DB_TXN_STAT *)db_malloc(nbytes);
-
-	if (stats == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(nbytes, db_malloc, &stats)) != 0)
+		return (ret);
 
 	LOCK_TXNREGION(mgr);
 	stats->st_last_txnid = mgr->region->last_txnid;
@@ -831,3 +979,68 @@ txn_stat(mgr, statp, db_malloc)
 	*statp = stats;
 	return (0);
 }
+
+static void
+__txn_freekids(txnp)
+	DB_TXN *txnp;
+{
+	DB_TXNMGR *mgr;
+	TXN_DETAIL *tp;
+	DB_TXN *kids;
+
+	mgr = txnp->mgrp;
+
+	for (kids = TAILQ_FIRST(&txnp->kids);
+	    kids != NULL;
+	    kids = TAILQ_FIRST(&txnp->kids)) {
+		/* Free any children of this transaction. */
+		__txn_freekids(kids);
+
+		/* Free the transaction detail in the region. */
+		LOCK_TXNREGION(mgr);
+		tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + kids->off);
+		SH_TAILQ_REMOVE(&mgr->region->active_txn,
+		    tp, links, __txn_detail);
+
+		__db_shalloc_free(mgr->mem, tp);
+		UNLOCK_TXNREGION(mgr);
+
+		/* Now remove from its parent. */
+		TAILQ_REMOVE(&txnp->kids, kids, klinks);
+		if (F_ISSET(txnp, TXN_MALLOC)) {
+			LOCK_TXNTHREAD(mgr);
+			TAILQ_REMOVE(&mgr->txn_chain, kids, links);
+			UNLOCK_TXNTHREAD(mgr);
+			__os_free(kids, sizeof(*kids));
+		}
+	}
+}
+
+/*
+ * __txn_is_ancestor --
+ * 	Determine if a transaction is an ancestor of another transaction.
+ * This is used during lock promotion when we do not have the per-process
+ * data structures that link parents together.  Instead, we'll have to
+ * follow the links in the transaction region.
+ *
+ * PUBLIC: int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t));
+ */
+int
+__txn_is_ancestor(mgr, hold_off, req_off)
+	DB_TXNMGR *mgr;
+	size_t hold_off, req_off;
+{
+	TXN_DETAIL *hold_tp, *req_tp;
+
+	hold_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + hold_off);
+	req_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + req_off);
+
+	while (req_tp->parent != 0) {
+		req_tp =
+		    (TXN_DETAIL *)((u_int8_t *)mgr->region + req_tp->parent);
+		if (req_tp->txnid == hold_tp->txnid)
+			return (1);
+	}
+
+	return (0);
+}
diff --git a/db2/txn/txn.src b/db2/txn/txn.src
index 04809b69d6..c9614f6d6b 100644
--- a/db2/txn/txn.src
+++ b/db2/txn/txn.src
@@ -4,26 +4,52 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)txn.src	10.3 (Sleepycat) 4/10/98
+ *	@(#)txn.src	10.6 (Sleepycat) 1/3/99
  */
 
 PREFIX	txn
 
 /*
- * Everything except for checkpointing takes the same logging routine.
+ * This is the standard log operation for commit.
  */
 BEGIN	regop
 ARG	opcode		u_int32_t	lu
 END
 
 /*
- * This is the checkpoint record. It contains the lsn that the checkpoint
- * guarantees and a pointer to the last checkpoint so that we can walk
- * backwards by checkpoint.
+ * This is the checkpoint record.  It contains the lsn that the checkpoint
+ * guarantees and a pointer to the last checkpoint so we can walk backwards
+ * by checkpoint.
+ *
  * ckp_lsn:
+ *	The lsn in the log of the most recent point at which all begun
+ *	transactions have been aborted.  This is the point for which
+ *	the checkpoint is relevant.
  * last_ckp:
+ *	The previous checkpoint.
  */
 BEGIN	ckp
 POINTER	ckp_lsn		DB_LSN *	lu
 POINTER	last_ckp	DB_LSN *	lu
 END
+
+/*
+ * This is the standard log operation for prepare (since right now
+ * we only use prepare in an XA environment).
+ */
+BEGIN	xa_regop
+ARG	opcode		u_int32_t	lu
+DBT	xid		DBT		s
+ARG	formatID	int32_t		ld
+ARG	gtrid		u_int32_t	u
+ARG	bqual		u_int32_t	u
+POINTER	begin_lsn	DB_LSN *	lu		
+END
+
+/*
+ * This is the log operation for a child commit.
+ */
+BEGIN	child
+ARG	opcode		u_int32_t	lu
+ARG	parent		u_int32_t	lu
+END
diff --git a/db2/txn/txn_auto.c b/db2/txn/txn_auto.c
index f03a52991f..e6d431f089 100644
--- a/db2/txn/txn_auto.c
+++ b/db2/txn/txn_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "txn.h"
@@ -37,15 +36,14 @@ int __txn_regop_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_txn_regop;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
 	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
 	    + sizeof(opcode);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -63,7 +61,7 @@ int __txn_regop_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -101,7 +99,7 @@ __txn_regop_print(notused1, dbtp, lsnp, notused2, notused3)
 	    (u_long)argp->prev_lsn.offset);
 	printf("\topcode: %lu\n", (u_long)argp->opcode);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -115,11 +113,12 @@ __txn_regop_read(recbuf, argpp)
 {
 	__txn_regop_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__txn_regop_args *)__db_malloc(sizeof(__txn_regop_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__txn_regop_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -157,16 +156,15 @@ int __txn_ckp_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_txn_ckp;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
 	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
 	    + sizeof(*ckp_lsn)
 	    + sizeof(*last_ckp);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -192,7 +190,7 @@ int __txn_ckp_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -233,7 +231,7 @@ __txn_ckp_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tlast_ckp: [%lu][%lu]\n",
 	    (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -247,11 +245,12 @@ __txn_ckp_read(recbuf, argpp)
 {
 	__txn_ckp_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__txn_ckp_args *)__db_malloc(sizeof(__txn_ckp_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__txn_ckp_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -269,6 +268,310 @@ __txn_ckp_read(recbuf, argpp)
 }
 
 /*
+ * PUBLIC: int __txn_xa_regop_log
+ * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC:     u_int32_t, const DBT *, int32_t, u_int32_t,
+ * PUBLIC:     u_int32_t, DB_LSN *));
+ */
+int __txn_xa_regop_log(logp, txnid, ret_lsnp, flags,
+	opcode, xid, formatID, gtrid, bqual, begin_lsn)
+	DB_LOG *logp;
+	DB_TXN *txnid;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t opcode;
+	const DBT *xid;
+	int32_t formatID;
+	u_int32_t gtrid;
+	u_int32_t bqual;
+	DB_LSN * begin_lsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn;
+	u_int32_t zero;
+	u_int32_t rectype, txn_num;
+	int ret;
+	u_int8_t *bp;
+
+	rectype = DB_txn_xa_regop;
+	txn_num = txnid == NULL ? 0 : txnid->txnid;
+	if (txnid == NULL) {
+		ZERO_LSN(null_lsn);
+		lsnp = &null_lsn;
+	} else
+		lsnp = &txnid->last_lsn;
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(opcode)
+	    + sizeof(u_int32_t) + (xid == NULL ? 0 : xid->size)
+	    + sizeof(formatID)
+	    + sizeof(gtrid)
+	    + sizeof(bqual)
+	    + sizeof(*begin_lsn);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
+
+	bp = logrec.data;
+	memcpy(bp, &rectype, sizeof(rectype));
+	bp += sizeof(rectype);
+	memcpy(bp, &txn_num, sizeof(txn_num));
+	bp += sizeof(txn_num);
+	memcpy(bp, lsnp, sizeof(DB_LSN));
+	bp += sizeof(DB_LSN);
+	memcpy(bp, &opcode, sizeof(opcode));
+	bp += sizeof(opcode);
+	if (xid == NULL) {
+		zero = 0;
+		memcpy(bp, &zero, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else {
+		memcpy(bp, &xid->size, sizeof(xid->size));
+		bp += sizeof(xid->size);
+		memcpy(bp, xid->data, xid->size);
+		bp += xid->size;
+	}
+	memcpy(bp, &formatID, sizeof(formatID));
+	bp += sizeof(formatID);
+	memcpy(bp, &gtrid, sizeof(gtrid));
+	bp += sizeof(gtrid);
+	memcpy(bp, &bqual, sizeof(bqual));
+	bp += sizeof(bqual);
+	if (begin_lsn != NULL)
+		memcpy(bp, begin_lsn, sizeof(*begin_lsn));
+	else
+		memset(bp, 0, sizeof(*begin_lsn));
+	bp += sizeof(*begin_lsn);
+#ifdef DIAGNOSTIC
+	if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+		fprintf(stderr, "Error in log record length");
+#endif
+	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+	if (txnid != NULL)
+		txnid->last_lsn = *ret_lsnp;
+	__os_free(logrec.data, 0);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_print
+ * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__txn_xa_regop_print(notused1, dbtp, lsnp, notused2, notused3)
+	DB_LOG *notused1;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	int notused2;
+	void *notused3;
+{
+	__txn_xa_regop_args *argp;
+	u_int32_t i;
+	u_int ch;
+	int ret;
+
+	i = 0;
+	ch = 0;
+	notused1 = NULL;
+	notused2 = 0;
+	notused3 = NULL;
+
+	if ((ret = __txn_xa_regop_read(dbtp->data, &argp)) != 0)
+		return (ret);
+	printf("[%lu][%lu]txn_xa_regop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file,
+	    (u_long)lsnp->offset,
+	    (u_long)argp->type,
+	    (u_long)argp->txnid->txnid,
+	    (u_long)argp->prev_lsn.file,
+	    (u_long)argp->prev_lsn.offset);
+	printf("\topcode: %lu\n", (u_long)argp->opcode);
+	printf("\txid: ");
+	for (i = 0; i < argp->xid.size; i++) {
+		ch = ((u_int8_t *)argp->xid.data)[i];
+		if (isprint(ch) || ch == 0xa)
+			putchar(ch);
+		else
+			printf("%#x ", ch);
+	}
+	printf("\n");
+	printf("\tformatID: %ld\n", (long)argp->formatID);
+	printf("\tgtrid: %u\n", argp->gtrid);
+	printf("\tbqual: %u\n", argp->bqual);
+	printf("\tbegin_lsn: [%lu][%lu]\n",
+	    (u_long)argp->begin_lsn.file, (u_long)argp->begin_lsn.offset);
+	printf("\n");
+	__os_free(argp, 0);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_read __P((void *, __txn_xa_regop_args **));
+ */
+int
+__txn_xa_regop_read(recbuf, argpp)
+	void *recbuf;
+	__txn_xa_regop_args **argpp;
+{
+	__txn_xa_regop_args *argp;
+	u_int8_t *bp;
+	int ret;
+
+	ret = __os_malloc(sizeof(__txn_xa_regop_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
+	argp->txnid = (DB_TXN *)&argp[1];
+	bp = recbuf;
+	memcpy(&argp->type, bp, sizeof(argp->type));
+	bp += sizeof(argp->type);
+	memcpy(&argp->txnid->txnid,  bp, sizeof(argp->txnid->txnid));
+	bp += sizeof(argp->txnid->txnid);
+	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+	bp += sizeof(DB_LSN);
+	memcpy(&argp->opcode, bp, sizeof(argp->opcode));
+	bp += sizeof(argp->opcode);
+	memcpy(&argp->xid.size, bp, sizeof(u_int32_t));
+	bp += sizeof(u_int32_t);
+	argp->xid.data = bp;
+	bp += argp->xid.size;
+	memcpy(&argp->formatID, bp, sizeof(argp->formatID));
+	bp += sizeof(argp->formatID);
+	memcpy(&argp->gtrid, bp, sizeof(argp->gtrid));
+	bp += sizeof(argp->gtrid);
+	memcpy(&argp->bqual, bp, sizeof(argp->bqual));
+	bp += sizeof(argp->bqual);
+	memcpy(&argp->begin_lsn, bp,  sizeof(argp->begin_lsn));
+	bp += sizeof(argp->begin_lsn);
+	*argpp = argp;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __txn_child_log
+ * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
+ * PUBLIC:     u_int32_t, u_int32_t));
+ */
+int __txn_child_log(logp, txnid, ret_lsnp, flags,
+	opcode, parent)
+	DB_LOG *logp;
+	DB_TXN *txnid;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t opcode;
+	u_int32_t parent;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn;
+	u_int32_t rectype, txn_num;
+	int ret;
+	u_int8_t *bp;
+
+	rectype = DB_txn_child;
+	txn_num = txnid == NULL ? 0 : txnid->txnid;
+	if (txnid == NULL) {
+		ZERO_LSN(null_lsn);
+		lsnp = &null_lsn;
+	} else
+		lsnp = &txnid->last_lsn;
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(opcode)
+	    + sizeof(parent);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
+
+	bp = logrec.data;
+	memcpy(bp, &rectype, sizeof(rectype));
+	bp += sizeof(rectype);
+	memcpy(bp, &txn_num, sizeof(txn_num));
+	bp += sizeof(txn_num);
+	memcpy(bp, lsnp, sizeof(DB_LSN));
+	bp += sizeof(DB_LSN);
+	memcpy(bp, &opcode, sizeof(opcode));
+	bp += sizeof(opcode);
+	memcpy(bp, &parent, sizeof(parent));
+	bp += sizeof(parent);
+#ifdef DIAGNOSTIC
+	if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
+		fprintf(stderr, "Error in log record length");
+#endif
+	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
+	if (txnid != NULL)
+		txnid->last_lsn = *ret_lsnp;
+	__os_free(logrec.data, 0);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_child_print
+ * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__txn_child_print(notused1, dbtp, lsnp, notused2, notused3)
+	DB_LOG *notused1;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	int notused2;
+	void *notused3;
+{
+	__txn_child_args *argp;
+	u_int32_t i;
+	u_int ch;
+	int ret;
+
+	i = 0;
+	ch = 0;
+	notused1 = NULL;
+	notused2 = 0;
+	notused3 = NULL;
+
+	if ((ret = __txn_child_read(dbtp->data, &argp)) != 0)
+		return (ret);
+	printf("[%lu][%lu]txn_child: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file,
+	    (u_long)lsnp->offset,
+	    (u_long)argp->type,
+	    (u_long)argp->txnid->txnid,
+	    (u_long)argp->prev_lsn.file,
+	    (u_long)argp->prev_lsn.offset);
+	printf("\topcode: %lu\n", (u_long)argp->opcode);
+	printf("\tparent: %lu\n", (u_long)argp->parent);
+	printf("\n");
+	__os_free(argp, 0);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __txn_child_read __P((void *, __txn_child_args **));
+ */
+int
+__txn_child_read(recbuf, argpp)
+	void *recbuf;
+	__txn_child_args **argpp;
+{
+	__txn_child_args *argp;
+	u_int8_t *bp;
+	int ret;
+
+	ret = __os_malloc(sizeof(__txn_child_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
+	argp->txnid = (DB_TXN *)&argp[1];
+	bp = recbuf;
+	memcpy(&argp->type, bp, sizeof(argp->type));
+	bp += sizeof(argp->type);
+	memcpy(&argp->txnid->txnid,  bp, sizeof(argp->txnid->txnid));
+	bp += sizeof(argp->txnid->txnid);
+	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+	bp += sizeof(DB_LSN);
+	memcpy(&argp->opcode, bp, sizeof(argp->opcode));
+	bp += sizeof(argp->opcode);
+	memcpy(&argp->parent, bp, sizeof(argp->parent));
+	bp += sizeof(argp->parent);
+	*argpp = argp;
+	return (0);
+}
+
+/*
  * PUBLIC: int __txn_init_print __P((DB_ENV *));
  */
 int
@@ -283,6 +586,12 @@ __txn_init_print(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __txn_ckp_print, DB_txn_ckp)) != 0)
 		return (ret);
+	if ((ret = __db_add_recovery(dbenv,
+	    __txn_xa_regop_print, DB_txn_xa_regop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery(dbenv,
+	    __txn_child_print, DB_txn_child)) != 0)
+		return (ret);
 	return (0);
 }
 
@@ -301,6 +610,12 @@ __txn_init_recover(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __txn_ckp_recover, DB_txn_ckp)) != 0)
 		return (ret);
+	if ((ret = __db_add_recovery(dbenv,
+	    __txn_xa_regop_recover, DB_txn_xa_regop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery(dbenv,
+	    __txn_child_recover, DB_txn_child)) != 0)
+		return (ret);
 	return (0);
 }
 
diff --git a/db2/txn/txn_rec.c b/db2/txn/txn_rec.c
index e53dc5f3b7..f21a0f92c8 100644
--- a/db2/txn/txn_rec.c
+++ b/db2/txn/txn_rec.c
@@ -40,7 +40,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)txn_rec.c	10.11 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)txn_rec.c	10.15 (Sleepycat) 1/3/99";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -54,10 +54,18 @@ static const char sccsid[] = "@(#)txn_rec.c	10.11 (Sleepycat) 5/3/98";
 #include "shqueue.h"
 #include "txn.h"
 #include "db_am.h"
+#include "log.h"
+#include "common_ext.h"
 
+static int __txn_restore_txn __P((DB_ENV *, DB_LSN *, __txn_xa_regop_args *));
+
+#define	IS_XA_TXN(R) (R->xid.size != 0)
+	
 /*
  * PUBLIC: int __txn_regop_recover
- * PUBLIC:     __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ *
+ * These records are only ever written for commits.
  */
 int
 __txn_regop_recover(logp, dbtp, lsnp, redo, info)
@@ -79,24 +87,80 @@ __txn_regop_recover(logp, dbtp, lsnp, redo, info)
 	if ((ret = __txn_regop_read(dbtp->data, &argp)) != 0)
 		return (ret);
 
-	switch (argp->opcode) {
-	case TXN_COMMIT:
-		if (__db_txnlist_find(info,
-		    argp->txnid->txnid) == DB_NOTFOUND)
-			__db_txnlist_add(info, argp->txnid->txnid);
-		break;
-	case TXN_PREPARE:	/* Nothing to do. */
-		/* Call __db_txnlist_find so that we update the maxid. */
-		(void)__db_txnlist_find(info, argp->txnid->txnid);
-		break;
-	default:
+	if (argp->opcode != TXN_COMMIT)
+		ret = EINVAL;
+	else
+		if (__db_txnlist_find(info, argp->txnid->txnid) == DB_NOTFOUND)
+			ret = __db_txnlist_add(info, argp->txnid->txnid);
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	__os_free(argp, 0);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_recover
+ * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ *
+ * These records are only ever written for prepares.
+ */
+int
+__txn_xa_regop_recover(logp, dbtp, lsnp, redo, info)
+	DB_LOG *logp;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	int redo;
+	void *info;
+{
+	__txn_xa_regop_args *argp;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_xa_regop_print(logp, dbtp, lsnp, redo, info);
+#endif
+	COMPQUIET(redo, 0);
+	COMPQUIET(logp, NULL);
+
+	if ((ret = __txn_xa_regop_read(dbtp->data, &argp)) != 0)
+		return (ret);
+
+	if (argp->opcode != TXN_PREPARE)
 		ret = EINVAL;
-		break;
+	else {
+		/*
+		 * Whether we are in XA or not, we need to call
+		 * __db_txnlist_find so that we update the maxid.
+		 * If this is an XA transaction, then we treat
+		 * prepares like commits so that we roll forward to
+		 * a point where we can handle commit/abort calls
+		 * from the TMS.  If this isn't XA, then a prepare
+		 * is treated like a No-op; we only care about the
+		 * commit.
+		 */
+		ret = __db_txnlist_find(info, argp->txnid->txnid);
+		if (IS_XA_TXN(argp) && ret == DB_NOTFOUND) {
+			/*
+			 * This is an XA prepared, but not yet committed
+			 * transaction.  We need to add it to the
+			 * transaction list, so that it gets rolled
+			 * forward. We also have to add it to the region's
+			 * internal state so it can be properly aborted
+			 * or recovered.
+			 */
+			ret = __db_txnlist_add(info, argp->txnid->txnid);
+			if (ret == 0)
+				ret = __txn_restore_txn(logp->dbenv,
+				    lsnp, argp);
+		}
 	}
 
-	*lsnp = argp->prev_lsn;
-	__db_free(argp);
-	return (0);
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	__os_free(argp, 0);
+
+	return (ret);
 }
 
 /*
@@ -130,7 +194,103 @@ __txn_ckp_recover(logp, dbtp, lsnp, redo, info)
 	if (argp->ckp_lsn.file == lsnp->file &&
 	    argp->ckp_lsn.offset == lsnp->offset)
 		__db_txnlist_gen(info, redo ? -1 : 1);
+
 	*lsnp = argp->last_ckp;
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (DB_TXN_CKP);
 }
+
+/*
+ * __txn_child_recover
+ *	Recover a commit record for a child transaction.
+ *
+ * PUBLIC: int __txn_child_recover
+ * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
+ */
+int
+__txn_child_recover(logp, dbtp, lsnp, redo, info)
+	DB_LOG *logp;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	int redo;
+	void *info;
+{
+	__txn_child_args *argp;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_child_print(logp, dbtp, lsnp, redo, info);
+#endif
+	COMPQUIET(redo, 0);
+	COMPQUIET(logp, NULL);
+
+	if ((ret = __txn_child_read(dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * We count the child as committed only if its parent committed.
+	 * So, if we are not yet in the transaction list, but our parent
+	 * is, then we should go ahead and commit.
+	 */
+	if (argp->opcode != TXN_COMMIT)
+		ret = EINVAL;
+	else
+		if (__db_txnlist_find(info, argp->parent) == 0 &&
+		    __db_txnlist_find(info, argp->txnid->txnid) == DB_NOTFOUND)
+			ret = __db_txnlist_add(info, argp->txnid->txnid);
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	__os_free(argp, 0);
+
+	return (ret);
+}
+
+/*
+ * __txn_restore_txn --
+ *	Using only during XA recovery.  If we find any transactions that are
+ * prepared, but not yet committed, then we need to restore the transaction's
+ * state into the shared region, because the TM is going to issue a txn_abort
+ * or txn_commit and we need to respond correctly.
+ *
+ * lsnp is the LSN of the returned LSN
+ * argp is the perpare record (in an appropriate structure)
+ */
+static int
+__txn_restore_txn(dbenv, lsnp, argp)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+	__txn_xa_regop_args *argp;
+{
+	DB_TXNMGR *mgr;
+	TXN_DETAIL *td;
+	int ret;
+
+	if (argp->xid.size == 0)
+		return(0);
+
+	mgr = dbenv->tx_info;
+	LOCK_TXNREGION(mgr);
+
+	/* Allocate a new transaction detail structure. */
+	if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0)
+		return (ret);
+
+	/* Place transaction on active transaction list. */
+	SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail);
+
+	td->txnid = argp->txnid->txnid;
+	td->begin_lsn = argp->begin_lsn;
+	td->last_lsn = *lsnp;
+	td->last_lock = 0;
+	td->parent = 0;
+	td->status = TXN_PREPARED;
+	td->xa_status = TXN_XA_PREPARED;
+	memcpy(td->xid, argp->xid.data, argp->xid.size);
+	td->bqual = argp->bqual;
+	td->gtrid = argp->gtrid;
+	td->format = argp->formatID;
+
+	UNLOCK_TXNREGION(mgr);
+	return (0);
+}
diff --git a/db2/xa/xa.c b/db2/xa/xa.c
new file mode 100644
index 0000000000..94a96e7e09
--- /dev/null
+++ b/db2/xa/xa.c
@@ -0,0 +1,682 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+/* XXX Remove the global transaction and hang it off the environment. */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)xa.c	10.4 (Sleepycat) 10/11/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "shqueue.h"
+#include "log.h"
+#include "txn.h"
+#include "db_auto.h"
+#include "db_ext.h"
+#include "db_dispatch.h"
+
+static int  __db_xa_close __P((char *, int, long));
+static int  __db_xa_commit __P((XID *, int, long));
+static int  __db_xa_complete __P((int *, int *, int, long));
+static int  __db_xa_end __P((XID *, int, long));
+static int  __db_xa_forget __P((XID *, int, long));
+static int  __db_xa_open __P((char *, int, long));
+static int  __db_xa_prepare __P((XID *, int, long));
+static int  __db_xa_recover __P((XID *, long, int, long));
+static int  __db_xa_rollback __P((XID *, int, long));
+static int  __db_xa_start __P((XID *, int, long));
+static void __xa_txn_end __P((DB_ENV *));
+static void __xa_txn_init __P((DB_ENV *, TXN_DETAIL *, size_t));
+
+/*
+ * Possible flag values:
+ *	Dynamic registration	0 => no dynamic registration
+ *				TMREGISTER => dynamic registration
+ *	Asynchronous operation	0 => no support for asynchrony
+ *				TMUSEASYNC => async support
+ *	Migration support	0 => migration of transactions across
+ *				     threads is possible
+ *				TMNOMIGRATE => no migration across threads
+ */
+const struct xa_switch_t db_xa_switch = {
+	 "Berkeley DB",		/* name[RMNAMESZ] */
+	 TMNOMIGRATE,		/* flags */
+	 0,			/* version */
+	 __db_xa_open,		/* xa_open_entry */
+	 __db_xa_close,		/* xa_close_entry */
+	 __db_xa_start,		/* xa_start_entry */
+	 __db_xa_end,		/* xa_end_entry */
+	 __db_xa_rollback,	/* xa_rollback_entry */
+	 __db_xa_prepare,	/* xa_prepare_entry */
+	 __db_xa_commit,	/* xa_commit_entry */
+	 __db_xa_recover,	/* xa_recover_entry */
+	 __db_xa_forget,	/* xa_forget_entry */
+	 __db_xa_complete	/* xa_complete_entry */
+};
+
+/*
+ * __db_xa_open --
+ *	The open call in the XA protocol.  The rmid field is an id number
+ * that the TM assigned us and will pass us on every xa call.  We need to
+ * map that rmid number into a dbenv structure that we create during
+ * initialization.  Since this id number is thread specific, we do not
+ * need to store it in shared memory.  The file xa_map.c implements all
+ * such xa->db mappings.
+ *	The xa_info field is instance specific information.  We require
+ * that the value of DB_HOME be passed in xa_info.  Since xa_info is the
+ * only thing that we get to pass to db_appinit, any config information
+ * will have to be done via a config file instead of via the db_appinit
+ * call.
+ */
+static int
+__db_xa_open(xa_info, rmid, flags)
+	char *xa_info;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/* Verify if we already have this environment open. */
+	if (__db_rmid_to_env(rmid, &env, 0) == 0)
+		return (XA_OK);
+
+	/*
+	 * Since we cannot tell whether the environment is OK or not,
+	 * we can't actually do the db_appinit in xa_open.  Instead,
+	 * we save the mapping between the rmid and the xa_info.  If
+	 * we next get a call to __xa_recover, we do the db_appinit
+	 * with DB_RECOVER set.  If we get any other call, then we
+	 * do the db_appinit.
+	 */
+	return (__db_map_rmid_name(rmid, xa_info));
+}
+
+/*
+ * __db_xa_close --
+ *	The close call of the XA protocol.  The only trickiness here
+ * is that if there are any active transactions, we must fail.  It is
+ * *not* an error to call close on an environment that has already been
+ * closed (I am interpreting that to mean it's OK to call close on an
+ * environment that has never been opened).
+ */
+static int
+__db_xa_close(xa_info, rmid, flags)
+	char *xa_info;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	int ret, t_ret;
+
+	COMPQUIET(xa_info, NULL);
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/* If the environment is closed, then we're done. */
+	if (__db_rmid_to_env(rmid, &env, 0) != 0)
+		return (XA_OK);
+
+	/* Check if there are any pending transactions. */
+	if (env->xa_txn != NULL && env->xa_txn->txnid != TXN_INVALID)
+		return (XAER_PROTO);
+
+	/* Now, destroy the mapping and close the environment. */
+	ret = __db_unmap_rmid(rmid);
+	if ((t_ret = db_appexit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, sizeof(DB_ENV));
+
+	return (ret == 0 ? XA_OK : XAER_RMERR);
+}
+
+/*
+ * __db_xa_start --
+ *	Begin a transaction for the current resource manager.
+ */
+static int
+__db_xa_start(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	TXN_DETAIL *td;
+	size_t off;
+	int is_known;
+
+#define	OK_FLAGS	(TMJOIN | TMRESUME | TMNOWAIT | TMASYNC | TMNOFLAGS)
+	if (LF_ISSET(~OK_FLAGS))
+		return (XAER_INVAL);
+
+	if (LF_ISSET(TMJOIN) && LF_ISSET(TMRESUME))
+		return (XAER_INVAL);
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+
+	if (__db_rmid_to_env(rmid, &env, 1) != 0)
+		return (XAER_PROTO);
+
+	is_known = __db_xid_to_txn(env, xid, &off) == 0;
+
+	if (is_known && !LF_ISSET(TMRESUME) && !LF_ISSET(TMJOIN))
+		return (XAER_DUPID);
+
+	if (!is_known && LF_ISSET(TMRESUME | TMJOIN))
+		return (XAER_NOTA);
+
+	/*
+	 * This can't block, so we can ignore TMNOWAIT.
+	 *
+	 * Other error conditions: RMERR, RMFAIL, OUTSIDE, PROTO, RB*
+	 */
+	if (is_known) {
+		td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+		if (td->xa_status == TXN_XA_SUSPENDED && !LF_SET(TMRESUME))
+			return (XAER_PROTO);
+		if (td->xa_status == TXN_XA_DEADLOCKED)
+			return (XA_RBDEADLOCK);
+		if (td->xa_status == TXN_XA_ABORTED)
+			return (XA_RBOTHER);
+
+		/* Now, fill in the global transaction structure. */
+		__xa_txn_init(env, td, off);
+		td->xa_status = TXN_XA_STARTED;
+	} else {
+		if (__txn_xa_begin(env, env->xa_txn) != 0)
+			return (XAER_RMERR);
+		(void)__db_map_xid(env, xid, env->xa_txn->off);
+		td = (TXN_DETAIL *)
+		    ((u_int8_t *)env->tx_info->region + env->xa_txn->off);
+		td->xa_status = TXN_XA_STARTED;
+	}
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_end --
+ *	Disassociate the current transaction from the current process.
+ */
+static int
+__db_xa_end(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	DB_TXN *txn;
+	TXN_DETAIL *td;
+	size_t off;
+
+	if (flags != TMNOFLAGS && !LF_ISSET(TMSUSPEND | TMSUCCESS | TMFAIL))
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env, 0) != 0)
+		return (XAER_PROTO);
+
+	if (__db_xid_to_txn(env, xid, &off) != 0)
+		return (XAER_NOTA);
+
+	txn = env->xa_txn;
+	if (off != txn->off)
+		return (XAER_PROTO);
+
+	td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+	if (td->xa_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->status == TXN_ABORTED)
+		return (XA_RBOTHER);
+
+	if (td->xa_status != TXN_XA_STARTED)
+		return (XAER_PROTO);
+
+	/* Update the shared memory last_lsn field */
+	td->last_lsn = txn->last_lsn;
+
+	/*
+	 * If we ever support XA migration, we cannot keep SUSPEND/END
+	 * status in the shared region; it would have to be process local.
+	 */
+	if (LF_ISSET(TMSUSPEND))
+		td->xa_status = TXN_XA_SUSPENDED;
+	else
+		td->xa_status = TXN_XA_ENDED;
+
+	txn->txnid = TXN_INVALID;
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_prepare --
+ *	Sync the log to disk so we can guarantee recoverability.
+ */
+static int
+__db_xa_prepare(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	TXN_DETAIL *td;
+	size_t off;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/*
+	 * We need to know if we've ever called prepare on this.
+	 * As part of the prepare, we set the xa_status field to
+	 * reflect that fact that prepare has been called, and if
+	 * it's ever called again, it's an error.
+	 */
+	if (__db_rmid_to_env(rmid, &env, 1) != 0)
+		return (XAER_PROTO);
+
+	if (__db_xid_to_txn(env, xid, &off) != 0)
+		return (XAER_NOTA);
+
+	td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+
+	if (td->xa_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED)
+		return (XAER_PROTO);
+
+	/* Now, fill in the global transaction structure. */
+	__xa_txn_init(env, td, off);
+
+	if (txn_prepare(env->xa_txn) != 0)
+		return (XAER_RMERR);
+
+	td->xa_status = TXN_XA_PREPARED;
+
+	/* No fatal value that would require an XAER_RMFAIL. */
+	__xa_txn_end(env);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_commit --
+ *	Commit the transaction
+ */
+static int
+__db_xa_commit(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	TXN_DETAIL *td;
+	size_t off;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+#undef	OK_FLAGS
+#define	OK_FLAGS	(TMNOFLAGS | TMNOWAIT | TMONEPHASE)
+	if (LF_ISSET(~OK_FLAGS))
+		return (XAER_INVAL);
+
+	/*
+	 * We need to know if we've ever called prepare on this.
+	 * We can verify this by examining the xa_status field.
+	 */
+	if (__db_rmid_to_env(rmid, &env, 1) != 0)
+		return (XAER_PROTO);
+
+	if (__db_xid_to_txn(env, xid, &off) != 0)
+		return (XAER_NOTA);
+
+	td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+
+	if (td->xa_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->xa_status == TXN_XA_ABORTED)
+		return (XA_RBOTHER);
+
+	if (LF_SET(TMONEPHASE) &&
+	    td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED)
+		return (XAER_PROTO);
+
+	if (!LF_SET(TMONEPHASE) && td->xa_status != TXN_XA_PREPARED)
+		return (XAER_PROTO);
+
+	/* Now, fill in the global transaction structure. */
+	__xa_txn_init(env, td, off);
+
+	if (txn_commit(env->xa_txn) != 0)
+		return (XAER_RMERR);
+
+	/* No fatal value that would require an XAER_RMFAIL. */
+	__xa_txn_end(env);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_recover --
+ *	Returns a list of prepared and heuristically completed transactions.
+ *
+ * The return value is the number of xids placed into the xid array (less
+ * than or equal to the count parameter).  The flags are going to indicate
+ * whether we are starting a scan or continuing one.
+ */
+static int
+__db_xa_recover(xids, count, rmid, flags)
+	XID *xids;
+	long count, flags;
+	int rmid;
+{
+	__txn_xa_regop_args *argp;
+	DBT data;
+	DB_ENV *env;
+	DB_LOG *log;
+	XID *xidp;
+	char *dbhome;
+	int err, ret;
+	u_int32_t rectype, txnid;
+
+	ret = 0;
+	xidp = xids;
+
+
+	/*
+	 * If we are starting a scan, then we need to open the environment
+	 * and run recovery.  This recovery puts us in a state where we can
+	 * either commit or abort any transactions that were prepared but not
+	 * yet committed.  Once we've done that, we need to figure out where
+	 * to begin checking for such transactions.  If we are not starting
+	 * a scan, then the environment had better have already been recovered
+	 * and we'll start from * wherever the log cursor is.  Since XA apps
+	 * cannot be threaded, we don't have to worry about someone else
+	 * having moved it.
+	 */
+	if (LF_ISSET(TMSTARTRSCAN)) {
+		/* If the environment is open, we have a problem. */
+		if (__db_rmid_to_env(rmid, &env, 0) == XA_OK)
+			return (XAER_PROTO);
+
+		if ((ret = __os_calloc(1, sizeof(DB_ENV), &env)) != 0)
+			return (XAER_RMERR);
+
+		if (__db_rmid_to_name(rmid, &dbhome) != 0)
+			goto err1;
+
+#undef XA_FLAGS
+#define	XA_FLAGS DB_RECOVER | \
+	DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN
+		if ((ret = db_appinit(dbhome, NULL, env, XA_FLAGS)) != 0)
+			goto err1;
+
+		if (__db_map_rmid(rmid, env) != 0)
+			goto err2;
+
+		/* Now figure out from where to begin scan. */
+		log = env->lg_info;
+		if ((err = __log_findckp(log, &log->xa_first)) == DB_NOTFOUND) {
+			/*
+			 * If there were no log files, then we have no
+			 * transactions to return, so we simply return 0.
+			 */
+			return (0);
+		}
+		if ((err = __db_txnlist_init(&log->xa_info)) != 0)
+			goto err3;
+	} else {
+		/* We had better already know about this rmid. */
+		if (__db_rmid_to_env(rmid, &env, 0) != 0)
+			return (XAER_PROTO);
+		/*
+		 * If we are not starting a scan, the log cursor had
+		 * better be set.
+		 */
+		log = env->lg_info;
+		if (IS_ZERO_LSN(log->xa_lsn))
+			return (XAER_PROTO);
+	}
+
+	/*
+	 * At this point log->xa_first contains the point in the log
+	 * to which we need to roll back.  If we are starting a scan,
+	 * we'll start at the last record; if we're continuing a scan,
+	 * we'll have to start at log->xa_lsn.
+	 */
+
+	memset(&data, 0, sizeof(data));
+	for (err = log_get(log, &log->xa_lsn, &data,
+	    LF_ISSET(TMSTARTRSCAN) ? DB_LAST : DB_SET);
+	    err == 0 && log_compare(&log->xa_lsn, &log->xa_first) > 0;
+	    err = log_get(log, &log->xa_lsn, &data, DB_PREV)) {
+		memcpy(&rectype, data.data, sizeof(rectype));
+
+		/*
+		 * The only record type we care about is an DB_txn_xa_regop.
+		 * If it's a commit, we have to add it to a txnlist.  If it's
+		 * a prepare, and we don't have a commit, then we return it.
+		 * We are redoing some of what's in the xa_regop_recovery
+		 * code, but we have to do it here so we can get at the xid
+		 * in the record.
+		 */
+		if (rectype != DB_txn_xa_regop && rectype != DB_txn_regop)
+			continue;
+
+		memcpy(&txnid, (u_int8_t *)data.data + sizeof(rectype),
+		    sizeof(txnid));
+		err = __db_txnlist_find(log->xa_info, txnid);
+		switch (rectype) {
+		case DB_txn_regop:
+			if (err == DB_NOTFOUND)
+				__db_txnlist_add(log->xa_info, txnid);
+			err = 0;
+			break;
+		case DB_txn_xa_regop:
+			/*
+			 * This transaction is commited, so we needn't read
+			 * the record and do anything.
+			 */
+			if (err == 0)
+				break;
+			if ((err =
+			    __txn_xa_regop_read(data.data, &argp)) != 0) {
+				ret = XAER_RMERR;
+				goto out;
+			}
+
+			xidp->formatID = argp->formatID;
+			xidp->gtrid_length = argp->gtrid;
+			xidp->bqual_length = argp->bqual;
+			memcpy(xidp->data, argp->xid.data, argp->xid.size);
+			ret++;
+			xidp++;
+			__os_free(argp, sizeof(*argp));
+			if (ret == count)
+				goto done;
+			break;
+		}
+	}
+
+	if (err != 0 && err != DB_NOTFOUND)
+		goto out;
+
+done:	if (LF_ISSET(TMENDRSCAN)) {
+		ZERO_LSN(log->xa_lsn);
+		ZERO_LSN(log->xa_first);
+
+out:		__db_txnlist_end(log->xa_info);
+		log->xa_info = NULL;
+	}
+	return (ret);
+
+err3:	(void)__db_unmap_rmid(rmid);
+err2:	(void)db_appexit(env);
+err1:	__os_free(env, sizeof(DB_ENV));
+	return (XAER_RMERR);
+}
+
+/*
+ * __db_xa_rollback
+ *	Abort an XA transaction.
+ */
+static int
+__db_xa_rollback(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	TXN_DETAIL *td;
+	size_t off;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env, 1) != 0)
+		return (XAER_PROTO);
+
+	if (__db_xid_to_txn(env, xid, &off) != 0)
+		return (XAER_NOTA);
+
+	td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+
+	if (td->xa_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->xa_status == TXN_XA_ABORTED)
+		return (XA_RBOTHER);
+
+	if (LF_SET(TMONEPHASE) &&
+	    td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED)
+		return (XAER_PROTO);
+
+	if (!LF_SET(TMONEPHASE) && td->xa_status != TXN_XA_PREPARED)
+		return (XAER_PROTO);
+
+	/* Now, fill in the global transaction structure. */
+	__xa_txn_init(env, td, off);
+	if (txn_abort(env->xa_txn) != 0)
+		return (XAER_RMERR);
+
+	/* No fatal value that would require an XAER_RMFAIL. */
+	__xa_txn_end(env);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_forget --
+ *	Forget about an XID for a transaction that was heuristically
+ * completed.  Since we do not heuristically complete anything, I
+ * don't think we have to do anything here, but we should make sure
+ * that we reclaim the slots in the txnid table.
+ */
+static int
+__db_xa_forget(xid, rmid, flags)
+	XID *xid;
+	int rmid;
+	long flags;
+{
+	DB_ENV *env;
+	size_t off;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env, 1) != 0)
+		return (XAER_PROTO);
+
+	/*
+	 * If mapping is gone, then we're done.
+	 */
+	if (__db_xid_to_txn(env, xid, &off) != 0)
+		return (XA_OK);
+
+	__db_unmap_xid(env, xid, off);
+
+	/* No fatal value that would require an XAER_RMFAIL. */
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_complete --
+ *	Used to wait for asynchronous operations to complete.  Since we're
+ *	not doing asynch, this is an invalid operation.
+ */
+static int
+__db_xa_complete(handle, retval, rmid, flags)
+	int *handle, *retval, rmid;
+	long flags;
+{
+	COMPQUIET(handle, NULL);
+	COMPQUIET(retval, NULL);
+	COMPQUIET(rmid, 0);
+	COMPQUIET(flags, 0);
+
+	return (XAER_INVAL);
+}
+
+/*
+ * __xa_txn_init --
+ * 	Fill in the fields of the local transaction structure given
+ *	the detail transaction structure.
+ */
+static void
+__xa_txn_init(env, td, off)
+	DB_ENV *env;
+	TXN_DETAIL *td;
+	size_t off;
+{
+	DB_TXN *txn;
+
+	txn = env->xa_txn;
+	txn->mgrp = env->tx_info;
+	txn->parent = NULL;
+	txn->last_lsn = td->last_lsn;
+	txn->txnid = td->txnid;
+	txn->off = off;
+	txn->flags = 0;
+}
+
+/*
+ * __xa_txn_end --
+ * 	Invalidate a transaction structure that was generated by xa_txn_init.
+ */
+static void
+__xa_txn_end(env)
+	DB_ENV *env;
+{
+	DB_TXN *txn;
+
+	txn = env->xa_txn;
+	if (txn != NULL)
+		txn->txnid = TXN_INVALID;
+}
+
diff --git a/db2/xa/xa_db.c b/db2/xa/xa_db.c
new file mode 100644
index 0000000000..4aaaeff108
--- /dev/null
+++ b/db2/xa/xa_db.c
@@ -0,0 +1,308 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)xa_db.c	10.6 (Sleepycat) 12/19/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#undef stat
+
+#include "db_int.h"
+#include "db_page.h"
+#include "xa.h"
+#include "xa_ext.h"
+#include "db_am.h"
+#include "db_ext.h"
+#include "common_ext.h"
+
+static int __xa_c_close __P((DBC *));
+static int __xa_c_del __P((DBC *, u_int32_t));
+static int __xa_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __xa_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __xa_close __P((DB *, u_int32_t));
+static int __xa_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t));
+static int __xa_del __P((DB *, DB_TXN *, DBT *, u_int32_t));
+static int __xa_fd __P((DB *, int *));
+static int __xa_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __xa_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __xa_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
+static int __xa_sync __P((DB *, u_int32_t));
+
+int
+db_xa_open(fname, type, flags, mode, dbinfo, dbpp)
+	const char *fname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode;
+	DB_INFO *dbinfo;
+	DB **dbpp;
+{
+	DB *dbp, *real_dbp;
+	DB_ENV *dbenv;
+	struct __rmname *rp;
+	int ret;
+
+	/*
+	 * First try to open up the underlying DB.
+	 *
+	 * !!!
+	 * The dbenv argument is taken from the global list of environments.
+	 * When the transaction manager called xa_start() (__db_xa_start()),
+	 * the "current" DB environment was moved to the start of the list.
+	 * However, if we were called in a tpsvrinit function (which is
+	 * entirely plausible), then it's possible that xa_open was called
+	 * (which simply recorded the name of the environment to open) and
+	 * this is the next call into DB.  In that case, we still have to
+	 * open the environment.
+	 *
+	 * The way that we know that xa_open and nothing else was called
+	 * is because the nameq is not NULL.
+	 */
+	if ((rp = TAILQ_FIRST(&DB_GLOBAL(db_nameq))) != NULL &&
+	    (ret = __db_rmid_to_env(rp->rmid, &dbenv, 1)) != 0)
+		return (ret);
+
+	dbenv = TAILQ_FIRST(&DB_GLOBAL(db_envq));
+	if ((ret = db_open(fname,
+	    type, flags, mode, dbenv, dbinfo, &real_dbp)) != 0)
+		return (ret);
+
+	/*
+	 * Allocate our own DB handle, and copy the exported fields and
+	 * function pointers into it.  The internal pointer references
+	 * the real underlying DB handle.
+	 */
+	if ((ret = __os_calloc(1, sizeof(DB), &dbp)) != 0) {
+		(void)real_dbp->close(real_dbp, 0);
+		return (ret);
+	}
+	dbp->type = real_dbp->type;
+	dbp->byteswapped = real_dbp->byteswapped;
+	dbp->dbenv = dbenv;
+	dbp->internal = real_dbp;
+	TAILQ_INIT(&dbp->active_queue);
+	TAILQ_INIT(&dbp->free_queue);
+	dbp->close = __xa_close;
+	dbp->cursor = __xa_cursor;
+	dbp->del = __xa_del;
+	dbp->fd = __xa_fd;
+	dbp->get = __xa_get;
+	dbp->join = real_dbp->join;
+	dbp->put = __xa_put;
+	dbp->stat = __xa_stat;
+	dbp->sync = __xa_sync;
+
+	*dbpp = dbp;
+	return (0);
+}
+
+static int
+__xa_close(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+	DBC *dbc;
+	int ret;
+
+	/* Close any associated cursors. */
+	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+		(void)dbc->c_close(dbc);
+
+	/* Close the DB handle. */
+	real_dbp = (DB *)dbp->internal;
+	ret = real_dbp->close(real_dbp, flags);
+
+	__os_free(dbp, sizeof(DB));
+	return (ret);
+}
+
+static int
+__xa_cursor(dbp, txn, dbcp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+	DBC *real_dbc, *dbc;
+	int ret;
+
+	real_dbp = (DB *)dbp->internal;
+	txn = dbp->dbenv->xa_txn;
+
+	if ((ret = real_dbp->cursor(real_dbp, txn, &real_dbc, flags)) != 0)
+		return (ret);
+
+	/*
+	 * Allocate our own DBC handle, and copy the exported fields and
+	 * function pointers into it.  The internal pointer references
+	 * the real underlying DBC handle.
+	 */
+	if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0) {
+		(void)real_dbc->c_close(real_dbc);
+		return (ret);
+	}
+	dbc->dbp = dbp;
+	dbc->c_close = __xa_c_close;
+	dbc->c_del = __xa_c_del;
+	dbc->c_get = __xa_c_get;
+	dbc->c_put = __xa_c_put;
+	dbc->internal = real_dbc;
+	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+
+	*dbcp = dbc;
+	return (0);
+}
+
+static int
+__xa_fd(dbp, fdp)
+	DB *dbp;
+	int *fdp;
+{
+	DB *real_dbp;
+
+	COMPQUIET(fdp, NULL);
+
+	real_dbp = (DB *)dbp->internal;
+	return (__db_eopnotsup(real_dbp->dbenv));
+}
+
+static int
+__xa_del(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+
+	real_dbp = (DB *)dbp->internal;
+	txn = dbp->dbenv->xa_txn;
+
+	return (real_dbp->del(real_dbp, txn, key, flags));
+}
+
+static int
+__xa_get(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+
+	real_dbp = (DB *)dbp->internal;
+	txn = dbp->dbenv->xa_txn;
+
+	return (real_dbp->get(real_dbp, txn, key, data, flags));
+}
+
+static int
+__xa_put(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+
+	real_dbp = (DB *)dbp->internal;
+	txn = dbp->dbenv->xa_txn;
+
+	return (real_dbp->put(real_dbp, txn, key, data, flags));
+}
+
+static int
+__xa_stat(dbp, spp, db_malloc, flags)
+	DB *dbp;
+	void *spp;
+	void *(*db_malloc) __P((size_t));
+	u_int32_t flags;
+{
+	DB *real_dbp;
+
+	real_dbp = (DB *)dbp->internal;
+	return (real_dbp->stat(real_dbp, spp, db_malloc, flags));
+}
+
+static int
+__xa_sync(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB *real_dbp;
+
+	real_dbp = (DB *)dbp->internal;
+	return (real_dbp->sync(real_dbp, flags));
+}
+
+static int
+__xa_c_close(dbc)
+	DBC *dbc;
+{
+	DBC *real_dbc;
+	int ret;
+
+	real_dbc = (DBC *)dbc->internal;
+
+	ret = real_dbc->c_close(real_dbc);
+
+	TAILQ_REMOVE(&dbc->dbp->active_queue, dbc, links);
+	__os_free(dbc, sizeof(DBC));
+
+	return (ret);
+}
+
+static int
+__xa_c_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DBC *real_dbc;
+
+	real_dbc = (DBC *)dbc->internal;
+	return (real_dbc->c_del(real_dbc, flags));
+}
+
+static int
+__xa_c_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DBC *real_dbc;
+
+	real_dbc = (DBC *)dbc->internal;
+	return (real_dbc->c_get(real_dbc, key, data, flags));
+}
+
+static int
+__xa_c_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DBC *real_dbc;
+
+	real_dbc = (DBC *)dbc->internal;
+	return (real_dbc->c_put(real_dbc, key, data, flags));
+}
diff --git a/db2/xa/xa_map.c b/db2/xa/xa_map.c
new file mode 100644
index 0000000000..d4ebbae22f
--- /dev/null
+++ b/db2/xa/xa_map.c
@@ -0,0 +1,305 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)xa_map.c	10.4 (Sleepycat) 10/20/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "txn.h"
+
+/*
+ * This file contains all the mapping information that we need to support
+ * the DB/XA interface.
+ */
+
+/*
+ * __db_rmid_to_env
+ *	Return the environment associated with a given XA rmid.
+ *
+ * PUBLIC: int __db_rmid_to_env __P((int rmid, DB_ENV **envp, int open_ok));
+ */
+int
+__db_rmid_to_env(rmid, envp, open_ok)
+	int rmid;
+	DB_ENV **envp;
+	int open_ok;
+{
+	DB_ENV *env;
+	char *dbhome;
+
+	env = TAILQ_FIRST(&DB_GLOBAL(db_envq));
+	if (env != NULL && env->xa_rmid == rmid) {
+		*envp = env;
+		return (0);
+	}
+
+	/*
+	 * When we map an rmid, move that environment to be the first one in
+	 * the list of environments, so we pass the correct environment from
+	 * the upcoming db_xa_open() call into db_open().
+	 */
+	for (; env != NULL; env = TAILQ_NEXT(env, links))
+		if (env->xa_rmid == rmid) {
+			TAILQ_REMOVE(&DB_GLOBAL(db_envq), env, links);
+			TAILQ_INSERT_HEAD(&DB_GLOBAL(db_envq), env, links);
+			*envp = env;
+			return (0);
+		}
+
+	/*
+	 * We have not found the rmid on the environment list.  If we
+	 * are allowed to do an open, search for the rmid on the name
+	 * list and, if we find it, then open it.
+	 */
+	if (!open_ok)
+		return (1);
+
+	if (__db_rmid_to_name(rmid, &dbhome) != 0)
+		return (1);
+#undef XA_FLAGS
+#define	XA_FLAGS \
+	DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN
+
+	if (__os_calloc(1, sizeof(DB_ENV), &env) != 0)
+		return (1);
+
+	if (db_appinit(dbhome, NULL, env, XA_FLAGS) != 0) 
+		goto err;
+
+	if (__db_map_rmid(rmid, env) != 0)
+		goto err1;
+
+	__db_unmap_rmid_name(rmid);
+
+	*envp = env;
+	return (0);
+
+err1:	(void)db_appexit(env);
+err:	__os_free(env, sizeof(DB_ENV));
+	return (1);
+}
+
+/*
+ * __db_xid_to_txn
+ *	Return the txn that corresponds to this XID.
+ *
+ * PUBLIC: int __db_xid_to_txn __P((DB_ENV *, XID *, size_t *));
+ */
+int
+__db_xid_to_txn(dbenv, xid, offp)
+	DB_ENV *dbenv;
+	XID *xid;
+	size_t *offp;
+{
+	DB_TXNREGION *tmr;
+	struct __txn_detail *td;
+
+	/*
+	 * Search the internal active transaction table to find the
+	 * matching xid.  If this is a performance hit, then we
+	 * can create a hash table, but I doubt it's worth it.
+	 */
+	tmr = dbenv->tx_info->region;
+
+	LOCK_TXNREGION(dbenv->tx_info);
+	for (td = SH_TAILQ_FIRST(&tmr->active_txn, __txn_detail);
+	    td != NULL;
+	    td = SH_TAILQ_NEXT(td, links, __txn_detail))
+		if (memcmp(xid->data, td->xid, XIDDATASIZE) == 0)
+			break;
+	UNLOCK_TXNREGION(dbenv->tx_info);
+
+	if (td == NULL)
+		return (EINVAL);
+
+	*offp = (u_int8_t *)td - (u_int8_t *)tmr;
+	return (0);
+}
+
+/*
+ * __db_map_rmid
+ *	Create a mapping between the specified rmid and environment.
+ *
+ * PUBLIC: int __db_map_rmid __P((int, DB_ENV *));
+ */
+int
+__db_map_rmid(rmid, env)
+	int rmid;
+	DB_ENV *env;
+{
+	if (__os_calloc(1, sizeof(DB_TXN), &env->xa_txn) != 0)
+		return (XAER_RMERR);
+	env->xa_txn->txnid = TXN_INVALID;
+	env->xa_rmid = rmid;
+	TAILQ_INSERT_HEAD(&DB_GLOBAL(db_envq), env, links);
+	return (XA_OK);
+}
+
+/*
+ * __db_unmap_rmid
+ *	Destroy the mapping for the given rmid.
+ *
+ * PUBLIC: int __db_unmap_rmid __P((int));
+ */
+int
+__db_unmap_rmid(rmid)
+	int rmid;
+{
+	DB_ENV *e;
+
+	for (e = TAILQ_FIRST(&DB_GLOBAL(db_envq));
+	    e->xa_rmid != rmid;
+	    e = TAILQ_NEXT(e, links));
+
+	if (e == NULL)
+		return (EINVAL);
+
+	TAILQ_REMOVE(&DB_GLOBAL(db_envq), e, links);
+	if (e->xa_txn != NULL)
+		__os_free(e->xa_txn, sizeof(DB_TXN));
+	return (0);
+}
+
+/*
+ * __db_map_xid
+ *	Create a mapping between this XID and the transaction at
+ *	"off" in the shared region.
+ *
+ * PUBLIC: int __db_map_xid __P((DB_ENV *, XID *, size_t));
+ */
+int
+__db_map_xid(env, xid, off)
+	DB_ENV *env;
+	XID *xid;
+	size_t off;
+{
+	DB_TXNMGR *tm;
+	TXN_DETAIL *td;
+
+	tm = env->tx_info;
+	td = (TXN_DETAIL *)((u_int8_t *)tm->region + off);
+
+	LOCK_TXNREGION(tm);
+	memcpy(td->xid, xid->data, XIDDATASIZE);
+	UNLOCK_TXNREGION(tm);
+
+	return (0);
+}
+
+/*
+ * __db_unmap_xid
+ *	Destroy the mapping for the specified XID.
+ *
+ * PUBLIC: void __db_unmap_xid __P((DB_ENV *, XID *, size_t));
+ */
+
+void
+__db_unmap_xid(env, xid, off)
+	DB_ENV *env;
+	XID *xid;
+	size_t off;
+{
+	TXN_DETAIL *td;
+
+	COMPQUIET(xid, NULL);
+
+	td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off);
+	memset(td->xid, 0, sizeof(td->xid));
+}
+
+/*
+ * __db_map_rmid_name --
+ * 	Create a mapping from an rmid to a name (the xa_info argument).
+ * We use this during create and then at some later point when we are
+ * trying to map an rmid, we might indicate that it's OK to do an open
+ * in which case, we'll get the xa_info parameter from here and then
+ * free it up.
+ *
+ * PUBLIC: int __db_map_rmid_name __P((int, char *));
+ */
+
+int
+__db_map_rmid_name(rmid, dbhome)
+	int rmid;
+	char *dbhome;
+{
+	struct __rmname *entry;
+	int ret;
+
+	if ((ret = __os_malloc(sizeof(struct __rmname), NULL, &entry)) != 0)
+		return (ret);
+
+	if ((ret = __os_strdup(dbhome, &entry->dbhome)) != 0) {
+		__os_free(entry, sizeof(struct __rmname));
+		return (ret);
+	}
+
+	entry->rmid = rmid;
+
+	TAILQ_INSERT_HEAD(&DB_GLOBAL(db_nameq), entry, links);
+	return (0);
+}
+
+/*
+ * __db_rmid_to_name --
+ *	Given an rmid, return the name of the home directory for that
+ * rmid.
+ *
+ * PUBLIC: int __db_rmid_to_name __P((int, char **));
+ */
+int
+__db_rmid_to_name(rmid, dbhomep)
+	int rmid;
+	char **dbhomep;
+{
+	struct __rmname *np;
+
+	for (np = TAILQ_FIRST(&DB_GLOBAL(db_nameq)); np != NULL;
+	    np = TAILQ_NEXT(np, links)) {
+		if (np->rmid == rmid) {
+			*dbhomep = np->dbhome;
+			return (0);
+		}
+	}
+	return (1);
+}
+
+/*
+ * __db_unmap_rmid_name --
+ *	Given an rmid, remove its entry from the name list.
+ *
+ * PUBLIC:  void __db_unmap_rmid_name __P((int));
+ */
+void
+__db_unmap_rmid_name(rmid)
+	int rmid;
+{
+	struct __rmname *np, *next;
+
+	for (np = TAILQ_FIRST(&DB_GLOBAL(db_nameq)); np != NULL; np = next) {
+		next = TAILQ_NEXT(np, links);
+		if (np->rmid == rmid) {
+			TAILQ_REMOVE(&DB_GLOBAL(db_nameq), np, links);
+			__os_freestr(np->dbhome);
+			__os_free(np, sizeof(struct __rmname));
+			return;
+		}
+	}
+	return;
+}