about summary refs log tree commit diff
path: root/db2/db
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>1999-06-13 13:36:34 +0000
committerUlrich Drepper <drepper@redhat.com>1999-06-13 13:36:34 +0000
commitec239360d13518a13f572b635d036c7d10028010 (patch)
treebdb5111363f45d2107849c2456b575d72779174c /db2/db
parentfc3703521650a9b6db910a50c4fc0f410496e134 (diff)
downloadglibc-ec239360d13518a13f572b635d036c7d10028010.tar.gz
glibc-ec239360d13518a13f572b635d036c7d10028010.tar.xz
glibc-ec239360d13518a13f572b635d036c7d10028010.zip
Update.
	* db2/Makefile (distribute): Remove files which do not exist
	anymore.
Diffstat (limited to 'db2/db')
-rw-r--r--db2/db/db.c313
-rw-r--r--db2/db/db.src13
-rw-r--r--db2/db/db_am.c430
-rw-r--r--db2/db/db_auto.c299
-rw-r--r--db2/db/db_dispatch.c41
-rw-r--r--db2/db/db_dup.c511
-rw-r--r--db2/db/db_iface.c488
-rw-r--r--db2/db/db_join.c271
-rw-r--r--db2/db/db_overflow.c129
-rw-r--r--db2/db/db_pr.c110
-rw-r--r--db2/db/db_rec.c155
-rw-r--r--db2/db/db_ret.c21
-rw-r--r--db2/db/db_thread.c121
13 files changed, 1997 insertions, 905 deletions
diff --git a/db2/db/db.c b/db2/db/db.c
index 70c6c5443b..2b4c270324 100644
--- a/db2/db/db.c
+++ b/db2/db/db.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db.c	10.57 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db.c	10.75 (Sleepycat) 12/3/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -67,9 +67,6 @@ static const char sccsid[] = "@(#)db.c	10.57 (Sleepycat) 5/7/98";
 #include "db_am.h"
 #include "common_ext.h"
 
-static int db_close __P((DB *, u_int32_t));
-static int db_fd __P((DB *, int *));
-
 /*
  * If the metadata page has the flag set, set the local flag.  If the page
  * does NOT have the flag set, return EINVAL if the user's dbinfo argument
@@ -87,11 +84,6 @@ static int db_fd __P((DB *, int *));
 		}							\
 }
 
-#ifdef _LIBC
-#define db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) \
-  __nss_db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
-#endif
-
 /*
  * db_open --
  *	Main library interface to the DB access methods.
@@ -141,9 +133,10 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 
 		/*
 		 * Specifying a cachesize to db_open(3), after creating an
-		 * environment, is a common mistake.
+		 * environment with DB_INIT_MPOOL, is a common mistake.
 		 */
-		if (dbinfo != NULL && dbinfo->db_cachesize != 0) {
+		if (dbenv->mp_info != NULL &&
+		    dbinfo != NULL && dbinfo->db_cachesize != 0) {
 			__db_err(dbenv,
 			    "cachesize will be ignored if environment exists");
 			return (EINVAL);
@@ -156,12 +149,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	real_name = NULL;
 
 	/* Allocate the DB structure, reference the DB_ENV structure. */
-	if ((dbp = (DB *)__db_calloc(1, sizeof(DB))) == NULL) {
-		__db_err(dbenv, "%s", strerror(ENOMEM));
-		return (ENOMEM);
-	}
+	if ((ret = __os_calloc(1, sizeof(DB), &dbp)) != 0)
+		return (ret);
 	dbp->dbenv = dbenv;
 
+	/* Random initialization. */
+	TAILQ_INIT(&dbp->free_queue);
+	TAILQ_INIT(&dbp->active_queue);
+	if ((ret = __db_init_wrapper(dbp)) != 0)
+		goto err;
+
 	/* Convert the db_open(3) flags. */
 	if (LF_ISSET(DB_RDONLY))
 		F_SET(dbp, DB_AM_RDONLY);
@@ -192,21 +189,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	}
 
 	/*
-	 * Always set the master and initialize the queues, so we can
-	 * use these fields without checking the thread bit.
-	 */
-	dbp->master = dbp;
-	LIST_INIT(&dbp->handleq);
-	LIST_INSERT_HEAD(&dbp->handleq, dbp, links);
-	TAILQ_INIT(&dbp->curs_queue);
-
-	/*
 	 * Set based on the dbenv fields, although no logging or transactions
 	 * are possible for temporary files.
 	 */
 	if (dbenv != NULL) {
-		if (dbenv->lk_info != NULL)
-			F_SET(dbp, DB_AM_LOCKING);
+		if (dbenv->lk_info != NULL) {
+			if (F_ISSET(dbenv, DB_ENV_CDB))
+				F_SET(dbp, DB_AM_CDB);
+			else
+				F_SET(dbp, DB_AM_LOCKING);
+		}
 		if (fname != NULL && dbenv->lg_info != NULL)
 			F_SET(dbp, DB_AM_LOGGING);
 	}
@@ -215,9 +207,29 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 	if (dbinfo == NULL) {
 		dbp->pgsize = 0;
 		dbp->db_malloc = NULL;
+		dbp->dup_compare = NULL;
 	} else {
+		/*
+		 * We don't want anything that's not a power-of-2, as we rely
+		 * on that for alignment of various types on the pages.
+		 */
+		if ((dbp->pgsize = dbinfo->db_pagesize) != 0 &&
+		    (u_int32_t)1 << __db_log2(dbp->pgsize) != dbp->pgsize) {
+			__db_err(dbenv, "page sizes must be a power-of-2");
+			goto einval;
+		}
 		dbp->pgsize = dbinfo->db_pagesize;
 		dbp->db_malloc = dbinfo->db_malloc;
+		if (F_ISSET(dbinfo, DB_DUPSORT)) {
+			if (F_ISSET(dbinfo, DB_DUP))
+				dbp->dup_compare = dbinfo->dup_compare == NULL ?
+				    __bam_defcmp : dbinfo->dup_compare;
+			else {
+				__db_err(dbenv, "DB_DUPSORT requires DB_DUP");
+				goto einval;
+			}
+			F_CLR(dbinfo, DB_DUPSORT);
+		}
 	}
 
 	/* Fill in the default file mode. */
@@ -235,6 +247,7 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp)
 		default:
 			goto err;
 		}
+	dbp->byteswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
 
 	/*
 	 * If we have a file name, try and read the first page, figure out
@@ -289,7 +302,7 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 		 * sizes, we limit the default pagesize to 16K.
 		 */
 		if (dbp->pgsize == 0) {
-			if ((ret = __db_ioinfo(real_name,
+			if ((ret = __os_ioinfo(real_name,
 			    fd, NULL, NULL, &iopsize)) != 0) {
 				__db_err(dbenv,
 				    "%s: %s", real_name, strerror(ret));
@@ -299,6 +312,14 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 				iopsize = 512;
 			if (iopsize > 16 * 1024)
 				iopsize = 16 * 1024;
+
+			/*
+			 * Sheer paranoia, but we don't want anything that's
+			 * not a power-of-2, as we rely on that for alignment
+			 * of various types on the pages.
+			 */
+			DB_ROUNDOFF(iopsize, 512);
+
 			dbp->pgsize = iopsize;
 			F_SET(dbp, DB_AM_PGDEF);
 		}
@@ -308,11 +329,11 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 		 * that the meta-data for all access methods fits in 512
 		 * bytes, and that no database will be smaller than that.
 		 */
-		if ((ret = __db_read(fd, mbuf, sizeof(mbuf), &nr)) != 0)
+		if ((ret = __os_read(fd, mbuf, sizeof(mbuf), &nr)) != 0)
 			goto err;
 
 		/* The fd is no longer needed. */
-		(void)__db_close(fd);
+		(void)__os_close(fd);
 		fd = -1;
 
 		if (nr != sizeof(mbuf)) {
@@ -337,7 +358,7 @@ open_retry:	if (LF_ISSET(DB_CREATE)) {
 			 */
 			if (retry_cnt++ < 3 &&
 			    !LF_ISSET(DB_CREATE | DB_TRUNCATE)) {
-				__db_sleep(1, 0);
+				__os_sleep(1, 0);
 				goto open_retry;
 			}
 			if (type == DB_UNKNOWN) {
@@ -396,7 +417,7 @@ retry:		switch (((BTMETA *)mbuf)->magic) {
 
 			/* Copy the file's unique id. */
 			need_fileid = 0;
-			memcpy(dbp->lock.fileid, btm->uid, DB_FILE_ID_LEN);
+			memcpy(dbp->fileid, btm->uid, DB_FILE_ID_LEN);
 			break;
 		case DB_HASHMAGIC:
 			if (type != DB_HASH && type != DB_UNKNOWN)
@@ -425,7 +446,7 @@ retry:		switch (((BTMETA *)mbuf)->magic) {
 
 			/* Copy the file's unique id. */
 			need_fileid = 0;
-			memcpy(dbp->lock.fileid, hashm->uid, DB_FILE_ID_LEN);
+			memcpy(dbp->fileid, hashm->uid, DB_FILE_ID_LEN);
 			break;
 		default:
 			if (swapped) {
@@ -489,11 +510,9 @@ empty:	/*
 		F_SET(dbp, DB_AM_MLOCAL);
 
 		if (dbenv == NULL) {
-			if ((dbp->mp_dbenv =
-			    (DB_ENV *)__db_calloc(sizeof(DB_ENV), 1)) == NULL) {
-				ret = ENOMEM;
+			if ((ret = __os_calloc(1,
+			    sizeof(DB_ENV), &dbp->mp_dbenv)) != 0)
 				goto err;
-			}
 
 			envp = dbp->mp_dbenv;
 			restore = 0;
@@ -554,20 +573,20 @@ empty:	/*
 	 */
 	if (need_fileid) {
 		if (fname == NULL) {
-			memset(dbp->lock.fileid, 0, DB_FILE_ID_LEN);
+			memset(dbp->fileid, 0, DB_FILE_ID_LEN);
 			if (F_ISSET(dbp, DB_AM_LOCKING) &&
 			    (ret = lock_id(dbenv->lk_info,
-			    (u_int32_t *)dbp->lock.fileid)) != 0)
+			    (u_int32_t *)dbp->fileid)) != 0)
 				goto err;
 		} else
-			if ((ret = __db_fileid(dbenv,
-			    real_name, 1, dbp->lock.fileid)) != 0)
+			if ((ret = __os_fileid(dbenv,
+			    real_name, 1, dbp->fileid)) != 0)
 				goto err;
 	}
 
 	/* No further use for the real name. */
 	if (real_name != NULL)
-		FREES(real_name);
+		__os_freestr(real_name);
 	real_name = NULL;
 
 	/*
@@ -595,7 +614,7 @@ empty:	/*
 	memset(&finfo, 0, sizeof(finfo));
 	finfo.ftype = ftype;
 	finfo.pgcookie = &pgcookie;
-	finfo.fileid = dbp->lock.fileid;
+	finfo.fileid = dbp->fileid;
 	finfo.lsn_offset = 0;
 	finfo.clear_len = DB_PAGE_CLEAR_LEN;
 	if ((ret = memp_fopen(dbp->mp, fname,
@@ -605,12 +624,21 @@ empty:	/*
 
 	/*
 	 * XXX
-	 * Truly spectacular layering violation.  We need a per-thread mutex
-	 * that lives in shared memory (thanks, HP-UX!) and so we acquire a
-	 * pointer to the mpool one.
+	 * We need a per-thread mutex that lives in shared memory -- HP-UX
+	 * can't allocate mutexes in malloc'd memory.  Allocate it from the
+	 * shared memory region, since it's the only one that is guaranteed
+	 * to exist.
 	 */
-	if (F_ISSET(dbp, DB_AM_THREAD))
-		dbp->mutexp = dbp->mpf->mutexp;
+	if (F_ISSET(dbp, DB_AM_THREAD)) {
+		if ((ret = __memp_reg_alloc(dbp->mp,
+		    sizeof(db_mutex_t), NULL, &dbp->mutexp)) != 0)
+			goto err;
+		/*
+		 * Since we only get here if DB_THREAD was specified, we know
+		 * we have spinlocks and no file offset argument is needed.
+		 */
+		(void)__db_mutex_init(dbp->mutexp, 0);
+	}
 
 	/* Get a log file id. */
 	if (F_ISSET(dbp, DB_AM_LOGGING) &&
@@ -618,18 +646,6 @@ empty:	/*
 	    dbp, fname, type, &dbp->log_fileid)) != 0)
 		goto err;
 
-	/*
-	 * Get a locker id for this DB, and build the lock cookie: the first
-	 * db_pgno_t bytes are the page number, the next N bytes are the file
-	 * id.
-	 */
-	if (F_ISSET(dbp, DB_AM_LOCKING)) {
-		if ((ret = lock_id(dbenv->lk_info, &dbp->locker)) != 0)
-			goto err;
-		dbp->lock_dbt.size = sizeof(dbp->lock);
-		dbp->lock_dbt.data = &dbp->lock;
-	}
-
 	/* Call the real open function. */
 	switch (type) {
 	case DB_BTREE:
@@ -639,7 +655,7 @@ empty:	/*
 		if (dbinfo != NULL && (ret = __db_fcchk(dbenv,
 		    "db_open", dbinfo->flags, DB_DUP, DB_RECNUM)) != 0)
 			goto err;
-		if ((ret = __bam_open(dbp, type, dbinfo)) != 0)
+		if ((ret = __bam_open(dbp, dbinfo)) != 0)
 			goto err;
 		break;
 	case DB_HASH:
@@ -655,24 +671,20 @@ empty:	/*
 		if (dbinfo != NULL && (ret = __db_fchk(dbenv,
 		    "db_open", dbinfo->flags, DB_INFO_FLAGS)) != 0)
 			goto err;
-		if ((ret = __ram_open(dbp, type, dbinfo)) != 0)
+		if ((ret = __ram_open(dbp, dbinfo)) != 0)
 			goto err;
 		break;
 	default:
 		abort();
 	}
 
-	/* Call a local close routine. */
-	dbp->close = db_close;
-	dbp->fd = db_fd;
-
 	*dbpp = dbp;
 	return (0);
 
 einval:	ret = EINVAL;
 err:	/* Close the file descriptor. */
 	if (fd != -1)
-		(void)__db_close(fd);
+		(void)__os_close(fd);
 
 	/* Discard the log file id. */
 	if (dbp->log_fileid != 0)
@@ -688,90 +700,60 @@ err:	/* Close the file descriptor. */
 
 	/* If we allocated a DB_ENV, discard it. */
 	if (dbp->mp_dbenv != NULL)
-		FREE(dbp->mp_dbenv, sizeof(DB_ENV));
+		__os_free(dbp->mp_dbenv, sizeof(DB_ENV));
 
 	if (real_name != NULL)
-		FREES(real_name);
+		__os_freestr(real_name);
 	if (dbp != NULL)
-		FREE(dbp, sizeof(DB));
+		__os_free(dbp, sizeof(DB));
 
 	return (ret);
 }
 
-#ifdef _LIBC
-# undef db_open
-weak_alias (__nss_db_open, db_open)
-#endif
-
 /*
- * db_close --
+ * __db_close --
  *	Close a DB tree.
+ *
+ * PUBLIC: int __db_close __P((DB *, u_int32_t));
  */
-static int
-db_close(dbp, flags)
+int
+__db_close(dbp, flags)
 	DB *dbp;
 	u_int32_t flags;
 {
 	DBC *dbc;
-	DB *tdbp;
 	int ret, t_ret;
 
+	DB_PANIC_CHECK(dbp);
+
 	/* Validate arguments. */
-	if ((ret = __db_fchk(dbp->dbenv, "db_close", flags, DB_NOSYNC)) != 0)
+	if ((ret = __db_closechk(dbp, flags)) != 0)
 		return (ret);
 
 	/* Sync the underlying file. */
-	if (!LF_ISSET(DB_NOSYNC) &&
+	if (flags != DB_NOSYNC &&
 	    (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0)
 		ret = t_ret;
 
 	/*
-	 * Call the underlying access method close routine for all the
-	 * cursors and handles.
+	 * Go through the active cursors and call the cursor recycle routine,
+	 * which resolves pending operations and moves the cursors onto the
+	 * free list.  Then, walk the free list and call the cursor destroy
+	 * routine.
 	 */
-	for (tdbp = LIST_FIRST(&dbp->handleq);
-	    tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) {
-		while ((dbc = TAILQ_FIRST(&tdbp->curs_queue)) != NULL)
-			switch (tdbp->type) {
-			case DB_BTREE:
-				if ((t_ret =
-				    __bam_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			case DB_HASH:
-				if ((t_ret =
-				    __ham_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			case DB_RECNO:
-				if ((t_ret =
-				    __ram_c_iclose(tdbp, dbc)) != 0 && ret == 0)
-					ret = t_ret;
-				break;
-			default:
-				abort();
-			}
-
-		switch (tdbp->type) {
-		case DB_BTREE:
-			if ((t_ret = __bam_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		case DB_HASH:
-			if ((t_ret = __ham_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		case DB_RECNO:
-			if ((t_ret = __ram_close(tdbp)) != 0 && ret == 0)
-				ret = t_ret;
-			break;
-		default:
-			abort();
-		}
-	}
+	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Call the access specific close function. */
+	if ((t_ret = dbp->am_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
 
 	/* Sync the memory pool. */
-	if (!LF_ISSET(DB_NOSYNC) && (t_ret = memp_fsync(dbp->mpf)) != 0 &&
+	if (flags != DB_NOSYNC && (t_ret = memp_fsync(dbp->mpf)) != 0 &&
 	    t_ret != DB_INCOMPLETE && ret == 0)
 		ret = t_ret;
 
@@ -788,91 +770,12 @@ db_close(dbp, flags)
 	if (F_ISSET(dbp, DB_AM_LOGGING))
 		(void)log_unregister(dbp->dbenv->lg_info, dbp->log_fileid);
 
-	/* Discard the lock cookie for all handles. */
-	for (tdbp = LIST_FIRST(&dbp->handleq);
-	    tdbp != NULL; tdbp = LIST_NEXT(tdbp, links))
-		if (F_ISSET(tdbp, DB_AM_LOCKING)) {
-#ifdef DEBUG
-			DB_LOCKREQ request;
-
-			/*
-			 * If we're running tests, display any locks currently
-			 * held.  It's possible that some applications may hold
-			 * locks for long periods, e.g., conference room locks,
-			 * but the DB tests should never close holding locks.
-			 */
-			request.op = DB_LOCK_DUMP;
-			if ((t_ret = lock_vec(tdbp->dbenv->lk_info,
-			    tdbp->locker, 0, &request, 1, NULL)) != 0 &&
-			    ret == 0)
-				ret = EAGAIN;
-#endif
-		}
-
 	/* If we allocated a DB_ENV, discard it. */
 	if (dbp->mp_dbenv != NULL)
-		FREE(dbp->mp_dbenv, sizeof(DB_ENV));
+		__os_free(dbp->mp_dbenv, sizeof(DB_ENV));
 
-	/* Free all of the DB's. */
-	LIST_REMOVE(dbp, links);
-	while ((tdbp = LIST_FIRST(&dbp->handleq)) != NULL) {
-		LIST_REMOVE(tdbp, links);
-		FREE(tdbp, sizeof(*tdbp));
-	}
-	FREE(dbp, sizeof(*dbp));
+	/* Free the DB. */
+	__os_free(dbp, sizeof(*dbp));
 
 	return (ret);
 }
-
-/*
- * db_fd --
- *	Return a file descriptor for flock'ing.
- */
-static int
-db_fd(dbp, fdp)
-        DB *dbp;
-	int *fdp;
-{
-	/*
-	 * XXX
-	 * Truly spectacular layering violation.
-	 */
-	return (__mp_xxx_fd(dbp->mpf, fdp));
-}
-
-/*
- * __db_pgerr --
- *	Error when unable to retrieve a specified page.
- *
- * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t));
- */
-int
-__db_pgerr(dbp, pgno)
-	DB *dbp;
-	db_pgno_t pgno;
-{
-	/*
-	 * Three things are certain:
-	 * Death, taxes, and lost data.
-	 * Guess which has occurred.
-	 */
-	__db_err(dbp->dbenv,
-	    "unable to create/retrieve page %lu", (u_long)pgno);
-	return (__db_panic(dbp));
-}
-
-/*
- * __db_pgfmt --
- *	Error when a page has the wrong format.
- *
- * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t));
- */
-int
-__db_pgfmt(dbp, pgno)
-	DB *dbp;
-	db_pgno_t pgno;
-{
-	__db_err(dbp->dbenv,
-	    "page %lu: illegal page type or format", (u_long)pgno);
-	return (__db_panic(dbp));
-}
diff --git a/db2/db/db.src b/db2/db/db.src
index 91d8b390a1..26557e10ac 100644
--- a/db2/db/db.src
+++ b/db2/db/db.src
@@ -4,7 +4,7 @@
  * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  *
- *	@(#)db.src	10.6 (Sleepycat) 4/28/98
+ *	@(#)db.src	10.8 (Sleepycat) 9/20/98
  */
 
 PREFIX	db
@@ -98,6 +98,7 @@ END
 /*
  * relink -- Handles relinking around a page.
  *
+ * opcode:	indicates if this is an addpage or delete page
  * pgno:	the page being changed.
  * lsn		the page's original lsn.
  * prev:	the previous page.
@@ -106,6 +107,7 @@ END
  * lsn_next:	the previous page's original lsn.
  */
 BEGIN relink
+ARG	opcode		u_int32_t	lu
 ARG	fileid		u_int32_t	lu
 ARG	pgno		db_pgno_t	lu
 POINTER	lsn		DB_LSN *	lu
@@ -148,12 +150,3 @@ DBT	key		DBT		s
 DBT	data		DBT		s
 ARG	arg_flags	u_int32_t	lu
 END
-
-/*
- * noop -- do nothing, but get an LSN.
- */
-BEGIN noop
-ARG	fileid		u_int32_t	lu
-ARG	pgno		db_pgno_t	lu
-POINTER	prevlsn		DB_LSN *	lu
-END
diff --git a/db2/db/db_am.c b/db2/db/db_am.c
new file mode 100644
index 0000000000..e02ad57f53
--- /dev/null
+++ b/db2/db/db_am.c
@@ -0,0 +1,430 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_am.c	10.15 (Sleepycat) 12/30/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "btree.h"
+#include "hash.h"
+#include "db_am.h"
+#include "db_ext.h"
+
+static int __db_c_close __P((DBC *));
+static int __db_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t));
+static int __db_fd __P((DB *, int *));
+static int __db_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __db_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+
+/*
+ * __db_init_wrapper --
+ *	Wrapper layer to implement generic DB functions.
+ *
+ * PUBLIC: int __db_init_wrapper __P((DB *));
+ */
+int
+__db_init_wrapper(dbp)
+	DB *dbp;
+{
+	dbp->close = __db_close;
+	dbp->cursor = __db_cursor;
+	dbp->del = NULL;		/* !!! Must be set by access method. */
+	dbp->fd = __db_fd;
+	dbp->get = __db_get;
+	dbp->join = __db_join;
+	dbp->put = __db_put;
+	dbp->stat = NULL;		/* !!! Must be set by access method. */
+	dbp->sync = __db_sync;
+
+	return (0);
+}
+
+/*
+ * __db_cursor --
+ *	Allocate and return a cursor.
+ */
+static int
+__db_cursor(dbp, txn, dbcp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc, *adbc;
+	int ret;
+	db_lockmode_t mode;
+	u_int32_t op;
+
+	DB_PANIC_CHECK(dbp);
+
+	/* Take one from the free list if it's available. */
+	DB_THREAD_LOCK(dbp);
+	if ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	else {
+		DB_THREAD_UNLOCK(dbp);
+
+		if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0)
+			return (ret);
+
+		dbc->dbp = dbp;
+		dbc->c_close = __db_c_close;
+
+		/* Set up locking information. */
+		if (F_ISSET(dbp, DB_AM_LOCKING | DB_AM_CDB)) {
+ 			/*
+ 			 * If we are not threaded, then there is no need to
+ 			 * create new locker ids.  We know that no one else
+ 			 * is running concurrently using this DB, so we can
+ 			 * take a peek at any cursors on the active queue.
+ 			 */
+ 			if (!F_ISSET(dbp, DB_AM_THREAD) &&
+ 			    (adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+ 				dbc->lid = adbc->lid;
+ 			else
+ 				if ((ret = lock_id(dbp->dbenv->lk_info,
+ 				    &dbc->lid)) != 0)
+ 					goto err;
+ 
+			memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN);
+			if (F_ISSET(dbp, DB_AM_CDB)) {
+				dbc->lock_dbt.size = DB_FILE_ID_LEN;
+				dbc->lock_dbt.data = dbc->lock.fileid;
+			} else {
+				dbc->lock_dbt.size = sizeof(dbc->lock);
+				dbc->lock_dbt.data = &dbc->lock;
+			}
+		}
+
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bam_c_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __ham_c_init(dbc)) != 0)
+				goto err;
+			break;
+		default:
+			ret = EINVAL;
+			goto err;
+		}
+
+		DB_THREAD_LOCK(dbp);
+	}
+
+	if ((dbc->txn = txn) == NULL)
+		dbc->locker = dbc->lid;
+	else
+		dbc->locker = txn->txnid;
+
+	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/*
+	 * If this is the concurrent DB product, then we do all locking
+	 * in the interface, which is right here.
+	 */
+	if (F_ISSET(dbp, DB_AM_CDB)) {
+		op = LF_ISSET(DB_OPFLAGS_MASK);
+		mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE :
+		    (LF_ISSET(DB_RMW) ? DB_LOCK_IWRITE : DB_LOCK_READ);
+		if ((ret = lock_get(dbp->dbenv->lk_info, dbc->locker, 0,
+		    &dbc->lock_dbt, mode, &dbc->mylock)) != 0) {
+			(void)__db_c_close(dbc);
+			return (EAGAIN);
+		}
+		if (LF_ISSET(DB_RMW))
+			F_SET(dbc, DBC_RMW);
+		if (op == DB_WRITELOCK)
+			F_SET(dbc, DBC_WRITER);
+	}
+
+	*dbcp = dbc;
+	return (0);
+
+err:	__os_free(dbc, sizeof(*dbc));
+	return (ret);
+}
+
+/*
+ * __db_c_close --
+ *	Close the cursor (recycle for later use).
+ */
+static int
+__db_c_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	DB_PANIC_CHECK(dbp);
+
+	ret = 0;
+
+	/*
+	 * We cannot release the lock until after we've called the
+	 * access method specific routine, since btrees may have pending
+	 * deletes.
+	 */
+
+	/* Remove the cursor from the active queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/* Call the access specific cursor close routine. */
+	if ((t_ret = dbc->c_am_close(dbc)) != 0 && ret == 0)
+		t_ret = ret;
+
+	/* Release the lock. */
+	if (F_ISSET(dbc->dbp, DB_AM_CDB) && dbc->mylock != LOCK_INVALID) {
+		ret = lock_put(dbc->dbp->dbenv->lk_info, dbc->mylock);
+		dbc->mylock = LOCK_INVALID;
+	}
+
+	/* Clean up the cursor. */
+	dbc->flags = 0;
+
+#ifdef DEBUG
+	/*
+	 * Check for leftover locks, unless we're running with transactions.
+	 *
+	 * If we're running tests, display any locks currently held.  It's
+	 * possible that some applications may hold locks for long periods,
+	 * e.g., conference room locks, but the DB tests should never close
+	 * holding locks.
+	 */
+	if (F_ISSET(dbp, DB_AM_LOCKING) && dbc->lid == dbc->locker) {
+		DB_LOCKREQ request;
+
+		request.op = DB_LOCK_DUMP;
+		if ((t_ret = lock_vec(dbp->dbenv->lk_info,
+		    dbc->locker, 0, &request, 1, NULL)) != 0 && ret == 0)
+			ret = EAGAIN;
+	}
+#endif
+	/* Move the cursor to the free queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_cprint --
+ *	Display the current cursor list.
+ *
+ * PUBLIC: int __db_cprint __P((DB *));
+ */
+int
+__db_cprint(dbp)
+	DB *dbp;
+{
+	static const FN fn[] = {
+		{ DBC_RECOVER, 	"recover" },
+		{ DBC_RMW, 	"read-modify-write" },
+		{ 0 },
+	};
+	DBC *dbc;
+
+	DB_THREAD_LOCK(dbp);
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+		fprintf(stderr,
+		    "%#0x: dbp: %#0x txn: %#0x lid: %lu locker: %lu",
+		    (u_int)dbc, (u_int)dbc->dbp, (u_int)dbc->txn,
+		    (u_long)dbc->lid, (u_long)dbc->locker);
+		__db_prflags(dbc->flags, fn, stderr);
+		fprintf(stderr, "\n");
+	}
+	DB_THREAD_UNLOCK(dbp);
+
+	return (0);
+}
+#endif /* DEBUG */
+
+/*
+ * __db_c_destroy --
+ *	Destroy the cursor.
+ *
+ * PUBLIC: int __db_c_destroy __P((DBC *));
+ */
+int
+__db_c_destroy(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	/* Remove the cursor from the free queue. */
+	DB_THREAD_LOCK(dbp);
+	TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	DB_THREAD_UNLOCK(dbp);
+
+	/* Call the access specific cursor destroy routine. */
+	ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc);
+
+	/* Free up allocated memory. */
+	if (dbc->rkey.data != NULL)
+		__os_free(dbc->rkey.data, dbc->rkey.ulen);
+	if (dbc->rdata.data != NULL)
+		__os_free(dbc->rdata.data, dbc->rdata.ulen);
+	__os_free(dbc, sizeof(*dbc));
+
+	return (0);
+}
+
+/*
+ * db_fd --
+ *	Return a file descriptor for flock'ing.
+ */
+static int
+__db_fd(dbp, fdp)
+        DB *dbp;
+	int *fdp;
+{
+	DB_PANIC_CHECK(dbp);
+
+	/*
+	 * XXX
+	 * Truly spectacular layering violation.
+	 */
+	return (__mp_xxx_fd(dbp->mpf, fdp));
+}
+
+/*
+ * __db_get --
+ *	Return a key/data pair.
+ */
+static int
+__db_get(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
+		return (ret);
+
+	if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LREAD(dbc, txn, "__db_get", key, NULL, flags);
+
+	ret = dbc->c_get(dbc, key, data,
+	    flags == 0 || flags == DB_RMW ? flags | DB_SET : flags);
+
+	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_put --
+ *	Store a key/data pair.
+ */
+static int
+__db_put(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT tdata;
+	int ret, t_ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_putchk(dbp, key, data,
+	    flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0)
+		return (ret);
+
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags);
+
+	if (flags == DB_NOOVERWRITE) {
+		/*
+		 * Set DB_DBT_USERMEM, this might be a threaded application and
+		 * the flags checking will catch us.  We don't want the actual
+		 * data, so request a partial of length 0.
+		 */
+		memset(&tdata, 0, sizeof(tdata));
+		F_SET(&tdata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+		if ((ret = dbc->c_get(dbc, key, &tdata, DB_SET | DB_RMW)) == 0)
+			ret = DB_KEYEXIST;
+		else
+			ret = 0;
+	}
+	if (ret == 0)
+		ret = dbc->c_put(dbc, key, data, DB_KEYLAST);
+
+	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_sync --
+ *	Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *, u_int32_t));
+ */
+int
+__db_sync(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	int ret;
+
+	DB_PANIC_CHECK(dbp);
+
+	if ((ret = __db_syncchk(dbp, flags)) != 0)
+		return (ret);
+
+	/* If it wasn't possible to modify the file, we're done. */
+	if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY))
+		return (0);
+
+	/* Flush any dirty pages from the cache to the backing file. */
+	if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE)
+		ret = 0;
+
+	return (ret);
+}
diff --git a/db2/db/db_auto.c b/db2/db/db_auto.c
index 5203e0a94c..e3dba23c8b 100644
--- a/db2/db/db_auto.c
+++ b/db2/db/db_auto.c
@@ -10,7 +10,6 @@
 #endif
 
 #include "db_int.h"
-#include "shqueue.h"
 #include "db_page.h"
 #include "db_dispatch.h"
 #include "db_am.h"
@@ -46,8 +45,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_addrem;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -60,8 +58,8 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size)
 	    + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
 	    + sizeof(*pagelsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -112,7 +110,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -174,7 +172,7 @@ __db_addrem_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tpagelsn: [%lu][%lu]\n",
 	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -188,11 +186,12 @@ __db_addrem_read(recbuf, argpp)
 {
 	__db_addrem_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_addrem_args *)__db_malloc(sizeof(__db_addrem_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_addrem_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -253,8 +252,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_split;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -264,8 +262,8 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size)
 	    + sizeof(*pagelsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -302,7 +300,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -353,7 +351,7 @@ __db_split_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tpagelsn: [%lu][%lu]\n",
 	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -367,11 +365,12 @@ __db_split_read(recbuf, argpp)
 {
 	__db_split_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_split_args *)__db_malloc(sizeof(__db_split_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_split_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -430,8 +429,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_big;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -445,8 +443,8 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*pagelsn)
 	    + sizeof(*prevlsn)
 	    + sizeof(*nextlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -497,7 +495,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -554,7 +552,7 @@ __db_big_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tnextlsn: [%lu][%lu]\n",
 	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -568,11 +566,12 @@ __db_big_read(recbuf, argpp)
 {
 	__db_big_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_big_args *)__db_malloc(sizeof(__db_big_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_big_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -630,8 +629,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_ovref;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -640,8 +638,8 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(pgno)
 	    + sizeof(adjust)
 	    + sizeof(*lsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -668,7 +666,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -710,7 +708,7 @@ __db_ovref_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tlsn: [%lu][%lu]\n",
 	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -724,11 +722,12 @@ __db_ovref_read(recbuf, argpp)
 {
 	__db_ovref_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_ovref_args *)__db_malloc(sizeof(__db_ovref_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_ovref_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -752,16 +751,17 @@ __db_ovref_read(recbuf, argpp)
 /*
  * PUBLIC: int __db_relink_log
  * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
- * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t,
- * PUBLIC:     DB_LSN *, db_pgno_t, DB_LSN *));
+ * PUBLIC:     u_int32_t, u_int32_t, db_pgno_t, DB_LSN *,
+ * PUBLIC:     db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *));
  */
 int __db_relink_log(logp, txnid, ret_lsnp, flags,
-	fileid, pgno, lsn, prev, lsn_prev, next,
-	lsn_next)
+	opcode, fileid, pgno, lsn, prev, lsn_prev,
+	next, lsn_next)
 	DB_LOG *logp;
 	DB_TXN *txnid;
 	DB_LSN *ret_lsnp;
 	u_int32_t flags;
+	u_int32_t opcode;
 	u_int32_t fileid;
 	db_pgno_t pgno;
 	DB_LSN * lsn;
@@ -779,12 +779,12 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_relink;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
 	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(opcode)
 	    + sizeof(fileid)
 	    + sizeof(pgno)
 	    + sizeof(*lsn)
@@ -792,8 +792,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*lsn_prev)
 	    + sizeof(next)
 	    + sizeof(*lsn_next);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -802,6 +802,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	bp += sizeof(txn_num);
 	memcpy(bp, lsnp, sizeof(DB_LSN));
 	bp += sizeof(DB_LSN);
+	memcpy(bp, &opcode, sizeof(opcode));
+	bp += sizeof(opcode);
 	memcpy(bp, &fileid, sizeof(fileid));
 	bp += sizeof(fileid);
 	memcpy(bp, &pgno, sizeof(pgno));
@@ -832,7 +834,7 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -868,6 +870,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3)
 	    (u_long)argp->txnid->txnid,
 	    (u_long)argp->prev_lsn.file,
 	    (u_long)argp->prev_lsn.offset);
+	printf("\topcode: %lu\n", (u_long)argp->opcode);
 	printf("\tfileid: %lu\n", (u_long)argp->fileid);
 	printf("\tpgno: %lu\n", (u_long)argp->pgno);
 	printf("\tlsn: [%lu][%lu]\n",
@@ -879,7 +882,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tlsn_next: [%lu][%lu]\n",
 	    (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -893,11 +896,12 @@ __db_relink_read(recbuf, argpp)
 {
 	__db_relink_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_relink_args *)__db_malloc(sizeof(__db_relink_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_relink_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -906,6 +910,8 @@ __db_relink_read(recbuf, argpp)
 	bp += sizeof(argp->txnid->txnid);
 	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
 	bp += sizeof(DB_LSN);
+	memcpy(&argp->opcode, bp, sizeof(argp->opcode));
+	bp += sizeof(argp->opcode);
 	memcpy(&argp->fileid, bp, sizeof(argp->fileid));
 	bp += sizeof(argp->fileid);
 	memcpy(&argp->pgno, bp, sizeof(argp->pgno));
@@ -951,8 +957,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_addpage;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -962,8 +967,8 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(*lsn)
 	    + sizeof(nextpgno)
 	    + sizeof(*nextlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -995,7 +1000,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1039,7 +1044,7 @@ __db_addpage_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\tnextlsn: [%lu][%lu]\n",
 	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1053,11 +1058,12 @@ __db_addpage_read(recbuf, argpp)
 {
 	__db_addpage_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_addpage_args *)__db_malloc(sizeof(__db_addpage_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_addpage_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1108,8 +1114,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	rectype = DB_db_debug;
 	txn_num = txnid == NULL ? 0 : txnid->txnid;
 	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
+		ZERO_LSN(null_lsn);
 		lsnp = &null_lsn;
 	} else
 		lsnp = &txnid->last_lsn;
@@ -1119,8 +1124,8 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	    + sizeof(u_int32_t) + (key == NULL ? 0 : key->size)
 	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size)
 	    + sizeof(arg_flags);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0)
+		return (ret);
 
 	bp = logrec.data;
 	memcpy(bp, &rectype, sizeof(rectype));
@@ -1170,7 +1175,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags,
 	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
 	if (txnid != NULL)
 		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
+	__os_free(logrec.data, 0);
 	return (ret);
 }
 
@@ -1236,7 +1241,7 @@ __db_debug_print(notused1, dbtp, lsnp, notused2, notused3)
 	printf("\n");
 	printf("\targ_flags: %lu\n", (u_long)argp->arg_flags);
 	printf("\n");
-	__db_free(argp);
+	__os_free(argp, 0);
 	return (0);
 }
 
@@ -1250,11 +1255,12 @@ __db_debug_read(recbuf, argpp)
 {
 	__db_debug_args *argp;
 	u_int8_t *bp;
+	int ret;
 
-	argp = (__db_debug_args *)__db_malloc(sizeof(__db_debug_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
+	ret = __os_malloc(sizeof(__db_debug_args) +
+	    sizeof(DB_TXN), NULL, &argp);
+	if (ret != 0)
+		return (ret);
 	argp->txnid = (DB_TXN *)&argp[1];
 	bp = recbuf;
 	memcpy(&argp->type, bp, sizeof(argp->type));
@@ -1284,143 +1290,6 @@ __db_debug_read(recbuf, argpp)
 }
 
 /*
- * PUBLIC: int __db_noop_log
- * PUBLIC:     __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t,
- * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *));
- */
-int __db_noop_log(logp, txnid, ret_lsnp, flags,
-	fileid, pgno, prevlsn)
-	DB_LOG *logp;
-	DB_TXN *txnid;
-	DB_LSN *ret_lsnp;
-	u_int32_t flags;
-	u_int32_t fileid;
-	db_pgno_t pgno;
-	DB_LSN * prevlsn;
-{
-	DBT logrec;
-	DB_LSN *lsnp, null_lsn;
-	u_int32_t rectype, txn_num;
-	int ret;
-	u_int8_t *bp;
-
-	rectype = DB_db_noop;
-	txn_num = txnid == NULL ? 0 : txnid->txnid;
-	if (txnid == NULL) {
-		null_lsn.file = 0;
-		null_lsn.offset = 0;
-		lsnp = &null_lsn;
-	} else
-		lsnp = &txnid->last_lsn;
-	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
-	    + sizeof(fileid)
-	    + sizeof(pgno)
-	    + sizeof(*prevlsn);
-	if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL)
-		return (ENOMEM);
-
-	bp = logrec.data;
-	memcpy(bp, &rectype, sizeof(rectype));
-	bp += sizeof(rectype);
-	memcpy(bp, &txn_num, sizeof(txn_num));
-	bp += sizeof(txn_num);
-	memcpy(bp, lsnp, sizeof(DB_LSN));
-	bp += sizeof(DB_LSN);
-	memcpy(bp, &fileid, sizeof(fileid));
-	bp += sizeof(fileid);
-	memcpy(bp, &pgno, sizeof(pgno));
-	bp += sizeof(pgno);
-	if (prevlsn != NULL)
-		memcpy(bp, prevlsn, sizeof(*prevlsn));
-	else
-		memset(bp, 0, sizeof(*prevlsn));
-	bp += sizeof(*prevlsn);
-#ifdef DIAGNOSTIC
-	if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size)
-		fprintf(stderr, "Error in log record length");
-#endif
-	ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags);
-	if (txnid != NULL)
-		txnid->last_lsn = *ret_lsnp;
-	__db_free(logrec.data);
-	return (ret);
-}
-
-/*
- * PUBLIC: int __db_noop_print
- * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
- */
-int
-__db_noop_print(notused1, dbtp, lsnp, notused2, notused3)
-	DB_LOG *notused1;
-	DBT *dbtp;
-	DB_LSN *lsnp;
-	int notused2;
-	void *notused3;
-{
-	__db_noop_args *argp;
-	u_int32_t i;
-	u_int ch;
-	int ret;
-
-	i = 0;
-	ch = 0;
-	notused1 = NULL;
-	notused2 = 0;
-	notused3 = NULL;
-
-	if ((ret = __db_noop_read(dbtp->data, &argp)) != 0)
-		return (ret);
-	printf("[%lu][%lu]db_noop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
-	    (u_long)lsnp->file,
-	    (u_long)lsnp->offset,
-	    (u_long)argp->type,
-	    (u_long)argp->txnid->txnid,
-	    (u_long)argp->prev_lsn.file,
-	    (u_long)argp->prev_lsn.offset);
-	printf("\tfileid: %lu\n", (u_long)argp->fileid);
-	printf("\tpgno: %lu\n", (u_long)argp->pgno);
-	printf("\tprevlsn: [%lu][%lu]\n",
-	    (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
-	printf("\n");
-	__db_free(argp);
-	return (0);
-}
-
-/*
- * PUBLIC: int __db_noop_read __P((void *, __db_noop_args **));
- */
-int
-__db_noop_read(recbuf, argpp)
-	void *recbuf;
-	__db_noop_args **argpp;
-{
-	__db_noop_args *argp;
-	u_int8_t *bp;
-
-	argp = (__db_noop_args *)__db_malloc(sizeof(__db_noop_args) +
-	    sizeof(DB_TXN));
-	if (argp == NULL)
-		return (ENOMEM);
-	argp->txnid = (DB_TXN *)&argp[1];
-	bp = recbuf;
-	memcpy(&argp->type, bp, sizeof(argp->type));
-	bp += sizeof(argp->type);
-	memcpy(&argp->txnid->txnid,  bp, sizeof(argp->txnid->txnid));
-	bp += sizeof(argp->txnid->txnid);
-	memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
-	bp += sizeof(DB_LSN);
-	memcpy(&argp->fileid, bp, sizeof(argp->fileid));
-	bp += sizeof(argp->fileid);
-	memcpy(&argp->pgno, bp, sizeof(argp->pgno));
-	bp += sizeof(argp->pgno);
-	memcpy(&argp->prevlsn, bp,  sizeof(argp->prevlsn));
-	bp += sizeof(argp->prevlsn);
-	*argpp = argp;
-	return (0);
-}
-
-/*
  * PUBLIC: int __db_init_print __P((DB_ENV *));
  */
 int
@@ -1450,9 +1319,6 @@ __db_init_print(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __db_debug_print, DB_db_debug)) != 0)
 		return (ret);
-	if ((ret = __db_add_recovery(dbenv,
-	    __db_noop_print, DB_db_noop)) != 0)
-		return (ret);
 	return (0);
 }
 
@@ -1486,9 +1352,6 @@ __db_init_recover(dbenv)
 	if ((ret = __db_add_recovery(dbenv,
 	    __db_debug_recover, DB_db_debug)) != 0)
 		return (ret);
-	if ((ret = __db_add_recovery(dbenv,
-	    __db_noop_recover, DB_db_noop)) != 0)
-		return (ret);
 	return (0);
 }
 
diff --git a/db2/db/db_dispatch.c b/db2/db/db_dispatch.c
index 8645948614..616d08c3ff 100644
--- a/db2/db/db_dispatch.c
+++ b/db2/db/db_dispatch.c
@@ -43,13 +43,14 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_dispatch.c	10.14 (Sleepycat) 5/3/98";
+static const char sccsid[] = "@(#)db_dispatch.c	10.20 (Sleepycat) 10/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
+#include <shqueue.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
@@ -61,6 +62,7 @@ static const char sccsid[] = "@(#)db_dispatch.c	10.14 (Sleepycat) 5/3/98";
 #include "db_am.h"
 #include "common_ext.h"
 #include "log_auto.h"
+#include "txn.h"
 #include "txn_auto.h"
 
 /*
@@ -148,27 +150,16 @@ __db_add_recovery(dbenv, func, ndx)
 	u_int32_t ndx;
 {
 	u_int32_t i;
+	int ret;
 
-	/* Check if function is already registered. */
-	if (dispatch_table && ndx < dispatch_size &&
-	    dispatch_table[ndx] != 0 && dispatch_table[ndx] != func)
-		return (DB_REGISTERED);
+	COMPQUIET(dbenv, NULL);		/* !!!: not currently used. */
 
 	/* Check if we have to grow the table. */
 	if (ndx >= dispatch_size) {
-		if (dispatch_table == NULL)
-			dispatch_table = (int (**)
-			 __P((DB_LOG *, DBT *, DB_LSN *, int, void *)))
-			 __db_malloc(DB_user_BEGIN * sizeof(dispatch_table[0]));
-		else
-			dispatch_table = (int (**)
-			    __P((DB_LOG *, DBT *, DB_LSN *, int, void *)))
-			    __db_realloc(dispatch_table, (DB_user_BEGIN +
-			    dispatch_size) * sizeof(dispatch_table[0]));
-		if (dispatch_table == NULL) {
-			__db_err(dbenv, "%s", strerror(ENOMEM));
-			return (ENOMEM);
-		}
+		if ((ret = __os_realloc(&dispatch_table,
+		    (DB_user_BEGIN + dispatch_size) *
+		    sizeof(dispatch_table[0]))) != 0)
+			return (ret);
 		for (i = dispatch_size,
 		    dispatch_size += DB_user_BEGIN; i < dispatch_size; ++i)
 			dispatch_table[i] = NULL;
@@ -189,9 +180,10 @@ __db_txnlist_init(retp)
 	void *retp;
 {
 	DB_TXNHEAD *headp;
+	int ret;
 
-	if ((headp = (DB_TXNHEAD *)__db_malloc(sizeof(DB_TXNHEAD))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(sizeof(DB_TXNHEAD), NULL, &headp)) != 0)
+		return (ret);
 
 	LIST_INIT(&headp->head);
 	headp->maxid = 0;
@@ -214,9 +206,10 @@ __db_txnlist_add(listp, txnid)
 {
 	DB_TXNHEAD *hp;
 	DB_TXNLIST *elp;
+	int ret;
 
-	if ((elp = (DB_TXNLIST *)__db_malloc(sizeof(DB_TXNLIST))) == NULL)
-		return (ENOMEM);
+	if ((ret = __os_malloc(sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+		return (ret);
 
 	elp->txnid = txnid;
 	hp = (DB_TXNHEAD *)listp;
@@ -269,9 +262,9 @@ __db_txnlist_end(listp)
 	hp = (DB_TXNHEAD *)listp;
 	while ((p = LIST_FIRST(&hp->head)) != LIST_END(&hp->head)) {
 		LIST_REMOVE(p, links);
-		__db_free(p);
+		__os_free(p, 0);
 	}
-	__db_free(listp);
+	__os_free(listp, sizeof(DB_TXNHEAD));
 }
 
 /*
diff --git a/db2/db/db_dup.c b/db2/db/db_dup.c
index 6379fc1729..2673bbcd61 100644
--- a/db2/db/db_dup.c
+++ b/db2/db/db_dup.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_dup.c	10.18 (Sleepycat) 5/31/98";
+static const char sccsid[] = "@(#)db_dup.c	10.35 (Sleepycat) 12/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -23,25 +23,25 @@ static const char sccsid[] = "@(#)db_dup.c	10.18 (Sleepycat) 5/31/98";
 #include "btree.h"
 #include "db_am.h"
 
-static int __db_addpage __P((DB *,
-    PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
-static int __db_dsplit __P((DB *,
-    PAGE **, db_indx_t *, u_int32_t, int (*)(DB *, u_int32_t, PAGE **)));
+static int __db_addpage __P((DBC *,
+    PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **)));
+static int __db_dsplit __P((DBC *,
+    PAGE **, db_indx_t *, u_int32_t, int (*)(DBC *, u_int32_t, PAGE **)));
 
 /*
  * __db_dput --
  *	Put a duplicate item onto a duplicate page at the given index.
  *
- * PUBLIC: int __db_dput __P((DB *,
- * PUBLIC:    DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
+ * PUBLIC: int __db_dput __P((DBC *, DBT *,
+ * PUBLIC:    PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **)));
  */
 int
-__db_dput(dbp, dbt, pp, indxp, newfunc)
-	DB *dbp;
+__db_dput(dbc, dbt, pp, indxp, newfunc)
+	DBC *dbc;
 	DBT *dbt;
 	PAGE **pp;
 	db_indx_t *indxp;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
 	BOVERFLOW bo;
 	DBT *data_dbtp, hdr_dbt, *hdr_dbtp;
@@ -54,10 +54,12 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	 * We need some access method independent threshold for when we put
 	 * a duplicate item onto an overflow page.
 	 */
-	if (dbt->size > 0.25 * dbp->pgsize) {
-		if ((ret = __db_poff(dbp, dbt, &pgno, newfunc)) != 0)
+	if (dbt->size > 0.25 * dbc->dbp->pgsize) {
+		if ((ret = __db_poff(dbc, dbt, &pgno, newfunc)) != 0)
 			return (ret);
+		UMRW(bo.unused1);
 		B_TSET(bo.type, B_OVERFLOW, 0);
+		UMRW(bo.unused2);
 		bo.tlen = dbt->size;
 		bo.pgno = pgno;
 		hdr_dbt.data = &bo;
@@ -75,11 +77,14 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	pagep = *pp;
 	if (size > P_FREESPACE(pagep)) {
 		if (*indxp == NUM_ENT(*pp) && NEXT_PGNO(*pp) == PGNO_INVALID)
-			ret = __db_addpage(dbp, pp, indxp, newfunc);
+			ret = __db_addpage(dbc, pp, indxp, newfunc);
 		else
-			ret = __db_dsplit(dbp, pp, indxp, isize, newfunc);
+			ret = __db_dsplit(dbc, pp, indxp, isize, newfunc);
 		if (ret != 0)
-			/* XXX: Pages not returned to free list. */
+			/*
+			 * XXX
+			 * Pages not returned to free list.
+			 */
 			return (ret);
 		pagep = *pp;
 	}
@@ -88,11 +93,11 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
 	 * Now, pagep references the page on which to insert and indx is the
 	 * the location to insert.
 	 */
-	if ((ret = __db_pitem(dbp,
+	if ((ret = __db_pitem(dbc,
 	    pagep, (u_int32_t)*indxp, isize, hdr_dbtp, data_dbtp)) != 0)
 		return (ret);
 
-	(void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY);
+	(void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY);
 	return (0);
 }
 
@@ -100,15 +105,15 @@ __db_dput(dbp, dbt, pp, indxp, newfunc)
  * __db_drem --
  *	Remove a duplicate at the given index on the given page.
  *
- * PUBLIC: int __db_drem __P((DB *,
- * PUBLIC:    PAGE **, u_int32_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_drem __P((DBC *,
+ * PUBLIC:    PAGE **, u_int32_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_drem(dbp, pp, indx, freefunc)
-	DB *dbp;
+__db_drem(dbc, pp, indx, freefunc)
+	DBC *dbc;
 	PAGE **pp;
 	u_int32_t indx;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
 	PAGE *pagep;
 	int ret;
@@ -117,12 +122,12 @@ __db_drem(dbp, pp, indx, freefunc)
 
 	/* Check if we are freeing a big item. */
 	if (B_TYPE(GET_BKEYDATA(pagep, indx)->type) == B_OVERFLOW) {
-		if ((ret = __db_doff(dbp,
+		if ((ret = __db_doff(dbc,
 		    GET_BOVERFLOW(pagep, indx)->pgno, freefunc)) != 0)
 			return (ret);
-		ret = __db_ditem(dbp, pagep, indx, BOVERFLOW_SIZE);
+		ret = __db_ditem(dbc, pagep, indx, BOVERFLOW_SIZE);
 	} else
-		ret = __db_ditem(dbp, pagep, indx,
+		ret = __db_ditem(dbc, pagep, indx,
 		    BKEYDATA_SIZE(GET_BKEYDATA(pagep, indx)->len));
 	if (ret != 0)
 		return (ret);
@@ -137,12 +142,12 @@ __db_drem(dbp, pp, indx, freefunc)
 		 * !!!
 		 * __db_relink will set the dirty bit for us.
 		 */
-		if ((ret = __db_relink(dbp, pagep, pp, 0)) != 0)
+		if ((ret = __db_relink(dbc, DB_REM_PAGE, pagep, pp, 0)) != 0)
 			return (ret);
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} else
-		(void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY);
+		(void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY);
 
 	return (0);
 }
@@ -151,32 +156,41 @@ __db_drem(dbp, pp, indx, freefunc)
  * __db_dend --
  *	Find the last page in a set of offpage duplicates.
  *
- * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **));
+ * PUBLIC: int __db_dend __P((DBC *, db_pgno_t, PAGE **));
  */
 int
-__db_dend(dbp, pgno, pagep)
-	DB *dbp;
+__db_dend(dbc, pgno, pp)
+	DBC *dbc;
 	db_pgno_t pgno;
-	PAGE **pagep;
+	PAGE **pp;
 {
+	DB *dbp;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	/*
 	 * This implements DB_KEYLAST.  The last page is returned in pp; pgno
 	 * should be the page number of the first page of the duplicate chain.
+	 *
+	 * *pp may be non-NULL -- if given a valid page use it.
 	 */
+	if (*pp != NULL)
+		goto started;
 	for (;;) {
-		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
 			return (ret);
 		}
+started:	h = *pp;
+
 		if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
 			break;
-		(void)memp_fput(dbp->mpf, h, 0);
-	}
 
-	*pagep = h;
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+	}
 	return (0);
 }
 
@@ -191,41 +205,44 @@ __db_dend(dbp, pgno, pagep)
  *	the page on which the insert should happen, not yet put.
  */
 static int
-__db_dsplit(dbp, hp, indxp, size, newfunc)
-	DB *dbp;
+__db_dsplit(dbc, hp, indxp, size, newfunc)
+	DBC *dbc;
 	PAGE **hp;
 	db_indx_t *indxp;
 	u_int32_t size;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
 	PAGE *h, *np, *tp;
 	BKEYDATA *bk;
 	DBT page_dbt;
+	DB *dbp;
+	size_t pgsize;
 	db_indx_t halfbytes, i, indx, lastsum, nindex, oindex, s, sum;
-	int did_indx, ret;
+	int did_indx, ret, t_ret;
 
 	h = *hp;
 	indx = *indxp;
+	ret = 0;
+	dbp = dbc->dbp;
+	pgsize = dbp->pgsize;
 
 	/* Create a temporary page to do compaction onto. */
-	if ((tp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL)
-		return (ENOMEM);
-#ifdef DIAGNOSTIC
-	memset(tp, 0xff, dbp->pgsize);
-#endif
+	if ((ret = __os_malloc(pgsize, NULL, &tp)) != 0)
+		return (ret);
+
 	/* Create new page for the split. */
-	if ((ret = newfunc(dbp, P_DUPLICATE, &np)) != 0) {
-		FREE(tp, dbp->pgsize);
+	if ((ret = newfunc(dbc, P_DUPLICATE, &np)) != 0) {
+		__os_free(tp, pgsize);
 		return (ret);
 	}
 
-	P_INIT(np, dbp->pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0,
+	P_INIT(np, pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0,
 	    P_DUPLICATE);
-	P_INIT(tp, dbp->pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0,
+	P_INIT(tp, pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0,
 	    P_DUPLICATE);
 
 	/* Figure out the split point */
-	halfbytes = (dbp->pgsize - HOFFSET(h)) / 2;
+	halfbytes = (pgsize - HOFFSET(h)) / 2;
 	did_indx = 0;
 	for (sum = 0, lastsum = 0, i = 0; i < NUM_ENT(h); i++) {
 		if (i == indx) {
@@ -237,7 +254,6 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 				    (db_indx_t)(sum - halfbytes)) {
 					*hp = np;
 					*indxp = 0;
-					i--;
 				} else
 					*indxp = i;
 				break;
@@ -252,29 +268,28 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 
 		if (lastsum < halfbytes && sum >= halfbytes) {
 			/* We've crossed the halfway point. */
-			if ((db_indx_t)(halfbytes - lastsum) <
-			    (db_indx_t)(sum - halfbytes))
-				i--;
+			if ((db_indx_t)(sum - halfbytes) <
+			    (db_indx_t)(halfbytes - lastsum))
+				i++;
 			break;
 		}
 	}
-
 	/*
 	 * Check if we have set the return values of the index pointer and
 	 * page pointer.
 	 */
 	if (!did_indx) {
 		*hp = np;
-		*indxp = indx - i - 1;
+		*indxp = indx - i;
 	}
 
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		page_dbt.size = dbp->pgsize;
 		page_dbt.data = h;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid,
+		    dbc->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid,
 		    PGNO(h), &page_dbt, &LSN(h))) != 0) {
-			FREE(tp, dbp->pgsize);
+			__os_free(tp, pgsize);
 			return (ret);
 		}
 		LSN(tp) = LSN(h);
@@ -283,12 +298,12 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	/*
 	 * If it's a btree, adjust the cursors.
 	 *
-	 * i is the index of the last element to stay on the page.
+	 * i is the index of the first element to move onto the new page.
 	 */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
-		__bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i + 1, 0);
+	if (dbp->type == DB_BTREE)
+		__bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i, 0);
 
-	for (nindex = 0, oindex = i + 1; oindex < NUM_ENT(h); oindex++) {
+	for (nindex = 0, oindex = i; oindex < NUM_ENT(h); oindex++) {
 		bk = GET_BKEYDATA(h, oindex);
 		if (B_TYPE(bk->type) == B_KEYDATA)
 			s = BKEYDATA_SIZE(bk->len);
@@ -304,7 +319,7 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	 * Now do data compaction by copying the remaining stuff onto the
 	 * temporary page and then copying it back to the real page.
 	 */
-	for (nindex = 0, oindex = 0; oindex <= i; oindex++) {
+	for (nindex = 0, oindex = 0; oindex < i; oindex++) {
 		bk = GET_BKEYDATA(h, oindex);
 		if (B_TYPE(bk->type) == B_KEYDATA)
 			s = BKEYDATA_SIZE(bk->len);
@@ -324,59 +339,73 @@ __db_dsplit(dbp, hp, indxp, size, newfunc)
 	 */
 	memcpy(h, tp, LOFFSET(tp));
 	memcpy((u_int8_t *)h + HOFFSET(tp),
-	    (u_int8_t *)tp + HOFFSET(tp), dbp->pgsize - HOFFSET(tp));
-	FREE(tp, dbp->pgsize);
+	    (u_int8_t *)tp + HOFFSET(tp), pgsize - HOFFSET(tp));
+	__os_free(tp, pgsize);
 
-	if (DB_LOGGING(dbp)) {
-		page_dbt.size = dbp->pgsize;
+	if (DB_LOGGING(dbc)) {
+		/*
+		 * XXX
+		 * If either of these fails, are we leaving pages pinned?
+		 * Yes, but it seems like this happens in error case.
+		 */
+		page_dbt.size = pgsize;
 		page_dbt.data = h;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid,
+		    dbc->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid,
 		    PGNO(h), &page_dbt, &LSN(h))) != 0)
 			return (ret);
 
-		page_dbt.size = dbp->pgsize;
+		page_dbt.size = pgsize;
 		page_dbt.data = np;
 		if ((ret = __db_split_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid,
+		    dbc->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid,
 		    PGNO(np),  &page_dbt, &LSN(np))) != 0)
 			return (ret);
 	}
 
 	/*
+	 * Finally, if there was a next page after the page being
+	 * split, fix its prev pointer.
+	 */
+	if (np->next_pgno != PGNO_INVALID)
+	    ret = __db_relink(dbc, DB_ADD_PAGE, np, NULL, 1);
+
+	/*
 	 * Figure out if the location we're interested in is on the new
 	 * page, and if so, reset the callers' pointer.  Push the other
 	 * page back to the store.
 	 */
 	if (*hp == h)
-		ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
+		t_ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
 	else
-		ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
+		t_ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
 
-	return (ret);
+	return (ret != 0 ? ret : t_ret);
 }
 
 /*
  * __db_ditem --
  *	Remove an item from a page.
  *
- * PUBLIC:  int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t));
+ * PUBLIC:  int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
  */
 int
-__db_ditem(dbp, pagep, indx, nbytes)
-	DB *dbp;
+__db_ditem(dbc, pagep, indx, nbytes)
+	DBC *dbc;
 	PAGE *pagep;
 	u_int32_t indx, nbytes;
 {
+	DB *dbp;
 	DBT ldbt;
 	db_indx_t cnt, offset;
 	int ret;
 	u_int8_t *from;
 
-	if (DB_LOGGING(dbp)) {
+	dbp = dbc->dbp;
+	if (DB_LOGGING(dbc)) {
 		ldbt.data = P_ENTRY(pagep, indx);
 		ldbt.size = nbytes;
-		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn,
+		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep),
 		    (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
 			return (ret);
@@ -413,7 +442,7 @@ __db_ditem(dbp, pagep, indx, nbytes)
 		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
 
 	/* If it's a btree, adjust the cursors. */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
+	if (dbp->type == DB_BTREE)
 		__bam_ca_di(dbp, PGNO(pagep), indx, -1);
 
 	return (0);
@@ -424,16 +453,17 @@ __db_ditem(dbp, pagep, indx, nbytes)
  *	Put an item on a page.
  *
  * PUBLIC: int __db_pitem
- * PUBLIC:     __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
  */
 int
-__db_pitem(dbp, pagep, indx, nbytes, hdr, data)
-	DB *dbp;
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
 	PAGE *pagep;
 	u_int32_t indx;
 	u_int32_t nbytes;
 	DBT *hdr, *data;
 {
+	DB *dbp;
 	BKEYDATA bk;
 	DBT thdr;
 	int ret;
@@ -456,8 +486,9 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
 	 * the passed in header sizes must be adjusted for the structure's
 	 * placeholder for the trailing variable-length data field.
 	 */
-	if (DB_LOGGING(dbp))
-		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn,
+	dbp = dbc->dbp;
+	if (DB_LOGGING(dbc))
+		if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep),
 		    (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0)
 			return (ret);
@@ -485,7 +516,7 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
 		memcpy(p + hdr->size, data->data, data->size);
 
 	/* If it's a btree, adjust the cursors. */
-	if (dbp->type == DB_BTREE || dbp->type == DB_RECNO)
+	if (dbp->type == DB_BTREE)
 		__bam_ca_di(dbp, PGNO(pagep), indx, 1);
 
 	return (0);
@@ -495,14 +526,16 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data)
  * __db_relink --
  *	Relink around a deleted page.
  *
- * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int));
+ * PUBLIC: int __db_relink __P((DBC *, u_int32_t, PAGE *, PAGE **, int));
  */
 int
-__db_relink(dbp, pagep, new_next, needlock)
-	DB *dbp;
+__db_relink(dbc, add_rem, pagep, new_next, needlock)
+	DBC *dbc;
+	u_int32_t add_rem;
 	PAGE *pagep, **new_next;
 	int needlock;
 {
+	DB *dbp;
 	PAGE *np, *pp;
 	DB_LOCK npl, ppl;
 	DB_LSN *nlsnp, *plsnp;
@@ -512,10 +545,15 @@ __db_relink(dbp, pagep, new_next, needlock)
 	np = pp = NULL;
 	npl = ppl = LOCK_INVALID;
 	nlsnp = plsnp = NULL;
+	dbp = dbc->dbp;
 
-	/* Retrieve and lock the two pages. */
+	/*
+	 * Retrieve and lock the one/two pages.  For a remove, we may need
+	 * two pages (the before and after).  For an add, we only need one
+	 * because, the split took care of the prev.
+	 */
 	if (pagep->next_pgno != PGNO_INVALID) {
-		if (needlock && (ret = __bam_lget(dbp,
+		if (needlock && (ret = __bam_lget(dbc,
 		    0, pagep->next_pgno, DB_LOCK_WRITE, &npl)) != 0)
 			goto err;
 		if ((ret = memp_fget(dbp->mpf,
@@ -525,8 +563,8 @@ __db_relink(dbp, pagep, new_next, needlock)
 		}
 		nlsnp = &np->lsn;
 	}
-	if (pagep->prev_pgno != PGNO_INVALID) {
-		if (needlock && (ret = __bam_lget(dbp,
+	if (add_rem == DB_REM_PAGE && pagep->prev_pgno != PGNO_INVALID) {
+		if (needlock && (ret = __bam_lget(dbc,
 		    0, pagep->prev_pgno, DB_LOCK_WRITE, &ppl)) != 0)
 			goto err;
 		if ((ret = memp_fget(dbp->mpf,
@@ -538,9 +576,10 @@ __db_relink(dbp, pagep, new_next, needlock)
 	}
 
 	/* Log the change. */
-	if (DB_LOGGING(dbp)) {
-		if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbp->txn,
-		    &pagep->lsn, 0, dbp->log_fileid, pagep->pgno, &pagep->lsn,
+	if (DB_LOGGING(dbc)) {
+		if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbc->txn,
+		    &pagep->lsn, 0, add_rem, dbp->log_fileid,
+		    pagep->pgno, &pagep->lsn,
 		    pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0)
 			goto err;
 		if (np != NULL)
@@ -558,7 +597,10 @@ __db_relink(dbp, pagep, new_next, needlock)
 	 * set to NULL.
 	 */
 	if (np != NULL) {
-		np->prev_pgno = pagep->prev_pgno;
+		if (add_rem == DB_ADD_PAGE)
+			np->prev_pgno = pagep->pgno;
+		else
+			np->prev_pgno = pagep->prev_pgno;
 		if (new_next == NULL)
 			ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
 		else {
@@ -568,7 +610,7 @@ __db_relink(dbp, pagep, new_next, needlock)
 		if (ret != 0)
 			goto err;
 		if (needlock)
-			(void)__bam_lput(dbp, npl);
+			(void)__bam_lput(dbc, npl);
 	} else if (new_next != NULL)
 		*new_next = NULL;
 
@@ -577,18 +619,18 @@ __db_relink(dbp, pagep, new_next, needlock)
 		if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0)
 			goto err;
 		if (needlock)
-			(void)__bam_lput(dbp, ppl);
+			(void)__bam_lput(dbc, ppl);
 	}
 	return (0);
 
 err:	if (np != NULL)
 		(void)memp_fput(dbp->mpf, np, 0);
 	if (needlock && npl != LOCK_INVALID)
-		(void)__bam_lput(dbp, npl);
+		(void)__bam_lput(dbc, npl);
 	if (pp != NULL)
 		(void)memp_fput(dbp->mpf, pp, 0);
 	if (needlock && ppl != LOCK_INVALID)
-		(void)__bam_lput(dbp, ppl);
+		(void)__bam_lput(dbc, ppl);
 	return (ret);
 }
 
@@ -596,34 +638,37 @@ err:	if (np != NULL)
  * __db_ddup --
  *	Delete an offpage chain of duplicates.
  *
- * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_ddup __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_ddup(dbp, pgno, freefunc)
-	DB *dbp;
+__db_ddup(dbc, pgno, freefunc)
+	DBC *dbc;
 	db_pgno_t pgno;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
+	DB *dbp;
 	PAGE *pagep;
 	DBT tmp_dbt;
 	int ret;
 
+	dbp = dbc->dbp;
 	do {
 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
 			return (ret);
 		}
 
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = pagep;
 			tmp_dbt.size = dbp->pgsize;
-			if ((ret = __db_split_log(dbp->dbenv->lg_info, dbp->txn,
-			    &LSN(pagep), 0, DB_SPLITOLD, dbp->log_fileid,
-			    PGNO(pagep), &tmp_dbt, &LSN(pagep))) != 0)
+			if ((ret = __db_split_log(dbp->dbenv->lg_info,
+			    dbc->txn, &LSN(pagep), 0, DB_SPLITOLD,
+			    dbp->log_fileid, PGNO(pagep), &tmp_dbt,
+			    &LSN(pagep))) != 0)
 				return (ret);
 		}
 		pgno = pagep->next_pgno;
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} while (pgno != PGNO_INVALID);
 
@@ -636,21 +681,23 @@ __db_ddup(dbp, pgno, freefunc)
  *	current page.
  */
 static int
-__db_addpage(dbp, hp, indxp, newfunc)
-	DB *dbp;
+__db_addpage(dbc, hp, indxp, newfunc)
+	DBC *dbc;
 	PAGE **hp;
 	db_indx_t *indxp;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
+	DB *dbp;
 	PAGE *newpage;
 	int ret;
 
-	if ((ret = newfunc(dbp, P_DUPLICATE, &newpage)) != 0)
+	dbp = dbc->dbp;
+	if ((ret = newfunc(dbc, P_DUPLICATE, &newpage)) != 0)
 		return (ret);
 
-	if (DB_LOGGING(dbp)) {
+	if (DB_LOGGING(dbc)) {
 		if ((ret = __db_addpage_log(dbp->dbenv->lg_info,
-		    dbp->txn, &LSN(*hp), 0, dbp->log_fileid,
+		    dbc->txn, &LSN(*hp), 0, dbp->log_fileid,
 		    PGNO(*hp), &LSN(*hp), PGNO(newpage), &LSN(newpage))) != 0) {
 			return (ret);
 		}
@@ -666,3 +713,235 @@ __db_addpage(dbp, hp, indxp, newfunc)
 	*indxp = 0;
 	return (0);
 }
+
+/*
+ * __db_dsearch --
+ *	Search a set of duplicates for the proper position for a new duplicate.
+ *
+ *	+ pgno is the page number of the page on which to begin searching.
+ * 	  Since we can continue duplicate searches, it might not be the first
+ * 	  page.
+ *
+ * 	+ If we are continuing a search, then *pp may be non-NULL in which
+ * 	  case we do not have to retrieve the page.
+ *
+ *	+ If we are continuing a search, then *indxp contains the first
+ * 	  on pgno of where we should begin the search.
+ *
+ * 	NOTE: if there is no comparison function, then continuing is
+ * 	meaningless, and *pp should always be NULL and *indxp will be
+ *	ignored.
+ *
+ *	3 return values::
+ *
+ *	+ pp is the returned page pointer of where this element should go.
+ *	+ indxp is the returned index on that page
+ *	+ cmpp is the returned final comparison result.
+ *
+ * PUBLIC: int __db_dsearch __P((DBC *,
+ * PUBLIC:     int, DBT *, db_pgno_t, db_indx_t *, PAGE **, int *));
+ */
+int
+__db_dsearch(dbc, is_insert, dbt, pgno, indxp, pp, cmpp)
+	DBC *dbc;
+	int is_insert, *cmpp;
+	DBT *dbt;
+	db_pgno_t pgno;
+	db_indx_t *indxp;
+	PAGE **pp;
+{
+	DB *dbp;
+	PAGE *h;
+	db_indx_t base, indx, lim, save_indx;
+	db_pgno_t save_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	if (dbp->dup_compare == NULL) {
+		/*
+		 * We may have been given a valid page, but we may not be
+		 * able to use it.  The problem is that the application is
+		 * doing a join and we're trying to continue the search,
+		 * but since the items aren't sorted, we can't.  Discard
+		 * the page if it's not the one we're going to start with
+		 * anyway.
+		 */
+		if (*pp != NULL && (*pp)->pgno != pgno) {
+			if ((ret = memp_fput(dbp->mpf, *pp, 0)) != 0)
+				return (ret);
+			*pp = NULL;
+		}
+
+		/*
+		 * If no duplicate function is specified, just go to the end
+		 * of the duplicate set.
+		 */
+		if (is_insert) {
+			if ((ret = __db_dend(dbc, pgno, pp)) != 0)
+				return (ret);
+			*indxp = NUM_ENT(*pp);
+			return (0);
+		}
+
+		/*
+		 * We are looking for a specific duplicate, so do a linear
+		 * search.
+		 */
+		if (*pp != NULL)
+			goto nocmp_started;
+		for (;;) {
+			if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+				goto pg_err;
+nocmp_started:		h = *pp;
+
+			for (*indxp = 0; *indxp < NUM_ENT(h); ++*indxp) {
+				if ((*cmpp = __bam_cmp(dbp,
+				    dbt, h, *indxp, __bam_defcmp)) != 0)
+					continue;
+				/*
+				 * The duplicate may have already been deleted,
+				 * if it's a btree page, in which case we skip
+				 * it.
+				 */
+				if (dbp->type == DB_BTREE &&
+				    B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+					continue;
+
+				return (0);
+			}
+
+			if ((pgno = h->next_pgno) == PGNO_INVALID)
+				break;
+
+			if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+				return (ret);
+		}
+		*cmpp = 1;			/* We didn't succeed... */
+		return (0);
+	}
+
+	/*
+	 * We have a comparison routine, i.e., the duplicates are sorted.
+	 * Walk through the chain of duplicates, checking the last entry
+	 * on each page to decide if it's the page we want to search.
+	 *
+	 * *pp may be non-NULL -- if we were given a valid page (e.g., are
+	 * in mid-search), then use the provided page.
+	 */
+	if (*pp != NULL)
+		goto cmp_started;
+	for (;;) {
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+cmp_started:	h = *pp;
+
+		if ((pgno = h->next_pgno) == PGNO_INVALID || __bam_cmp(dbp,
+		    dbt, h, h->entries - 1, dbp->dup_compare) <= 0)
+			break;
+		/*
+		 * Even when continuing a search, make sure we don't skip
+		 * entries on a new page
+		 */
+		*indxp = 0;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+	}
+
+	/* Next, do a binary search on the page. */
+	base = F_ISSET(dbc, DBC_CONTINUE) ? *indxp : 0;
+	for (lim = NUM_ENT(h) - base; lim != 0; lim >>= 1) {
+		indx = base + (lim >> 1);
+		if ((*cmpp = __bam_cmp(dbp,
+		    dbt, h, indx, dbp->dup_compare)) == 0) {
+			*indxp = indx;
+
+			if (dbp->type != DB_BTREE ||
+			    !B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+			goto check_delete;
+		}
+		if (*cmpp > 0) {
+			base = indx + 1;
+			lim--;
+		}
+	}
+
+	/*
+	 * Base references the smallest index larger than the supplied DBT's
+	 * data item, potentially both 0 and NUM_ENT.
+	 */
+	*indxp = base;
+	return (0);
+
+check_delete:
+	/*
+	 * The duplicate may have already been deleted, if it's a btree page,
+	 * in which case we wander around, hoping to find an entry that hasn't
+	 * been deleted.  First, wander in a forwardly direction.
+	 */
+	save_pgno = (*pp)->pgno;
+	save_indx = *indxp;
+	for (++*indxp;;) {
+		for (; *indxp < NUM_ENT(h); ++*indxp) {
+			if ((*cmpp = __bam_cmp(dbp,
+			    dbt, h, *indxp, dbp->dup_compare)) != 0)
+				goto check_delete_rev;
+
+			if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+		}
+		if ((pgno = h->next_pgno) == PGNO_INVALID)
+			break;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+
+		*indxp = 0;
+	}
+
+check_delete_rev:
+	/* Go back to where we started, and wander in a backwardly direction. */
+	if (h->pgno != save_pgno) {
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+		if ((ret = memp_fget(dbp->mpf, &save_pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+	}
+
+	for (;;) {
+		while (*indxp > 0) {
+			--*indxp;
+			if ((*cmpp = __bam_cmp(dbp,
+			    dbt, h, *indxp, dbp->dup_compare)) != 0)
+				goto check_delete_fail;
+
+			if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type))
+				return (0);
+		}
+		if ((pgno = h->prev_pgno) == PGNO_INVALID)
+			break;
+
+		if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+			return (ret);
+
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0)
+			goto pg_err;
+		h = *pp;
+
+		*indxp = NUM_ENT(h);
+	}
+
+check_delete_fail:
+	*cmpp = 1;			/* We didn't succeed... */
+	return (0);
+
+pg_err:	__db_pgerr(dbp, pgno);
+	return (ret);
+}
diff --git a/db2/db/db_iface.c b/db2/db/db_iface.c
new file mode 100644
index 0000000000..4ebf3ba019
--- /dev/null
+++ b/db2/db/db_iface.c
@@ -0,0 +1,488 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_iface.c	10.40 (Sleepycat) 12/19/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_auto.h"
+#include "db_ext.h"
+#include "common_ext.h"
+
+static int __db_keyempty __P((const DB_ENV *));
+static int __db_rdonly __P((const DB_ENV *, const char *));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+
+/*
+ * __db_cdelchk --
+ *	Common cursor delete argument checking routine.
+ *
+ * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int));
+ */
+int
+__db_cdelchk(dbp, flags, isrdonly, isvalid)
+	const DB *dbp;
+	u_int32_t flags;
+	int isrdonly, isvalid;
+{
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "c_del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DBcursor->c_del", 0));
+	}
+
+	/*
+	 * The cursor must be initialized, return -1 for an invalid cursor,
+	 * otherwise 0.
+	 */
+	return (isvalid ? 0 : EINVAL);
+}
+
+/*
+ * __db_cgetchk --
+ *	Common cursor get argument checking routine.
+ *
+ * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
+ */
+int
+__db_cgetchk(dbp, key, data, flags, isvalid)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+	int isvalid;
+{
+	int key_einval, key_flags, ret;
+
+	key_einval = key_flags = 0;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_RMW);
+	switch (flags) {
+	case DB_NEXT_DUP:
+		if (dbp->type == DB_RECNO)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_LAST:
+	case DB_NEXT:
+	case DB_PREV:
+		key_flags = 1;
+		break;
+	case DB_GET_BOTH:
+	case DB_SET_RANGE:
+		key_einval = key_flags = 1;
+		break;
+	case DB_SET:
+		key_einval = 1;
+		break;
+	case DB_GET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		key_einval = key_flags = 1;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
+	/*
+	 * The cursor must be initialized for DB_CURRENT, return -1 for an
+	 * invalid cursor, otherwise 0.
+	 */
+	return (isvalid || flags != DB_CURRENT ? 0 : EINVAL);
+}
+
+/*
+ * __db_cputchk --
+ *	Common cursor put argument checking routine.
+ *
+ * PUBLIC: int __db_cputchk __P((const DB *,
+ * PUBLIC:    const DBT *, DBT *, u_int32_t, int, int));
+ */
+int
+__db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	int isrdonly, isvalid;
+{
+	int key_einval, key_flags, ret;
+
+	key_einval = key_flags = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "c_put"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_AFTER:
+	case DB_BEFORE:
+		if (dbp->dup_compare != NULL)
+			goto err;
+		if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER))
+			goto err;
+		if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP))
+			goto err;
+		break;
+	case DB_CURRENT:
+		/*
+		 * If there is a comparison function, doing a DB_CURRENT
+		 * must not change the part of the data item that is used
+		 * for the comparison.
+		 */
+		break;
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+		if (dbp->type == DB_RECNO)
+			goto err;
+		key_einval = key_flags = 1;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DBcursor->c_put", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
+	/*
+	 * The cursor must be initialized for anything other than DB_KEYFIRST
+	 * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0.
+	 */
+	return (isvalid ||
+	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? 0 : EINVAL);
+}
+
+/*
+ * __db_closechk --
+ *	DB->close flag check.
+ *
+ * PUBLIC: int __db_closechk __P((const DB *, u_int32_t));
+ */
+int
+__db_closechk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	if (flags != 0 && flags != DB_NOSYNC)
+		return (__db_ferr(dbp->dbenv, "DB->close", 0));
+
+	return (0);
+}
+
+/*
+ * __db_delchk --
+ *	Common delete argument checking routine.
+ *
+ * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
+ */
+int
+__db_delchk(dbp, key, flags, isrdonly)
+	const DB *dbp;
+	DBT *key;
+	u_int32_t flags;
+	int isrdonly;
+{
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "delete"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DB->del", 0));
+	}
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	return (0);
+}
+
+/*
+ * __db_getchk --
+ *	Common get argument checking routine.
+ *
+ * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
+ */
+int
+__db_getchk(dbp, key, data, flags)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	int ret;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_RMW);
+	switch (flags) {
+	case 0:
+	case DB_GET_BOTH:
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_BT_RECNUM))
+			goto err;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, flags == DB_SET_RECNO)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	return (0);
+}
+
+/*
+ * __db_joinchk --
+ *	Common join argument checking routine.
+ *
+ * PUBLIC: int __db_joinchk __P((const DB *, u_int32_t));
+ */
+int
+__db_joinchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	if (flags != 0)
+		return (__db_ferr(dbp->dbenv, "DB->join", 0));
+
+	return (0);
+}
+
+/*
+ * __db_putchk --
+ *	Common put argument checking routine.
+ *
+ * PUBLIC: int __db_putchk
+ * PUBLIC:    __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
+ */
+int
+__db_putchk(dbp, key, data, flags, isrdonly, isdup)
+	const DB *dbp;
+	DBT *key;
+	const DBT *data;
+	u_int32_t flags;
+	int isrdonly, isdup;
+{
+	int ret;
+
+	/* Check for changes to a read-only tree. */
+	if (isrdonly)
+		return (__db_rdonly(dbp->dbenv, "put"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+	case DB_NOOVERWRITE:
+		break;
+	case DB_APPEND:
+		if (dbp->type != DB_RECNO)
+			goto err;
+		break;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->put", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
+	/* Check for partial puts in the presence of duplicates. */
+	if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) {
+		__db_err(dbp->dbenv,
+"a partial put in the presence of duplicates requires a cursor operation");
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_statchk --
+ *	Common stat argument checking routine.
+ *
+ * PUBLIC: int __db_statchk __P((const DB *, u_int32_t));
+ */
+int
+__db_statchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	case DB_RECORDCOUNT:
+		if (dbp->type == DB_RECNO)
+			break;
+		if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_BT_RECNUM))
+			break;
+		goto err;
+	default:
+err:		return (__db_ferr(dbp->dbenv, "DB->stat", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_syncchk --
+ *	Common sync argument checking routine.
+ *
+ * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t));
+ */
+int
+__db_syncchk(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(dbp->dbenv, "DB->sync", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __dbt_ferr --
+ *	Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+	const DB *dbp;
+	const char *name;
+	const DBT *dbt;
+	int check_thread;
+{
+	int ret;
+
+	/*
+	 * Check for invalid DBT flags.  We allow any of the flags to be
+	 * specified to any DB or DBcursor call so that applications can
+	 * set DB_DBT_MALLOC when retrieving a data item from a secondary
+	 * database and then specify that same DBT as a key to a primary
+	 * database, without having to clear flags.
+	 */
+	if ((ret = __db_fchk(dbp->dbenv, name, dbt->flags,
+	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(dbp->dbenv, name,
+	    dbt->flags, DB_DBT_MALLOC, DB_DBT_USERMEM)) != 0)
+		return (ret);
+
+	if (check_thread && F_ISSET(dbp, DB_AM_THREAD) &&
+	    !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_USERMEM)) {
+		__db_err(dbp->dbenv,
+		    "missing flag thread flag for %s DBT", name);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_eopnotsup --
+ *	Common operation not supported message.
+ *
+ * PUBLIC: int __db_eopnotsup __P((const DB_ENV *));
+ */
+int
+__db_eopnotsup(dbenv)
+	const DB_ENV *dbenv;
+{
+	__db_err(dbenv, "operation not supported");
+#ifdef EOPNOTSUPP
+	return (EOPNOTSUPP);
+#else
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __db_keyempty --
+ *	Common missing or empty key value message.
+ */
+static int
+__db_keyempty(dbenv)
+	const DB_ENV *dbenv;
+{
+	__db_err(dbenv, "missing or empty key value specified");
+	return (EINVAL);
+}
+
+/*
+ * __db_rdonly --
+ *	Common readonly message.
+ */
+static int
+__db_rdonly(dbenv, name)
+	const DB_ENV *dbenv;
+	const char *name;
+{
+	__db_err(dbenv, "%s: attempt to modify a read-only tree", name);
+	return (EACCES);
+}
diff --git a/db2/db/db_join.c b/db2/db/db_join.c
new file mode 100644
index 0000000000..a4051c20b0
--- /dev/null
+++ b/db2/db/db_join.c
@@ -0,0 +1,271 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998
+ *	Sleepycat Software.  All rights reserved.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_join.c	10.10 (Sleepycat) 10/9/98";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_join.h"
+#include "db_am.h"
+#include "common_ext.h"
+
+static int __db_join_close __P((DBC *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * This is the duplicate-assisted join functionality.  Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * XXX
+ * Right now we do not maintain the number of duplicates so we do
+ * not optimize the join.  If the caller does, then best performance
+ * will be achieved by putting the cursor with the smallest cardinality
+ * first.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ *	This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join.  While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, intialized cursors and primary
+ * is the DB of the primary file.  The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned.  When no more items are left in the join
+ * set, the  c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, u_int32_t, DBC **));
+ */
+int
+__db_join(primary, curslist, flags, dbcp)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	JOIN_CURSOR *jc;
+	int i, ret;
+
+	DB_PANIC_CHECK(primary);
+
+	if ((ret = __db_joinchk(primary, flags)) != 0)
+		return (ret);
+
+	if (curslist == NULL || curslist[0] == NULL)
+		return (EINVAL);
+
+	dbc = NULL;
+	jc = NULL;
+
+	if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0)
+		goto err;
+
+	if ((ret = __os_calloc(1, sizeof(JOIN_CURSOR), &jc)) != 0)
+		goto err;
+
+	if ((ret = __os_malloc(256, NULL, &jc->j_key.data)) != 0)
+		goto err;
+	jc->j_key.ulen = 256;
+	F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+	for (jc->j_curslist = curslist;
+	    *jc->j_curslist != NULL; jc->j_curslist++)
+		;
+	if ((ret = __os_calloc((jc->j_curslist - curslist + 1),
+	    sizeof(DBC *), &jc->j_curslist)) != 0)
+		goto err;
+	for (i = 0; curslist[i] != NULL; i++) {
+		if (i != 0)
+			F_SET(curslist[i], DBC_KEYSET);
+		jc->j_curslist[i] = curslist[i];
+	}
+
+	dbc->c_close = __db_join_close;
+	dbc->c_del = __db_join_del;
+	dbc->c_get = __db_join_get;
+	dbc->c_put = __db_join_put;
+	dbc->internal = jc;
+	dbc->dbp = primary;
+	jc->j_init = 1;
+	jc->j_primary = primary;
+
+	*dbcp = dbc;
+
+	return (0);
+
+err:	if (jc != NULL) {
+		if (jc->j_curslist != NULL)
+			__os_free(jc->j_curslist,
+			    (jc->j_curslist - curslist + 1) * sizeof(DBC *));
+		__os_free(jc, sizeof(JOIN_CURSOR));
+	}
+	if (dbc != NULL)
+		__os_free(dbc, sizeof(DBC));
+	return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB_PANIC_CHECK(dbc->dbp);
+
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB_PANIC_CHECK(dbc->dbp);
+
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC **cpp;
+	JOIN_CURSOR *jc;
+	int ret;
+	u_int32_t operation;
+
+	dbp = dbc->dbp;
+
+	DB_PANIC_CHECK(dbp);
+
+	operation = LF_ISSET(DB_OPFLAGS_MASK);
+	if (operation != 0 && operation != DB_JOIN_ITEM)
+		return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
+
+	LF_CLR(DB_OPFLAGS_MASK);
+	if ((ret =
+	    __db_fchk(dbp->dbenv, "DBcursor->c_get", flags, DB_RMW)) != 0)
+		return (ret);
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+retry:
+	ret = jc->j_curslist[0]->c_get(jc->j_curslist[0],
+	    &jc->j_key, key, jc->j_init ? DB_CURRENT : DB_NEXT_DUP);
+
+	if (ret == ENOMEM) {
+		jc->j_key.ulen <<= 1;
+		if ((ret = __os_realloc(&jc->j_key.data, jc->j_key.ulen)) != 0)
+			return (ret);
+		goto retry;
+	}
+	if (ret != 0)
+		return (ret);
+
+	jc->j_init = 0;
+	do {
+		/*
+		 * We have the first element; now look for it in the
+		 * other cursors.
+		 */
+		for (cpp = jc->j_curslist + 1; *cpp != NULL; cpp++) {
+retry2:			if ((ret = ((*cpp)->c_get)(*cpp,
+			    &jc->j_key, key, DB_GET_BOTH)) == DB_NOTFOUND)
+				break;
+			if (ret == ENOMEM) {
+				jc->j_key.ulen <<= 1;
+				if ((ret = __os_realloc(&jc->j_key.data,
+				    jc->j_key.ulen)) != 0)
+					return (ret);
+				goto retry2;
+			}
+			if (F_ISSET(*cpp, DBC_KEYSET)) {
+				F_CLR(*cpp, DBC_KEYSET);
+				F_SET(*cpp, DBC_CONTINUE);
+			}
+		}
+
+		/*
+		 * If we got out of here with ret != 0, then we failed to
+		 * find the duplicate in one of the files, so we go on to
+		 * the next item in the outermost relation. If ret was
+		 * equal to 0, then we've got something to return.
+		 */
+		if (ret == 0)
+			break;
+	} while ((ret = jc->j_curslist[0]->c_get(jc->j_curslist[0],
+	    &jc->j_key, key,  DB_NEXT_DUP)) == 0);
+
+	/*
+	 * If ret != 0 here, we've exhausted the first file.  Otherwise,
+	 * key and data are set and we need to do the lookup on the
+	 * primary.
+	 */
+	if (ret != 0)
+		return (ret);
+
+	if (operation == DB_JOIN_ITEM)
+		return (0);
+	else
+		return ((jc->j_primary->get)(jc->j_primary,
+		    jc->j_curslist[0]->txn, key, data, 0));
+}
+
+static int
+__db_join_close(dbc)
+	DBC *dbc;
+{
+	JOIN_CURSOR *jc;
+	int i;
+
+	DB_PANIC_CHECK(dbc->dbp);
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+
+	/*
+	 * Clear the optimization flag in the cursors.
+	 */
+	for (i = 0; jc->j_curslist[i] != NULL; i++)
+		F_CLR(jc->j_curslist[i], DBC_CONTINUE | DBC_KEYSET);
+
+	__os_free(jc->j_curslist, 0);
+	__os_free(jc->j_key.data, jc->j_key.ulen);
+	__os_free(jc, sizeof(JOIN_CURSOR));
+	__os_free(dbc, sizeof(DBC));
+
+	return (0);
+}
diff --git a/db2/db/db_overflow.c b/db2/db/db_overflow.c
index d28740dcbe..0efcc9de7f 100644
--- a/db2/db/db_overflow.c
+++ b/db2/db/db_overflow.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_overflow.c	10.11 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db_overflow.c	10.21 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -60,6 +60,7 @@ static const char sccsid[] = "@(#)db_overflow.c	10.11 (Sleepycat) 5/7/98";
 #include "db_int.h"
 #include "db_page.h"
 #include "db_am.h"
+#include "common_ext.h"
 
 /*
  * Big key/data code.
@@ -106,29 +107,20 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
 		needed = tlen;
 	}
 
-	/*
-	 * Allocate any necessary memory.
-	 *
-	 * XXX: Never allocate 0 bytes;
-	 */
+	/* Allocate any necessary memory. */
 	if (F_ISSET(dbt, DB_DBT_USERMEM)) {
 		if (needed > dbt->ulen) {
 			dbt->size = needed;
 			return (ENOMEM);
 		}
 	} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
-		dbt->data = dbp->db_malloc == NULL ?
-		    (void *)__db_malloc(needed + 1) :
-		    (void *)dbp->db_malloc(needed + 1);
-		if (dbt->data == NULL)
-			return (ENOMEM);
+		if ((ret =
+		    __os_malloc(needed, dbp->db_malloc, &dbt->data)) != 0)
+			return (ret);
 	} else if (*bpsz == 0 || *bpsz < needed) {
-		*bpp = (*bpp == NULL ?
-		    (void *)__db_malloc(needed + 1) :
-		    (void *)__db_realloc(*bpp, needed + 1));
-		if (*bpp == NULL)
-			return (ENOMEM);
-		*bpsz = needed + 1;
+		if ((ret = __os_realloc(bpp, needed)) != 0)
+			return (ret);
+		*bpsz = needed;
 		dbt->data = *bpp;
 	} else
 		dbt->data = *bpp;
@@ -168,16 +160,17 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
  * __db_poff --
  *	Put an offpage item.
  *
- * PUBLIC: int __db_poff __P((DB *, const DBT *, db_pgno_t *,
- * PUBLIC:     int (*)(DB *, u_int32_t, PAGE **)));
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *,
+ * PUBLIC:     int (*)(DBC *, u_int32_t, PAGE **)));
  */
 int
-__db_poff(dbp, dbt, pgnop, newfunc)
-	DB *dbp;
+__db_poff(dbc, dbt, pgnop, newfunc)
+	DBC *dbc;
 	const DBT *dbt;
 	db_pgno_t *pgnop;
-	int (*newfunc) __P((DB *, u_int32_t, PAGE **));
+	int (*newfunc) __P((DBC *, u_int32_t, PAGE **));
 {
+	DB *dbp;
 	PAGE *pagep, *lastp;
 	DB_LSN new_lsn, null_lsn;
 	DBT tmp_dbt;
@@ -191,6 +184,7 @@ __db_poff(dbp, dbt, pgnop, newfunc)
 	 * number of bytes we get for pages we fill completely with a single
 	 * item.
 	 */
+	dbp = dbc->dbp;
 	pagespace = P_MAXSPACE(dbp->pgsize);
 
 	lastp = NULL;
@@ -208,13 +202,13 @@ __db_poff(dbp, dbt, pgnop, newfunc)
 		 * the item onto the page.  If sz is less than pagespace, we
 		 * have a partial record.
 		 */
-		if ((ret = newfunc(dbp, P_OVERFLOW, &pagep)) != 0)
+		if ((ret = newfunc(dbc, P_OVERFLOW, &pagep)) != 0)
 			return (ret);
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = p;
 			tmp_dbt.size = pagespace;
 			ZERO_LSN(null_lsn);
-			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn,
+			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn,
 			    &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid,
 			    PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID,
 			    PGNO_INVALID, &tmp_dbt, &LSN(pagep),
@@ -256,24 +250,26 @@ __db_poff(dbp, dbt, pgnop, newfunc)
  * __db_ovref --
  *	Increment/decrement the reference count on an overflow page.
  *
- * PUBLIC: int __db_ovref __P((DB *, db_pgno_t, int32_t));
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t, int32_t));
  */
 int
-__db_ovref(dbp, pgno, adjust)
-	DB *dbp;
+__db_ovref(dbc, pgno, adjust)
+	DBC *dbc;
 	db_pgno_t pgno;
 	int32_t adjust;
 {
+	DB *dbp;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
 		(void)__db_pgerr(dbp, pgno);
 		return (ret);
 	}
 
-	if (DB_LOGGING(dbp))
-		if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbp->txn,
+	if (DB_LOGGING(dbc))
+		if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbc->txn,
 		    &LSN(h), 0, dbp->log_fileid, h->pgno, adjust,
 		    &LSN(h))) != 0)
 			return (ret);
@@ -287,19 +283,21 @@ __db_ovref(dbp, pgno, adjust)
  * __db_doff --
  *	Delete an offpage chain of overflow pages.
  *
- * PUBLIC: int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *)));
  */
 int
-__db_doff(dbp, pgno, freefunc)
-	DB *dbp;
+__db_doff(dbc, pgno, freefunc)
+	DBC *dbc;
 	db_pgno_t pgno;
-	int (*freefunc) __P((DB *, PAGE *));
+	int (*freefunc) __P((DBC *, PAGE *));
 {
+	DB *dbp;
 	PAGE *pagep;
 	DB_LSN null_lsn;
 	DBT tmp_dbt;
 	int ret;
 
+	dbp = dbc->dbp;
 	do {
 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) {
 			(void)__db_pgerr(dbp, pgno);
@@ -312,21 +310,21 @@ __db_doff(dbp, pgno, freefunc)
 		 */
 		if (TYPE(pagep) == P_OVERFLOW && OV_REF(pagep) > 1) {
 			(void)memp_fput(dbp->mpf, pagep, 0);
-			return (__db_ovref(dbp, pgno, -1));
+			return (__db_ovref(dbc, pgno, -1));
 		}
 
-		if (DB_LOGGING(dbp)) {
+		if (DB_LOGGING(dbc)) {
 			tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD;
 			tmp_dbt.size = OV_LEN(pagep);
 			ZERO_LSN(null_lsn);
-			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn,
+			if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn,
 			    &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid,
 			    PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep),
 			    &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0)
 				return (ret);
 		}
 		pgno = pagep->next_pgno;
-		if ((ret = freefunc(dbp, pagep)) != 0)
+		if ((ret = freefunc(dbc, pagep)) != 0)
 			return (ret);
 	} while (pgno != PGNO_INVALID);
 
@@ -339,44 +337,71 @@ __db_doff(dbp, pgno, freefunc)
  *
  * Given a starting page number and a key, return <0, 0, >0 to indicate if the
  * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function.  In this case, we need to materialize
+ * the entire object and call their comparison routine.
  *
- * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t));
+ * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC:     int (*)(const DBT *, const DBT *), int *));
  */
 int
-__db_moff(dbp, dbt, pgno)
+__db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp)
 	DB *dbp;
 	const DBT *dbt;
 	db_pgno_t pgno;
+	u_int32_t tlen;
+	int (*cmpfunc) __P((const DBT *, const DBT *)), *cmpp;
 {
 	PAGE *pagep;
-	u_int32_t cmp_bytes, key_left;
+	DBT local_dbt;
+	void *buf;
+	u_int32_t bufsize, cmp_bytes, key_left;
 	u_int8_t *p1, *p2;
 	int ret;
 
+	/*
+	 * If there is a user-specified comparison function, build a
+	 * contiguous copy of the key, and call it.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_dbt, 0, sizeof(local_dbt));
+		buf = NULL;
+		bufsize = 0;
+
+		if ((ret = __db_goff(dbp,
+		    &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+			return (ret);
+		*cmpp = cmpfunc(&local_dbt, dbt);
+		__os_free(buf, bufsize);
+		return (0);
+	}
+
 	/* While there are both keys to compare. */
-	for (ret = 0, p1 = dbt->data,
+	for (*cmpp = 0, p1 = dbt->data,
 	    key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
-		if (memp_fget(dbp->mpf, &pgno, 0, &pagep) != 0) {
-			(void)__db_pgerr(dbp, pgno);
-			return (0);	/* No system error return. */
-		}
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0)
+			return (ret);
 
 		cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
 		key_left -= cmp_bytes;
 		for (p2 =
 		    (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2)
 			if (*p1 != *p2) {
-				ret = (long)*p1 - (long)*p2;
+				*cmpp = (long)*p1 - (long)*p2;
 				break;
 			}
 		pgno = NEXT_PGNO(pagep);
-		(void)memp_fput(dbp->mpf, pagep, 0);
-		if (ret != 0)
+		if ((ret = memp_fput(dbp->mpf, pagep, 0)) != 0)
 			return (ret);
+		if (*cmpp != 0)
+			return (0);
 	}
 	if (key_left > 0)		/* DBT is longer than page key. */
-		return (-1);
-	if (pgno != PGNO_INVALID)	/* DBT is shorter than page key. */
-		return (1);
+		*cmpp = -1;
+	else if (pgno != PGNO_INVALID)	/* DBT is shorter than page key. */
+		*cmpp = 1;
+	else
+		*cmpp = 0;
+
 	return (0);
 }
diff --git a/db2/db/db_pr.c b/db2/db/db_pr.c
index a294cdd135..7f4364c6e1 100644
--- a/db2/db/db_pr.c
+++ b/db2/db/db_pr.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_pr.c	10.29 (Sleepycat) 5/23/98";
+static const char sccsid[] = "@(#)db_pr.c	10.40 (Sleepycat) 11/22/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -126,11 +126,10 @@ __db_prdb(dbp)
 		{ DB_AM_MLOCAL,		"local mpool" },
 		{ DB_AM_PGDEF,		"default page size" },
 		{ DB_AM_RDONLY,		"read-only" },
-		{ DB_AM_RECOVER,	"recover" },
 		{ DB_AM_SWAP,		"needswap" },
 		{ DB_AM_THREAD,		"thread" },
-		{ DB_BT_RECNUM,		"btree:records" },
-		{ DB_HS_DIRTYMETA,	"hash:dirty-meta" },
+		{ DB_BT_RECNUM,		"btree:recnum" },
+		{ DB_DBM_ERROR,		"dbm/ndbm error" },
 		{ DB_RE_DELIMITER,	"recno:delimiter" },
 		{ DB_RE_FIXEDLEN,	"recno:fixed-length" },
 		{ DB_RE_PAD,		"recno:pad" },
@@ -178,42 +177,55 @@ __db_prbtree(dbp)
 	static const FN mfn[] = {
 		{ BTM_DUP,	"duplicates" },
 		{ BTM_RECNO,	"recno" },
-		{ BTM_RECNUM,	"btree:records" },
+		{ BTM_RECNUM,	"btree:recnum" },
 		{ BTM_FIXEDLEN,	"recno:fixed-length" },
 		{ BTM_RENUMBER,	"recno:renumber" },
 		{ 0 },
 	};
+	DBC *dbc;
 	BTMETA *mp;
 	BTREE *t;
-	EPG *epg;
 	FILE *fp;
 	PAGE *h;
 	RECNO *rp;
 	db_pgno_t i;
-	int ret;
+	int cnt, ret;
+	const char *sep;
 
 	t = dbp->internal;
 	fp = __db_prinit(NULL);
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
 
 	(void)fprintf(fp, "%s\nOn-page metadata:\n", DB_LINE);
 
 	i = PGNO_METADATA;
-	if ((ret = __bam_pget(dbp, (PAGE **)&mp, &i, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &i, 0, (PAGE **)&mp)) != 0) {
+		(void)dbc->c_close(dbc);
 		return (ret);
+	}
 
+	fprintf(fp, "lsn.file: %lu lsn.offset: %lu\n",
+	    (u_long)LSN(mp).file, (u_long)LSN(mp).offset);
 	(void)fprintf(fp, "magic %#lx\n", (u_long)mp->magic);
 	(void)fprintf(fp, "version %#lx\n", (u_long)mp->version);
 	(void)fprintf(fp, "pagesize %lu\n", (u_long)mp->pagesize);
 	(void)fprintf(fp, "maxkey: %lu minkey: %lu\n",
 	    (u_long)mp->maxkey, (u_long)mp->minkey);
 
-	(void)fprintf(fp, "free %lu", (u_long)mp->free);
-	for (i = mp->free; i != PGNO_INVALID;) {
-		if ((ret = __bam_pget(dbp, &h, &i, 0)) != 0)
+	(void)fprintf(fp, "free list: %lu", (u_long)mp->free);
+	for (i = mp->free, cnt = 0, sep = ", "; i != PGNO_INVALID;) {
+		if ((ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0)
 			return (ret);
 		i = h->next_pgno;
 		(void)memp_fput(dbp->mpf, h, 0);
-		(void)fprintf(fp, ", %lu", (u_long)i);
+		(void)fprintf(fp, "%s%lu", sep, (u_long)i);
+		if (++cnt % 10 == 0) {
+			(void)fprintf(fp, "\n");
+			cnt = 0;
+			sep = "";
+		} else
+			sep = ", ";
 	}
 	(void)fprintf(fp, "\n");
 
@@ -227,7 +239,7 @@ __db_prbtree(dbp)
 	    (u_long)t->bt_maxkey, (u_long)t->bt_minkey);
 	(void)fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n",
 	    (u_long)t->bt_compare, (u_long)t->bt_prefix);
-	if ((rp = t->bt_recno) != NULL) {
+	if ((rp = t->recno) != NULL) {
 		(void)fprintf(fp,
 		    "re_delim: %#lx re_pad: %#lx re_len: %lu re_source: %s\n",
 		    (u_long)rp->re_delim, (u_long)rp->re_pad,
@@ -238,13 +250,9 @@ __db_prbtree(dbp)
 		    (u_long)rp->re_cmap, (u_long)rp->re_smap,
 		    (u_long)rp->re_emap, (u_long)rp->re_msize);
 	}
-	(void)fprintf(fp, "stack:");
-	for (epg = t->bt_stack; epg < t->bt_sp; ++epg)
-		(void)fprintf(fp, " %lu", (u_long)epg->page->pgno);
-	(void)fprintf(fp, "\n");
 	(void)fprintf(fp, "ovflsize: %lu\n", (u_long)t->bt_ovflsize);
 	(void)fflush(fp);
-	return (0);
+	return (dbc->c_close(dbc));
 }
 
 /*
@@ -258,51 +266,50 @@ __db_prhash(dbp)
 	DB *dbp;
 {
 	FILE *fp;
-	HTAB *t;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
 	int i, put_page, ret;
 	db_pgno_t pgno;
 
-	t = dbp->internal;
-
 	fp = __db_prinit(NULL);
+	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+		return (ret);
+	hcp = (HASH_CURSOR *)dbc->internal;
 
-	fprintf(fp, "\thash_accesses    %lu\n", (u_long)t->hash_accesses);
-	fprintf(fp, "\thash_collisions  %lu\n", (u_long)t->hash_collisions);
-	fprintf(fp, "\thash_expansions  %lu\n", (u_long)t->hash_expansions);
-	fprintf(fp, "\thash_overflows 	%lu\n", (u_long)t->hash_overflows);
-	fprintf(fp, "\thash_bigpages    %lu\n", (u_long)t->hash_bigpages);
-	fprintf(fp, "\n");
-
-	if (t->hdr == NULL) {
+	/*
+	 * In this case,  hcp->hdr will never be null, if we decide
+	 * to pass dbc's to this routine instead, then it could be.
+	 */
+	if (hcp->hdr == NULL) {
 		pgno = PGNO_METADATA;
-		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &t->hdr)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &hcp->hdr)) != 0)
 			return (ret);
 		put_page = 1;
 	} else
 		put_page = 0;
 
-	fprintf(fp, "\tmagic      %#lx\n", (u_long)t->hdr->magic);
-	fprintf(fp, "\tversion    %lu\n", (u_long)t->hdr->version);
-	fprintf(fp, "\tpagesize   %lu\n", (u_long)t->hdr->pagesize);
-	fprintf(fp, "\tovfl_point %lu\n", (u_long)t->hdr->ovfl_point);
-	fprintf(fp, "\tlast_freed %lu\n", (u_long)t->hdr->last_freed);
-	fprintf(fp, "\tmax_bucket %lu\n", (u_long)t->hdr->max_bucket);
-	fprintf(fp, "\thigh_mask  %#lx\n", (u_long)t->hdr->high_mask);
-	fprintf(fp, "\tlow_mask   %#lx\n", (u_long)t->hdr->low_mask);
-	fprintf(fp, "\tffactor    %lu\n", (u_long)t->hdr->ffactor);
-	fprintf(fp, "\tnelem      %lu\n", (u_long)t->hdr->nelem);
-	fprintf(fp, "\th_charkey  %#lx\n", (u_long)t->hdr->h_charkey);
+	fprintf(fp, "\tmagic      %#lx\n", (u_long)hcp->hdr->magic);
+	fprintf(fp, "\tversion    %lu\n", (u_long)hcp->hdr->version);
+	fprintf(fp, "\tpagesize   %lu\n", (u_long)hcp->hdr->pagesize);
+	fprintf(fp, "\tovfl_point %lu\n", (u_long)hcp->hdr->ovfl_point);
+	fprintf(fp, "\tlast_freed %lu\n", (u_long)hcp->hdr->last_freed);
+	fprintf(fp, "\tmax_bucket %lu\n", (u_long)hcp->hdr->max_bucket);
+	fprintf(fp, "\thigh_mask  %#lx\n", (u_long)hcp->hdr->high_mask);
+	fprintf(fp, "\tlow_mask   %#lx\n", (u_long)hcp->hdr->low_mask);
+	fprintf(fp, "\tffactor    %lu\n", (u_long)hcp->hdr->ffactor);
+	fprintf(fp, "\tnelem      %lu\n", (u_long)hcp->hdr->nelem);
+	fprintf(fp, "\th_charkey  %#lx\n", (u_long)hcp->hdr->h_charkey);
 
 	for (i = 0; i < NCACHED; i++)
-		fprintf(fp, "%lu ", (u_long)t->hdr->spares[i]);
+		fprintf(fp, "%lu ", (u_long)hcp->hdr->spares[i]);
 	fprintf(fp, "\n");
 
 	(void)fflush(fp);
 	if (put_page) {
-		(void)memp_fput(dbp->mpf, (PAGE *)t->hdr, 0);
-		t->hdr = NULL;
+		(void)memp_fput(dbp->mpf, (PAGE *)hcp->hdr, 0);
+		hcp->hdr = NULL;
 	}
-	return (0);
+	return (dbc->c_close(dbc));
 }
 
 /*
@@ -318,22 +325,18 @@ __db_prtree(mpf, all)
 {
 	PAGE *h;
 	db_pgno_t i;
-	int ret, t_ret;
 
 	if (set_psize == PSIZE_BOUNDARY)
 		__db_psize(mpf);
 
-	ret = 0;
 	for (i = PGNO_ROOT;; ++i) {
-		if ((ret = memp_fget(mpf, &i, 0, &h)) != 0)
+		if (memp_fget(mpf, &i, 0, &h) != 0)
 			break;
-		if (TYPE(h) != P_INVALID)
-			if ((t_ret = __db_prpage(h, all)) != 0 && ret == 0)
-				ret = t_ret;
+		(void)__db_prpage(h, all);
 		(void)memp_fput(mpf, h, 0);
 	}
 	(void)fflush(__db_prinit(NULL));
-	return (ret);
+	return (0);
 }
 
 /*
@@ -425,8 +428,7 @@ __db_prpage(h, all)
 	    (TYPE(h) == P_LRECNO && h->pgno == PGNO_ROOT))
 		fprintf(fp, " total records: %4lu", (u_long)RE_NREC(h));
 	fprintf(fp, "\n");
-	if (TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO ||
-	    TYPE(h) == P_DUPLICATE || TYPE(h) == P_OVERFLOW)
+	if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO)
 		fprintf(fp, "    prev: %4lu next: %4lu",
 		    (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
 	if (TYPE(h) == P_IBTREE || TYPE(h) == P_LBTREE)
diff --git a/db2/db/db_rec.c b/db2/db/db_rec.c
index 1ef6f18e61..7f577b5855 100644
--- a/db2/db/db_rec.c
+++ b/db2/db/db_rec.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_rec.c	10.16 (Sleepycat) 4/28/98";
+static const char sccsid[] = "@(#)db_rec.c	10.19 (Sleepycat) 9/27/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -40,7 +40,8 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_addrem_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -57,9 +58,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -73,7 +72,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	    (cmp_n == 0 && !redo && argp->opcode == DB_REM_DUP)) {
 
 		/* Need to redo an add, or undo a delete. */
-		if ((ret = __db_pitem(file_dbp, pagep, argp->indx, argp->nbytes,
+		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
 		    argp->hdr.size == 0 ? NULL : &argp->hdr,
 		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
 			goto out;
@@ -83,7 +82,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 	} else if ((cmp_n == 0 && !redo && argp->opcode == DB_ADD_DUP) ||
 	    (cmp_p == 0 && redo && argp->opcode == DB_REM_DUP)) {
 		/* Need to undo an add, or redo a delete. */
-		if ((ret = __db_ditem(file_dbp,
+		if ((ret = __db_ditem(dbc,
 		    pagep, argp->indx, argp->nbytes)) != 0)
 			goto out;
 		change = DB_MPOOL_DIRTY;
@@ -96,8 +95,11 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info)
 			LSN(pagep) = argp->pagelsn;
 	}
 
-	if ((ret = memp_fput(mpf, pagep, change)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -114,7 +116,8 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_split_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int change, cmp_n, cmp_p, ret;
@@ -130,9 +133,7 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			*lsnp = argp->prev_lsn;
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -169,8 +170,11 @@ __db_split_recover(logp, dbtp, lsnp, redo, info)
 		LSN(pagep) = argp->pagelsn;
 		change = DB_MPOOL_DIRTY;
 	}
-	if ((ret = memp_fput(mpf, pagep, change)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -187,7 +191,8 @@ __db_big_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_big_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -209,7 +214,7 @@ __db_big_recover(logp, dbtp, lsnp, redo, info)
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
-			goto out;
+				goto out;
 	}
 
 	/*
@@ -299,9 +304,7 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 				 * so we would not have to undo anything.  In
 				 * this case, don't bother creating a page.
 				 */
-				*lsnp = argp->prev_lsn;
-				ret = 0;
-				goto out;
+				goto done;
 			} else
 				if ((ret = memp_fget(mpf, &argp->next_pgno,
 				    DB_MPOOL_CREATE, &pagep)) != 0)
@@ -323,7 +326,8 @@ npage:	if (argp->next_pgno != PGNO_INVALID) {
 			goto out;
 	}
 
-	*lsnp = argp->prev_lsn;
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -343,7 +347,8 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_ovref_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	int modified, ret;
@@ -370,8 +375,11 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info)
 		pagep->lsn = argp->lsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0)
-		*lsnp = argp->prev_lsn;
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
 out:	REC_CLOSE;
 }
@@ -392,17 +400,20 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_relink_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
-	int modified, ret;
+	int cmp_n, cmp_p, modified, ret;
 
 	REC_PRINT(__db_relink_print);
 	REC_INTRO(__db_relink_read);
 
 	/*
-	 * There are three pages we need to check -- the page, and the
-	 * previous and next pages, if they existed.
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
 	 */
 	if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
 		if (redo) {
@@ -411,6 +422,9 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 		}
 		goto next;
 	}
+	if (argp->opcode == DB_ADD_PAGE)
+		goto next;
+
 	modified = 0;
 	if (log_compare(&LSN(pagep), &argp->lsn) == 0 && redo) {
 		/* Redo the relink. */
@@ -424,10 +438,8 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info)
 		pagep->lsn = argp->lsn;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 next:	if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
 		if (redo) {
@@ -437,23 +449,27 @@ next:	if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
 		goto prev;
 	}
 	modified = 0;
-	if (log_compare(&LSN(pagep), &argp->lsn_next) == 0 && redo) {
-		/* Redo the relink. */
+	cmp_n = log_compare(lsnp, &LSN(pagep));
+	cmp_p = log_compare(&LSN(pagep), &argp->lsn_next);
+	if ((argp->opcode == DB_REM_PAGE && cmp_p == 0 && redo) ||
+	    (argp->opcode == DB_ADD_PAGE && cmp_n == 0 && !redo)) {
+		/* Redo the remove or undo the add. */
 		pagep->prev_pgno = argp->prev;
 
 		pagep->lsn = *lsnp;
 		modified = 1;
-	} else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) {
-		/* Undo the relink. */
+	} else if ((argp->opcode == DB_REM_PAGE && cmp_n == 0 && !redo) ||
+	    (argp->opcode == DB_ADD_PAGE && cmp_p == 0 && redo)) {
+		/* Undo the remove or redo the add. */
 		pagep->prev_pgno = argp->pgno;
 
 		pagep->lsn = argp->lsn_next;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void)__db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
+	if (argp->opcode == DB_ADD_PAGE)
+		goto done;
 
 prev:	if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
 		if (redo) {
@@ -476,10 +492,8 @@ prev:	if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
 		pagep->lsn = argp->lsn_prev;
 		modified = 1;
 	}
-	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
-		(void) __db_panic(file_dbp);
+	if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
 		goto out;
-	}
 
 done:	*lsnp = argp->prev_lsn;
 	ret = 0;
@@ -500,7 +514,8 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 	void *info;
 {
 	__db_addpage_args *argp;
-	DB *file_dbp, *mdbp;
+	DB *file_dbp;
+	DBC *dbc;
 	DB_MPOOLFILE *mpf;
 	PAGE *pagep;
 	u_int32_t change;
@@ -541,8 +556,7 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 			 * would not have to undo anything.  In this case,
 			 * don't bother creating a page.
 			 */
-			ret = 0;
-			goto out;
+			goto done;
 		} else
 			if ((ret = memp_fget(mpf,
 			    &argp->nextpgno, DB_MPOOL_CREATE, &pagep)) != 0)
@@ -563,11 +577,13 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info)
 		LSN(pagep) = argp->nextlsn;
 		change = DB_MPOOL_DIRTY;
 	}
-	ret = memp_fput(mpf, pagep, change);
+	if ((ret = memp_fput(mpf, pagep, change)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
 
-out:	if (ret == 0)
-		*lsnp = argp->prev_lsn;
-	REC_CLOSE;
+out:	REC_CLOSE;
 }
 
 /*
@@ -598,46 +614,3 @@ __db_debug_recover(logp, dbtp, lsnp, redo, info)
 
 	REC_NOOP_CLOSE;
 }
-
-/*
- * __db_noop_recover --
- *	Recovery function for noop.
- *
- * PUBLIC: int __db_noop_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
- */
-int
-__db_noop_recover(logp, dbtp, lsnp, redo, info)
-	DB_LOG *logp;
-	DBT *dbtp;
-	DB_LSN *lsnp;
-	int redo;
-	void *info;
-{
-	__db_noop_args *argp;
-	DB *file_dbp, *mdbp;
-	DB_MPOOLFILE *mpf;
-	PAGE *pagep;
-	u_int32_t change;
-	int cmp_n, cmp_p, ret;
-
-	REC_PRINT(__db_noop_print);
-	REC_INTRO(__db_noop_read);
-
-	if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0)
-		goto out;
-
-	cmp_n = log_compare(lsnp, &LSN(pagep));
-	cmp_p = log_compare(&LSN(pagep), &argp->prevlsn);
-	change = 0;
-	if (cmp_p == 0 && redo) {
-		LSN(pagep) = *lsnp;
-		change = DB_MPOOL_DIRTY;
-	} else if (cmp_n == 0 && !redo) {
-		LSN(pagep) = argp->prevlsn;
-		change = DB_MPOOL_DIRTY;
-	}
-	*lsnp = argp->prev_lsn;
-	ret = memp_fput(mpf, pagep, change);
-
-out:	REC_CLOSE;
-}
diff --git a/db2/db/db_ret.c b/db2/db/db_ret.c
index 9d9b599ad6..9f0d0ecf8d 100644
--- a/db2/db/db_ret.c
+++ b/db2/db/db_ret.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_ret.c	10.13 (Sleepycat) 5/7/98";
+static const char sccsid[] = "@(#)db_ret.c	10.16 (Sleepycat) 10/4/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -93,6 +93,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	u_int32_t *memsize;
 	void *(*db_malloc) __P((size_t));
 {
+	int ret;
+
 	/* If returning a partial record, reset the length. */
 	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
 		data = (u_int8_t *)data + dbt->doff;
@@ -120,9 +122,6 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	 * guarantees consistency, i.e., the application can always free memory
 	 * without concern as to how many bytes of the record were requested.
 	 *
-	 * XXX
-	 * Never allocate 0 bytes, it's known to make malloc/realloc unhappy.
-	 *
 	 * Use the memory specified by the application: DB_DBT_USERMEM.
 	 *
 	 * !!!
@@ -130,11 +129,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 	 * memory pointer is allowed to be NULL.
 	 */
 	if (F_ISSET(dbt, DB_DBT_MALLOC)) {
-		dbt->data = db_malloc == NULL ?
-		    (void *)__db_malloc(len) :
-		    (void *)db_malloc(len + 1);
-		if (dbt->data == NULL)
-			return (ENOMEM);
+		if ((ret = __os_malloc(len, db_malloc, &dbt->data)) != 0)
+			return (ret);
 	} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
 		if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
 			return (ENOMEM);
@@ -142,12 +138,9 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc)
 		return (EINVAL);
 	} else {
 		if (len != 0 && (*memsize == 0 || *memsize < len)) {
-			*memp = *memp == NULL ?
-			    (void *)__db_malloc(len) :
-			    (void *)__db_realloc(*memp, len);
-			if (*memp == NULL) {
+			if ((ret = __os_realloc(memp, len)) != 0) {
 				*memsize = 0;
-				return (ENOMEM);
+				return (ret);
 			}
 			*memsize = len;
 		}
diff --git a/db2/db/db_thread.c b/db2/db/db_thread.c
deleted file mode 100644
index 73e2a51286..0000000000
--- a/db2/db/db_thread.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998
- *	Sleepycat Software.  All rights reserved.
- */
-
-#include "config.h"
-
-#ifndef lint
-static const char sccsid[] = "@(#)db_thread.c	8.15 (Sleepycat) 4/26/98";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <errno.h>
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-
-static int __db_getlockid __P((DB *, DB *));
-
-/*
- * __db_gethandle --
- *	Called by db access method routines when the DB_THREAD flag is set.
- *	This routine returns a handle, either an existing handle from the
- *	chain of handles, or creating one if necessary.
- *
- * PUBLIC: int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **));
- */
-int
-__db_gethandle(dbp, am_func, dbpp)
-	DB *dbp, **dbpp;
-	int (*am_func) __P((DB *, DB *));
-{
-	DB *ret_dbp;
-	int ret, t_ret;
-
-	if ((ret = __db_mutex_lock((db_mutex_t *)dbp->mutexp, -1)) != 0)
-		return (ret);
-
-	if ((ret_dbp = LIST_FIRST(&dbp->handleq)) != NULL)
-		/* Simply take one off the list. */
-		LIST_REMOVE(ret_dbp, links);
-	else {
-		/* Allocate a new handle. */
-		if ((ret_dbp = (DB *)__db_malloc(sizeof(*dbp))) == NULL) {
-			ret = ENOMEM;
-			goto err;
-		}
-		memcpy(ret_dbp, dbp, sizeof(*dbp));
-		ret_dbp->internal = NULL;
-		TAILQ_INIT(&ret_dbp->curs_queue);
-
-		/* Set the locker, the lock structure and the lock DBT. */
-		if ((ret = __db_getlockid(dbp, ret_dbp)) != 0)
-			goto err;
-
-		/* Finally, call the access method specific dup function. */
-		if ((ret = am_func(dbp, ret_dbp)) != 0)
-			goto err;
-	}
-
-	*dbpp = ret_dbp;
-
-	if (0) {
-err:		if (ret_dbp != NULL)
-			FREE(ret_dbp, sizeof(*ret_dbp));
-	}
-	if ((t_ret =
-	    __db_mutex_unlock((db_mutex_t *)dbp->mutexp, -1)) != 0 && ret == 0)
-		ret = t_ret;
-	return (ret);
-}
-
-/*
- * __db_puthandle --
- *	Return a DB handle to the pool for later use.
- *
- * PUBLIC: int __db_puthandle __P((DB *));
- */
-int
-__db_puthandle(dbp)
-	DB *dbp;
-{
-	DB *master;
-	int ret;
-
-	master = dbp->master;
-	if ((ret = __db_mutex_lock((db_mutex_t *)master->mutexp, -1)) != 0)
-		return (ret);
-
-	LIST_INSERT_HEAD(&master->handleq, dbp, links);
-
-	return (__db_mutex_unlock((db_mutex_t *)master->mutexp, -1));
-}
-
-/*
- * __db_getlockid --
- *	Create a new locker ID and copy the file lock information from
- *	the old DB into the new one.
- */
-static int
-__db_getlockid(dbp, new_dbp)
-	DB *dbp, *new_dbp;
-{
-	int ret;
-
-	if (F_ISSET(dbp, DB_AM_LOCKING)) {
-		if ((ret = lock_id(dbp->dbenv->lk_info, &new_dbp->locker)) != 0)
-			return (ret);
-		memcpy(new_dbp->lock.fileid, dbp->lock.fileid, DB_FILE_ID_LEN);
-		new_dbp->lock_dbt.size = sizeof(new_dbp->lock);
-		new_dbp->lock_dbt.data = &new_dbp->lock;
-	}
-	return (0);
-}