about summary refs log tree commit diff
path: root/db2/mp
diff options
context:
space:
mode:
Diffstat (limited to 'db2/mp')
-rw-r--r--db2/mp/mp_bh.c437
-rw-r--r--db2/mp/mp_fget.c359
-rw-r--r--db2/mp/mp_fopen.c437
-rw-r--r--db2/mp/mp_fput.c140
-rw-r--r--db2/mp/mp_fset.c72
-rw-r--r--db2/mp/mp_open.c176
-rw-r--r--db2/mp/mp_pr.c313
-rw-r--r--db2/mp/mp_region.c340
-rw-r--r--db2/mp/mp_sync.c205
9 files changed, 2479 insertions, 0 deletions
diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c
new file mode 100644
index 0000000000..e1b68ce450
--- /dev/null
+++ b/db2/mp/mp_bh.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_bh.c	10.12 (Sleepycat) 8/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * __memp_bhwrite --
+ *	Write the page associated with a given bucket header.
+ *
+ * PUBLIC: int __memp_bhwrite
+ * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	BH *bhp;
+	int *restartp, *wrotep;
+{
+	DBT dbt;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPREG *mpreg;
+
+	if (restartp != NULL)
+		*restartp = 0;
+	if (wrotep != NULL)
+		*wrotep = 0;
+
+	/*
+	 * Walk the process' DB_MPOOLFILE list and try and find a file
+	 * descriptor for this file.
+	 */
+	LOCKHANDLE(dbmp, &dbmp->mutex);
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+		if (dbmfp->mfp == mfp)
+			break;
+	UNLOCKHANDLE(dbmp, &dbmp->mutex);
+	if (dbmfp != NULL)
+		goto found;
+
+	/*
+	 * It's not a page from a file we've opened.  If the file requires
+	 * input/output processing, see if this process has ever registered
+	 * information as to how to write this type of file.  If not, there's
+	 * nothing we can do.
+	 */
+	if (mfp->ftype != 0) {
+		LOCKHANDLE(dbmp, &dbmp->mutex);
+		for (mpreg = LIST_FIRST(&dbmp->dbregq);
+		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+			if (mpreg->ftype == mfp->ftype)
+				break;
+		UNLOCKHANDLE(dbmp, &dbmp->mutex);
+		if (mpreg == NULL)
+			return (0);
+	}
+
+	/*
+	 * Try and open the file; ignore any error, assume it's a permissions
+	 * problem.
+	 */
+	dbt.size = mfp->pgcookie_len;
+	dbt.data = ADDR(dbmp, mfp->pgcookie_off);
+	if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off),
+	    mfp->ftype, 0, 0, mfp->stat.st_pagesize,
+	    mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0)
+		return (0);
+
+found:	return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
+}
+
+/*
+ * __memp_pgread --
+ *	Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int can_create;
+{
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	size_t pagesize;
+	ssize_t nr;
+	int ret;
+
+	dbmp = dbmfp->dbmp;
+	mfp = dbmfp->mfp;
+	pagesize = mfp->stat.st_pagesize;
+
+	F_SET(bhp, BH_LOCKED | BH_TRASH);
+	LOCKBUFFER(dbmp, bhp);
+	UNLOCKREGION(dbmp);
+
+	/*
+	 * Temporary files may not yet have been created.
+	 *
+	 * Seek to the page location.
+	 */
+	ret = 0;
+	LOCKHANDLE(dbmp, &dbmfp->mutex);
+	if (dbmfp->fd == -1 || (ret =
+	    __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) {
+		if (!can_create) {
+			if (dbmfp->fd == -1)
+				ret = EINVAL;
+			UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+			__db_err(dbmp->dbenv,
+			    "%s: page %lu doesn't exist, create flag not set",
+			    dbmfp->path, (u_long)bhp->pgno);
+			goto err;
+		}
+		UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+		/* Clear any uninitialized data. */
+		memset(bhp->buf, 0, pagesize);
+		goto pgin;
+	}
+
+	/*
+	 * Read the page; short reads are treated like creates, although
+	 * any valid data is preserved.
+	 */
+	ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr);
+	UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+	if (ret != 0)
+		goto err;
+
+	if (nr == (ssize_t)pagesize)
+		can_create = 0;
+	else {
+		if (!can_create) {
+			ret = EINVAL;
+			goto err;
+		}
+
+		/* Clear any uninitialized data. */
+		memset(bhp->buf + nr, 0, pagesize - nr);
+	}
+
+	/* Call any pgin function. */
+pgin:	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
+
+	/* Reacquire the region lock. */
+	LOCKREGION(dbmp);
+
+	/* If the pgin function succeeded, the data is now valid. */
+	if (ret == 0)
+		F_CLR(bhp, BH_TRASH);
+
+	/* Update the statistics. */
+	if (can_create) {
+		++dbmp->mp->stat.st_page_create;
+		++mfp->stat.st_page_create;
+	} else {
+		++dbmp->mp->stat.st_page_in;
+		++mfp->stat.st_page_in;
+	}
+
+	if (0) {
+err:		LOCKREGION(dbmp);
+	}
+
+	/* Release the buffer. */
+	F_CLR(bhp, BH_LOCKED);
+	UNLOCKBUFFER(dbmp, bhp);
+
+	return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ *	Write a page to a file.
+ *
+ * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_pgwrite(dbmfp, bhp, restartp, wrotep)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int *restartp, *wrotep;
+{
+	DB_ENV *dbenv;
+	DB_LOG *lg_info;
+	DB_LSN lsn;
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	size_t pagesize;
+	ssize_t nw;
+	int callpgin, ret;
+	const char *fail;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mfp = dbmfp->mfp;
+
+	if (restartp != NULL)
+		*restartp = 0;
+	if (wrotep != NULL)
+		*wrotep = 0;
+	callpgin = 0;
+	pagesize = mfp->stat.st_pagesize;
+
+	F_SET(bhp, BH_LOCKED);
+	LOCKBUFFER(dbmp, bhp);
+	UNLOCKREGION(dbmp);
+
+	if (restartp != NULL)
+		*restartp = 1;
+
+	/* Copy the LSN off the page if we're going to need it. */
+	lg_info = dbenv->lg_info;
+	if (lg_info != NULL || F_ISSET(bhp, BH_WRITE))
+		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+
+	/* Ensure the appropriate log records are on disk. */
+	if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0)
+		goto err;
+
+	/*
+	 * Call any pgout function.  We set the callpgin flag so that on
+	 * error we flag that the contents of the buffer may be trash.
+	 */
+	if (mfp->ftype == 0)
+		ret = 0;
+	else {
+		callpgin = 1;
+		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
+			goto err;
+	}
+
+	/* Temporary files may not yet have been created. */
+	LOCKHANDLE(dbmp, &dbmfp->mutex);
+	if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, DB_APP_TMP,
+	    NULL, NULL, &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {
+		UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+		__db_err(dbenv, "unable to create temporary backing file");
+		goto err;
+	}
+
+	/* Write the page out. */
+	if ((ret =
+	    __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0)
+		fail = "seek";
+	else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
+		fail = "write";
+	UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+	if (ret != 0) {
+		/*
+		 * XXX
+		 * Shut the compiler up; it doesn't understand the correlation
+		 * between the failing clauses to __db_lseek and __db_write and
+		 * this ret != 0.
+		 */
+		fail = NULL;
+		goto syserr;
+	}
+
+	if (nw != (ssize_t)pagesize) {
+		ret = EIO;
+		fail = "write";
+		goto syserr;
+	}
+
+	if (wrotep != NULL)
+		*wrotep = 1;
+
+	/* Reacquire the region lock. */
+	LOCKREGION(dbmp);
+
+	/* Clean up the flags based on a successful write. */
+	F_SET(bhp, BH_CALLPGIN);
+	F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+	UNLOCKBUFFER(dbmp, bhp);
+
+	/*
+	 * If we wrote a buffer which a checkpoint is waiting for, update
+	 * the count of pending buffers (both in the mpool as a whole and
+	 * for this file).  If the count for this file goes to zero, flush
+	 * the writes.
+	 *
+	 * XXX:
+	 * We ignore errors from the sync -- it makes no sense to return an
+	 * error to the calling process, so set a flag causing the sync to
+	 * be retried later.
+	 *
+	 * If the buffer we wrote has a LSN larger than the current largest
+	 * we've written for this checkpoint, update the saved value.
+	 */
+	mp = dbmp->mp;
+	if (F_ISSET(bhp, BH_WRITE)) {
+		if (log_compare(&lsn, &mp->lsn) > 0)
+			mp->lsn = lsn;
+		F_CLR(bhp, BH_WRITE);
+
+		--mp->lsn_cnt;
+		if (--mfp->lsn_cnt == 0) {
+			/*
+			 * Don't lock -- there are no atomicity issues for
+			 * fsync(2).
+			 */
+			if (__db_fsync(dbmfp->fd) != 0)
+				F_SET(mp, MP_LSN_RETRY);
+		}
+	}
+
+	/* Update I/O statistics. */
+	++mp->stat.st_page_out;
+	++mfp->stat.st_page_out;
+
+	return (0);
+
+syserr:	__db_err(dbenv,
+	    "%s: %s failed for page %lu", dbmfp->path, fail, (u_long)bhp->pgno);
+
+err:	UNLOCKBUFFER(dbmp, bhp);
+	LOCKREGION(dbmp);
+	if (callpgin)
+		F_SET(bhp, BH_CALLPGIN);
+	F_CLR(bhp, BH_LOCKED);
+	return (ret);
+}
+
+/*
+ * __memp_pg --
+ *	Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pg(dbmfp, bhp, is_pgin)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int is_pgin;
+{
+	DBT dbt, *dbtp;
+	DB_MPOOL *dbmp;
+	DB_MPREG *mpreg;
+	MPOOLFILE *mfp;
+	int ftype, ret;
+
+	dbmp = dbmfp->dbmp;
+	mfp = dbmfp->mfp;
+
+	LOCKHANDLE(dbmp, &dbmp->mutex);
+
+	ftype = mfp->ftype;
+	for (mpreg = LIST_FIRST(&dbmp->dbregq);
+	    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
+		if (ftype != mpreg->ftype)
+			continue;
+		if (mfp->pgcookie_len == 0)
+			dbtp = NULL;
+		else {
+			dbt.size = mfp->pgcookie_len;
+			dbt.data = ADDR(dbmp, mfp->pgcookie_off);
+			dbtp = &dbt;
+		}
+		UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+		if (is_pgin) {
+			if (mpreg->pgin != NULL && (ret =
+			    mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
+				goto err;
+		} else
+			if (mpreg->pgout != NULL && (ret =
+			    mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)
+				goto err;
+		break;
+	}
+
+	if (mpreg == NULL)
+		UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+	return (0);
+
+err:	UNLOCKHANDLE(dbmp, &dbmp->mutex);
+	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+	    dbmfp->path, is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
+	return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ *	Free a bucket header and its referenced data.
+ *
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
+ */
+void
+__memp_bhfree(dbmp, mfp, bhp, free_mem)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	BH *bhp;
+	int free_mem;
+{
+	size_t off;
+
+	/* Delete the buffer header from the MPOOL hash list. */
+	off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno);
+	SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh);
+
+	/* Delete the buffer header from the LRU chain. */
+	SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
+
+	/*
+	 * If we're not reusing it immediately, free the buffer header
+	 * and data for real.
+	 */
+	if (free_mem)
+		__db_shalloc_free(dbmp->addr, bhp);
+}
diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c
new file mode 100644
index 0000000000..418802a3b9
--- /dev/null
+++ b/db2/mp/mp_fget.c
@@ -0,0 +1,359 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fget.c	10.22 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+int __sleep_on_every_page_get;		/* XXX: thread debugging option. */
+
+/*
+ * memp_fget --
+ *	Get a page from the file.
+ */
+int
+memp_fget(dbmfp, pgnoaddr, flags, addrp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+	u_long flags;
+	void *addrp;
+{
+	BH *bhp, *tbhp;
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	db_pgno_t lastpgno;
+	size_t bucket, mf_offset;
+	off_t size;
+	u_long cnt;
+	int b_incr, b_inserted, readonly_alloc, ret;
+	void *addr;
+
+	dbmp = dbmfp->dbmp;
+
+	/*
+	 * Validate arguments.
+	 *
+	 * !!!
+	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+	 * files here, and create non-existent pages in readonly files if the
+	 * flags are set, later.  The reason is that the hash access method
+	 * wants to get empty pages that don't really exist in readonly files.
+	 * The only alternative is for hash to write the last "bucket" all the
+	 * time, which we don't want to do because one of our big goals in life
+	 * is to keep database files small.  It's sleazy as hell, but we catch
+	 * any attempt to actually write the file in memp_fput().
+	 */
+#define	OKFLAGS	(DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
+	if (flags != 0) {
+		if ((ret =
+		    __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0)
+			return (ret);
+
+		switch (flags) {
+		case DB_MPOOL_CREATE:
+		case DB_MPOOL_LAST:
+		case DB_MPOOL_NEW:
+		case 0:
+			break;
+		default:
+			return (__db_ferr(dbmp->dbenv, "memp_fget", 1));
+		}
+	}
+
+#ifdef DEBUG
+	/*
+	 * XXX
+	 * We want to switch threads as often as possible.  Sleep every time
+	 * we get a new page to make it more likely.
+	 */
+	if (__sleep_on_every_page_get && (dbmp->dbenv == NULL ||
+	    dbmp->dbenv->db_yield == NULL || dbmp->dbenv->db_yield() != 0))
+		__db_sleep(0, 1);
+#endif
+
+	mp = dbmp->mp;
+	mfp = dbmfp->mfp;
+	mf_offset = OFFSET(dbmp, mfp);
+	addr = NULL;
+	bhp = NULL;
+	b_incr = b_inserted = readonly_alloc = ret = 0;
+
+	LOCKREGION(dbmp);
+
+	/*
+	 * If mmap'ing the file, just return a pointer.  However, if another
+	 * process has opened the file for writing since we mmap'd it, start
+	 * playing the game by their rules, i.e. everything goes through the
+	 * cache.  All pages previously returned should be safe, as long as
+	 * a locking protocol was observed.
+	 *
+	 * XXX
+	 * We don't discard the map because we don't know when all of the
+	 * pages will have been discarded from the process' address space.
+	 * It would be possible to do so by reference counting the open
+	 * pages from the mmap, but it's unclear to me that it's worth it.
+	 */
+	if (dbmfp->addr != NULL && dbmfp->mfp->can_mmap) {
+		lastpgno = dbmfp->len == 0 ?
+		    0 : (dbmfp->len - 1) / mfp->stat.st_pagesize;
+		if (LF_ISSET(DB_MPOOL_LAST))
+			*pgnoaddr = lastpgno;
+		else {
+			/*
+			 * !!!
+			 * Allocate a page that can never really exist.  See
+			 * the comment above about non-existent pages and the
+			 * hash access method.
+			 */
+			if (LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))
+				readonly_alloc = 1;
+			else if (*pgnoaddr > lastpgno) {
+				__db_err(dbmp->dbenv,
+				    "%s: page %lu doesn't exist",
+				    dbmfp->path, (u_long)*pgnoaddr);
+				ret = EINVAL;
+				goto err;
+			}
+		}
+		if (!readonly_alloc) {
+			addr = ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+
+			++mp->stat.st_map;
+			++mfp->stat.st_map;
+
+			goto mapret;
+		}
+	}
+
+	/*
+	 * If requesting the last page or a new page, find the last page.  The
+	 * tricky thing is that the user may have created a page already that's
+	 * after any page that exists in the file.
+	 */
+	if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
+		/*
+		 * Temporary files may not yet have been created.
+		 *
+		 * Don't lock -- there are no atomicity issues for stat(2).
+		 */
+		if (dbmfp->fd == -1)
+			size = 0;
+		else if ((ret = __db_stat(dbmp->dbenv,
+		    dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+			goto err;
+
+		*pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize;
+
+		/*
+		 * Walk the list of BH's, looking for later pages.  Save the
+		 * pointer if a later page is found so that we don't have to
+		 * search the list twice.
+		 *
+		 * If requesting a new page, return the page one after the last
+		 * page -- which we'll have to create.
+		 */
+		for (tbhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+		    tbhp != NULL; tbhp = SH_TAILQ_NEXT(tbhp, q, __bh))
+			if (tbhp->pgno >= *pgnoaddr &&
+			    tbhp->mf_offset == mf_offset) {
+				bhp = tbhp;
+				*pgnoaddr = bhp->pgno;
+			}
+		if (LF_ISSET(DB_MPOOL_NEW))
+			++*pgnoaddr;
+	}
+
+	/* If we already found the right buffer, return it. */
+	if (LF_ISSET(DB_MPOOL_LAST) && bhp != NULL) {
+		addr = bhp->buf;
+		goto found;
+	}
+
+	/* If we haven't checked the BH list yet, do the search. */
+	if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
+		++mp->stat.st_hash_searches;
+		bucket = BUCKET(mp, mf_offset, *pgnoaddr);
+		for (cnt = 0,
+		    bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) {
+			++cnt;
+			if (bhp->pgno == *pgnoaddr &&
+			    bhp->mf_offset == mf_offset) {
+				addr = bhp->buf;
+				if (cnt > mp->stat.st_hash_longest)
+					mp->stat.st_hash_longest = cnt;
+				mp->stat.st_hash_examined += cnt;
+				goto found;
+			}
+		}
+		if (cnt > mp->stat.st_hash_longest)
+			mp->stat.st_hash_longest = cnt;
+		mp->stat.st_hash_examined += cnt;
+	}
+
+	/*
+	 * Allocate a new buffer header and data space, and mark the contents
+	 * as useless.
+	 */
+	if ((ret = __memp_ralloc(dbmp, sizeof(BH) -
+	    sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
+		goto err;
+	addr = bhp->buf;
+#ifdef DEBUG
+	if ((ALIGNTYPE)addr & (sizeof(size_t) - 1)) {
+		__db_err(dbmp->dbenv,
+		    "Internal error: BH data NOT size_t aligned.");
+		abort();
+	}
+#endif
+	memset(bhp, 0, sizeof(BH));
+	LOCKINIT(dbmp, &bhp->mutex);
+
+	/*
+	 * Prepend the bucket header to the head of the appropriate MPOOL
+	 * bucket hash list.  Append the bucket header to the tail of the
+	 * MPOOL LRU chain.
+	 *
+	 * We have to do this before we read in the page so we can discard
+	 * our region lock without screwing up the world.
+	 */
+	bucket = BUCKET(mp, mf_offset, *pgnoaddr);
+	SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, mq, __bh);
+	SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
+	b_inserted = 1;
+
+	/* Set the page number, and associated MPOOLFILE. */
+	bhp->mf_offset = mf_offset;
+	bhp->pgno = *pgnoaddr;
+
+	/*
+	 * If we know we created the page, zero it out and continue.
+	 *
+	 * !!!
+	 * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function.
+	 * If DB_MPOOL_CREATE is used, then the application's pgin function
+	 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
+	 * it can detect all of its page creates, and not bother.
+	 *
+	 * Otherwise, read the page into memory, optionally creating it if
+	 * DB_MPOOL_CREATE is set.
+	 *
+	 * Increment the reference count for created buffers, but importantly,
+	 * increment the reference count for buffers we're about to read so
+	 * that the buffer can't move.
+	 */
+	++bhp->ref;
+	b_incr = 1;
+
+	if (LF_ISSET(DB_MPOOL_NEW))
+		memset(addr, 0, mfp->stat.st_pagesize);
+	else {
+		/*
+		 * It's possible for the read function to fail, which means
+		 * that we fail as well.
+		 */
+reread:		if ((ret = __memp_pgread(dbmfp,
+		    bhp, LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))) != 0)
+			goto err;
+
+		/*
+		 * !!!
+		 * The __memp_pgread call discarded and reacquired the region
+		 * lock.  Because the buffer reference count was incremented
+		 * before the region lock was discarded the buffer didn't move.
+		 */
+		++mp->stat.st_cache_miss;
+		++mfp->stat.st_cache_miss;
+	}
+
+	if (0) {
+found:		/* Increment the reference count. */
+		if (bhp->ref == UINT16_T_MAX) {
+			__db_err(dbmp->dbenv,
+			    "%s: too many references to page %lu",
+			    dbmfp->path, bhp->pgno);
+			ret = EAGAIN;
+			goto err;
+		}
+		++bhp->ref;
+		b_incr = 1;
+
+		/*
+	 	 * Any found buffer might be trouble.
+		 *
+		 * BH_LOCKED --
+		 * I/O in progress, wait for it to finish.  Because the buffer
+		 * reference count was incremented before the region lock was
+		 * discarded we know the buffer didn't move.
+		 */
+		if (F_ISSET(bhp, BH_LOCKED)) {
+			UNLOCKREGION(dbmp);
+			LOCKBUFFER(dbmp, bhp);
+			/* Waiting for I/O to finish... */
+			UNLOCKBUFFER(dbmp, bhp);
+			LOCKREGION(dbmp);
+		}
+
+		/*
+		 * BH_TRASH --
+		 * The buffer is garbage.
+		 */
+		if (F_ISSET(bhp, BH_TRASH))
+			goto reread;
+
+		/*
+		 * BH_CALLPGIN --
+		 * The buffer was written, and the contents need to be
+		 * converted again.
+		 */
+		if (F_ISSET(bhp, BH_CALLPGIN)) {
+			if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+				goto err;
+			F_CLR(bhp, BH_CALLPGIN);
+		}
+
+		++mp->stat.st_cache_hit;
+		++mfp->stat.st_cache_hit;
+	}
+
+mapret:	LOCKHANDLE(dbmp, &dbmfp->mutex);
+	++dbmfp->pinref;
+	UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+	if (0) {
+err:		/*
+		 * If no other process is already waiting on a created buffer,
+		 * go ahead and discard it, it's not useful.
+		 */
+		if (b_incr)
+			--bhp->ref;
+		if (b_inserted && bhp->ref == 0)
+			__memp_bhfree(dbmp, mfp, bhp, 1);
+	}
+
+	UNLOCKREGION(dbmp);
+
+	*(void **)addrp = addr;
+	return (ret);
+}
diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c
new file mode 100644
index 0000000000..7703847b73
--- /dev/null
+++ b/db2/mp/mp_fopen.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fopen.c	10.24 (Sleepycat) 8/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *));
+static int __memp_mf_open __P((DB_MPOOL *, DB_MPOOLFILE *,
+    int, int, size_t, int, DBT *, u_int8_t *, int, MPOOLFILE **));
+
+/*
+ * memp_fopen --
+ *	Open a backing file for the memory pool.
+ */
+int
+memp_fopen(dbmp, path, ftype,
+    flags, mode, pagesize, lsn_offset, pgcookie, fileid, retp)
+	DB_MPOOL *dbmp;
+	const char *path;
+	int ftype, flags, mode, lsn_offset;
+	size_t pagesize;
+	DBT *pgcookie;
+	u_int8_t *fileid;
+	DB_MPOOLFILE **retp;
+{
+	int ret;
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(dbmp->dbenv,
+	    "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0)
+		return (ret);
+
+	return (__memp_fopen(dbmp, path, ftype,
+	    flags, mode, pagesize, lsn_offset, pgcookie, fileid, 1, retp));
+}
+
+/*
+ * __memp_fopen --
+ *	Open a backing file for the memory pool; internal version.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOL *, const char *, int, int,
+ * PUBLIC:    int, size_t, int, DBT *, u_int8_t *, int, DB_MPOOLFILE **));
+ */
+int
+__memp_fopen(dbmp, path,
+    ftype, flags, mode, pagesize, lsn_offset, pgcookie, fileid, needlock, retp)
+	DB_MPOOL *dbmp;
+	const char *path;
+	int ftype, flags, mode, lsn_offset, needlock;
+	size_t pagesize;
+	DBT *pgcookie;
+	u_int8_t *fileid;
+	DB_MPOOLFILE **retp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	off_t size;
+	int ret;
+
+	dbenv = dbmp->dbenv;
+	ret = 0;
+
+	/* Require a non-zero pagesize. */
+	if (pagesize == 0) {
+		__db_err(dbenv, "memp_fopen: pagesize not specified");
+		return (EINVAL);
+	}
+
+	/* Allocate and initialize the per-process structure. */
+	if ((dbmfp =
+	    (DB_MPOOLFILE *)calloc(1, sizeof(DB_MPOOLFILE))) == NULL) {
+		__db_err(dbenv, "%s: %s",
+		    path == NULL ? TEMPORARY : path, strerror(ENOMEM));
+		return (ENOMEM);
+	}
+	LOCKINIT(dbmp, &dbmfp->mutex);
+	dbmfp->dbmp = dbmp;
+	dbmfp->fd = -1;
+	if (LF_ISSET(DB_RDONLY))
+		F_SET(dbmfp, MP_READONLY);
+
+	if (path == NULL) {
+		if (LF_ISSET(DB_RDONLY)) {
+			__db_err(dbenv,
+			    "memp_fopen: temporary files can't be readonly");
+			ret = EINVAL;
+			goto err;
+		}
+		dbmfp->path = (char *) TEMPORARY;
+		F_SET(dbmfp, MP_PATH_TEMP);
+	} else {
+		/* Calculate the real name for this file. */
+		if ((ret = __db_appname(dbenv,
+		    DB_APP_DATA, NULL, path, NULL, &dbmfp->path)) != 0)
+			goto err;
+		F_SET(dbmfp, MP_PATH_ALLOC);
+
+
+		/* Open the file. */
+		if ((ret = __db_fdopen(dbmfp->path,
+		    LF_ISSET(DB_CREATE | DB_RDONLY), DB_CREATE | DB_RDONLY,
+		    mode, &dbmfp->fd)) != 0) {
+			__db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret));
+			goto err;
+		}
+
+		/* Don't permit files that aren't a multiple of the pagesize. */
+		if ((ret = __db_stat(dbenv,
+		     dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+			goto err;
+		if (size % pagesize) {
+			__db_err(dbenv,
+			    "%s: file size not a multiple of the pagesize",
+			    dbmfp->path);
+			ret = EINVAL;
+			goto err;
+		}
+	}
+
+	/* Find/allocate the shared file object. */
+	if (needlock)
+		LOCKREGION(dbmp);
+	ret = __memp_mf_open(dbmp, dbmfp, ftype,
+	    F_ISSET(dbmfp, MP_READONLY), pagesize,
+	    lsn_offset, pgcookie, fileid, F_ISSET(dbmfp, MP_PATH_TEMP), &mfp);
+	if (needlock)
+		UNLOCKREGION(dbmp);
+	if (ret != 0)
+		goto err;
+
+	dbmfp->mfp = mfp;
+
+	/*
+	 * If a file:
+	 *
+	 *	+ is read-only
+	 *	+ doesn't require any pgin/pgout support
+	 *	+ is less than mp_mmapsize bytes in size.
+	 *	+ and the DB_NOMMAP flag wasn't set
+	 *
+	 * we can mmap it instead of reading/writing buffers.  Don't do error
+	 * checking based on the mmap call failure.  We want to do normal I/O
+	 * on the file if the reason we failed was because the file was on an
+	 * NFS mounted partition, and we can fail in buffer I/O just as easily
+	 * as here.
+	 *
+	 * XXX
+	 * We'd like to test to see if the file is too big to mmap.  Since we
+	 * don't know what size or type off_t's or size_t's are, or the largest
+	 * unsigned integral type is, or what random insanity the local C
+	 * compiler will perpetrate, doing the comparison in a portable way is
+	 * flatly impossible.  Hope that mmap fails if the file is too large.
+	 */
+#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 Mb. */
+	dbmfp->addr = NULL;
+	mfp->can_mmap = F_ISSET(dbmfp, MP_READONLY) &&
+	    ftype == 0 && !LF_ISSET(DB_NOMMAP) && path != NULL &&
+	    size <= (dbenv == NULL || dbenv->mp_mmapsize == 0 ?
+	    DB_MAXMMAPSIZE : (off_t)dbenv->mp_mmapsize);
+	if (mfp->can_mmap) {
+		dbmfp->len = size;
+		if (__db_mmap(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) {
+			mfp->can_mmap = 0;
+			dbmfp->addr = NULL;
+		}
+	}
+
+	LOCKHANDLE(dbmp, &dbmp->mutex);
+	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+	UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+	*retp = dbmfp;
+	return (0);
+
+err:	if (F_ISSET(dbmfp, MP_PATH_ALLOC))
+		FREES(dbmfp->path);
+	if (dbmfp->fd != -1)
+		(void)__db_close(dbmfp->fd);
+	if (dbmfp != NULL)
+		FREE(dbmfp, sizeof(DB_MPOOLFILE));
+	return (ret);
+}
+
+/*
+ * __memp_mf_open --
+ *	Open an MPOOLFILE.
+ */
+static int
+__memp_mf_open(dbmp, dbmfp,
+    ftype, readonly, pagesize, lsn_offset, pgcookie, fileid, istemp, retp)
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	int ftype, readonly, lsn_offset, istemp;
+	size_t pagesize;
+	DBT *pgcookie;
+	u_int8_t *fileid;
+	MPOOLFILE **retp;
+{
+	MPOOLFILE *mfp;
+	int ret;
+	u_int8_t idbuf[DB_FILE_ID_LEN];
+	void *p;
+
+	/* Temporary files can't match previous files. */
+	if (istemp)
+		goto alloc;
+
+	/*
+	 * Get the file id if we weren't give one.  Generated file id's don't
+	 * use timestamps, otherwise there'd be no chance of anyone joining
+	 * the party.
+	 */
+	if (fileid == NULL) {
+		if ((ret =
+		    __db_fileid(dbmp->dbenv, dbmfp->path, 0, idbuf)) != 0)
+			return (ret);
+		fileid = idbuf;
+	}
+
+	/* Walk the list of MPOOLFILE's, looking for a matching file. */
+	for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+		if (!memcmp(fileid,
+		    ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
+			if (ftype != mfp->ftype ||
+			    pagesize != mfp->stat.st_pagesize) {
+				__db_err(dbmp->dbenv,
+				    "%s: ftype or pagesize changed",
+				    dbmfp->path);
+				ret = EINVAL;
+				mfp = NULL;
+				goto ret1;
+			}
+			/*
+			 * Found it: increment the reference count and update
+			 * the mmap-able status.
+			 */
+			++mfp->ref;
+			if (!readonly)
+				mfp->can_mmap = 0;
+			goto ret1;
+		}
+
+	/* Allocate a new MPOOLFILE. */
+alloc:	if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+		goto ret1;
+
+	/* Initialize the structure. */
+	memset(mfp, 0, sizeof(MPOOLFILE));
+	mfp->ref = 1;
+	mfp->ftype = ftype;
+	mfp->lsn_off = lsn_offset;
+	mfp->stat.st_pagesize = pagesize;
+
+	/* Copy the file path into shared memory. */
+	if ((ret = __memp_ralloc(dbmp,
+	    strlen(dbmfp->path) + 1, &mfp->path_off, &p)) != 0)
+		goto err;
+	memcpy(p, dbmfp->path, strlen(dbmfp->path) + 1);
+
+	/* Copy the file identification string into shared memory. */
+	if (istemp)
+		mfp->fileid_off = 0;
+	else {
+		if ((ret = __memp_ralloc(dbmp,
+		    DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+			goto err;
+		memcpy(p, fileid, DB_FILE_ID_LEN);
+	}
+
+	/* Copy the page cookie into shared memory. */
+	if (pgcookie == NULL || pgcookie->size == 0) {
+		mfp->pgcookie_len = 0;
+		mfp->pgcookie_off = 0;
+	} else {
+		if ((ret = __memp_ralloc(dbmp,
+		    pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
+			goto err;
+		memcpy(p, pgcookie->data, pgcookie->size);
+		mfp->pgcookie_len = pgcookie->size;
+	}
+
+	/* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
+	SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile);
+
+	if (0) {
+err:		if (mfp->path_off != 0)
+			__db_shalloc_free(dbmp->addr,
+			    ADDR(dbmp, mfp->path_off));
+		if (!istemp)
+			__db_shalloc_free(dbmp->addr,
+			    ADDR(dbmp, mfp->fileid_off));
+		if (mfp != NULL)
+			__db_shalloc_free(dbmp->addr, mfp);
+		mfp = NULL;
+	}
+
+ret1:	*retp = mfp;
+	return (0);
+}
+
+/*
+ * memp_fclose --
+ *	Close a backing file for the memory pool.
+ */
+int
+memp_fclose(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_MPOOL *dbmp;
+	int ret, t_ret;
+
+	dbmp = dbmfp->dbmp;
+	ret = 0;
+
+	/* Complain if pinned blocks never returned. */
+	if (dbmfp->pinref != 0)
+		__db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned",
+		    dbmfp->path, (u_long)dbmfp->pinref);
+
+	/* Remove the DB_MPOOLFILE structure from the list. */
+	LOCKHANDLE(dbmp, &dbmp->mutex);
+	TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+	UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+	/* Close the underlying MPOOLFILE. */
+	(void)__memp_mf_close(dbmp, dbmfp);
+
+	/* Discard any mmap information. */
+	if (dbmfp->addr != NULL &&
+	    (ret = __db_munmap(dbmfp->addr, dbmfp->len)) != 0)
+		__db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(ret));
+
+	/* Close the file; temporary files may not yet have been created. */
+	if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) {
+		__db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(t_ret));
+		if (ret != 0)
+			t_ret = ret;
+	}
+
+	/* Potentially allocated path. */
+	if (F_ISSET(dbmfp, MP_PATH_ALLOC))
+		FREES(dbmfp->path);
+
+	/* Free the DB_MPOOLFILE structure. */
+	FREE(dbmfp, sizeof(DB_MPOOLFILE));
+
+	return (ret);
+}
+
+/*
+ * __memp_mf_close --
+ *	Close down an MPOOLFILE.
+ */
+static int
+__memp_mf_close(dbmp, dbmfp)
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+{
+	BH *bhp, *nbhp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	size_t mf_offset;
+
+	mp = dbmp->mp;
+	mfp = dbmfp->mfp;
+
+	LOCKREGION(dbmp);
+
+	/* If more than a single reference, simply decrement. */
+	if (mfp->ref > 1) {
+		--mfp->ref;
+		goto ret1;
+	}
+
+	/*
+	 * Move any BH's held by the file to the free list.  We don't free the
+	 * memory itself because we may be discarding the memory pool, and it's
+	 * fairly expensive to reintegrate the buffers back into the region for
+	 * no purpose.
+	 */
+	mf_offset = OFFSET(dbmp, mfp);
+	for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+#ifdef DEBUG_NO_DIRTY
+		/* Complain if we find any blocks that were left dirty. */
+		if (F_ISSET(bhp, BH_DIRTY))
+			__db_err(dbmp->dbenv,
+			    "%s: close: pgno %lu left dirty; ref %lu",
+			    dbmfp->path, (u_long)bhp->pgno, (u_long)bhp->ref);
+#endif
+
+		if (bhp->mf_offset == mf_offset) {
+			__memp_bhfree(dbmp, mfp, bhp, 0);
+			SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh);
+		}
+	}
+
+	/* Delete from the list of MPOOLFILEs. */
+	SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+
+	/* Free the space. */
+	__db_shalloc_free(dbmp->addr, mfp);
+	__db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->path_off));
+	if (mfp->fileid_off != 0)
+		__db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->fileid_off));
+	if (mfp->pgcookie_off != 0)
+		__db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->pgcookie_off));
+
+ret1:	UNLOCKREGION(dbmp);
+	return (0);
+}
diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c
new file mode 100644
index 0000000000..5fac8ae76b
--- /dev/null
+++ b/db2/mp/mp_fput.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fput.c	10.10 (Sleepycat) 7/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_fput --
+ *	Mpool file put function.
+ */
+int
+memp_fput(dbmfp, pgaddr, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+	u_long flags;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	int wrote, ret;
+
+	dbmp = dbmfp->dbmp;
+
+	/* Validate arguments. */
+	if (flags) {
+		if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags,
+		    DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(dbmp->dbenv, "memp_fput",
+		    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+			return (ret);
+
+		if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+			__db_err(dbmp->dbenv,
+			    "%s: dirty flag set for readonly file page",
+			    dbmfp->path);
+			return (EACCES);
+		}
+	}
+
+	/* Decrement the pinned reference count. */
+	LOCKHANDLE(dbmp, &dbmfp->mutex);
+	if (dbmfp->pinref == 0)
+		__db_err(dbmp->dbenv,
+		    "%s: put: more blocks returned than retrieved",
+		    dbmfp->path);
+	else
+		--dbmfp->pinref;
+	UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+	/*
+	 * If we're mapping the file, there's nothing to do.  Because we can
+	 * quit mapping at any time, we have to check on each buffer to see
+	 * if it's in the map region.
+	 */
+	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
+		return (0);
+
+	/* Convert the page address to a buffer header. */
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+	LOCKREGION(dbmp);
+
+	/* Set/clear the page bits. */
+	if (LF_ISSET(DB_MPOOL_CLEAN))
+		F_CLR(bhp, BH_DIRTY);
+	if (LF_ISSET(DB_MPOOL_DIRTY))
+		F_SET(bhp, BH_DIRTY);
+	if (LF_ISSET(DB_MPOOL_DISCARD))
+		F_SET(bhp, BH_DISCARD);
+
+	/*
+	 * If more than one reference to the page, we're done.  Ignore discard
+	 * flags (for now) and leave it at its position in the LRU chain.  The
+	 * rest gets done at last reference close.
+	 */
+#ifdef DEBUG
+	if (bhp->ref == 0) {
+		__db_err(dbmp->dbenv,
+		    "Internal error: bhp->ref on page %lu went negative.",
+		    (u_long)bhp->pgno);
+		abort();
+	}
+#endif
+	if (--bhp->ref > 0) {
+		UNLOCKREGION(dbmp);
+		return (0);
+	}
+
+	/* Move the buffer to the head/tail of the LRU chain. */
+	SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
+	if (F_ISSET(bhp, BH_DISCARD))
+		SH_TAILQ_INSERT_HEAD(&dbmp->mp->bhq, bhp, q, __bh);
+	else
+		SH_TAILQ_INSERT_TAIL(&dbmp->mp->bhq, bhp, q);
+
+	/*
+	 * If this buffer is scheduled for writing because of a checkpoint,
+	 * write it now.  If we can't write it, set a flag so that the next
+	 * time the memp_sync function is called we try writing it there,
+	 * as the checkpoint application better be able to write all of the
+	 * files.
+	 */
+	if (F_ISSET(bhp, BH_WRITE))
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			if (__memp_bhwrite(dbmp,
+			    dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
+				F_SET(dbmp->mp, MP_LSN_RETRY);
+		} else {
+			F_CLR(bhp, BH_WRITE);
+
+			mfp = ADDR(dbmp, bhp->mf_offset);
+			--mfp->lsn_cnt;
+
+			--dbmp->mp->lsn_cnt;
+		}
+
+	UNLOCKREGION(dbmp);
+	return (0);
+}
diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c
new file mode 100644
index 0000000000..588085a358
--- /dev/null
+++ b/db2/mp/mp_fset.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fset.c	10.8 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_fset --
+ *	Mpool page set-flag routine.
+ */
+int
+memp_fset(dbmfp, pgaddr, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+	u_long flags;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	int ret;
+
+	dbmp = dbmfp->dbmp;
+
+	/* Validate arguments. */
+	if (flags != 0) {
+		if ((ret = __db_fchk(dbmp->dbenv, "memp_fset", flags,
+		    DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(dbmp->dbenv, "memp_fset",
+		    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+			return (ret);
+
+		if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+			__db_err(dbmp->dbenv,
+			    "%s: dirty flag set for readonly file page",
+			    dbmfp->path);
+			return (EACCES);
+		}
+	}
+
+	/* Convert the page address to a buffer header. */
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+	LOCKREGION(dbmp);
+
+	if (LF_ISSET(DB_MPOOL_DIRTY))
+		F_SET(bhp, BH_DIRTY);
+	if (LF_ISSET(DB_MPOOL_CLEAN))
+		F_CLR(bhp, BH_DIRTY);
+	if (LF_ISSET(DB_MPOOL_DISCARD))
+		F_SET(bhp, BH_DISCARD);
+
+	UNLOCKREGION(dbmp);
+	return (0);
+}
diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c
new file mode 100644
index 0000000000..257ce1b9e9
--- /dev/null
+++ b/db2/mp/mp_open.c
@@ -0,0 +1,176 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_open.c	10.12 (Sleepycat) 7/6/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_open --
+ *	Initialize and/or join a memory pool.
+ */
+int
+memp_open(path, flags, mode, dbenv, retp)
+	const char *path;
+	int flags, mode;
+	DB_ENV *dbenv;
+	DB_MPOOL **retp;
+{
+	DB_MPOOL *dbmp;
+	size_t cachesize;
+	int ret;
+
+	/* Validate arguments. */
+#ifdef HAVE_SPINLOCKS
+#define	OKFLAGS	(DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD)
+#else
+#define	OKFLAGS	(DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP)
+#endif
+	if ((ret = __db_fchk(dbenv, "memp_open", flags, OKFLAGS)) != 0)
+		return (ret);
+
+	/* Extract fields from DB_ENV structure. */
+	cachesize = dbenv == NULL ? 0 : dbenv->mp_size;
+
+	/* Create and initialize the DB_MPOOL structure. */
+	if ((dbmp = (DB_MPOOL *)calloc(1, sizeof(DB_MPOOL))) == NULL)
+		return (ENOMEM);
+	LOCKINIT(dbmp, &dbmp->mutex);
+	LIST_INIT(&dbmp->dbregq);
+	TAILQ_INIT(&dbmp->dbmfq);
+
+	dbmp->dbenv = dbenv;
+
+	/* Decide if it's possible for anyone else to access the pool. */
+	if ((dbenv == NULL && path == NULL) ||
+	    (dbenv != NULL && F_ISSET(dbenv, DB_MPOOL_PRIVATE)))
+		F_SET(dbmp, MP_ISPRIVATE);
+
+	/*
+	 * Map in the region.  We do locking regardless, as portions of it are
+	 * implemented in common code (if we put the region in a file, that is).
+	 */
+	F_SET(dbmp, MP_LOCKREGION);
+	if ((ret = __memp_ropen(dbmp, path, cachesize, mode, flags)) != 0)
+		goto err;
+	F_CLR(dbmp, MP_LOCKREGION);
+
+	/*
+	 * If there's concurrent access, then we have to lock the region.
+	 * If it's threaded, then we have to lock both the handles and the
+	 * region.
+	 */
+	if (!F_ISSET(dbmp, MP_ISPRIVATE))
+		F_SET(dbmp, MP_LOCKREGION);
+	if (LF_ISSET(DB_THREAD))
+		F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION);
+
+	*retp = dbmp;
+	return (0);
+
+err:	if (dbmp != NULL)
+		FREE(dbmp, sizeof(DB_MPOOL));
+	return (ret);
+}
+
+/*
+ * memp_close --
+ *	Close a memory pool.
+ */
+int
+memp_close(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_MPOOLFILE *dbmfp;
+	DB_MPREG *mpreg;
+	int ret, t_ret;
+
+	ret = 0;
+
+	/* Discard DB_MPREGs. */
+	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+		LIST_REMOVE(mpreg, q);
+		FREE(mpreg, sizeof(DB_MPREG));
+	}
+
+	/* Discard DB_MPOOLFILEs. */
+	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+		if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Close the region. */
+	if ((t_ret = __memp_rclose(dbmp)) && ret == 0)
+		ret = t_ret;
+
+	/* Free the structure. */
+	FREE(dbmp, sizeof(DB_MPOOL));
+
+	return (ret);
+}
+
+/*
+ * memp_unlink --
+ *	Exit a memory pool.
+ */
+int
+memp_unlink(path, force, dbenv)
+	const char *path;
+	int force;
+	DB_ENV *dbenv;
+{
+	return (__db_runlink(dbenv,
+	    DB_APP_NONE, path, DB_DEFAULT_MPOOL_FILE, force));
+}
+
+/*
+ * memp_register --
+ *	Register a file type's pgin, pgout routines.
+ */
+int
+memp_register(dbmp, ftype, pgin, pgout)
+	DB_MPOOL *dbmp;
+	int ftype;
+	int (*pgin) __P((db_pgno_t, void *, DBT *));
+	int (*pgout) __P((db_pgno_t, void *, DBT *));
+{
+	DB_MPREG *mpr;
+
+	if ((mpr = (DB_MPREG *)malloc(sizeof(DB_MPREG))) == NULL)
+		return (ENOMEM);
+
+	mpr->ftype = ftype;
+	mpr->pgin = pgin;
+	mpr->pgout = pgout;
+
+	/*
+	 * Insert at the head.  Because we do a linear walk, we'll find
+	 * the most recent registry in the case of multiple entries, so
+	 * we don't have to check for multiple registries.
+	 */
+	LOCKHANDLE(dbmp, &dbmp->mutex);
+	LIST_INSERT_HEAD(&dbmp->dbregq, mpr, q);
+	UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+	return (0);
+}
diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c
new file mode 100644
index 0000000000..94eabf5947
--- /dev/null
+++ b/db2/mp/mp_pr.c
@@ -0,0 +1,313 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_pr.c	10.12 (Sleepycat) 7/29/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+
+void __memp_debug __P((DB_MPOOL *, FILE *, int));
+
+static void __memp_pbh __P((FILE *, DB_MPOOL *, BH *, int));
+static void __memp_pdbmf __P((FILE *, DB_MPOOLFILE *, int));
+static void __memp_pmf __P((FILE *, MPOOLFILE *, int));
+static void __memp_pmp __P((FILE *, DB_MPOOL *, MPOOL *, int));
+
+/*
+ * memp_stat --
+ *	Display MPOOL statistics.
+ */
+int
+memp_stat(dbmp, gspp, fspp, db_malloc)
+	DB_MPOOL *dbmp;
+	DB_MPOOL_STAT **gspp;
+	DB_MPOOL_FSTAT ***fspp;
+	void *(*db_malloc) __P((size_t));
+{
+	DB_MPOOL_FSTAT **tfsp;
+	MPOOLFILE *mfp;
+	size_t len, nlen;
+	char *name;
+
+	/* Allocate space for the global statistics. */
+	if (gspp != NULL) {
+		*gspp = NULL;
+
+		if ((*gspp = db_malloc == NULL ?
+		    (DB_MPOOL_STAT *)malloc(sizeof(**gspp)) :
+		    (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL)
+			return (ENOMEM);
+
+		LOCKREGION(dbmp);
+
+		/* Copy out the global statistics. */
+		**gspp = dbmp->mp->stat;
+		(*gspp)->st_hash_buckets = dbmp->mp->htab_buckets;
+
+		UNLOCKREGION(dbmp);
+	}
+
+	if (fspp != NULL) {
+		*fspp = NULL;
+
+		LOCKREGION(dbmp);
+
+		/* Count the MPOOLFILE structures. */
+		for (len = 0,
+		    mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+		    mfp != NULL;
+		    ++len, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile));
+
+		UNLOCKREGION(dbmp);
+
+		if (len == 0)
+			return (0);
+
+		/* Allocate space for the pointers. */
+		len = (len + 1) * sizeof(DB_MPOOL_FSTAT *);
+		if ((*fspp = db_malloc == NULL ?
+		    (DB_MPOOL_FSTAT **)malloc(len) :
+		    (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL)
+			return (ENOMEM);
+
+		LOCKREGION(dbmp);
+
+		/* Build each individual entry. */
+		for (tfsp = *fspp,
+		    mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+		    mfp != NULL;
+		    ++tfsp, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			name = ADDR(dbmp, mfp->path_off);
+			nlen = strlen(name);
+			len = sizeof(DB_MPOOL_FSTAT) + nlen + 1;
+			if ((*tfsp = db_malloc == NULL ?
+			    (DB_MPOOL_FSTAT *)malloc(len) :
+			    (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL)
+				return (ENOMEM);
+			**tfsp = mfp->stat;
+			(*tfsp)->file_name = (char *)
+			    (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT);
+			memcpy((*tfsp)->file_name, name, nlen + 1);
+		}
+		*tfsp = NULL;
+
+		UNLOCKREGION(dbmp);
+	}
+	return (0);
+}
+
+/*
+ * __memp_debug --
+ *	Display MPOOL structures.
+ *
+ * PUBLIC: void __memp_debug __P((DB_MPOOL *, FILE *, int));
+ */
+void
+__memp_debug(dbmp, fp, data)
+	DB_MPOOL *dbmp;
+	FILE *fp;
+	int data;
+{
+	DB_MPOOLFILE *dbmfp;
+	u_long cnt;
+
+	/* Make it easy to call from the debugger. */
+	if (fp == NULL)
+		fp = stderr;
+
+	/* Welcome message. */
+	(void)fprintf(fp, "%s\nMpool per-process (%lu) statistics\n",
+	    DB_LINE, (u_long)getpid());
+
+	if (data)
+		(void)fprintf(fp, "    fd: %d; addr %lx; maddr %lx\n",
+		    dbmp->fd, (u_long)dbmp->addr, (u_long)dbmp->maddr);
+
+	/* Display the DB_MPOOLFILE structures. */
+	for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; ++cnt, dbmfp = TAILQ_NEXT(dbmfp, q));
+	(void)fprintf(fp, "%lu process-local files\n", cnt);
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) {
+		(void)fprintf(fp, "%s\n", dbmfp->path);
+		__memp_pdbmf(fp, dbmfp, data);
+	}
+
+	/* Switch to global statistics. */
+	(void)fprintf(fp, "\n%s\nMpool statistics\n", DB_LINE);
+
+	/* Display the MPOOL structure. */
+	__memp_pmp(fp, dbmp, dbmp->mp, data);
+
+	/* Flush in case we're debugging. */
+	(void)fflush(fp);
+}
+
+/*
+ * __memp_pdbmf --
+ *	Display a DB_MPOOLFILE structure.
+ */
+static void
+__memp_pdbmf(fp, dbmfp, data)
+	FILE *fp;
+	DB_MPOOLFILE *dbmfp;
+	int data;
+{
+	if (!data)
+		return;
+
+	(void)fprintf(fp, "    fd: %d; %s\n",
+	    dbmfp->fd, F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+}
+
+/*
+ * __memp_pmp --
+ *	Display the MPOOL structure.
+ */
+static void
+__memp_pmp(fp, dbmp, mp, data)
+	FILE *fp;
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	int data;
+{
+	BH *bhp;
+	MPOOLFILE *mfp;
+	DB_HASHTAB *htabp;
+	size_t bucket;
+	int cnt;
+	const char *sep;
+
+	(void)fprintf(fp, "references: %lu; cachesize: %lu\n",
+	    (u_long)mp->rlayout.refcnt, (u_long)mp->stat.st_cachesize);
+	(void)fprintf(fp,
+	    "    %lu pages created\n", mp->stat.st_page_create);
+	(void)fprintf(fp,
+	    "    %lu mmap pages returned\n", mp->stat.st_map);
+	(void)fprintf(fp, "    %lu I/O's (%lu read, %lu written)\n",
+	    mp->stat.st_page_in + mp->stat.st_page_out,
+	    mp->stat.st_page_in, mp->stat.st_page_out);
+	if (mp->stat.st_cache_hit + mp->stat.st_cache_miss != 0)
+		(void)fprintf(fp,
+		    "    %.0f%% cache hit rate (%lu hit, %lu miss)\n",
+		    ((double)mp->stat.st_cache_hit /
+	    (mp->stat.st_cache_hit + mp->stat.st_cache_miss)) * 100,
+		    mp->stat.st_cache_hit, mp->stat.st_cache_miss);
+
+	/* Display the MPOOLFILE structures. */
+	for (cnt = 0, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+	    mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile));
+	(void)fprintf(fp, "%d total files\n", cnt);
+	for (cnt = 1, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+	    mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		(void)fprintf(fp, "file %d\n", cnt);
+		__memp_pmf(fp, mfp, data);
+	}
+
+	if (!data)
+		return;
+
+	/* Display the hash table list of BH's. */
+	(void)fprintf(fp, "%s\nHASH table of BH's (%lu buckets):\n",
+	    DB_LINE, (u_long)mp->htab_buckets);
+	(void)fprintf(fp,
+	    "longest chain searched %lu\n", mp->stat.st_hash_longest);
+	(void)fprintf(fp, "average chain searched %lu (total/calls: %lu/%lu)\n",
+	    mp->stat.st_hash_examined /
+	    (mp->stat.st_hash_searches ? mp->stat.st_hash_searches : 1),
+	    mp->stat.st_hash_examined, mp->stat.st_hash_searches);
+	for (htabp = dbmp->htab,
+	    bucket = 0; bucket < mp->htab_buckets; ++htabp, ++bucket) {
+		if (SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh) != NULL)
+			(void)fprintf(fp, "%lu:\n", (u_long)bucket);
+		for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh))
+			__memp_pbh(fp, dbmp, bhp, data);
+	}
+
+	/* Display the LRU list of BH's. */
+	(void)fprintf(fp, "LRU list of BH's (pgno/offset):");
+	for (sep = "\n    ", bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+	    bhp != NULL; sep = ", ", bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+		(void)fprintf(fp, "%s%lu/%lu", sep,
+		    (u_long)bhp->pgno, (u_long)OFFSET(dbmp, bhp));
+	(void)fprintf(fp, "\n");
+}
+
+/*
+ * __memp_pmf --
+ *	Display an MPOOLFILE structure.
+ */
+static void
+__memp_pmf(fp, mfp, data)
+	FILE *fp;
+	MPOOLFILE *mfp;
+	int data;
+{
+	(void)fprintf(fp, "    %lu pages created\n", mfp->stat.st_page_create);
+	(void)fprintf(fp, "    %lu I/O's (%lu read, %lu written)\n",
+	    mfp->stat.st_page_in + mfp->stat.st_page_out,
+	    mfp->stat.st_page_in, mfp->stat.st_page_out);
+	if (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss != 0)
+		(void)fprintf(fp,
+		    "    %.0f%% cache hit rate (%lu hit, %lu miss)\n",
+		    ((double)mfp->stat.st_cache_hit /
+		    (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss)) * 100,
+		    mfp->stat.st_cache_hit, mfp->stat.st_cache_miss);
+	if (!data)
+		return;
+
+	(void)fprintf(fp, "    %d references; %s; pagesize: %lu\n", mfp->ref,
+	    mfp->can_mmap ? "mmap" : "read/write",
+	    (u_long)mfp->stat.st_pagesize);
+}
+
+/*
+ * __memp_pbh --
+ *	Display a BH structure.
+ */
+static void
+__memp_pbh(fp, dbmp, bhp, data)
+	FILE *fp;
+	DB_MPOOL *dbmp;
+	BH *bhp;
+	int data;
+{
+	const char *sep;
+
+	if (!data)
+		return;
+
+	(void)fprintf(fp, "    BH @ %lu (mf: %lu): page %lu; ref %lu",
+	    (u_long)OFFSET(dbmp, bhp),
+	    (u_long)bhp->mf_offset, (u_long)bhp->pgno, (u_long)bhp->ref);
+	sep = "; ";
+	if (F_ISSET(bhp, BH_DIRTY)) {
+		(void)fprintf(fp, "%sdirty", sep);
+		sep = ", ";
+	}
+	if (F_ISSET(bhp, BH_WRITE)) {
+		(void)fprintf(fp, "%schk_write", sep);
+		sep = ", ";
+	}
+	(void)fprintf(fp, "\n");
+}
diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c
new file mode 100644
index 0000000000..a5c52123b9
--- /dev/null
+++ b/db2/mp/mp_region.c
@@ -0,0 +1,340 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_region.c	10.11 (Sleepycat) 8/2/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * __memp_ralloc --
+ *	Allocate some space in the mpool region.
+ *
+ * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *));
+ */
+int
+__memp_ralloc(dbmp, len, offsetp, retp)
+	DB_MPOOL *dbmp;
+	size_t len, *offsetp;
+	void *retp;
+{
+	BH *bhp, *nbhp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	size_t fsize, total;
+	int nomore, restart, ret, wrote;
+	void *p;
+
+	mp = dbmp->mp;
+
+	nomore = 0;
+alloc:	if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
+		if (offsetp != NULL)
+			*offsetp = OFFSET(dbmp, p);
+		*(void **)retp = p;
+		return (0);
+	}
+	if (nomore) {
+		__db_err(dbmp->dbenv, "%s", strerror(ret));
+		return (ret);
+	}
+
+	/* Look for a buffer on the free list that's the right size. */
+	for (bhp =
+	    SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) {
+		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+		if (__db_shsizeof(bhp) == len) {
+			SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
+			if (offsetp != NULL)
+				*offsetp = OFFSET(dbmp, bhp);
+			*(void **)retp = bhp;
+			return (0);
+		}
+	}
+
+	/* Discard from the free list until we've freed enough memory. */
+	total = 0;
+	for (bhp =
+	    SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) {
+		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+		SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
+		__db_shalloc_free(dbmp->addr, bhp);
+
+		/*
+		 * Retry as soon as we've freed up sufficient space.  If we
+		 * have to coalesce of memory to satisfy the request, don't
+		 * try until it's likely (possible?) that we'll succeed.
+		 */
+		total += fsize = __db_shsizeof(bhp);
+		if (fsize >= len || total >= 3 * len)
+			goto alloc;
+	}
+
+retry:	/* Find a buffer we can flush; pure LRU. */
+	total = 0;
+	for (bhp =
+	    SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+		/* Ignore pinned or locked (I/O in progress) buffers. */
+		if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+			continue;
+
+		/* Find the associated MPOOLFILE. */
+		mfp = ADDR(dbmp, bhp->mf_offset);
+
+		/*
+		 * Write the page if it's dirty.
+		 *
+		 * If we wrote the page, fall through and free the buffer.  We
+		 * don't have to rewalk the list to acquire the buffer because
+		 * it was never available for any other process to modify it.
+		 * If we didn't write the page, but we discarded and reacquired
+		 * the region lock, restart the buffer list walk.  If we neither
+		 * wrote the buffer nor discarded the region lock, continue down
+		 * the buffer list.
+		 */
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			if ((ret = __memp_bhwrite(dbmp,
+			    mfp, bhp, &restart, &wrote)) != 0)
+				return (ret);
+
+			/*
+			 * It's possible that another process wants this buffer
+			 * and incremented the ref count while we were writing
+			 * it.
+			 */
+			if (bhp->ref != 0)
+				goto retry;
+
+			if (wrote)
+				++mp->stat.st_rw_evict;
+			else {
+				if (restart)
+					goto retry;
+				else
+					continue;
+			}
+		} else
+			++mp->stat.st_ro_evict;
+
+		/*
+		 * Check to see if the buffer is the size we're looking for.
+		 * If it is, simply reuse it.
+		 */
+		total += fsize = __db_shsizeof(bhp);
+		if (fsize == len) {
+			__memp_bhfree(dbmp, mfp, bhp, 0);
+
+			if (offsetp != NULL)
+				*offsetp = OFFSET(dbmp, bhp);
+			*(void **)retp = bhp;
+			return (0);
+		}
+
+		/* Free the buffer. */
+		__memp_bhfree(dbmp, mfp, bhp, 1);
+
+		/*
+		 * Retry as soon as we've freed up sufficient space.  If we
+		 * have to coalesce of memory to satisfy the request, don't
+		 * try until it's likely (possible?) that we'll succeed.
+		 */
+		if (fsize >= len || total >= 3 * len)
+			goto alloc;
+
+		/* Restart the walk if we discarded the region lock. */
+		if (restart)
+			goto retry;
+	}
+	nomore = 1;
+	goto alloc;
+}
+
+/*
+ * __memp_ropen --
+ *	Attach to, and optionally create, the mpool region.
+ *
+ * PUBLIC: int __memp_ropen
+ * PUBLIC:    __P((DB_MPOOL *, const char *, size_t, int, int));
+ */
+int
+__memp_ropen(dbmp, path, cachesize, mode, flags)
+	DB_MPOOL *dbmp;
+	const char *path;
+	size_t cachesize;
+	int mode, flags;
+{
+	MPOOL *mp;
+	size_t rlen;
+	int fd, newregion, ret, retry_cnt;
+
+	/*
+	 * Unlike other DB subsystems, mpool can't simply grow the region
+	 * because it returns pointers into the region to its clients.  To
+	 * "grow" the region, we'd have to allocate a new region and then
+	 * store a region number in the structures that reference regional
+	 * objects.  It's reasonable that we fail regardless, as clients
+	 * shouldn't have every page in the region pinned, so the only
+	 * "failure" mode should be a performance penalty because we don't
+	 * find a page in the cache that we'd like to have found.
+	 *
+	 * Up the user's cachesize by 25% to account for our overhead.
+	 */
+	if (cachesize < DB_CACHESIZE_MIN)
+		if (cachesize == 0)
+			cachesize = DB_CACHESIZE_DEF;
+		else
+			cachesize = DB_CACHESIZE_MIN;
+	rlen = cachesize + cachesize / 4;
+
+	/* Map in the region. */
+	retry_cnt = newregion = 0;
+retry:	if (LF_ISSET(DB_CREATE)) {
+		/*
+		 * If it's a private mpool, use malloc, it's a lot faster than
+		 * instantiating a region.
+		 *
+		 * XXX
+		 * If we're doing locking and don't have spinlocks for this
+		 * architecture, we'd have to instantiate the file, we need
+		 * the file descriptor for locking.  However, it should not
+		 * be possible for DB_THREAD to be set if HAVE_SPINLOCKS aren't
+		 * defined.
+		 */
+		if (F_ISSET(dbmp, MP_ISPRIVATE))
+			ret = (dbmp->maddr = malloc(rlen)) == NULL ? ENOMEM : 0;
+		else
+			ret = __db_rcreate(dbmp->dbenv, DB_APP_NONE, path,
+			    DB_DEFAULT_MPOOL_FILE, mode, rlen, &fd,
+			    &dbmp->maddr);
+		if (ret == 0) {
+			/* Put the MPOOL structure first in the region. */
+			mp = dbmp->maddr;
+
+			SH_TAILQ_INIT(&mp->bhq);
+			SH_TAILQ_INIT(&mp->bhfq);
+			SH_TAILQ_INIT(&mp->mpfq);
+
+			/* Initialize the rest of the region as free space. */
+			dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL);
+			__db_shalloc_init(dbmp->addr, rlen - sizeof(MPOOL));
+
+			/*
+			 *
+			 * Pretend that the cache will be broken up into 4K
+			 * pages, and that we want to keep it under, say, 10
+			 * pages on each chain.  This means a 256MB cache will
+			 * allocate ~6500 offset pairs.
+			 */
+			mp->htab_buckets =
+			    __db_tablesize((cachesize / (4 * 1024)) / 10);
+
+			/* Allocate hash table space and initialize it. */
+			if ((ret = __db_shalloc(dbmp->addr,
+			    mp->htab_buckets * sizeof(DB_HASHTAB),
+			    0, &dbmp->htab)) != 0)
+				goto err;
+			__db_hashinit(dbmp->htab, mp->htab_buckets);
+			mp->htab = OFFSET(dbmp, dbmp->htab);
+
+			memset(&mp->stat, 0, sizeof(mp->stat));
+			mp->stat.st_cachesize = cachesize;
+
+			mp->flags = 0;
+
+			newregion = 1;
+		} else if (ret != EEXIST)
+			return (ret);
+	}
+
+	/* If we didn't or couldn't create the region, try and join it. */
+	if (!newregion &&
+	    (ret = __db_ropen(dbmp->dbenv, DB_APP_NONE,
+	    path, DB_DEFAULT_MPOOL_FILE, 0, &fd, &dbmp->maddr)) != 0) {
+		/*
+		 * If we failed because the file wasn't available, wait a
+		 * second and try again.
+		 */
+		if (ret == EAGAIN && ++retry_cnt < 3) {
+			(void)__db_sleep(1, 0);
+			goto retry;
+		}
+		return (ret);
+	}
+
+	/* Set up the common pointers. */
+	dbmp->mp = dbmp->maddr;
+	dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL);
+
+	/*
+	 * If not already locked, lock the region -- if it's a new region,
+	 * then either __db_rcreate() locked it for us or we malloc'd it
+	 * instead of creating a region, neither of which requires locking
+	 * here.
+	 */
+	if (!newregion)
+		LOCKREGION(dbmp);
+
+	/*
+	 * Get the hash table address; it's on the shared page, so we have
+	 * to lock first.
+	 */
+	dbmp->htab = ADDR(dbmp, dbmp->mp->htab);
+
+	dbmp->fd = fd;
+
+	/* If we locked the region, release it now. */
+	if (!F_ISSET(dbmp, MP_ISPRIVATE))
+		UNLOCKREGION(dbmp);
+	return (0);
+
+err:	if (fd != -1) {
+		dbmp->fd = fd;
+		(void)__memp_rclose(dbmp);
+	}
+
+	if (newregion)
+		(void)memp_unlink(path, 1, dbmp->dbenv);
+	return (ret);
+}
+
+/*
+ * __memp_rclose --
+ *	Close the mpool region.
+ *
+ * PUBLIC: int __memp_rclose __P((DB_MPOOL *));
+ */
+int
+__memp_rclose(dbmp)
+	DB_MPOOL *dbmp;
+{
+	if (F_ISSET(dbmp, MP_ISPRIVATE)) {
+		free(dbmp->maddr);
+		return (0);
+	}
+	return (__db_rclose(dbmp->dbenv, dbmp->fd, dbmp->maddr));
+}
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c
new file mode 100644
index 0000000000..4f1205661a
--- /dev/null
+++ b/db2/mp/mp_sync.c
@@ -0,0 +1,205 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_sync.c	10.8 (Sleepycat) 7/2/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_sync --
+ *	Mpool sync function.
+ */
+int
+memp_sync(dbmp, lsnp)
+	DB_MPOOL *dbmp;
+	DB_LSN *lsnp;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	int can_write, wrote, lsn_cnt, restart, ret;
+
+	dbenv = dbmp->dbenv;
+
+	if (dbmp->dbenv->lg_info == NULL) {
+		__db_err(dbenv, "memp_sync requires logging");
+		return (EINVAL);
+	}
+
+	LOCKREGION(dbmp);
+
+	/*
+	 * If the application is asking about a previous call, and we haven't
+	 * found any buffers that the application holding the pin couldn't
+	 * write, return yes or no based on the current count.  Note, if the
+	 * application is asking about a LSN *smaller* than one we've already
+	 * handled, then we return based on the count for that LSN.
+	 */
+	mp = dbmp->mp;
+	if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
+		if (mp->lsn_cnt == 0) {
+			*lsnp = mp->lsn;
+			ret = 0;
+		} else
+			ret = DB_INCOMPLETE;
+
+		UNLOCKREGION(dbmp);
+		return (ret);
+	}
+
+	/* Else, it's a new checkpoint. */
+	F_CLR(mp, MP_LSN_RETRY);
+
+	/*
+	 * Save the LSN.  We know that it's a new LSN or larger than the one
+	 * for which we were already doing a checkpoint.  (BTW, I don't expect
+	 * to see multiple LSN's from the same or multiple processes, but You
+	 * Just Never Know.  Responding as if they all called with the largest
+	 * of the LSNs specified makes everything work.
+	 *
+	 * We don't currently use the LSN we save.  We could potentially save
+	 * the last-written LSN in each buffer header and use it to determine
+	 * what buffers need to be written.  The problem with this is that it's
+	 * sizeof(LSN) more bytes of buffer header.  We currently write all the
+	 * dirty buffers instead.
+	 *
+	 * Walk the list of shared memory segments clearing the count of
+	 * buffers waiting to be written.
+	 */
+	mp->lsn = *lsnp;
+	mp->lsn_cnt = 0;
+	for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+		mfp->lsn_cnt = 0;
+
+	/*
+	 * Walk the list of buffers and mark all dirty buffers to be written
+	 * and all pinned buffers to be potentially written.  We do this in
+	 * single fell swoop while holding the region locked so that processes
+	 * can't make new buffers dirty, causing us to never finish.  Since
+	 * the application may have restarted the sync, clear any BH_WRITE
+	 * flags that appear to be left over.
+	 */
+	can_write = lsn_cnt = 0;
+	for (lsn_cnt = 0, bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+		if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
+			F_SET(bhp, BH_WRITE);
+
+			if (bhp->ref == 0)
+				can_write = 1;
+
+			mfp = ADDR(dbmp, bhp->mf_offset);
+			++mfp->lsn_cnt;
+
+			++lsn_cnt;
+		} else
+			F_CLR(bhp, BH_WRITE);
+
+	mp->lsn_cnt = lsn_cnt;
+
+	/* If there no buffers we can write, we're done. */
+	if (!can_write) {
+		UNLOCKREGION(dbmp);
+		return (mp->lsn_cnt ? DB_INCOMPLETE : 0);
+	}
+
+	/*
+	 * Write any buffers that we can.  Restart the walk after each write,
+	 * __memp_pgwrite() discards and reacquires the region lock during I/O.
+	 */
+retry:	for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+		/* Ignore pinned or locked buffers. */
+		if (!F_ISSET(bhp, BH_WRITE) ||
+		    bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+			continue;
+
+		mfp = ADDR(dbmp, bhp->mf_offset);
+		if ((ret =
+		    __memp_bhwrite(dbmp, mfp, bhp, &restart, &wrote)) != 0)
+			goto err;
+		if (wrote) {
+			if (restart)
+				goto retry;
+			continue;
+		}
+		__db_err(dbenv, "%s: unable to flush page: %lu",
+		    ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno);
+		ret = EPERM;
+		goto err;
+	}
+	ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+
+err:	UNLOCKREGION(dbmp);
+	return (ret);
+}
+
+/*
+ * memp_fsync --
+ *	Mpool file sync function.
+ */
+int
+memp_fsync(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	size_t mf_offset;
+	int pincnt, restart, ret, wrote;
+
+	/* We don't sync temporary files -- what's the use? */
+	if (F_ISSET(dbmfp, MP_PATH_TEMP))
+		return (0);
+
+	dbmp = dbmfp->dbmp;
+	ret = 0;
+
+	mf_offset = OFFSET(dbmp, dbmfp->mfp);
+
+	LOCKREGION(dbmp);
+
+	/*
+	 * Walk the list of buffer headers for the MPOOLFILE, and write out any
+	 * dirty buffers that we can.
+	 */
+retry:	pincnt = 0;
+	for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+		if (F_ISSET(bhp, BH_DIRTY) && bhp->mf_offset == mf_offset) {
+			if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
+				++pincnt;
+				continue;
+			}
+			if ((ret =
+			    __memp_pgwrite(dbmfp, bhp, &restart, &wrote)) != 0)
+				goto err;
+			if (!wrote)
+				++pincnt;
+			if (restart)
+				goto retry;
+		}
+
+	UNLOCKREGION(dbmp);
+
+err:	return (ret == 0 ? (pincnt ? DB_INCOMPLETE : 0) : ret);
+}