about summary refs log tree commit diff
path: root/db2/common
diff options
context:
space:
mode:
Diffstat (limited to 'db2/common')
-rw-r--r--db2/common/db_appinit.c183
-rw-r--r--db2/common/db_apprec.c49
-rw-r--r--db2/common/db_byteorder.c4
-rw-r--r--db2/common/db_err.c137
-rw-r--r--db2/common/db_log2.c7
-rw-r--r--db2/common/db_region.c1131
-rw-r--r--db2/common/db_salloc.c41
-rw-r--r--db2/common/db_shash.c82
8 files changed, 998 insertions, 636 deletions
diff --git a/db2/common/db_appinit.c b/db2/common/db_appinit.c
index 4ee9e4f40c..6ec007be0a 100644
--- a/db2/common/db_appinit.c
+++ b/db2/common/db_appinit.c
@@ -1,23 +1,21 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_appinit.c	10.38 (Sleepycat) 1/7/98";
+static const char sccsid[] = "@(#)db_appinit.c	10.52 (Sleepycat) 6/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
-#include <sys/param.h>
-#include <sys/stat.h>
+#include <sys/types.h>
 
 #include <ctype.h>
 #include <errno.h>
-#include <fcntl.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
@@ -34,14 +32,14 @@ static const char sccsid[] = "@(#)db_appinit.c	10.38 (Sleepycat) 1/7/98";
 #include "clib_ext.h"
 #include "common_ext.h"
 
-static int __db_home __P((DB_ENV *, const char *, int));
+static int __db_home __P((DB_ENV *, const char *, u_int32_t));
 static int __db_parse __P((DB_ENV *, char *));
-static int __db_tmp_dir __P((DB_ENV *, int));
-static int __db_tmp_open __P((DB_ENV *, char *, int *));
+static int __db_tmp_dir __P((DB_ENV *, u_int32_t));
+static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, int *));
 
 /*
  * db_version --
- *	Return verision information.
+ *	Return version information.
  */
 char *
 db_version(majverp, minverp, patchp)
@@ -65,16 +63,18 @@ db_appinit(db_home, db_config, dbenv, flags)
 	const char *db_home;
 	char * const *db_config;
 	DB_ENV *dbenv;
-	int flags;
+	u_int32_t flags;
 {
 	FILE *fp;
-	int ret;
+	int mode, ret;
 	char * const *p;
 	char *lp, buf[MAXPATHLEN * 2];
 
 	/* Validate arguments. */
 	if (dbenv == NULL)
 		return (EINVAL);
+
+
 #ifdef HAVE_SPINLOCKS
 #define	OKFLAGS								\
    (DB_CREATE | DB_NOMMAP | DB_THREAD | DB_INIT_LOCK | DB_INIT_LOG |	\
@@ -89,10 +89,9 @@ db_appinit(db_home, db_config, dbenv, flags)
 	if ((ret = __db_fchk(dbenv, "db_appinit", flags, OKFLAGS)) != 0)
 		return (ret);
 
-#define	RECOVERY_FLAGS (DB_CREATE | DB_INIT_TXN | DB_INIT_LOG)
-	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
-	    LF_ISSET(RECOVERY_FLAGS) != RECOVERY_FLAGS)
-		return (__db_ferr(dbenv, "db_appinit", 1));
+	/* Transactions imply logging. */
+	if (LF_ISSET(DB_INIT_TXN))
+		LF_SET(DB_INIT_LOG);
 
 	/* Convert the db_appinit(3) flags. */
 	if (LF_ISSET(DB_THREAD))
@@ -147,47 +146,48 @@ db_appinit(db_home, db_config, dbenv, flags)
 	F_SET(dbenv, DB_ENV_APPINIT);
 
 	/*
-	 * If we are doing recovery, remove all the regions.
+	 * If we are doing recovery, remove all the old shared memory
+	 * regions.
 	 */
 	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) {
-		/* Remove all the old shared memory regions.  */
-		if ((ret = log_unlink(NULL, 1 /* force */, dbenv)) != 0)
+		if ((ret = log_unlink(NULL, 1, dbenv)) != 0)
 			goto err;
-		if ((ret = memp_unlink(NULL, 1 /* force */, dbenv)) != 0)
+		if ((ret = memp_unlink(NULL, 1, dbenv)) != 0)
 			goto err;
-		if ((ret = lock_unlink(NULL, 1 /* force */, dbenv)) != 0)
+		if ((ret = lock_unlink(NULL, 1, dbenv)) != 0)
 			goto err;
-		if ((ret = txn_unlink(NULL, 1 /* force */, dbenv)) != 0)
+		if ((ret = txn_unlink(NULL, 1, dbenv)) != 0)
 			goto err;
 	}
 
-	/* Transactions imply logging. */
-	if (LF_ISSET(DB_INIT_TXN))
-		LF_SET(DB_INIT_LOG);
-
-	/* Default permissions are 0660. */
-#undef	DB_DEFPERM
-#define	DB_DEFPERM	(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)
-
-	/* Initialize the subsystems. */
+	/*
+	 * Create the new shared regions.
+	 *
+	 * Default permissions are read-write for both owner and group.
+	 */
+	mode = __db_omode("rwrw--");
 	if (LF_ISSET(DB_INIT_LOCK) && (ret = lock_open(NULL,
 	    LF_ISSET(DB_CREATE | DB_THREAD),
-	    DB_DEFPERM, dbenv, &dbenv->lk_info)) != 0)
+	    mode, dbenv, &dbenv->lk_info)) != 0)
 		goto err;
 	if (LF_ISSET(DB_INIT_LOG) && (ret = log_open(NULL,
 	    LF_ISSET(DB_CREATE | DB_THREAD),
-	    DB_DEFPERM, dbenv, &dbenv->lg_info)) != 0)
+	    mode, dbenv, &dbenv->lg_info)) != 0)
 		goto err;
 	if (LF_ISSET(DB_INIT_MPOOL) && (ret = memp_open(NULL,
 	    LF_ISSET(DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD),
-	    DB_DEFPERM, dbenv, &dbenv->mp_info)) != 0)
+	    mode, dbenv, &dbenv->mp_info)) != 0)
 		goto err;
 	if (LF_ISSET(DB_INIT_TXN) && (ret = txn_open(NULL,
 	    LF_ISSET(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC),
-	    DB_DEFPERM, dbenv, &dbenv->tx_info)) != 0)
+	    mode, dbenv, &dbenv->tx_info)) != 0)
 		goto err;
 
-	/* Initialize recovery. */
+	/*
+	 * If the application is running with transactions, initialize the
+	 * function tables.  Once that's done, do recovery for any previous
+	 * run.
+	 */
 	if (LF_ISSET(DB_INIT_TXN)) {
 		if ((ret = __bam_init_recover(dbenv)) != 0)
 			goto err;
@@ -199,12 +199,12 @@ db_appinit(db_home, db_config, dbenv, flags)
 			goto err;
 		if ((ret = __txn_init_recover(dbenv)) != 0)
 			goto err;
-	}
 
-	/* Run recovery if necessary. */
-	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && (ret =
-	    __db_apprec(dbenv, LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0)
-		goto err;
+		if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+		    (ret = __db_apprec(dbenv,
+		    LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0)
+			goto err;
+	}
 
 	return (ret);
 
@@ -282,21 +282,21 @@ db_appexit(dbenv)
  *	it in allocated space.
  *
  * PUBLIC: int __db_appname __P((DB_ENV *,
- * PUBLIC:    APPNAME, const char *, const char *, int *, char **));
+ * PUBLIC:    APPNAME, const char *, const char *, u_int32_t, int *, char **));
  */
 int
-__db_appname(dbenv, appname, dir, file, fdp, namep)
+__db_appname(dbenv, appname, dir, file, tmp_oflags, fdp, namep)
 	DB_ENV *dbenv;
 	APPNAME appname;
 	const char *dir, *file;
+	u_int32_t tmp_oflags;
 	int *fdp;
 	char **namep;
 {
 	DB_ENV etmp;
 	size_t len;
-	int ret, slash, tmp_create, tmp_free;
+	int data_entry, ret, slash, tmp_create, tmp_free;
 	const char *a, *b, *c;
-	int data_entry;
 	char *p, *start;
 
 	a = b = c = NULL;
@@ -349,8 +349,8 @@ __db_appname(dbenv, appname, dir, file, fdp, namep)
 	 *
 	 * DB_ENV	   APPNAME	   RESULT
 	 * -------------------------------------------
-	 * null		   DB_APP_TMP	   <tmp>/<create>
-	 * set		   DB_APP_TMP	   DB_HOME/DB_TMP_DIR/<create>
+	 * null		   DB_APP_TMP*	   <tmp>/<create>
+	 * set		   DB_APP_TMP*	   DB_HOME/DB_TMP_DIR/<create>
 	 */
 retry:	switch (appname) {
 	case DB_APP_NONE:
@@ -431,7 +431,14 @@ done:	len =
 	    (c == NULL ? 0 : strlen(c) + 1) +
 	    (file == NULL ? 0 : strlen(file) + 1);
 
-	if ((start = (char *)__db_malloc(len)) == NULL) {
+	/*
+	 * Allocate space to hold the current path information, as well as any
+	 * temporary space that we're going to need to create a temporary file
+	 * name.
+	 */
+#define	DB_TRAIL	"XXXXXX"
+	if ((start =
+	    (char *)__db_malloc(len + sizeof(DB_TRAIL) + 10)) == NULL) {
 		__db_err(dbenv, "%s", strerror(ENOMEM));
 		if (tmp_free)
 			FREES(etmp.db_tmp_dir);
@@ -460,14 +467,15 @@ done:	len =
 		FREES(etmp.db_tmp_dir);
 
 	/* Create the file if so requested. */
-	if (tmp_create) {
-		ret = __db_tmp_open(dbenv, start, fdp);
+	if (tmp_create &&
+	    (ret = __db_tmp_open(dbenv, tmp_oflags, start, fdp)) != 0) {
 		FREES(start);
-	} else {
-		*namep = start;
-		ret = 0;
+		return (ret);
 	}
-	return (ret);
+
+	if (namep != NULL)
+		*namep = start;
+	return (0);
 }
 
 /*
@@ -478,7 +486,7 @@ static int
 __db_home(dbenv, db_home, flags)
 	DB_ENV *dbenv;
 	const char *db_home;
-	int flags;
+	u_int32_t flags;
 {
 	const char *p;
 
@@ -532,10 +540,12 @@ __db_parse(dbenv, s)
 		return (ENOMEM);
 
 	tp = local_s;
-	while ((name = strsep(&tp, " \t")) != NULL && *name == '\0');
+	while ((name = strsep(&tp, " \t")) != NULL && *name == '\0')
+		;
 	if (name == NULL)
 		goto illegal;
-	while ((value = strsep(&tp, " \t")) != NULL && *value == '\0');
+	while ((value = strsep(&tp, " \t")) != NULL && *value == '\0')
+		;
 	if (value == NULL) {
 illegal:	ret = EINVAL;
 		__db_err(dbenv, "illegal name-value pair: %s", s);
@@ -591,7 +601,7 @@ static char *sTempFolder;
 static int
 __db_tmp_dir(dbenv, flags)
 	DB_ENV *dbenv;
-	int flags;
+	u_int32_t flags;
 {
 	static const char * list[] = {	/* Ordered: see db_appinit(3). */
 		"/var/tmp",
@@ -671,49 +681,45 @@ __db_tmp_dir(dbenv, flags)
  *	Create a temporary file.
  */
 static int
-__db_tmp_open(dbenv, dir, fdp)
+__db_tmp_open(dbenv, flags, path, fdp)
 	DB_ENV *dbenv;
-	char *dir;
+	u_int32_t flags;
+	char *path;
 	int *fdp;
 {
 #ifdef HAVE_SIGFILLSET
 	sigset_t set, oset;
 #endif
 	u_long pid;
-	size_t len;
-	int isdir, ret;
-	char *trv, buf[MAXPATHLEN];
+	int mode, isdir, ret;
+	const char *p;
+	char *trv;
 
 	/*
 	 * Check the target directory; if you have six X's and it doesn't
 	 * exist, this runs for a *very* long time.
 	 */
-	if ((ret = __db_exists(dir, &isdir)) != 0) {
-		__db_err(dbenv, "%s: %s", dir, strerror(ret));
+	if ((ret = __db_exists(path, &isdir)) != 0) {
+		__db_err(dbenv, "%s: %s", path, strerror(ret));
 		return (ret);
 	}
 	if (!isdir) {
-		__db_err(dbenv, "%s: %s", dir, strerror(EINVAL));
+		__db_err(dbenv, "%s: %s", path, strerror(EINVAL));
 		return (EINVAL);
 	}
 
 	/* Build the path. */
-#define	DB_TRAIL	"/XXXXXX"
-	if ((len = strlen(dir)) + sizeof(DB_TRAIL) > sizeof(buf)) {
-		__db_err(dbenv,
-		    "tmp_open: %s: %s", buf, strerror(ENAMETOOLONG));
-		return (ENAMETOOLONG);
-	}
-	(void)strcpy(buf, dir);
-	(void)strcpy(buf + len, DB_TRAIL);
-	buf[len] = PATH_SEPARATOR[0];			/* WIN32 */
+	for (trv = path; *trv != '\0'; ++trv)
+		;
+	*trv = PATH_SEPARATOR[0];
+	for (p = DB_TRAIL; (*++trv = *p) != '\0'; ++p)
+		;
 
 	/*
 	 * Replace the X's with the process ID.  Pid should be a pid_t,
 	 * but we use unsigned long for portability.
 	 */
-	for (pid = getpid(),
-	    trv = buf + len + sizeof(DB_TRAIL) - 1; *--trv == 'X'; pid /= 10)
+	for (pid = getpid(); *--trv == 'X'; pid /= 10)
 		switch (pid % 10) {
 		case 0: *trv = '0'; break;
 		case 1: *trv = '1'; break;
@@ -728,30 +734,33 @@ __db_tmp_open(dbenv, dir, fdp)
 		}
 	++trv;
 
+	/* Set up open flags and mode. */
+	LF_SET(DB_CREATE | DB_EXCL);
+	mode = __db_omode("rw----");
+
 	/*
-	 * Try and open a file.  We block every signal we can get our hands
+	 * Try to open a file.  We block every signal we can get our hands
 	 * on so that, if we're interrupted at the wrong time, the temporary
 	 * file isn't left around -- of course, if we drop core in-between
 	 * the calls we'll hang forever, but that's probably okay.  ;-}
 	 */
 #ifdef HAVE_SIGFILLSET
-	(void)sigfillset(&set);
+	if (LF_ISSET(DB_TEMPORARY))
+		(void)sigfillset(&set);
 #endif
 	for (;;) {
 #ifdef HAVE_SIGFILLSET
-		(void)sigprocmask(SIG_BLOCK, &set, &oset);
+		if (LF_ISSET(DB_TEMPORARY))
+			(void)sigprocmask(SIG_BLOCK, &set, &oset);
 #endif
-#define	DB_TEMPOPEN	DB_CREATE | DB_EXCL | DB_TEMPORARY
-		if ((ret = __db_open(buf,
-		    DB_TEMPOPEN, DB_TEMPOPEN, S_IRUSR | S_IWUSR, fdp)) == 0) {
+		ret = __db_open(path, flags, flags, mode, fdp);
 #ifdef HAVE_SIGFILLSET
+		if (LF_ISSET(DB_TEMPORARY))
 			(void)sigprocmask(SIG_SETMASK, &oset, NULL);
 #endif
+		if (ret == 0)
 			return (0);
-		}
-#ifdef HAVE_SIGFILLSET
-		(void)sigprocmask(SIG_SETMASK, &oset, NULL);
-#endif
+
 		/*
 		 * XXX:
 		 * If we don't get an EEXIST error, then there's something
@@ -761,7 +770,7 @@ __db_tmp_open(dbenv, dir, fdp)
 		 */
 		if (ret != EEXIST) {
 			__db_err(dbenv,
-			    "tmp_open: %s: %s", buf, strerror(ret));
+			    "tmp_open: %s: %s", path, strerror(ret));
 			return (ret);
 		}
 
diff --git a/db2/common/db_apprec.c b/db2/common/db_apprec.c
index 7a42e13317..df707eafef 100644
--- a/db2/common/db_apprec.c
+++ b/db2/common/db_apprec.c
@@ -1,7 +1,7 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
@@ -9,18 +9,17 @@
 
 #ifndef lint
 static const char copyright[] =
-"@(#) Copyright (c) 1997\n\
+"@(#) Copyright (c) 1996, 1997, 1998\n\
 	Sleepycat Software Inc.  All rights reserved.\n";
-static const char sccsid[] = "@(#)db_apprec.c	10.23 (Sleepycat) 1/17/98";
+static const char sccsid[] = "@(#)db_apprec.c	10.30 (Sleepycat) 5/3/98";
 #endif
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
-#include <time.h>
 #include <string.h>
-#include <stdlib.h>
+#include <time.h>
 #endif
 
 #include "db_int.h"
@@ -36,18 +35,19 @@ static const char sccsid[] = "@(#)db_apprec.c	10.23 (Sleepycat) 1/17/98";
  * __db_apprec --
  *	Perform recovery.
  *
- * PUBLIC: int __db_apprec __P((DB_ENV *, int));
+ * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t));
  */
 int
 __db_apprec(dbenv, flags)
 	DB_ENV *dbenv;
-	int flags;
+	u_int32_t flags;
 {
 	DBT data;
 	DB_LOG *lp;
 	DB_LSN ckp_lsn, first_lsn, lsn;
 	time_t now;
-	int is_thread, ret;
+	u_int32_t is_thread;
+	int ret;
 	void *txninfo;
 
 	lp = dbenv->lg_info;
@@ -91,14 +91,14 @@ __db_apprec(dbenv, flags)
 	if ((ret = log_get(lp, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) {
 		/*
 		 * If we don't find a checkpoint, start from the beginning.
-		 * If that fails, we're done.  Note, we require that there
-		 * be log records if we're performing recovery, and fail if
-		 * there aren't.
+		 * If that fails, we're done.  Note, we do not require that
+		 * there be log records if we're performing recovery.
 		 */
 		if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) {
-			__db_err(dbenv, "First log record not found");
 			if (ret == DB_NOTFOUND)
-				ret = EINVAL;
+				ret = 0;
+			else
+				__db_err(dbenv, "First log record not found");
 			goto out;
 		}
 	}
@@ -134,14 +134,17 @@ __db_apprec(dbenv, flags)
 	} else
 		if ((ret = __log_findckp(lp, &first_lsn)) == DB_NOTFOUND) {
 			/*
-			 * If recovery was specified, there must be log files.
-			 * If we don't find one, it's an error.  (This should
-			 * have been caught above, when a log_get() of DB_FIRST
-			 * or DB_CHECKPOINT succeeded, but paranoia is good.)
+			 * We don't require that log files exist if recovery
+			 * was specified.
 			 */
-			ret = EINVAL;
+			ret = 0;
 			goto out;
 		}
+
+	if (dbenv->db_verbose)
+		__db_err(lp->dbenv, "Recovery starting from [%lu][%lu]",
+		    (u_long)first_lsn.file, (u_long)first_lsn.offset);
+
 	for (ret = log_get(lp, &lsn, &data, DB_LAST);
 	    ret == 0 && log_compare(&lsn, &first_lsn) > 0;
 	    ret = log_get(lp, &lsn, &data, DB_PREV)) {
@@ -175,21 +178,21 @@ __db_apprec(dbenv, flags)
 	__log_close_files(lp);
 
 	/*
-	 * Now set the maximum transaction id, set the last checkpoint lsn,
-	 * and the current time.  Then take a checkpoint.
+	 * Now set the last checkpoint lsn and the current time,
+	 * take a checkpoint, and reset the txnid.
 	 */
 	(void)time(&now);
-	dbenv->tx_info->region->last_txnid = ((__db_txnhead *)txninfo)->maxid;
 	dbenv->tx_info->region->last_ckp = ckp_lsn;
 	dbenv->tx_info->region->time_ckp = (u_int32_t)now;
 	if ((ret = txn_checkpoint(dbenv->tx_info, 0, 0)) != 0)
 		goto out;
+	dbenv->tx_info->region->last_txnid = TXN_MINIMUM;
 
 	if (dbenv->db_verbose) {
 		__db_err(lp->dbenv, "Recovery complete at %.24s", ctime(&now));
-		__db_err(lp->dbenv, "%s %lu %s [%lu][%lu]",
+		__db_err(lp->dbenv, "%s %lx %s [%lu][%lu]",
 		    "Maximum transaction id",
-		    (u_long)dbenv->tx_info->region->last_txnid,
+		    ((DB_TXNHEAD *)txninfo)->maxid,
 		    "Recovery checkpoint",
 		    (u_long)dbenv->tx_info->region->last_ckp.file,
 		    (u_long)dbenv->tx_info->region->last_ckp.offset);
diff --git a/db2/common/db_byteorder.c b/db2/common/db_byteorder.c
index e486132073..cadf742851 100644
--- a/db2/common/db_byteorder.c
+++ b/db2/common/db_byteorder.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_byteorder.c	10.4 (Sleepycat) 9/4/97";
+static const char sccsid[] = "@(#)db_byteorder.c	10.5 (Sleepycat) 4/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
diff --git a/db2/common/db_err.c b/db2/common/db_err.c
index fc59aadbaf..98a414279e 100644
--- a/db2/common/db_err.c
+++ b/db2/common/db_err.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_err.c	10.21 (Sleepycat) 1/13/98";
+static const char sccsid[] = "@(#)db_err.c	10.25 (Sleepycat) 5/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -26,6 +26,7 @@ static const char sccsid[] = "@(#)db_err.c	10.21 (Sleepycat) 1/13/98";
 #include "db_int.h"
 #include "common_ext.h"
 
+static int __db_keyempty __P((const DB_ENV *));
 static int __db_rdonly __P((const DB_ENV *, const char *));
 
 /*
@@ -81,11 +82,11 @@ __db_err(dbenv, fmt, va_alist)
  * appears before the assignment in the __db__panic() call.
  */
 static int __db_ecursor __P((DB *, DB_TXN *, DBC **));
-static int __db_edel __P((DB *, DB_TXN *, DBT *, int));
+static int __db_edel __P((DB *, DB_TXN *, DBT *, u_int32_t));
 static int __db_efd __P((DB *, int *));
-static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, int));
-static int __db_estat __P((DB *, void *, void *(*)(size_t), int));
-static int __db_esync __P((DB *, int));
+static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __db_estat __P((DB *, void *, void *(*)(size_t), u_int32_t));
+static int __db_esync __P((DB *, u_int32_t));
 
 /*
  * __db_ecursor --
@@ -113,7 +114,7 @@ __db_edel(a, b, c, d)
 	DB *a;
 	DB_TXN *b;
 	DBT *c;
-	int d;
+	u_int32_t d;
 {
 	COMPQUIET(a, NULL);
 	COMPQUIET(b, NULL);
@@ -147,7 +148,7 @@ __db_egp(a, b, c, d, e)
 	DB *a;
 	DB_TXN *b;
 	DBT *c, *d;
-	int e;
+	u_int32_t e;
 {
 	COMPQUIET(a, NULL);
 	COMPQUIET(b, NULL);
@@ -167,7 +168,7 @@ __db_estat(a, b, c, d)
 	DB *a;
 	void *b;
 	void *(*c) __P((size_t));
-	int d;
+	u_int32_t d;
 {
 	COMPQUIET(a, NULL);
 	COMPQUIET(b, NULL);
@@ -184,7 +185,7 @@ __db_estat(a, b, c, d)
 static int
 __db_esync(a, b)
 	DB *a;
-	int b;
+	u_int32_t b;
 {
 	COMPQUIET(a, NULL);
 	COMPQUIET(b, 0);
@@ -208,6 +209,10 @@ __db_panic(dbp)
 	 *
 	 * We should call mpool and have it shut down the file, so we get
 	 * other processes sharing this file as well.
+	 *
+	 *	Chaos reigns within.
+	 *	Reflect, repent, and reboot.
+	 *	Order shall return.
 	 */
 	dbp->cursor = __db_ecursor;
 	dbp->del = __db_edel;
@@ -235,13 +240,13 @@ __db_panic(dbp)
  * __db_fchk --
  *	General flags checking routine.
  *
- * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, int, int));
+ * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t));
  */
 int
 __db_fchk(dbenv, name, flags, ok_flags)
 	DB_ENV *dbenv;
 	const char *name;
-	int flags, ok_flags;
+	u_int32_t flags, ok_flags;
 {
 	DB_CHECK_FLAGS(dbenv, name, flags, ok_flags);
 	return (0);
@@ -251,13 +256,14 @@ __db_fchk(dbenv, name, flags, ok_flags)
  * __db_fcchk --
  *	General combination flags checking routine.
  *
- * PUBLIC: int __db_fcchk __P((DB_ENV *, const char *, int, int, int));
+ * PUBLIC: int __db_fcchk
+ * PUBLIC:    __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
  */
 int
 __db_fcchk(dbenv, name, flags, flag1, flag2)
 	DB_ENV *dbenv;
 	const char *name;
-	int flags, flag1, flag2;
+	u_int32_t flags, flag1, flag2;
 {
 	DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2);
 	return (0);
@@ -267,12 +273,13 @@ __db_fcchk(dbenv, name, flags, flag1, flag2)
  * __db_cdelchk --
  *	Common cursor delete argument checking routine.
  *
- * PUBLIC: int __db_cdelchk __P((const DB *, int, int, int));
+ * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int));
  */
 int
 __db_cdelchk(dbp, flags, isrdonly, isvalid)
 	const DB *dbp;
-	int flags, isrdonly, isvalid;
+	u_int32_t flags;
+	int isrdonly, isvalid;
 {
 	/* Check for changes to a read-only tree. */
 	if (isrdonly)
@@ -292,17 +299,18 @@ __db_cdelchk(dbp, flags, isrdonly, isvalid)
  * __db_cgetchk --
  *	Common cursor get argument checking routine.
  *
- * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, int, int));
+ * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int));
  */
 int
 __db_cgetchk(dbp, key, data, flags, isvalid)
 	const DB *dbp;
 	DBT *key, *data;
-	int flags, isvalid;
+	u_int32_t flags;
+	int isvalid;
 {
-	int check_key;
+	int key_einval, key_flags;
 
-	check_key = 0;
+	key_flags = key_einval = 0;
 
 	/* Check for invalid dbc->c_get() function flags. */
 	switch (flags) {
@@ -311,10 +319,13 @@ __db_cgetchk(dbp, key, data, flags, isvalid)
 	case DB_LAST:
 	case DB_NEXT:
 	case DB_PREV:
+		key_flags = 1;
+		break;
 	case DB_SET_RANGE:
-		check_key = 1;
+		key_einval = key_flags = 1;
 		break;
 	case DB_SET:
+		key_einval = 1;
 		break;
 	case DB_GET_RECNO:
 		if (!F_ISSET(dbp, DB_BT_RECNUM))
@@ -323,14 +334,14 @@ __db_cgetchk(dbp, key, data, flags, isvalid)
 	case DB_SET_RECNO:
 		if (!F_ISSET(dbp, DB_BT_RECNUM))
 			goto err;
-		check_key = 1;
+		key_einval = key_flags = 1;
 		break;
 	default:
 err:		return (__db_ferr(dbp->dbenv, "c_get", 0));
 	}
 
 	/* Check for invalid key/data flags. */
-	if (check_key)
+	if (key_flags)
 		DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags,
 		    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
 	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
@@ -340,11 +351,15 @@ err:		return (__db_ferr(dbp->dbenv, "c_get", 0));
 	if (F_ISSET(dbp, DB_AM_THREAD)) {
 		if (!F_ISSET(data, DB_DBT_USERMEM | DB_DBT_MALLOC))
 			return (__db_ferr(dbp->dbenv, "threaded data", 1));
-		if (check_key &&
+		if (key_flags &&
 		    !F_ISSET(key, DB_DBT_USERMEM | DB_DBT_MALLOC))
 			return (__db_ferr(dbp->dbenv, "threaded key", 1));
 	}
 
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
 	/*
 	 * The cursor must be initialized for DB_CURRENT, return -1 for an
 	 * invalid cursor, otherwise 0.
@@ -357,23 +372,24 @@ err:		return (__db_ferr(dbp->dbenv, "c_get", 0));
  *	Common cursor put argument checking routine.
  *
  * PUBLIC: int __db_cputchk __P((const DB *,
- * PUBLIC:    const DBT *, DBT *, int, int, int));
+ * PUBLIC:    const DBT *, DBT *, u_int32_t, int, int));
  */
 int
 __db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
 	const DB *dbp;
 	const DBT *key;
 	DBT *data;
-	int flags, isrdonly, isvalid;
+	u_int32_t flags;
+	int isrdonly, isvalid;
 {
-	int check_key;
+	int key_einval, key_flags;
 
 	/* Check for changes to a read-only tree. */
 	if (isrdonly)
 		return (__db_rdonly(dbp->dbenv, "c_put"));
 
 	/* Check for invalid dbc->c_put() function flags. */
-	check_key = 0;
+	key_einval = key_flags = 0;
 	switch (flags) {
 	case DB_AFTER:
 	case DB_BEFORE:
@@ -388,19 +404,23 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
 	case DB_KEYLAST:
 		if (dbp->type == DB_RECNO)
 			goto err;
-		check_key = 1;
+		key_einval = key_flags = 1;
 		break;
 	default:
 err:		return (__db_ferr(dbp->dbenv, "c_put", 0));
 	}
 
 	/* Check for invalid key/data flags. */
-	if (check_key)
+	if (key_flags)
 		DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags,
 		    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
 	DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags,
 	    DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL);
 
+	/* Check for missing keys. */
+	if (key_einval && (key->data == NULL || key->size == 0))
+		return (__db_keyempty(dbp->dbenv));
+
 	/*
 	 * The cursor must be initialized for anything other than DB_KEYFIRST
 	 * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0.
@@ -413,12 +433,14 @@ err:		return (__db_ferr(dbp->dbenv, "c_put", 0));
  * __db_delchk --
  *	Common delete argument checking routine.
  *
- * PUBLIC: int __db_delchk __P((const DB *, int, int));
+ * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
  */
 int
-__db_delchk(dbp, flags, isrdonly)
+__db_delchk(dbp, key, flags, isrdonly)
 	const DB *dbp;
-	int flags, isrdonly;
+	DBT *key;
+	u_int32_t flags;
+	int isrdonly;
 {
 	/* Check for changes to a read-only tree. */
 	if (isrdonly)
@@ -427,6 +449,10 @@ __db_delchk(dbp, flags, isrdonly)
 	/* Check for invalid db->del() function flags. */
 	DB_CHECK_FLAGS(dbp->dbenv, "delete", flags, 0);
 
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
 	return (0);
 }
 
@@ -434,14 +460,14 @@ __db_delchk(dbp, flags, isrdonly)
  * __db_getchk --
  *	Common get argument checking routine.
  *
- * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, int));
+ * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t));
  */
 int
 __db_getchk(dbp, key, data, flags)
 	const DB *dbp;
 	const DBT *key;
 	DBT *data;
-	int flags;
+	u_int32_t flags;
 {
 	/* Check for invalid db->get() function flags. */
 	DB_CHECK_FLAGS(dbp->dbenv,
@@ -457,6 +483,10 @@ __db_getchk(dbp, key, data, flags)
 	    !F_ISSET(data, DB_DBT_MALLOC | DB_DBT_USERMEM))
 		return (__db_ferr(dbp->dbenv, "threaded data", 1));
 
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
 	return (0);
 }
 
@@ -464,14 +494,16 @@ __db_getchk(dbp, key, data, flags)
  * __db_putchk --
  *	Common put argument checking routine.
  *
- * PUBLIC: int __db_putchk __P((const DB *, DBT *, const DBT *, int, int, int));
+ * PUBLIC: int __db_putchk
+ * PUBLIC:    __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
  */
 int
 __db_putchk(dbp, key, data, flags, isrdonly, isdup)
 	const DB *dbp;
 	DBT *key;
 	const DBT *data;
-	int flags, isrdonly, isdup;
+	u_int32_t flags;
+	int isrdonly, isdup;
 {
 	/* Check for changes to a read-only tree. */
 	if (isrdonly)
@@ -488,12 +520,17 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup)
 	DB_CHECK_FCOMBO(dbp->dbenv,
 	    "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM);
 
+	/* Check for missing keys. */
+	if (key->data == NULL || key->size == 0)
+		return (__db_keyempty(dbp->dbenv));
+
 	/* Check for partial puts in the presence of duplicates. */
 	if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) {
 		__db_err(dbp->dbenv,
 "a partial put in the presence of duplicates requires a cursor operation");
 		return (EINVAL);
 	}
+
 	return (0);
 }
 
@@ -501,12 +538,12 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup)
  * __db_statchk --
  *	Common stat argument checking routine.
  *
- * PUBLIC: int __db_statchk __P((const DB *, int));
+ * PUBLIC: int __db_statchk __P((const DB *, u_int32_t));
  */
 int
 __db_statchk(dbp, flags)
 	const DB *dbp;
-	int flags;
+	u_int32_t flags;
 {
 	/* Check for invalid db->stat() function flags. */
 	DB_CHECK_FLAGS(dbp->dbenv, "stat", flags, DB_RECORDCOUNT);
@@ -522,12 +559,12 @@ __db_statchk(dbp, flags)
  * __db_syncchk --
  *	Common sync argument checking routine.
  *
- * PUBLIC: int __db_syncchk __P((const DB *, int));
+ * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t));
  */
 int
 __db_syncchk(dbp, flags)
 	const DB *dbp;
-	int flags;
+	u_int32_t flags;
 {
 	/* Check for invalid db->sync() function flags. */
 	DB_CHECK_FLAGS(dbp->dbenv, "sync", flags, 0);
@@ -542,13 +579,13 @@ __db_syncchk(dbp, flags)
  * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int));
  */
 int
-__db_ferr(dbenv, name, combo)
+__db_ferr(dbenv, name, iscombo)
 	const DB_ENV *dbenv;
 	const char *name;
-	int combo;
+	int iscombo;
 {
 	__db_err(dbenv, "illegal flag %sspecified to %s",
-	    combo ? "combination " : "", name);
+	    iscombo ? "combination " : "", name);
 	return (EINVAL);
 }
 
@@ -564,3 +601,15 @@ __db_rdonly(dbenv, name)
 	__db_err(dbenv, "%s: attempt to modify a read-only tree", name);
 	return (EACCES);
 }
+
+/*
+ * __db_keyempty --
+ *	Common missing or empty key value message.
+ */
+static int
+__db_keyempty(dbenv)
+	const DB_ENV *dbenv;
+{
+	__db_err(dbenv, "missing or empty key value specified");
+	return (EINVAL);
+}
diff --git a/db2/common/db_log2.c b/db2/common/db_log2.c
index 9af01116f6..d6b14f540b 100644
--- a/db2/common/db_log2.c
+++ b/db2/common/db_log2.c
@@ -1,7 +1,7 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 /*
@@ -43,7 +43,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_log2.c	10.3 (Sleepycat) 6/21/97";
+static const char sccsid[] = "@(#)db_log2.c	10.5 (Sleepycat) 4/26/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -63,6 +63,7 @@ __db_log2(num)
 	u_int32_t i, limit;
 
 	limit = 1;
-	for (i = 0; limit < num; limit = limit << 1, i++);
+	for (i = 0; limit < num; limit = limit << 1, i++)
+		;
 	return (i);
 }
diff --git a/db2/common/db_region.c b/db2/common/db_region.c
index 02d939e3e6..6d15f7f092 100644
--- a/db2/common/db_region.c
+++ b/db2/common/db_region.c
@@ -1,59 +1,20 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
-/*
- * Copyright (c) 1995, 1996
- *	The President and Fellows of Harvard University.  All rights reserved.
- *
- * This code is derived from software contributed to Harvard by
- * Margo Seltzer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_region.c	10.21 (Sleepycat) 1/16/98";
+static const char sccsid[] = "@(#)db_region.c	10.46 (Sleepycat) 5/26/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
-#include <sys/stat.h>
 
 #include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #endif
@@ -61,548 +22,840 @@ static const char sccsid[] = "@(#)db_region.c	10.21 (Sleepycat) 1/16/98";
 #include "db_int.h"
 #include "common_ext.h"
 
-static int __db_rmap __P((DB_ENV *, int, size_t, void *));
+static int __db_growregion __P((REGINFO *, size_t));
 
 /*
- * __db_rcreate --
- *
- * Common interface for creating a shared region.  Handles synchronization
- * across multiple processes.
- *
- * The dbenv contains the environment for this process, including naming
- * information.  The path argument represents the parameters passed to
- * the open routines and may be either a file or a directory.  If it is
- * a directory, it must exist.  If it is a file, then the file parameter
- * must be NULL, otherwise, file is the name to be created inside the
- * directory path.
- *
- * The function returns a pointer to the shared region that has been mapped
- * into memory, NULL on error.
+ * __db_rattach --
+ *	Optionally create and attach to a shared memory region.
  *
- * PUBLIC: int __db_rcreate __P((DB_ENV *, APPNAME,
- * PUBLIC:    const char *, const char *, int, size_t, int, int *, void *));
+ * PUBLIC: int __db_rattach __P((REGINFO *));
  */
 int
-__db_rcreate(dbenv, appname, path, file, mode, size, oflags, fdp, retp)
-	DB_ENV *dbenv;
-	APPNAME appname;
-	const char *path, *file;
-	int mode, oflags, *fdp;
-	size_t size;
-	void *retp;
+__db_rattach(infop)
+	REGINFO *infop;
 {
-	RLAYOUT *rp;
-	int fd, ret;
-	char *name;
+	RLAYOUT *rlp, rl;
+	size_t grow_region, size;
+	ssize_t nr, nw;
+	u_int32_t flags, mbytes, bytes;
+	u_int8_t *p;
+	int malloc_possible, ret, retry_cnt;
+
+	grow_region = 0;
+	malloc_possible = 1;
+	ret = retry_cnt = 0;
+
+	/* Round off the requested size to the next page boundary. */
+	DB_ROUNDOFF(infop->size);
+
+	/* Some architectures have hard limits on the maximum region size. */
+#ifdef DB_REGIONSIZE_MAX
+	if (infop->size > DB_REGIONSIZE_MAX) {
+		__db_err(infop->dbenv, "__db_rattach: cache size too large");
+		return (EINVAL);
+	}
+#endif
 
-	fd = -1;
-	rp = NULL;
+	/* Intialize the return information in the REGINFO structure. */
+loop:	infop->addr = NULL;
+	infop->fd = -1;
+	infop->segid = INVALID_SEGID;
+	if (infop->name != NULL) {
+		FREES(infop->name);
+		infop->name = NULL;
+	}
+	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
 
+#ifndef HAVE_SPINLOCKS
 	/*
-	 * Get the filename -- note, if it's a temporary file, it will
-	 * be created by the underlying temporary file creation code,
-	 * so we have to check the file descriptor to be sure it's an
-	 * error.
+	 * XXX
+	 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
+	 * locking, which implies using mmap(2) to map in a regular file.
+	 * (Theoretically, we could probably get a file descriptor to lock
+	 * other types of shared regions, but I don't see any reason to
+	 * bother.)
 	 */
-	if ((ret = __db_appname(dbenv, appname, path, file, &fd, &name)) != 0)
-		return (ret);
+	malloc_possible = 0;
+#endif
 
+#ifdef __hppa
 	/*
-	 * Now open the file. We need to make sure that multiple processes
-	 * that attempt to create the region at the same time are properly
-	 * ordered, so we open it DB_EXCL and DB_CREATE so two simultaneous
-	 * attempts to create the region will return failure in one of the
-	 * attempts.
+	 * XXX
+	 * HP-UX won't permit mutexes to live in anything but shared memory.
+	 * Instantiate a shared region file on that architecture, regardless.
 	 */
-	oflags |= DB_CREATE | DB_EXCL;
-	if (fd == -1 &&
-	    (ret = __db_open(name, oflags, oflags, mode, &fd)) != 0) {
-		if (ret != EEXIST)
-			__db_err(dbenv,
-			    "region create: %s: %s", name, strerror(ret));
-		goto err;
+	malloc_possible = 0;
+#endif
+	/*
+	 * If a region is truly private, malloc the memory.  That's faster
+	 * than either anonymous memory or a shared file.
+	 */
+	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
+		if ((infop->addr = __db_malloc(infop->size)) == NULL)
+			return (ENOMEM);
+
+		/*
+		 * It's sometimes significantly faster to page-fault in all
+		 * of the region's pages before we run the application, as
+		 * we can see fairly nasty side-effects when we page-fault
+		 * while holding various locks, i.e., the lock takes a long
+		 * time, and other threads convoy behind the lock holder.
+		 */
+		if (DB_GLOBAL(db_region_init))
+			for (p = infop->addr;
+			    p < (u_int8_t *)infop->addr + infop->size;
+			    p += DB_VMPAGESIZE)
+				p[0] = '\0';
+
+		F_SET(infop, REGION_CREATED | REGION_MALLOC);
+		goto region_init;
 	}
-	*fdp = fd;
 
-	/* Grow the region to the correct size. */
-	if ((ret = __db_rgrow(dbenv, fd, size)) != 0)
-		goto err;
+	/*
+	 * Get the name of the region (creating the file if a temporary file
+	 * is being used).  The dbenv contains the current DB environment,
+	 * including naming information.  The path argument may be a file or
+	 * a directory.  If path is a directory, it must exist and file is the
+	 * file name to be created inside the directory.  If path is a file,
+	 * then file must be NULL.
+	 */
+	if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
+	    infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
+		return (ret);
+	if (infop->fd != -1)
+		F_SET(infop, REGION_CREATED);
 
-	/* Map the region in. */
-	if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0)
-		goto err;
+	/*
+	 * Try to create the file, if we have authority.  We have to make sure
+	 * that multiple threads/processes attempting to simultaneously create
+	 * the region are properly ordered, so we open it using DB_CREATE and
+	 * DB_EXCL, so two attempts to create the region will return failure in
+	 * one.
+	 */
+	if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
+		flags = infop->dbflags;
+		LF_SET(DB_EXCL);
+		if ((ret = __db_open(infop->name,
+		    flags, flags, infop->mode, &infop->fd)) == 0)
+			F_SET(infop, REGION_CREATED);
+		else
+			if (ret != EEXIST)
+				goto errmsg;
+	}
 
-	/* Initialize the region. */
-	if ((ret = __db_rinit(dbenv, rp, fd, size, 1)) != 0)
-		goto err;
+	/* If we couldn't create the file, try and open it. */
+	if (infop->fd == -1) {
+		flags = infop->dbflags;
+		LF_CLR(DB_CREATE | DB_EXCL);
+		if ((ret = __db_open(infop->name,
+		    flags, flags, infop->mode, &infop->fd)) != 0)
+			goto errmsg;
+	}
 
-	if (name != NULL)
-		FREES(name);
+	/*
+	 * There are three cases we support:
+	 *    1. Named anonymous memory (shmget(2)).
+	 *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
+	 *    3. Memory backed by a regular file (mmap(2)).
+	 *
+	 * We instantiate a backing file in all cases, which contains at least
+	 * the RLAYOUT structure, and in case #4, contains the actual region.
+	 * This is necessary for a couple of reasons:
+	 *
+	 * First, the mpool region uses temporary files to name regions, and
+	 * since you may have multiple regions in the same directory, we need
+	 * a filesystem name to ensure that they don't collide.
+	 *
+	 * Second, applications are allowed to forcibly remove regions, even
+	 * if they don't know anything about them other than the name.  If a
+	 * region is backed by anonymous memory, there has to be some way for
+	 * the application to find out that information, and, in some cases,
+	 * determine ID information for the anonymous memory.
+	 */
+	if (F_ISSET(infop, REGION_CREATED)) {
+		/*
+		 * If we're using anonymous memory to back this region, set
+		 * the flag.
+		 */
+		if (DB_GLOBAL(db_region_anon))
+			F_SET(infop, REGION_ANONYMOUS);
 
-	*(void **)retp = rp;
-	return (0);
+		/*
+		 * If we're using a regular file to back a region we created,
+		 * grow it to the specified size.
+		 */
+		if (!DB_GLOBAL(db_region_anon) &&
+		    (ret = __db_growregion(infop, infop->size)) != 0)
+			goto err;
+	} else {
+		/*
+		 * If we're joining a region, figure out what it looks like.
+		 *
+		 * XXX
+		 * We have to figure out if the file is a regular file backing
+		 * a region that we want to map into our address space, or a
+		 * file with the information we need to find a shared anonymous
+		 * region that we want to map into our address space.
+		 *
+		 * All this noise is because some systems don't have a coherent
+		 * VM and buffer cache, and worse, if you mix operations on the
+		 * VM and buffer cache, half the time you hang the system.
+		 *
+		 * There are two possibilities.  If the file is the size of an
+		 * RLAYOUT structure, then we know that the real region is in
+		 * shared memory, because otherwise it would be bigger.  (As
+		 * the RLAYOUT structure size is smaller than a disk sector,
+		 * the only way it can be this size is if deliberately written
+		 * that way.)  In which case, retrieve the information we need
+		 * from the RLAYOUT structure and use it to acquire the shared
+		 * memory.
+		 *
+		 * If the structure is larger than an RLAYOUT structure, then
+		 * the file is backing the shared memory region, and we use
+		 * the current size of the file without reading any information
+		 * from the file itself so that we don't confuse the VM.
+		 *
+		 * And yes, this makes me want to take somebody and kill them,
+		 * but I can't think of any other solution.
+		 */
+		if ((ret = __db_ioinfo(infop->name,
+		    infop->fd, &mbytes, &bytes, NULL)) != 0)
+			goto errmsg;
+		size = mbytes * MEGABYTE + bytes;
+
+		if (size <= sizeof(RLAYOUT)) {
+			/*
+			 * If the size is too small, the read fails or the
+			 * valid flag is incorrect, assume it's because the
+			 * RLAYOUT information hasn't been written out yet,
+			 * and retry.
+			 */
+			if (size < sizeof(RLAYOUT))
+				goto retry;
+			if ((ret =
+			    __db_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
+				goto retry;
+			if (rl.valid != DB_REGIONMAGIC)
+				goto retry;
+
+			/* Copy the size, memory id and characteristics. */
+			size = rl.size;
+			infop->segid = rl.segid;
+			if (F_ISSET(&rl, REGION_ANONYMOUS))
+				F_SET(infop, REGION_ANONYMOUS);
+		}
 
-err:	if (fd != -1) {
-		if (rp != NULL)
-			(void)__db_unmap(rp, rp->size);
-		(void)__db_unlink(name);
-		(void)__db_close(fd);
+		/*
+		 * If the region is larger than we think, that's okay, use the
+		 * current size.  If it's smaller than we think, and we were
+		 * just using the default size, that's okay, use the current
+		 * size.  If it's smaller than we think and we really care,
+		 * save the size and we'll catch that further down -- we can't
+		 * correct it here because we have to have a lock to grow the
+		 * region.
+		 */
+		if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
+			grow_region = infop->size;
+		infop->size = size;
 	}
-	if (name != NULL)
-		FREES(name);
-	return (ret);
-}
-
-/*
- * __db_rinit --
- *	Initialize the region.
- *
- * PUBLIC: int __db_rinit __P((DB_ENV *, RLAYOUT *, int, size_t, int));
- */
-int
-__db_rinit(dbenv, rp, fd, size, lock_region)
-	DB_ENV *dbenv;
-	RLAYOUT *rp;
-	size_t size;
-	int fd, lock_region;
-{
-	int ret;
 
-	COMPQUIET(dbenv, NULL);
+	/*
+	 * Map the region into our address space.  If we're creating it, the
+	 * underlying routines will make it the right size.
+	 *
+	 * There are at least two cases where we can "reasonably" fail when
+	 * we attempt to map in the region.  On Windows/95, closing the last
+	 * reference to a region causes it to be zeroed out.  On UNIX, when
+	 * using the shmget(2) interfaces, the region will no longer exist
+	 * if the system was rebooted.  In these cases, the underlying map call
+	 * returns EAGAIN, and we *remove* our file and try again.  There are
+	 * obvious races in doing this, but it should eventually settle down
+	 * to a winner and then things should proceed normally.
+	 */
+	if ((ret = __db_mapregion(infop->name, infop)) != 0)
+		if (ret == EAGAIN) {
+			/*
+			 * Pretend we created the region even if we didn't so
+			 * that our error processing unlinks it.
+			 */
+			F_SET(infop, REGION_CREATED);
+			ret = 0;
+			goto retry;
+		} else
+			goto err;
 
+region_init:
 	/*
-	 * Initialize the common information.
+	 * Initialize the common region information.
 	 *
 	 * !!!
 	 * We have to order the region creates so that two processes don't try
-	 * to simultaneously create the region and so that processes that are
-	 * joining the region never see inconsistent data.  We'd like to play
-	 * file permissions games, but we can't because WNT filesystems won't
-	 * open a file mode 0.
-	 *
-	 * If the lock_region flag is set, the process creating the region
-	 * acquires the lock before the setting the version number.  Any
-	 * process joining the region checks the version number before
-	 * attempting to acquire the lock.  (The lock_region flag may not be
-	 * set -- the mpool code sometimes malloc's private regions but still
-	 * needs to initialize them, specifically, the mutex for threads.)
+	 * to simultaneously create the region.  This is handled by using the
+	 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
 	 *
-	 * We have to check the version number first, because if the version
-	 * number has not been written, it's possible that the mutex has not
-	 * been initialized in which case an attempt to get it could lead to
-	 * random behavior.  If the version number isn't there (the file size
-	 * is too small) or it's 0, we know that the region is being created.
-	 *
-	 * We also make sure to check the return of __db_mutex_lock() here,
-	 * even though we don't usually check elsewhere.  This is the first
-	 * lock we attempt to acquire, and if it fails we have to know.  (It
-	 * can fail -- SunOS, using fcntl(2) for locking, with an in-memory
-	 * filesystem specified as the database home.)
+	 * We also have to order region joins so that processes joining regions
+	 * never see inconsistent data.  We'd like to play permissions games
+	 * with the backing file, but we can't because WNT filesystems won't
+	 * open a file mode 0.
 	 */
-	__db_mutex_init(&rp->lock, MUTEX_LOCK_OFFSET(rp, &rp->lock));
-	if (lock_region && (ret = __db_mutex_lock(&rp->lock, fd)) != 0)
-		return (ret);
-
-	rp->refcnt = 1;
-	rp->size = size;
-	rp->flags = 0;
-	db_version(&rp->majver, &rp->minver, &rp->patch);
+	rlp = (RLAYOUT *)infop->addr;
+	if (F_ISSET(infop, REGION_CREATED)) {
+		/*
+		 * The process creating the region acquires a lock before it
+		 * sets the valid flag.  Any processes joining the region will
+		 * check the valid flag before acquiring the lock.
+		 *
+		 * Check the return of __db_mutex_init() and __db_mutex_lock(),
+		 * even though we don't usually check elsewhere.  This is the
+		 * first lock we initialize and acquire, and we have to know if
+		 * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
+		 * for locking, with an in-memory filesystem specified as the
+		 * database home.)
+		 */
+		if ((ret = __db_mutex_init(&rlp->lock,
+		    MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
+		    (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
+			goto err;
 
-	return (0);
-}
+		/* Initialize the remaining region information. */
+		rlp->refcnt = 1;
+		rlp->size = infop->size;
+		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
+		rlp->segid = infop->segid;
+		rlp->flags = 0;
+		if (F_ISSET(infop, REGION_ANONYMOUS))
+			F_SET(rlp, REGION_ANONYMOUS);
 
-/*
- * __db_ropen --
- *	Construct the name of a file, open it and map it in.
- *
- * PUBLIC: int __db_ropen __P((DB_ENV *,
- * PUBLIC:    APPNAME, const char *, const char *, int, int *, void *));
- */
-int
-__db_ropen(dbenv, appname, path, file, flags, fdp, retp)
-	DB_ENV *dbenv;
-	APPNAME appname;
-	const char *path, *file;
-	int flags, *fdp;
-	void *retp;
-{
-	RLAYOUT *rp;
-	size_t size;
-	u_int32_t mbytes, bytes;
-	int fd, ret;
-	char *name;
+		/*
+		 * Fill in the valid field last -- use a magic number, memory
+		 * may not be zero-filled, and we want to minimize the chance
+		 * for collision.
+		 */
+		rlp->valid = DB_REGIONMAGIC;
 
-	fd = -1;
-	rp = NULL;
+		/*
+		 * If the region is anonymous, write the RLAYOUT information
+		 * into the backing file so that future region join and unlink
+		 * calls can find it.
+		 *
+		 * XXX
+		 * We MUST do the seek before we do the write.  On Win95, while
+		 * closing the last reference to an anonymous shared region
+		 * doesn't discard the region, it does zero it out.  So, the
+		 * REGION_CREATED may be set, but the file may have already
+		 * been written and the file descriptor may be at the end of
+		 * the file.
+		 */
+		if (F_ISSET(infop, REGION_ANONYMOUS)) {
+			if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret =
+			    __db_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
+				goto err;
+		}
+	} else {
+		/*
+		 * Check the valid flag to ensure the region is initialized.
+		 * If the valid flag has not been set, the mutex may not have
+		 * been initialized, and an attempt to get it could lead to
+		 * random behavior.
+		 */
+		if (rlp->valid != DB_REGIONMAGIC)
+			goto retry;
 
-	/* Get the filename. */
-	if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
-		return (ret);
+		/* Get the region lock. */
+		(void)__db_mutex_lock(&rlp->lock, infop->fd);
 
-	/* Open the file. */
-	if ((ret = __db_open(name, flags, DB_MUTEXDEBUG, 0, &fd)) != 0) {
-		__db_err(dbenv, "region open: %s: %s", name, strerror(ret));
-		goto err2;
-	}
+		/*
+		 * We now own the region.  There are a couple of things that
+		 * may have gone wrong, however.
+		 *
+		 * Problem #1: while we were waiting for the lock, the region
+		 * was deleted.  Detected by re-checking the valid flag, since
+		 * it's cleared by the delete region routines.
+		 */
+		if (rlp->valid != DB_REGIONMAGIC) {
+			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
+			goto retry;
+		}
 
-	*fdp = fd;
+		/*
+		 * Problem #2: We want a bigger region than has previously been
+		 * created.  Detected by checking if the region is smaller than
+		 * our caller requested.  If it is, we grow the region, (which
+		 * does the detach and re-attach for us).
+		 */
+		if (grow_region != 0 &&
+		    (ret = __db_rgrow(infop, grow_region)) != 0) {
+			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
+			goto err;
+		}
 
-	/*
-	 * Map the file in.  We have to do things in a strange order so that
-	 * we don't get into a situation where the file was just created and
-	 * isn't yet initialized.  See the comment in __db_rcreate() above.
-	 *
-	 * XXX
-	 * We'd like to test to see if the file is too big to mmap.  Since we
-	 * don't know what size or type off_t's or size_t's are, or the largest
-	 * unsigned integral type is, or what random insanity the local C
-	 * compiler will perpetrate, doing the comparison in a portable way is
-	 * flatly impossible.  Hope that mmap fails if the file is too large.
-	 *
-	 */
-	if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) {
-		__db_err(dbenv, "%s: %s", name, strerror(ret));
-		goto err2;
-	}
-	size = mbytes * MEGABYTE + bytes;
+		/*
+		 * Problem #3: when we checked the size of the file, it was
+		 * still growing as part of creation.  Detected by the fact
+		 * that infop->size isn't the same size as the region.
+		 */
+		if (infop->size != rlp->size) {
+			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
+			goto retry;
+		}
 
-	/* Check to make sure the first block has been written. */
-	if (size < sizeof(RLAYOUT)) {
-		ret = EAGAIN;
-		goto err2;
+		/* Increment the reference count. */
+		++rlp->refcnt;
 	}
 
-	/* Map in whatever is there. */
-	if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0)
-		goto err2;
+	/* Return the region in a locked condition. */
 
-	/*
-	 * Check to make sure the region has been initialized.  We can't just
-	 * grab the lock because the lock may not have been initialized yet.
-	 */
-	if (rp->majver == 0) {
-		ret = EAGAIN;
-		goto err2;
-	}
-
-	/* Get the region lock. */
-	if (!LF_ISSET(DB_MUTEXDEBUG))
-		(void)__db_mutex_lock(&rp->lock, fd);
+	if (0) {
+errmsg:		__db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
 
-	/*
-	 * The file may have been half-written if we were descheduled between
-	 * getting the size of the file and checking the major version.  Check
-	 * to make sure we got the entire file.
-	 */
-	if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) {
-		__db_err(dbenv, "%s: %s", name, strerror(ret));
-		goto err1;
-	}
-	if (size != mbytes * MEGABYTE + bytes) {
-		ret = EAGAIN;
-		goto err1;
-	}
+err:
+retry:		/* Discard the region. */
+		if (infop->addr != NULL) {
+			(void)__db_unmapregion(infop);
+			infop->addr = NULL;
+		}
 
-	/* The file may have just been deleted. */
-	if (F_ISSET(rp, DB_R_DELETED)) {
-		ret = EAGAIN;
-		goto err1;
-	}
+		/* Discard the backing file. */
+		if (infop->fd != -1) {
+			(void)__db_close(infop->fd);
+			infop->fd = -1;
 
-	/* Increment the reference count. */
-	++rp->refcnt;
+			if (F_ISSET(infop, REGION_CREATED))
+				(void)__db_unlink(infop->name);
+		}
 
-	/* Release the lock. */
-	if (!LF_ISSET(DB_MUTEXDEBUG))
-		(void)__db_mutex_unlock(&rp->lock, fd);
+		/* Discard the name. */
+		if (infop->name != NULL) {
+			FREES(infop->name);
+			infop->name = NULL;
+		}
 
-	FREES(name);
+		/*
+		 * If we had a temporary error, wait a few seconds and
+		 * try again.
+		 */
+		if (ret == 0) {
+			if (++retry_cnt <= 3) {
+				__db_sleep(retry_cnt * 2, 0);
+				goto loop;
+			}
+			ret = EAGAIN;
+		}
+	}
 
-	*(void **)retp = rp;
-	return (0);
+	/*
+	 * XXX
+	 * HP-UX won't permit mutexes to live in anything but shared memory.
+	 * Instantiate a shared region file on that architecture, regardless.
+	 *
+	 * XXX
+	 * There's a problem in cleaning this up on application exit, or on
+	 * application failure.  If an application opens a database without
+	 * an environment, we create a temporary backing mpool region for it.
+	 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
+	 * mutexes to live in anything but shared memory, we instantiate a
+	 * real file plus a memory region of some form.  If the application
+	 * crashes, the necessary information to delete the backing file and
+	 * any system region (e.g., the shmget(2) segment ID) is no longer
+	 * available.  We can't completely fix the problem, but we try.
+	 *
+	 * The underlying UNIX __db_mapregion() code preferentially uses the
+	 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
+	 * that are marked REGION_PRIVATE.  This means that we normally aren't
+	 * holding any system resources when we get here, in which case we can
+	 * delete the backing file.  This results in a short race, from the
+	 * __db_open() call above to here.
+	 *
+	 * If, for some reason, we are holding system resources when we get
+	 * here, we don't have any choice -- we can't delete the backing file
+	 * because we may need it to detach from the resources.  Set the
+	 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
+	 * the application closes the region.
+	 */
+	if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
+		if (F_ISSET(infop, REGION_HOLDINGSYS))
+			F_SET(infop, REGION_LASTDETACH);
+		else {
+			F_SET(infop, REGION_REMOVED);
+			F_CLR(infop, REGION_CANGROW);
+
+			(void)__db_close(infop->fd);
+			(void)__db_unlink(infop->name);
+		}
 
-err1:	if (!LF_ISSET(DB_MUTEXDEBUG))
-		(void)__db_mutex_unlock(&rp->lock, fd);
-err2:	if (rp != NULL)
-		(void)__db_unmap(rp, rp->size);
-	if (fd != -1)
-		(void)__db_close(fd);
-	FREES(name);
 	return (ret);
 }
 
 /*
- * __db_rclose --
- *	Close a shared memory region.
+ * __db_rdetach --
+ *	De-attach from a shared memory region.
  *
- * PUBLIC: int __db_rclose __P((DB_ENV *, int, void *));
+ * PUBLIC: int __db_rdetach __P((REGINFO *));
  */
 int
-__db_rclose(dbenv, fd, ptr)
-	DB_ENV *dbenv;
-	int fd;
-	void *ptr;
+__db_rdetach(infop)
+	REGINFO *infop;
 {
-	RLAYOUT *rp;
-	int ret, t_ret;
-	const char *fail;
+	RLAYOUT *rlp;
+	int detach, ret, t_ret;
 
-	rp = ptr;
-	fail = NULL;
+	ret = 0;
 
-	/* Get the lock. */
-	if ((ret = __db_mutex_lock(&rp->lock, fd)) != 0) {
-		fail = "lock get";
-		goto err;
+	/*
+	 * If the region was removed when it was created, no further action
+	 * is required.
+	 */
+	if (F_ISSET(infop, REGION_REMOVED))
+		goto done;
+	/*
+	 * If the region was created in memory returned by malloc, the only
+	 * action required is freeing the memory.
+	 */
+	if (F_ISSET(infop, REGION_MALLOC)) {
+		__db_free(infop->addr);
+		goto done;
 	}
 
+	/* Otherwise, attach to the region and optionally delete it. */
+	rlp = infop->addr;
+
+	/* Get the lock. */
+	(void)__db_mutex_lock(&rlp->lock, infop->fd);
+
 	/* Decrement the reference count. */
-	--rp->refcnt;
+	if (rlp->refcnt == 0)
+		__db_err(infop->dbenv,
+		    "region rdetach: reference count went to zero!");
+	else
+		--rlp->refcnt;
+
+	/*
+	 * If we're going to remove the region, clear the valid flag so
+	 * that any region join that's blocked waiting for us will know
+	 * what happened.
+	 */
+	detach = 0;
+	if (F_ISSET(infop, REGION_LASTDETACH))
+		if (rlp->refcnt == 0) {
+			detach = 1;
+			rlp->valid = 0;
+		} else
+			ret = EBUSY;
 
 	/* Release the lock. */
-	if ((t_ret = __db_mutex_unlock(&rp->lock, fd)) != 0 && fail == NULL) {
-		ret = t_ret;
-		fail = "lock release";
-	}
+	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
 
-	/* Discard the region. */
-	if ((t_ret = __db_unmap(ptr, rp->size)) != 0 && fail == NULL) {
-		ret = t_ret;
-		fail = "munmap";
-	}
+	/* Close the backing file descriptor. */
+	(void)__db_close(infop->fd);
+	infop->fd = -1;
 
-	if ((t_ret = __db_close(fd)) != 0 && fail == NULL) {
+	/* Discard our mapping of the region. */
+	if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
 		ret = t_ret;
-		fail = "close";
+
+	/* Discard the region itself. */
+	if (detach) {
+		if ((t_ret =
+		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __db_unlink(infop->name) != 0) && ret == 0)
+			ret = t_ret;
 	}
 
-	if (fail == NULL)
-		return (0);
+done:	/* Discard the name. */
+	if (infop->name != NULL) {
+		FREES(infop->name);
+		infop->name = NULL;
+	}
 
-err:	__db_err(dbenv, "region detach: %s: %s", fail, strerror(ret));
 	return (ret);
 }
 
 /*
  * __db_runlink --
- *	Remove a shared memory region.
+ *	Remove a region.
  *
- * PUBLIC: int __db_runlink __P((DB_ENV *,
- * PUBLIC:    APPNAME, const char *, const char *, int));
+ * PUBLIC: int __db_runlink __P((REGINFO *, int));
  */
 int
-__db_runlink(dbenv, appname, path, file, force)
-	DB_ENV *dbenv;
-	APPNAME appname;
-	const char *path, *file;
+__db_runlink(infop, force)
+	REGINFO *infop;
 	int force;
 {
-	RLAYOUT *rp;
-	int cnt, fd, ret, t_ret;
+	RLAYOUT rl, *rlp;
+	size_t size;
+	ssize_t nr;
+	u_int32_t mbytes, bytes;
+	int fd, ret, t_ret;
 	char *name;
 
-	rp = NULL;
+	/*
+	 * XXX
+	 * We assume that we've created a new REGINFO structure for this
+	 * call, not used one that was already initialized.  Regardless,
+	 * if anyone is planning to use it after we're done, they're going
+	 * to be sorely disappointed.
+	 *
+	 * If force isn't set, we attach to the region, set a flag to delete
+	 * the region on last close, and let the region delete code do the
+	 * work.
+	 */
+	if (!force) {
+		if ((ret = __db_rattach(infop)) != 0)
+			return (ret);
 
-	/* Get the filename. */
-	if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
-		return (ret);
+		rlp = (RLAYOUT *)infop->addr;
+		(void)__db_mutex_unlock(&rlp->lock, infop->fd);
 
-	/* If the file doesn't exist, we're done. */
-	if (__db_exists(name, NULL))
-		goto done;
+		F_SET(infop, REGION_LASTDETACH);
+
+		return (__db_rdetach(infop));
+	}
 
 	/*
-	 * If we're called with a force flag, try and unlink the file.  This
-	 * may not succeed if the file is currently open, but there's nothing
-	 * we can do about that.  There is a race condition between the check
-	 * for existence above and the actual unlink.  If someone else snuck
-	 * in and removed it before we do the remove, then we might get an
-	 * ENOENT error.  If we get the ENOENT, we treat it as success, just
-	 * as we do above.
+	 * Otherwise, we don't want to attach to the region.  We may have been
+	 * called to clean up if a process died leaving a region locked and/or
+	 * corrupted, which could cause the attach to hang.
 	 */
-	if (force) {
-		if ((ret = __db_unlink(name)) != 0 && ret != ENOENT)
-			goto err1;
-		goto done;
+	if ((ret = __db_appname(infop->dbenv, infop->appname,
+	    infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
+		return (ret);
+
+	/*
+	 * An underlying file is created for all regions other than private
+	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
+	 * back the region.  If that file doesn't exist, we're done.
+	 */
+	if (__db_exists(name, NULL) != 0) {
+		FREES(name);
+		return (0);
 	}
 
-	/* Open and lock the region. */
-	if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
-		goto err1;
-	(void)__db_mutex_lock(&rp->lock, fd);
+	/*
+	 * See the comments in __db_rattach -- figure out if this is a regular
+	 * file backing a region or if it's a regular file with information
+	 * about a region.
+	 */
+	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
+		goto errmsg;
+	if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
+		goto errmsg;
+	size = mbytes * MEGABYTE + bytes;
 
-	/* If the region is currently being deleted, fail. */
-	if (F_ISSET(rp, DB_R_DELETED)) {
-		ret = ENOENT;		/* XXX: ENOENT? */
-		goto err2;
-	}
+	if (size <= sizeof(RLAYOUT)) {
+		if ((ret = __db_read(fd, &rl, sizeof(rl), &nr)) != 0)
+			goto errmsg;
+		if (rl.valid != DB_REGIONMAGIC) {
+			__db_err(infop->dbenv,
+			    "%s: illegal region magic number", name);
+			ret = EINVAL;
+			goto err;
+		}
 
-	/* If the region is currently in use by someone else, fail. */
-	if (rp->refcnt > 1) {
-		ret = EBUSY;
-		goto err2;
+		/* Set the size, memory id and characteristics. */
+		infop->size = rl.size;
+		infop->segid = rl.segid;
+		if (F_ISSET(&rl, REGION_ANONYMOUS))
+			F_SET(infop, REGION_ANONYMOUS);
+	} else {
+		infop->size = size;
+		infop->segid = INVALID_SEGID;
 	}
 
-	/* Set the delete flag. */
-	F_SET(rp, DB_R_DELETED);
-
-	/* Release the lock and close the region. */
-	(void)__db_mutex_unlock(&rp->lock, fd);
-	if ((t_ret = __db_rclose(dbenv, fd, rp)) != 0 && ret == 0)
-		goto err1;
+	/* Remove the underlying region. */
+	ret = __db_unlinkregion(name, infop);
 
 	/*
-	 * Unlink the region.  There's a race here -- other threads or
-	 * processes might be opening the region while we're trying to
-	 * remove it.  They'll fail, because we've set the DELETED flag,
-	 * but they could still stop us from succeeding in the unlink.
+	 * Unlink the backing file.  Close the open file descriptor first,
+	 * because some architectures (e.g., Win32) won't unlink a file if
+	 * open file descriptors remain.
 	 */
-	for (cnt = 5; cnt > 0; --cnt) {
-		if ((ret = __db_unlink(name)) == 0)
-			break;
-		(void)__db_sleep(0, 250000);
-	}
-	if (ret == 0) {
-done:		FREES(name);
-		return (0);
-	}
-
-	/* Not a clue.  Try to clear the DB_R_DELETED flag. */
-	if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
-		goto err1;
-	(void)__db_mutex_lock(&rp->lock, fd);
-	F_CLR(rp, DB_R_DELETED);
-	/* FALLTHROUGH */
+	(void)__db_close(fd);
+	if ((t_ret = __db_unlink(name)) != 0 && ret == 0)
+		ret = t_ret;
 
-err2:	(void)__db_mutex_unlock(&rp->lock, fd);
-	(void)__db_rclose(dbenv, fd, rp);
-err1:	__db_err(dbenv, "region unlink: %s: %s", name, strerror(ret));
+	if (0) {
+errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
+err:		(void)__db_close(fd);
+	}
 
 	FREES(name);
 	return (ret);
 }
 
 /*
- * DB creates all regions on 4K boundaries so that we don't make the
- * underlying VM unhappy.
- */
-#define	__DB_VMPAGESIZE	(4 * 1024)
-
-/*
  * __db_rgrow --
- *	Extend a region by a specified amount.
+ *	Extend a region.
  *
- * PUBLIC: int __db_rgrow __P((DB_ENV *, int, size_t));
+ * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
  */
 int
-__db_rgrow(dbenv, fd, incr)
-	DB_ENV *dbenv;
-	int fd;
-	size_t incr;
+__db_rgrow(infop, new_size)
+	REGINFO *infop;
+	size_t new_size;
+{
+	RLAYOUT *rlp;
+	size_t increment;
+	int ret;
+
+	/*
+	 * !!!
+	 * This routine MUST be called with the region already locked.
+	 */
+
+	/* The underlying routines have flagged if this region can grow. */
+	if (!F_ISSET(infop, REGION_CANGROW))
+		return (EINVAL);
+
+	/*
+	 * Round off the requested size to the next page boundary, and
+	 * determine the additional space required.
+	 */
+	rlp = (RLAYOUT *)infop->addr;
+	DB_ROUNDOFF(new_size);
+	increment = new_size - rlp->size;
+
+	if ((ret = __db_growregion(infop, increment)) != 0)
+		return (ret);
+
+	/* Update the on-disk region size. */
+	rlp->size = new_size;
+
+	/* Detach from and reattach to the region. */
+	return (__db_rreattach(infop, new_size));
+}
+
+/*
+ * __db_growregion --
+ *	Grow a shared memory region.
+ */
+static int
+__db_growregion(infop, increment)
+	REGINFO *infop;
+	size_t increment;
 {
+	db_pgno_t pages;
 	size_t i;
-	ssize_t nw;
-	int mmap_init_needed, ret;
-	char buf[__DB_VMPAGESIZE];
+	ssize_t nr, nw;
+	u_int32_t relative;
+	int ret;
+	char buf[DB_VMPAGESIZE];
 
 	/* Seek to the end of the region. */
-	if ((ret = __db_seek(fd, 0, 0, 0, SEEK_END)) != 0)
+	if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
 		goto err;
 
 	/* Write nuls to the new bytes. */
 	memset(buf, 0, sizeof(buf));
 
 	/*
-	 * Historically, some systems required that all of the bytes of the
-	 * region be written before it could be mmapped and accessed randomly.
-	 *
-	 * Windows/95 doesn't have that problem, but it leaves file contents
-	 * uninitialized.  Win/NT apparently initializes them.
+	 * Some systems require that all of the bytes of the region be
+	 * written before it can be mapped and accessed randomly, and
+	 * other systems don't zero out the pages.
 	 */
-#ifdef MMAP_INIT_NEEDED
-	mmap_init_needed = 1;
-#else
-	mmap_init_needed = __os_oldwin();
-#endif
-	if (mmap_init_needed)
+	if (__db_mapinit())
 		/* Extend the region by writing each new page. */
-		for (i = 0; i < incr; i += __DB_VMPAGESIZE) {
-			if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
+			if ((ret =
+			    __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
 				goto err;
 			if (nw != sizeof(buf))
 				goto eio;
 		}
 	else {
 		/*
-		 * Extend the region by writing the last page.
-		 *
-		 * Round off the increment to the next page boundary.
+		 * Extend the region by writing the last page.  If the region
+		 * is >4Gb, increment may be larger than the maximum possible
+		 * seek "relative" argument, as it's an unsigned 32-bit value.
+		 * Break the offset into pages of 1MB each so that we don't
+		 * overflow (2^20 + 2^32 is bigger than any memory I expect
+		 * to see for awhile).
 		 */
-		incr += __DB_VMPAGESIZE - 1;
-		incr -= incr % __DB_VMPAGESIZE;
-
-		/* Write the last page, not the page after the last. */
-		if ((ret =
-		    __db_seek(fd, 0, 0, incr - __DB_VMPAGESIZE, SEEK_CUR)) != 0)
+		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
+		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
+		if ((ret = __db_seek(infop->fd,
+		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
 			goto err;
-		if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+		if ((ret = __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
 			goto err;
 		if (nw != sizeof(buf))
 			goto eio;
+
+		/*
+		 * It's sometimes significantly faster to page-fault in all
+		 * of the region's pages before we run the application, as
+		 * we can see fairly nasty side-effects when we page-fault
+		 * while holding various locks, i.e., the lock takes a long
+		 * time, and other threads convoy behind the lock holder.
+		 */
+		if (DB_GLOBAL(db_region_init)) {
+			pages = increment / MEGABYTE;
+			relative = increment % MEGABYTE;
+			if ((ret = __db_seek(infop->fd,
+			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
+				goto err;
+
+			/* Read a byte from each page. */
+			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
+				if ((ret =
+				    __db_read(infop->fd, buf, 1, &nr)) != 0)
+					goto err;
+				if (nr != 1)
+					goto eio;
+				if ((ret = __db_seek(infop->fd,
+				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
+					goto err;
+			}
+		}
 	}
 	return (0);
 
 eio:	ret = EIO;
-err:	__db_err(dbenv, "region grow: %s", strerror(ret));
+err:	__db_err(infop->dbenv, "region grow: %s", strerror(ret));
 	return (ret);
 }
 
 /*
- * __db_rremap --
- *	Unmap the old region and map in a new region of a new size.  If
- *	either call fails, returns NULL, else returns the address of the
- *	new region.
+ * __db_rreattach --
+ *	Detach from and reattach to a region.
  *
- * PUBLIC: int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *));
+ * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
  */
 int
-__db_rremap(dbenv, ptr, oldsize, newsize, fd, retp)
-	DB_ENV *dbenv;
-	void *ptr, *retp;
-	size_t oldsize, newsize;
-	int fd;
+__db_rreattach(infop, new_size)
+	REGINFO *infop;
+	size_t new_size;
 {
 	int ret;
 
-	if ((ret = __db_unmap(ptr, oldsize)) != 0) {
-		__db_err(dbenv, "region remap: munmap: %s", strerror(ret));
-		return (ret);
+#ifdef DIAGNOSTIC
+	if (infop->name == NULL) {
+		__db_err(infop->dbenv, "__db_rreattach: name was NULL");
+		return (EINVAL);
 	}
+#endif
+	/*
+	 * If we're growing an already mapped region, we have to unmap it
+	 * and get it back.  We have it locked, so nobody else can get in,
+	 * which makes it fairly straight-forward to do, as everybody else
+	 * is going to block while we do the unmap/remap.  NB: if we fail
+	 * to get it back, the pooch is genuinely screwed, because we can
+	 * never release the lock we're holding.
+	 *
+	 * Detach from the region.  We have to do this first so architectures
+	 * that don't permit a file to be mapped into different places in the
+	 * address space simultaneously, e.g., HP's PaRisc, will work.
+	 */
+	if ((ret = __db_unmapregion(infop)) != 0)
+		return (ret);
 
-	return (__db_rmap(dbenv, fd, newsize, retp));
-}
-
-/*
- * __db_rmap --
- *	Attach to a shared memory region.
- */
-static int
-__db_rmap(dbenv, fd, size, retp)
-	DB_ENV *dbenv;
-	int fd;
-	size_t size;
-	void *retp;
-{
-	RLAYOUT *rp;
-	int ret;
+	/* Update the caller's REGINFO size to the new map size. */
+	infop->size = new_size;
 
-	if ((ret = __db_map(fd, size, 0, 0, (void **)&rp)) != 0) {
-		__db_err(dbenv, "region map: mmap %s", strerror(ret));
-		return (ret);
-	}
-	if (rp->size < size)
-		rp->size = size;
+	/* Attach to the region. */
+	ret = __db_mapregion(infop->name, infop);
 
-	*(void **)retp = rp;
-	return (0);
+	return (ret);
 }
diff --git a/db2/common/db_salloc.c b/db2/common/db_salloc.c
index f0202ddb90..0fa696bf7e 100644
--- a/db2/common/db_salloc.c
+++ b/db2/common/db_salloc.c
@@ -1,21 +1,21 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_salloc.c	10.6 (Sleepycat) 7/5/97";
+static const char sccsid[] = "@(#)db_salloc.c	10.13 (Sleepycat) 5/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
 #include <errno.h>
-#include <stdio.h>
+#include <string.h>
 #endif
 
 #include "db_int.h"
@@ -109,11 +109,13 @@ __db_shalloc(p, len, align, retp)
 
 		*(void **)retp = rp;
 
+#define	SHALLOC_FRAGMENT	32
 		/*
-		 * If there are at least 32 bytes of additional memory, divide
-		 * the chunk into two chunks.
+		 * If there are at least SHALLOC_FRAGMENT additional bytes of
+		 * memory, divide the chunk into two chunks.
 		 */
-		if ((u_int8_t *)rp >= (u_int8_t *)&elp->links + 32) {
+		if ((u_int8_t *)rp >=
+		    (u_int8_t *)&elp->links + SHALLOC_FRAGMENT) {
 			sp = rp;
 			*--sp = elp->len -
 			    ((u_int8_t *)rp - (u_int8_t *)&elp->links);
@@ -136,7 +138,7 @@ __db_shalloc(p, len, align, retp)
 		return (0);
 	}
 
-	/* Nothing found large enough; need to figure out how to grow region. */
+	/* Nothing found large enough; need to grow the region. */
 	return (ENOMEM);
 }
 
@@ -159,12 +161,18 @@ __db_shalloc_free(regionp, ptr)
 	 * Step back over flagged length fields to find the beginning of
 	 * the object and its real size.
 	 */
-	for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp);
+	for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp)
+		;
 	ptr = sp;
 
 	newp = (struct __data *)((u_int8_t *)ptr - sizeof(size_t));
 	free_size = newp->len;
 
+	/* Trash the returned memory. */
+#ifdef DIAGNOSTIC
+	memset(ptr, 0xff, free_size);
+#endif
+
 	/*
 	 * Walk the list, looking for where this entry goes.
 	 *
@@ -177,7 +185,8 @@ __db_shalloc_free(regionp, ptr)
 	hp = (struct __head *)regionp;
 	for (elp = SH_LIST_FIRST(hp, __data), lastp = NULL;
 	    elp != NULL && (void *)elp < (void *)ptr;
-	    lastp = elp, elp = SH_LIST_NEXT(elp, links, __data));
+	    lastp = elp, elp = SH_LIST_NEXT(elp, links, __data))
+		;
 
 	/*
 	 * Elp is either NULL (we reached the end of the list), or the slot
@@ -259,32 +268,34 @@ __db_shsizeof(ptr)
 	 * Step back over flagged length fields to find the beginning of
 	 * the object and its real size.
 	 */
-	for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp);
+	for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp)
+		;
 
 	elp = (struct __data *)((u_int8_t *)sp - sizeof(size_t));
 	return (elp->len);
 }
 
-#ifdef DEBUG
 /*
  * __db_shalloc_dump --
  *
- * PUBLIC: void __db_shalloc_dump __P((FILE *, void *));
+ * PUBLIC: void __db_shalloc_dump __P((void *, FILE *));
  */
 void
-__db_shalloc_dump(fp, addr)
-	FILE *fp;
+__db_shalloc_dump(addr, fp)
 	void *addr;
+	FILE *fp;
 {
 	struct __data *elp;
 
+	/* Make it easy to call from the debugger. */
 	if (fp == NULL)
 		fp = stderr;
 
+	fprintf(fp, "%s\nMemory free list\n", DB_LINE);
+
 	for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
 	    elp != NULL;
 	    elp = SH_LIST_NEXT(elp, links, __data))
 		fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len);
 	fprintf(fp, "\n");
 }
-#endif
diff --git a/db2/common/db_shash.c b/db2/common/db_shash.c
index ab188f564f..3f48a55907 100644
--- a/db2/common/db_shash.c
+++ b/db2/common/db_shash.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)db_shash.c	10.4 (Sleepycat) 1/8/98";
+static const char sccsid[] = "@(#)db_shash.c	10.9 (Sleepycat) 4/10/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,39 +19,75 @@ static const char sccsid[] = "@(#)db_shash.c	10.4 (Sleepycat) 1/8/98";
 #include "shqueue.h"
 #include "common_ext.h"
 
-/* Powers-of-2 and close-by prime number pairs. */
+/*
+ * Table of good hash values.  Up to ~250,000 buckets, we use powers of 2.
+ * After that, we slow the rate of increase by half.  For each choice, we
+ * then use a nearby prime number as the hash value.
+ *
+ * If a terabyte is the maximum cache we'll see, and we assume there are
+ * 10 1K buckets on each hash chain, then 107374182 is the maximum number
+ * of buckets we'll ever need.
+ */
 static const struct {
-	u_int	power;
-	u_int	prime;
+	u_int32_t power;
+	u_int32_t prime;
 } list[] = {
-	{  64,	  67},
-	{ 128,	 131},
-	{ 256,	 257},
-	{ 512,	 521},
-	{1024,	1031},
-	{2048,	2053},
-	{4096,	4099},
-	{8192,	8191},
-	{0,	   0}
+	{	 64,		67},		/* 2^6 */
+	{	128,	       131},		/* 2^7 */
+	{	256,	       257},		/* 2^8 */
+	{	512,	       521},		/* 2^9 */
+	{      1024,	      1031},		/* 2^10 */
+	{      2048,	      2053},		/* 2^11 */
+	{      4096,	      4099},		/* 2^12 */
+	{      8192,	      8191},		/* 2^13 */
+	{     16384,	     16381},		/* 2^14 */
+	{     32768,	     32771},		/* 2^15 */
+	{     65536,	     65537},		/* 2^16 */
+	{    131072,	    131071},		/* 2^17 */
+	{    262144,	    262147},		/* 2^18 */
+	{    393216,	    393209},		/* 2^18 + 2^18/2 */
+	{    524288,	    524287},		/* 2^19 */
+	{    786432,	    786431},		/* 2^19 + 2^19/2 */
+	{   1048576,	   1048573},		/* 2^20 */
+	{   1572864,	   1572869},		/* 2^20 + 2^20/2 */
+	{   2097152,	   2097169},		/* 2^21 */
+	{   3145728,	   3145721},		/* 2^21 + 2^21/2 */
+	{   4194304,	   4194301},		/* 2^22 */
+	{   6291456,	   6291449},		/* 2^22 + 2^22/2 */
+	{   8388608,	   8388617},		/* 2^23 */
+	{  12582912,	  12582917},		/* 2^23 + 2^23/2 */
+	{  16777216,	  16777213},		/* 2^24 */
+	{  25165824,	  25165813},		/* 2^24 + 2^24/2 */
+	{  33554432,	  33554393},		/* 2^25 */
+	{  50331648,	  50331653},		/* 2^25 + 2^25/2 */
+	{  67108864,	  67108859},		/* 2^26 */
+	{ 100663296,	 100663291},		/* 2^26 + 2^26/2 */
+	{ 134217728,	 134217757},		/* 2^27 */
+	{ 201326592,	 201326611},		/* 2^27 + 2^27/2 */
+	{ 268435456,	 268435459},		/* 2^28 */
+	{ 402653184,	 402653189},		/* 2^28 + 2^28/2 */
+	{ 536870912,	 536870909},		/* 2^29 */
+	{ 805306368,	 805306357},		/* 2^29 + 2^29/2 */
+	{1073741824,	1073741827},		/* 2^30 */
+	{0,		0}
 };
 
 /*
  * __db_tablesize --
  *	Choose a size for the hash table.
  *
- * PUBLIC: int __db_tablesize __P((u_int));
+ * PUBLIC: int __db_tablesize __P((u_int32_t));
  */
 int
 __db_tablesize(n_buckets)
-	u_int n_buckets;
+	u_int32_t n_buckets;
 {
 	int i;
 
 	/*
-	 * We try to be clever about how big we make the hash tables.  Pick
-	 * a prime number close to the "suggested" number of elements that
-	 * will be in the hash table.  We shoot for minimum collisions (i.e.
-	 * one element in each bucket).  We use 64 as the minimum table size.
+	 * We try to be clever about how big we make the hash tables.  Use a
+	 * prime number close to the "suggested" number of elements that will
+	 * be in the hash table.  Use 64 as the minimum hash table size.
 	 *
 	 * Ref: Sedgewick, Algorithms in C, "Hash Functions"
 	 */
@@ -73,14 +109,14 @@ __db_tablesize(n_buckets)
  * __db_hashinit --
  *	Initialize a hash table that resides in shared memory.
  *
- * PUBLIC: void __db_hashinit __P((void *, int));
+ * PUBLIC: void __db_hashinit __P((void *, u_int32_t));
  */
 void
 __db_hashinit(begin, nelements)
 	void *begin;
-	int nelements;
+	u_int32_t nelements;
 {
-	int i;
+	u_int32_t i;
 	SH_TAILQ_HEAD(hash_head) *headp;
 
 	headp = (struct hash_head *)begin;