diff options
Diffstat (limited to 'db2/common')
-rw-r--r-- | db2/common/db_appinit.c | 183 | ||||
-rw-r--r-- | db2/common/db_apprec.c | 49 | ||||
-rw-r--r-- | db2/common/db_byteorder.c | 4 | ||||
-rw-r--r-- | db2/common/db_err.c | 137 | ||||
-rw-r--r-- | db2/common/db_log2.c | 7 | ||||
-rw-r--r-- | db2/common/db_region.c | 1131 | ||||
-rw-r--r-- | db2/common/db_salloc.c | 41 | ||||
-rw-r--r-- | db2/common/db_shash.c | 82 |
8 files changed, 998 insertions, 636 deletions
diff --git a/db2/common/db_appinit.c b/db2/common/db_appinit.c index 4ee9e4f40c..6ec007be0a 100644 --- a/db2/common/db_appinit.c +++ b/db2/common/db_appinit.c @@ -1,23 +1,21 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_appinit.c 10.38 (Sleepycat) 1/7/98"; +static const char sccsid[] = "@(#)db_appinit.c 10.52 (Sleepycat) 6/2/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES -#include <sys/param.h> -#include <sys/stat.h> +#include <sys/types.h> #include <ctype.h> #include <errno.h> -#include <fcntl.h> #include <signal.h> #include <stdlib.h> #include <string.h> @@ -34,14 +32,14 @@ static const char sccsid[] = "@(#)db_appinit.c 10.38 (Sleepycat) 1/7/98"; #include "clib_ext.h" #include "common_ext.h" -static int __db_home __P((DB_ENV *, const char *, int)); +static int __db_home __P((DB_ENV *, const char *, u_int32_t)); static int __db_parse __P((DB_ENV *, char *)); -static int __db_tmp_dir __P((DB_ENV *, int)); -static int __db_tmp_open __P((DB_ENV *, char *, int *)); +static int __db_tmp_dir __P((DB_ENV *, u_int32_t)); +static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, int *)); /* * db_version -- - * Return verision information. + * Return version information. */ char * db_version(majverp, minverp, patchp) @@ -65,16 +63,18 @@ db_appinit(db_home, db_config, dbenv, flags) const char *db_home; char * const *db_config; DB_ENV *dbenv; - int flags; + u_int32_t flags; { FILE *fp; - int ret; + int mode, ret; char * const *p; char *lp, buf[MAXPATHLEN * 2]; /* Validate arguments. */ if (dbenv == NULL) return (EINVAL); + + #ifdef HAVE_SPINLOCKS #define OKFLAGS \ (DB_CREATE | DB_NOMMAP | DB_THREAD | DB_INIT_LOCK | DB_INIT_LOG | \ @@ -89,10 +89,9 @@ db_appinit(db_home, db_config, dbenv, flags) if ((ret = __db_fchk(dbenv, "db_appinit", flags, OKFLAGS)) != 0) return (ret); -#define RECOVERY_FLAGS (DB_CREATE | DB_INIT_TXN | DB_INIT_LOG) - if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && - LF_ISSET(RECOVERY_FLAGS) != RECOVERY_FLAGS) - return (__db_ferr(dbenv, "db_appinit", 1)); + /* Transactions imply logging. */ + if (LF_ISSET(DB_INIT_TXN)) + LF_SET(DB_INIT_LOG); /* Convert the db_appinit(3) flags. */ if (LF_ISSET(DB_THREAD)) @@ -147,47 +146,48 @@ db_appinit(db_home, db_config, dbenv, flags) F_SET(dbenv, DB_ENV_APPINIT); /* - * If we are doing recovery, remove all the regions. + * If we are doing recovery, remove all the old shared memory + * regions. */ if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) { - /* Remove all the old shared memory regions. */ - if ((ret = log_unlink(NULL, 1 /* force */, dbenv)) != 0) + if ((ret = log_unlink(NULL, 1, dbenv)) != 0) goto err; - if ((ret = memp_unlink(NULL, 1 /* force */, dbenv)) != 0) + if ((ret = memp_unlink(NULL, 1, dbenv)) != 0) goto err; - if ((ret = lock_unlink(NULL, 1 /* force */, dbenv)) != 0) + if ((ret = lock_unlink(NULL, 1, dbenv)) != 0) goto err; - if ((ret = txn_unlink(NULL, 1 /* force */, dbenv)) != 0) + if ((ret = txn_unlink(NULL, 1, dbenv)) != 0) goto err; } - /* Transactions imply logging. */ - if (LF_ISSET(DB_INIT_TXN)) - LF_SET(DB_INIT_LOG); - - /* Default permissions are 0660. */ -#undef DB_DEFPERM -#define DB_DEFPERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) - - /* Initialize the subsystems. */ + /* + * Create the new shared regions. + * + * Default permissions are read-write for both owner and group. + */ + mode = __db_omode("rwrw--"); if (LF_ISSET(DB_INIT_LOCK) && (ret = lock_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD), - DB_DEFPERM, dbenv, &dbenv->lk_info)) != 0) + mode, dbenv, &dbenv->lk_info)) != 0) goto err; if (LF_ISSET(DB_INIT_LOG) && (ret = log_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD), - DB_DEFPERM, dbenv, &dbenv->lg_info)) != 0) + mode, dbenv, &dbenv->lg_info)) != 0) goto err; if (LF_ISSET(DB_INIT_MPOOL) && (ret = memp_open(NULL, LF_ISSET(DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD), - DB_DEFPERM, dbenv, &dbenv->mp_info)) != 0) + mode, dbenv, &dbenv->mp_info)) != 0) goto err; if (LF_ISSET(DB_INIT_TXN) && (ret = txn_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC), - DB_DEFPERM, dbenv, &dbenv->tx_info)) != 0) + mode, dbenv, &dbenv->tx_info)) != 0) goto err; - /* Initialize recovery. */ + /* + * If the application is running with transactions, initialize the + * function tables. Once that's done, do recovery for any previous + * run. + */ if (LF_ISSET(DB_INIT_TXN)) { if ((ret = __bam_init_recover(dbenv)) != 0) goto err; @@ -199,12 +199,12 @@ db_appinit(db_home, db_config, dbenv, flags) goto err; if ((ret = __txn_init_recover(dbenv)) != 0) goto err; - } - /* Run recovery if necessary. */ - if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && (ret = - __db_apprec(dbenv, LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0) - goto err; + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && + (ret = __db_apprec(dbenv, + LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0) + goto err; + } return (ret); @@ -282,21 +282,21 @@ db_appexit(dbenv) * it in allocated space. * * PUBLIC: int __db_appname __P((DB_ENV *, - * PUBLIC: APPNAME, const char *, const char *, int *, char **)); + * PUBLIC: APPNAME, const char *, const char *, u_int32_t, int *, char **)); */ int -__db_appname(dbenv, appname, dir, file, fdp, namep) +__db_appname(dbenv, appname, dir, file, tmp_oflags, fdp, namep) DB_ENV *dbenv; APPNAME appname; const char *dir, *file; + u_int32_t tmp_oflags; int *fdp; char **namep; { DB_ENV etmp; size_t len; - int ret, slash, tmp_create, tmp_free; + int data_entry, ret, slash, tmp_create, tmp_free; const char *a, *b, *c; - int data_entry; char *p, *start; a = b = c = NULL; @@ -349,8 +349,8 @@ __db_appname(dbenv, appname, dir, file, fdp, namep) * * DB_ENV APPNAME RESULT * ------------------------------------------- - * null DB_APP_TMP <tmp>/<create> - * set DB_APP_TMP DB_HOME/DB_TMP_DIR/<create> + * null DB_APP_TMP* <tmp>/<create> + * set DB_APP_TMP* DB_HOME/DB_TMP_DIR/<create> */ retry: switch (appname) { case DB_APP_NONE: @@ -431,7 +431,14 @@ done: len = (c == NULL ? 0 : strlen(c) + 1) + (file == NULL ? 0 : strlen(file) + 1); - if ((start = (char *)__db_malloc(len)) == NULL) { + /* + * Allocate space to hold the current path information, as well as any + * temporary space that we're going to need to create a temporary file + * name. + */ +#define DB_TRAIL "XXXXXX" + if ((start = + (char *)__db_malloc(len + sizeof(DB_TRAIL) + 10)) == NULL) { __db_err(dbenv, "%s", strerror(ENOMEM)); if (tmp_free) FREES(etmp.db_tmp_dir); @@ -460,14 +467,15 @@ done: len = FREES(etmp.db_tmp_dir); /* Create the file if so requested. */ - if (tmp_create) { - ret = __db_tmp_open(dbenv, start, fdp); + if (tmp_create && + (ret = __db_tmp_open(dbenv, tmp_oflags, start, fdp)) != 0) { FREES(start); - } else { - *namep = start; - ret = 0; + return (ret); } - return (ret); + + if (namep != NULL) + *namep = start; + return (0); } /* @@ -478,7 +486,7 @@ static int __db_home(dbenv, db_home, flags) DB_ENV *dbenv; const char *db_home; - int flags; + u_int32_t flags; { const char *p; @@ -532,10 +540,12 @@ __db_parse(dbenv, s) return (ENOMEM); tp = local_s; - while ((name = strsep(&tp, " \t")) != NULL && *name == '\0'); + while ((name = strsep(&tp, " \t")) != NULL && *name == '\0') + ; if (name == NULL) goto illegal; - while ((value = strsep(&tp, " \t")) != NULL && *value == '\0'); + while ((value = strsep(&tp, " \t")) != NULL && *value == '\0') + ; if (value == NULL) { illegal: ret = EINVAL; __db_err(dbenv, "illegal name-value pair: %s", s); @@ -591,7 +601,7 @@ static char *sTempFolder; static int __db_tmp_dir(dbenv, flags) DB_ENV *dbenv; - int flags; + u_int32_t flags; { static const char * list[] = { /* Ordered: see db_appinit(3). */ "/var/tmp", @@ -671,49 +681,45 @@ __db_tmp_dir(dbenv, flags) * Create a temporary file. */ static int -__db_tmp_open(dbenv, dir, fdp) +__db_tmp_open(dbenv, flags, path, fdp) DB_ENV *dbenv; - char *dir; + u_int32_t flags; + char *path; int *fdp; { #ifdef HAVE_SIGFILLSET sigset_t set, oset; #endif u_long pid; - size_t len; - int isdir, ret; - char *trv, buf[MAXPATHLEN]; + int mode, isdir, ret; + const char *p; + char *trv; /* * Check the target directory; if you have six X's and it doesn't * exist, this runs for a *very* long time. */ - if ((ret = __db_exists(dir, &isdir)) != 0) { - __db_err(dbenv, "%s: %s", dir, strerror(ret)); + if ((ret = __db_exists(path, &isdir)) != 0) { + __db_err(dbenv, "%s: %s", path, strerror(ret)); return (ret); } if (!isdir) { - __db_err(dbenv, "%s: %s", dir, strerror(EINVAL)); + __db_err(dbenv, "%s: %s", path, strerror(EINVAL)); return (EINVAL); } /* Build the path. */ -#define DB_TRAIL "/XXXXXX" - if ((len = strlen(dir)) + sizeof(DB_TRAIL) > sizeof(buf)) { - __db_err(dbenv, - "tmp_open: %s: %s", buf, strerror(ENAMETOOLONG)); - return (ENAMETOOLONG); - } - (void)strcpy(buf, dir); - (void)strcpy(buf + len, DB_TRAIL); - buf[len] = PATH_SEPARATOR[0]; /* WIN32 */ + for (trv = path; *trv != '\0'; ++trv) + ; + *trv = PATH_SEPARATOR[0]; + for (p = DB_TRAIL; (*++trv = *p) != '\0'; ++p) + ; /* * Replace the X's with the process ID. Pid should be a pid_t, * but we use unsigned long for portability. */ - for (pid = getpid(), - trv = buf + len + sizeof(DB_TRAIL) - 1; *--trv == 'X'; pid /= 10) + for (pid = getpid(); *--trv == 'X'; pid /= 10) switch (pid % 10) { case 0: *trv = '0'; break; case 1: *trv = '1'; break; @@ -728,30 +734,33 @@ __db_tmp_open(dbenv, dir, fdp) } ++trv; + /* Set up open flags and mode. */ + LF_SET(DB_CREATE | DB_EXCL); + mode = __db_omode("rw----"); + /* - * Try and open a file. We block every signal we can get our hands + * Try to open a file. We block every signal we can get our hands * on so that, if we're interrupted at the wrong time, the temporary * file isn't left around -- of course, if we drop core in-between * the calls we'll hang forever, but that's probably okay. ;-} */ #ifdef HAVE_SIGFILLSET - (void)sigfillset(&set); + if (LF_ISSET(DB_TEMPORARY)) + (void)sigfillset(&set); #endif for (;;) { #ifdef HAVE_SIGFILLSET - (void)sigprocmask(SIG_BLOCK, &set, &oset); + if (LF_ISSET(DB_TEMPORARY)) + (void)sigprocmask(SIG_BLOCK, &set, &oset); #endif -#define DB_TEMPOPEN DB_CREATE | DB_EXCL | DB_TEMPORARY - if ((ret = __db_open(buf, - DB_TEMPOPEN, DB_TEMPOPEN, S_IRUSR | S_IWUSR, fdp)) == 0) { + ret = __db_open(path, flags, flags, mode, fdp); #ifdef HAVE_SIGFILLSET + if (LF_ISSET(DB_TEMPORARY)) (void)sigprocmask(SIG_SETMASK, &oset, NULL); #endif + if (ret == 0) return (0); - } -#ifdef HAVE_SIGFILLSET - (void)sigprocmask(SIG_SETMASK, &oset, NULL); -#endif + /* * XXX: * If we don't get an EEXIST error, then there's something @@ -761,7 +770,7 @@ __db_tmp_open(dbenv, dir, fdp) */ if (ret != EEXIST) { __db_err(dbenv, - "tmp_open: %s: %s", buf, strerror(ret)); + "tmp_open: %s: %s", path, strerror(ret)); return (ret); } diff --git a/db2/common/db_apprec.c b/db2/common/db_apprec.c index 7a42e13317..df707eafef 100644 --- a/db2/common/db_apprec.c +++ b/db2/common/db_apprec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ @@ -9,18 +9,17 @@ #ifndef lint static const char copyright[] = -"@(#) Copyright (c) 1997\n\ +"@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_apprec.c 10.23 (Sleepycat) 1/17/98"; +static const char sccsid[] = "@(#)db_apprec.c 10.30 (Sleepycat) 5/3/98"; #endif #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> -#include <time.h> #include <string.h> -#include <stdlib.h> +#include <time.h> #endif #include "db_int.h" @@ -36,18 +35,19 @@ static const char sccsid[] = "@(#)db_apprec.c 10.23 (Sleepycat) 1/17/98"; * __db_apprec -- * Perform recovery. * - * PUBLIC: int __db_apprec __P((DB_ENV *, int)); + * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t)); */ int __db_apprec(dbenv, flags) DB_ENV *dbenv; - int flags; + u_int32_t flags; { DBT data; DB_LOG *lp; DB_LSN ckp_lsn, first_lsn, lsn; time_t now; - int is_thread, ret; + u_int32_t is_thread; + int ret; void *txninfo; lp = dbenv->lg_info; @@ -91,14 +91,14 @@ __db_apprec(dbenv, flags) if ((ret = log_get(lp, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) { /* * If we don't find a checkpoint, start from the beginning. - * If that fails, we're done. Note, we require that there - * be log records if we're performing recovery, and fail if - * there aren't. + * If that fails, we're done. Note, we do not require that + * there be log records if we're performing recovery. */ if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) { - __db_err(dbenv, "First log record not found"); if (ret == DB_NOTFOUND) - ret = EINVAL; + ret = 0; + else + __db_err(dbenv, "First log record not found"); goto out; } } @@ -134,14 +134,17 @@ __db_apprec(dbenv, flags) } else if ((ret = __log_findckp(lp, &first_lsn)) == DB_NOTFOUND) { /* - * If recovery was specified, there must be log files. - * If we don't find one, it's an error. (This should - * have been caught above, when a log_get() of DB_FIRST - * or DB_CHECKPOINT succeeded, but paranoia is good.) + * We don't require that log files exist if recovery + * was specified. */ - ret = EINVAL; + ret = 0; goto out; } + + if (dbenv->db_verbose) + __db_err(lp->dbenv, "Recovery starting from [%lu][%lu]", + (u_long)first_lsn.file, (u_long)first_lsn.offset); + for (ret = log_get(lp, &lsn, &data, DB_LAST); ret == 0 && log_compare(&lsn, &first_lsn) > 0; ret = log_get(lp, &lsn, &data, DB_PREV)) { @@ -175,21 +178,21 @@ __db_apprec(dbenv, flags) __log_close_files(lp); /* - * Now set the maximum transaction id, set the last checkpoint lsn, - * and the current time. Then take a checkpoint. + * Now set the last checkpoint lsn and the current time, + * take a checkpoint, and reset the txnid. */ (void)time(&now); - dbenv->tx_info->region->last_txnid = ((__db_txnhead *)txninfo)->maxid; dbenv->tx_info->region->last_ckp = ckp_lsn; dbenv->tx_info->region->time_ckp = (u_int32_t)now; if ((ret = txn_checkpoint(dbenv->tx_info, 0, 0)) != 0) goto out; + dbenv->tx_info->region->last_txnid = TXN_MINIMUM; if (dbenv->db_verbose) { __db_err(lp->dbenv, "Recovery complete at %.24s", ctime(&now)); - __db_err(lp->dbenv, "%s %lu %s [%lu][%lu]", + __db_err(lp->dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction id", - (u_long)dbenv->tx_info->region->last_txnid, + ((DB_TXNHEAD *)txninfo)->maxid, "Recovery checkpoint", (u_long)dbenv->tx_info->region->last_ckp.file, (u_long)dbenv->tx_info->region->last_ckp.offset); diff --git a/db2/common/db_byteorder.c b/db2/common/db_byteorder.c index e486132073..cadf742851 100644 --- a/db2/common/db_byteorder.c +++ b/db2/common/db_byteorder.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_byteorder.c 10.4 (Sleepycat) 9/4/97"; +static const char sccsid[] = "@(#)db_byteorder.c 10.5 (Sleepycat) 4/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES diff --git a/db2/common/db_err.c b/db2/common/db_err.c index fc59aadbaf..98a414279e 100644 --- a/db2/common/db_err.c +++ b/db2/common/db_err.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_err.c 10.21 (Sleepycat) 1/13/98"; +static const char sccsid[] = "@(#)db_err.c 10.25 (Sleepycat) 5/2/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -26,6 +26,7 @@ static const char sccsid[] = "@(#)db_err.c 10.21 (Sleepycat) 1/13/98"; #include "db_int.h" #include "common_ext.h" +static int __db_keyempty __P((const DB_ENV *)); static int __db_rdonly __P((const DB_ENV *, const char *)); /* @@ -81,11 +82,11 @@ __db_err(dbenv, fmt, va_alist) * appears before the assignment in the __db__panic() call. */ static int __db_ecursor __P((DB *, DB_TXN *, DBC **)); -static int __db_edel __P((DB *, DB_TXN *, DBT *, int)); +static int __db_edel __P((DB *, DB_TXN *, DBT *, u_int32_t)); static int __db_efd __P((DB *, int *)); -static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, int)); -static int __db_estat __P((DB *, void *, void *(*)(size_t), int)); -static int __db_esync __P((DB *, int)); +static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __db_estat __P((DB *, void *, void *(*)(size_t), u_int32_t)); +static int __db_esync __P((DB *, u_int32_t)); /* * __db_ecursor -- @@ -113,7 +114,7 @@ __db_edel(a, b, c, d) DB *a; DB_TXN *b; DBT *c; - int d; + u_int32_t d; { COMPQUIET(a, NULL); COMPQUIET(b, NULL); @@ -147,7 +148,7 @@ __db_egp(a, b, c, d, e) DB *a; DB_TXN *b; DBT *c, *d; - int e; + u_int32_t e; { COMPQUIET(a, NULL); COMPQUIET(b, NULL); @@ -167,7 +168,7 @@ __db_estat(a, b, c, d) DB *a; void *b; void *(*c) __P((size_t)); - int d; + u_int32_t d; { COMPQUIET(a, NULL); COMPQUIET(b, NULL); @@ -184,7 +185,7 @@ __db_estat(a, b, c, d) static int __db_esync(a, b) DB *a; - int b; + u_int32_t b; { COMPQUIET(a, NULL); COMPQUIET(b, 0); @@ -208,6 +209,10 @@ __db_panic(dbp) * * We should call mpool and have it shut down the file, so we get * other processes sharing this file as well. + * + * Chaos reigns within. + * Reflect, repent, and reboot. + * Order shall return. */ dbp->cursor = __db_ecursor; dbp->del = __db_edel; @@ -235,13 +240,13 @@ __db_panic(dbp) * __db_fchk -- * General flags checking routine. * - * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, int, int)); + * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t)); */ int __db_fchk(dbenv, name, flags, ok_flags) DB_ENV *dbenv; const char *name; - int flags, ok_flags; + u_int32_t flags, ok_flags; { DB_CHECK_FLAGS(dbenv, name, flags, ok_flags); return (0); @@ -251,13 +256,14 @@ __db_fchk(dbenv, name, flags, ok_flags) * __db_fcchk -- * General combination flags checking routine. * - * PUBLIC: int __db_fcchk __P((DB_ENV *, const char *, int, int, int)); + * PUBLIC: int __db_fcchk + * PUBLIC: __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t)); */ int __db_fcchk(dbenv, name, flags, flag1, flag2) DB_ENV *dbenv; const char *name; - int flags, flag1, flag2; + u_int32_t flags, flag1, flag2; { DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2); return (0); @@ -267,12 +273,13 @@ __db_fcchk(dbenv, name, flags, flag1, flag2) * __db_cdelchk -- * Common cursor delete argument checking routine. * - * PUBLIC: int __db_cdelchk __P((const DB *, int, int, int)); + * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int)); */ int __db_cdelchk(dbp, flags, isrdonly, isvalid) const DB *dbp; - int flags, isrdonly, isvalid; + u_int32_t flags; + int isrdonly, isvalid; { /* Check for changes to a read-only tree. */ if (isrdonly) @@ -292,17 +299,18 @@ __db_cdelchk(dbp, flags, isrdonly, isvalid) * __db_cgetchk -- * Common cursor get argument checking routine. * - * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, int, int)); + * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int)); */ int __db_cgetchk(dbp, key, data, flags, isvalid) const DB *dbp; DBT *key, *data; - int flags, isvalid; + u_int32_t flags; + int isvalid; { - int check_key; + int key_einval, key_flags; - check_key = 0; + key_flags = key_einval = 0; /* Check for invalid dbc->c_get() function flags. */ switch (flags) { @@ -311,10 +319,13 @@ __db_cgetchk(dbp, key, data, flags, isvalid) case DB_LAST: case DB_NEXT: case DB_PREV: + key_flags = 1; + break; case DB_SET_RANGE: - check_key = 1; + key_einval = key_flags = 1; break; case DB_SET: + key_einval = 1; break; case DB_GET_RECNO: if (!F_ISSET(dbp, DB_BT_RECNUM)) @@ -323,14 +334,14 @@ __db_cgetchk(dbp, key, data, flags, isvalid) case DB_SET_RECNO: if (!F_ISSET(dbp, DB_BT_RECNUM)) goto err; - check_key = 1; + key_einval = key_flags = 1; break; default: err: return (__db_ferr(dbp->dbenv, "c_get", 0)); } /* Check for invalid key/data flags. */ - if (check_key) + if (key_flags) DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, @@ -340,11 +351,15 @@ err: return (__db_ferr(dbp->dbenv, "c_get", 0)); if (F_ISSET(dbp, DB_AM_THREAD)) { if (!F_ISSET(data, DB_DBT_USERMEM | DB_DBT_MALLOC)) return (__db_ferr(dbp->dbenv, "threaded data", 1)); - if (check_key && + if (key_flags && !F_ISSET(key, DB_DBT_USERMEM | DB_DBT_MALLOC)) return (__db_ferr(dbp->dbenv, "threaded key", 1)); } + /* Check for missing keys. */ + if (key_einval && (key->data == NULL || key->size == 0)) + return (__db_keyempty(dbp->dbenv)); + /* * The cursor must be initialized for DB_CURRENT, return -1 for an * invalid cursor, otherwise 0. @@ -357,23 +372,24 @@ err: return (__db_ferr(dbp->dbenv, "c_get", 0)); * Common cursor put argument checking routine. * * PUBLIC: int __db_cputchk __P((const DB *, - * PUBLIC: const DBT *, DBT *, int, int, int)); + * PUBLIC: const DBT *, DBT *, u_int32_t, int, int)); */ int __db_cputchk(dbp, key, data, flags, isrdonly, isvalid) const DB *dbp; const DBT *key; DBT *data; - int flags, isrdonly, isvalid; + u_int32_t flags; + int isrdonly, isvalid; { - int check_key; + int key_einval, key_flags; /* Check for changes to a read-only tree. */ if (isrdonly) return (__db_rdonly(dbp->dbenv, "c_put")); /* Check for invalid dbc->c_put() function flags. */ - check_key = 0; + key_einval = key_flags = 0; switch (flags) { case DB_AFTER: case DB_BEFORE: @@ -388,19 +404,23 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid) case DB_KEYLAST: if (dbp->type == DB_RECNO) goto err; - check_key = 1; + key_einval = key_flags = 1; break; default: err: return (__db_ferr(dbp->dbenv, "c_put", 0)); } /* Check for invalid key/data flags. */ - if (check_key) + if (key_flags) DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + /* Check for missing keys. */ + if (key_einval && (key->data == NULL || key->size == 0)) + return (__db_keyempty(dbp->dbenv)); + /* * The cursor must be initialized for anything other than DB_KEYFIRST * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0. @@ -413,12 +433,14 @@ err: return (__db_ferr(dbp->dbenv, "c_put", 0)); * __db_delchk -- * Common delete argument checking routine. * - * PUBLIC: int __db_delchk __P((const DB *, int, int)); + * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); */ int -__db_delchk(dbp, flags, isrdonly) +__db_delchk(dbp, key, flags, isrdonly) const DB *dbp; - int flags, isrdonly; + DBT *key; + u_int32_t flags; + int isrdonly; { /* Check for changes to a read-only tree. */ if (isrdonly) @@ -427,6 +449,10 @@ __db_delchk(dbp, flags, isrdonly) /* Check for invalid db->del() function flags. */ DB_CHECK_FLAGS(dbp->dbenv, "delete", flags, 0); + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + return (0); } @@ -434,14 +460,14 @@ __db_delchk(dbp, flags, isrdonly) * __db_getchk -- * Common get argument checking routine. * - * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, int)); + * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t)); */ int __db_getchk(dbp, key, data, flags) const DB *dbp; const DBT *key; DBT *data; - int flags; + u_int32_t flags; { /* Check for invalid db->get() function flags. */ DB_CHECK_FLAGS(dbp->dbenv, @@ -457,6 +483,10 @@ __db_getchk(dbp, key, data, flags) !F_ISSET(data, DB_DBT_MALLOC | DB_DBT_USERMEM)) return (__db_ferr(dbp->dbenv, "threaded data", 1)); + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + return (0); } @@ -464,14 +494,16 @@ __db_getchk(dbp, key, data, flags) * __db_putchk -- * Common put argument checking routine. * - * PUBLIC: int __db_putchk __P((const DB *, DBT *, const DBT *, int, int, int)); + * PUBLIC: int __db_putchk + * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); */ int __db_putchk(dbp, key, data, flags, isrdonly, isdup) const DB *dbp; DBT *key; const DBT *data; - int flags, isrdonly, isdup; + u_int32_t flags; + int isrdonly, isdup; { /* Check for changes to a read-only tree. */ if (isrdonly) @@ -488,12 +520,17 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup) DB_CHECK_FCOMBO(dbp->dbenv, "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM); + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + /* Check for partial puts in the presence of duplicates. */ if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) { __db_err(dbp->dbenv, "a partial put in the presence of duplicates requires a cursor operation"); return (EINVAL); } + return (0); } @@ -501,12 +538,12 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup) * __db_statchk -- * Common stat argument checking routine. * - * PUBLIC: int __db_statchk __P((const DB *, int)); + * PUBLIC: int __db_statchk __P((const DB *, u_int32_t)); */ int __db_statchk(dbp, flags) const DB *dbp; - int flags; + u_int32_t flags; { /* Check for invalid db->stat() function flags. */ DB_CHECK_FLAGS(dbp->dbenv, "stat", flags, DB_RECORDCOUNT); @@ -522,12 +559,12 @@ __db_statchk(dbp, flags) * __db_syncchk -- * Common sync argument checking routine. * - * PUBLIC: int __db_syncchk __P((const DB *, int)); + * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t)); */ int __db_syncchk(dbp, flags) const DB *dbp; - int flags; + u_int32_t flags; { /* Check for invalid db->sync() function flags. */ DB_CHECK_FLAGS(dbp->dbenv, "sync", flags, 0); @@ -542,13 +579,13 @@ __db_syncchk(dbp, flags) * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int)); */ int -__db_ferr(dbenv, name, combo) +__db_ferr(dbenv, name, iscombo) const DB_ENV *dbenv; const char *name; - int combo; + int iscombo; { __db_err(dbenv, "illegal flag %sspecified to %s", - combo ? "combination " : "", name); + iscombo ? "combination " : "", name); return (EINVAL); } @@ -564,3 +601,15 @@ __db_rdonly(dbenv, name) __db_err(dbenv, "%s: attempt to modify a read-only tree", name); return (EACCES); } + +/* + * __db_keyempty -- + * Common missing or empty key value message. + */ +static int +__db_keyempty(dbenv) + const DB_ENV *dbenv; +{ + __db_err(dbenv, "missing or empty key value specified"); + return (EINVAL); +} diff --git a/db2/common/db_log2.c b/db2/common/db_log2.c index 9af01116f6..d6b14f540b 100644 --- a/db2/common/db_log2.c +++ b/db2/common/db_log2.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_log2.c 10.3 (Sleepycat) 6/21/97"; +static const char sccsid[] = "@(#)db_log2.c 10.5 (Sleepycat) 4/26/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -63,6 +63,7 @@ __db_log2(num) u_int32_t i, limit; limit = 1; - for (i = 0; limit < num; limit = limit << 1, i++); + for (i = 0; limit < num; limit = limit << 1, i++) + ; return (i); } diff --git a/db2/common/db_region.c b/db2/common/db_region.c index 02d939e3e6..6d15f7f092 100644 --- a/db2/common/db_region.c +++ b/db2/common/db_region.c @@ -1,59 +1,20 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ -/* - * Copyright (c) 1995, 1996 - * The President and Fellows of Harvard University. All rights reserved. - * - * This code is derived from software contributed to Harvard by - * Margo Seltzer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_region.c 10.21 (Sleepycat) 1/16/98"; +static const char sccsid[] = "@(#)db_region.c 10.46 (Sleepycat) 5/26/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> -#include <sys/stat.h> #include <errno.h> -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> #include <string.h> #include <unistd.h> #endif @@ -61,548 +22,840 @@ static const char sccsid[] = "@(#)db_region.c 10.21 (Sleepycat) 1/16/98"; #include "db_int.h" #include "common_ext.h" -static int __db_rmap __P((DB_ENV *, int, size_t, void *)); +static int __db_growregion __P((REGINFO *, size_t)); /* - * __db_rcreate -- - * - * Common interface for creating a shared region. Handles synchronization - * across multiple processes. - * - * The dbenv contains the environment for this process, including naming - * information. The path argument represents the parameters passed to - * the open routines and may be either a file or a directory. If it is - * a directory, it must exist. If it is a file, then the file parameter - * must be NULL, otherwise, file is the name to be created inside the - * directory path. - * - * The function returns a pointer to the shared region that has been mapped - * into memory, NULL on error. + * __db_rattach -- + * Optionally create and attach to a shared memory region. * - * PUBLIC: int __db_rcreate __P((DB_ENV *, APPNAME, - * PUBLIC: const char *, const char *, int, size_t, int, int *, void *)); + * PUBLIC: int __db_rattach __P((REGINFO *)); */ int -__db_rcreate(dbenv, appname, path, file, mode, size, oflags, fdp, retp) - DB_ENV *dbenv; - APPNAME appname; - const char *path, *file; - int mode, oflags, *fdp; - size_t size; - void *retp; +__db_rattach(infop) + REGINFO *infop; { - RLAYOUT *rp; - int fd, ret; - char *name; + RLAYOUT *rlp, rl; + size_t grow_region, size; + ssize_t nr, nw; + u_int32_t flags, mbytes, bytes; + u_int8_t *p; + int malloc_possible, ret, retry_cnt; + + grow_region = 0; + malloc_possible = 1; + ret = retry_cnt = 0; + + /* Round off the requested size to the next page boundary. */ + DB_ROUNDOFF(infop->size); + + /* Some architectures have hard limits on the maximum region size. */ +#ifdef DB_REGIONSIZE_MAX + if (infop->size > DB_REGIONSIZE_MAX) { + __db_err(infop->dbenv, "__db_rattach: cache size too large"); + return (EINVAL); + } +#endif - fd = -1; - rp = NULL; + /* Intialize the return information in the REGINFO structure. */ +loop: infop->addr = NULL; + infop->fd = -1; + infop->segid = INVALID_SEGID; + if (infop->name != NULL) { + FREES(infop->name); + infop->name = NULL; + } + F_CLR(infop, REGION_CANGROW | REGION_CREATED); +#ifndef HAVE_SPINLOCKS /* - * Get the filename -- note, if it's a temporary file, it will - * be created by the underlying temporary file creation code, - * so we have to check the file descriptor to be sure it's an - * error. + * XXX + * Lacking spinlocks, we must have a file descriptor for fcntl(2) + * locking, which implies using mmap(2) to map in a regular file. + * (Theoretically, we could probably get a file descriptor to lock + * other types of shared regions, but I don't see any reason to + * bother.) */ - if ((ret = __db_appname(dbenv, appname, path, file, &fd, &name)) != 0) - return (ret); + malloc_possible = 0; +#endif +#ifdef __hppa /* - * Now open the file. We need to make sure that multiple processes - * that attempt to create the region at the same time are properly - * ordered, so we open it DB_EXCL and DB_CREATE so two simultaneous - * attempts to create the region will return failure in one of the - * attempts. + * XXX + * HP-UX won't permit mutexes to live in anything but shared memory. + * Instantiate a shared region file on that architecture, regardless. */ - oflags |= DB_CREATE | DB_EXCL; - if (fd == -1 && - (ret = __db_open(name, oflags, oflags, mode, &fd)) != 0) { - if (ret != EEXIST) - __db_err(dbenv, - "region create: %s: %s", name, strerror(ret)); - goto err; + malloc_possible = 0; +#endif + /* + * If a region is truly private, malloc the memory. That's faster + * than either anonymous memory or a shared file. + */ + if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) { + if ((infop->addr = __db_malloc(infop->size)) == NULL) + return (ENOMEM); + + /* + * It's sometimes significantly faster to page-fault in all + * of the region's pages before we run the application, as + * we can see fairly nasty side-effects when we page-fault + * while holding various locks, i.e., the lock takes a long + * time, and other threads convoy behind the lock holder. + */ + if (DB_GLOBAL(db_region_init)) + for (p = infop->addr; + p < (u_int8_t *)infop->addr + infop->size; + p += DB_VMPAGESIZE) + p[0] = '\0'; + + F_SET(infop, REGION_CREATED | REGION_MALLOC); + goto region_init; } - *fdp = fd; - /* Grow the region to the correct size. */ - if ((ret = __db_rgrow(dbenv, fd, size)) != 0) - goto err; + /* + * Get the name of the region (creating the file if a temporary file + * is being used). The dbenv contains the current DB environment, + * including naming information. The path argument may be a file or + * a directory. If path is a directory, it must exist and file is the + * file name to be created inside the directory. If path is a file, + * then file must be NULL. + */ + if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path, + infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0) + return (ret); + if (infop->fd != -1) + F_SET(infop, REGION_CREATED); - /* Map the region in. */ - if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0) - goto err; + /* + * Try to create the file, if we have authority. We have to make sure + * that multiple threads/processes attempting to simultaneously create + * the region are properly ordered, so we open it using DB_CREATE and + * DB_EXCL, so two attempts to create the region will return failure in + * one. + */ + if (infop->fd == -1 && infop->dbflags & DB_CREATE) { + flags = infop->dbflags; + LF_SET(DB_EXCL); + if ((ret = __db_open(infop->name, + flags, flags, infop->mode, &infop->fd)) == 0) + F_SET(infop, REGION_CREATED); + else + if (ret != EEXIST) + goto errmsg; + } - /* Initialize the region. */ - if ((ret = __db_rinit(dbenv, rp, fd, size, 1)) != 0) - goto err; + /* If we couldn't create the file, try and open it. */ + if (infop->fd == -1) { + flags = infop->dbflags; + LF_CLR(DB_CREATE | DB_EXCL); + if ((ret = __db_open(infop->name, + flags, flags, infop->mode, &infop->fd)) != 0) + goto errmsg; + } - if (name != NULL) - FREES(name); + /* + * There are three cases we support: + * 1. Named anonymous memory (shmget(2)). + * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS). + * 3. Memory backed by a regular file (mmap(2)). + * + * We instantiate a backing file in all cases, which contains at least + * the RLAYOUT structure, and in case #4, contains the actual region. + * This is necessary for a couple of reasons: + * + * First, the mpool region uses temporary files to name regions, and + * since you may have multiple regions in the same directory, we need + * a filesystem name to ensure that they don't collide. + * + * Second, applications are allowed to forcibly remove regions, even + * if they don't know anything about them other than the name. If a + * region is backed by anonymous memory, there has to be some way for + * the application to find out that information, and, in some cases, + * determine ID information for the anonymous memory. + */ + if (F_ISSET(infop, REGION_CREATED)) { + /* + * If we're using anonymous memory to back this region, set + * the flag. + */ + if (DB_GLOBAL(db_region_anon)) + F_SET(infop, REGION_ANONYMOUS); - *(void **)retp = rp; - return (0); + /* + * If we're using a regular file to back a region we created, + * grow it to the specified size. + */ + if (!DB_GLOBAL(db_region_anon) && + (ret = __db_growregion(infop, infop->size)) != 0) + goto err; + } else { + /* + * If we're joining a region, figure out what it looks like. + * + * XXX + * We have to figure out if the file is a regular file backing + * a region that we want to map into our address space, or a + * file with the information we need to find a shared anonymous + * region that we want to map into our address space. + * + * All this noise is because some systems don't have a coherent + * VM and buffer cache, and worse, if you mix operations on the + * VM and buffer cache, half the time you hang the system. + * + * There are two possibilities. If the file is the size of an + * RLAYOUT structure, then we know that the real region is in + * shared memory, because otherwise it would be bigger. (As + * the RLAYOUT structure size is smaller than a disk sector, + * the only way it can be this size is if deliberately written + * that way.) In which case, retrieve the information we need + * from the RLAYOUT structure and use it to acquire the shared + * memory. + * + * If the structure is larger than an RLAYOUT structure, then + * the file is backing the shared memory region, and we use + * the current size of the file without reading any information + * from the file itself so that we don't confuse the VM. + * + * And yes, this makes me want to take somebody and kill them, + * but I can't think of any other solution. + */ + if ((ret = __db_ioinfo(infop->name, + infop->fd, &mbytes, &bytes, NULL)) != 0) + goto errmsg; + size = mbytes * MEGABYTE + bytes; + + if (size <= sizeof(RLAYOUT)) { + /* + * If the size is too small, the read fails or the + * valid flag is incorrect, assume it's because the + * RLAYOUT information hasn't been written out yet, + * and retry. + */ + if (size < sizeof(RLAYOUT)) + goto retry; + if ((ret = + __db_read(infop->fd, &rl, sizeof(rl), &nr)) != 0) + goto retry; + if (rl.valid != DB_REGIONMAGIC) + goto retry; + + /* Copy the size, memory id and characteristics. */ + size = rl.size; + infop->segid = rl.segid; + if (F_ISSET(&rl, REGION_ANONYMOUS)) + F_SET(infop, REGION_ANONYMOUS); + } -err: if (fd != -1) { - if (rp != NULL) - (void)__db_unmap(rp, rp->size); - (void)__db_unlink(name); - (void)__db_close(fd); + /* + * If the region is larger than we think, that's okay, use the + * current size. If it's smaller than we think, and we were + * just using the default size, that's okay, use the current + * size. If it's smaller than we think and we really care, + * save the size and we'll catch that further down -- we can't + * correct it here because we have to have a lock to grow the + * region. + */ + if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF)) + grow_region = infop->size; + infop->size = size; } - if (name != NULL) - FREES(name); - return (ret); -} - -/* - * __db_rinit -- - * Initialize the region. - * - * PUBLIC: int __db_rinit __P((DB_ENV *, RLAYOUT *, int, size_t, int)); - */ -int -__db_rinit(dbenv, rp, fd, size, lock_region) - DB_ENV *dbenv; - RLAYOUT *rp; - size_t size; - int fd, lock_region; -{ - int ret; - COMPQUIET(dbenv, NULL); + /* + * Map the region into our address space. If we're creating it, the + * underlying routines will make it the right size. + * + * There are at least two cases where we can "reasonably" fail when + * we attempt to map in the region. On Windows/95, closing the last + * reference to a region causes it to be zeroed out. On UNIX, when + * using the shmget(2) interfaces, the region will no longer exist + * if the system was rebooted. In these cases, the underlying map call + * returns EAGAIN, and we *remove* our file and try again. There are + * obvious races in doing this, but it should eventually settle down + * to a winner and then things should proceed normally. + */ + if ((ret = __db_mapregion(infop->name, infop)) != 0) + if (ret == EAGAIN) { + /* + * Pretend we created the region even if we didn't so + * that our error processing unlinks it. + */ + F_SET(infop, REGION_CREATED); + ret = 0; + goto retry; + } else + goto err; +region_init: /* - * Initialize the common information. + * Initialize the common region information. * * !!! * We have to order the region creates so that two processes don't try - * to simultaneously create the region and so that processes that are - * joining the region never see inconsistent data. We'd like to play - * file permissions games, but we can't because WNT filesystems won't - * open a file mode 0. - * - * If the lock_region flag is set, the process creating the region - * acquires the lock before the setting the version number. Any - * process joining the region checks the version number before - * attempting to acquire the lock. (The lock_region flag may not be - * set -- the mpool code sometimes malloc's private regions but still - * needs to initialize them, specifically, the mutex for threads.) + * to simultaneously create the region. This is handled by using the + * DB_CREATE and DB_EXCL flags when we create the "backing" region file. * - * We have to check the version number first, because if the version - * number has not been written, it's possible that the mutex has not - * been initialized in which case an attempt to get it could lead to - * random behavior. If the version number isn't there (the file size - * is too small) or it's 0, we know that the region is being created. - * - * We also make sure to check the return of __db_mutex_lock() here, - * even though we don't usually check elsewhere. This is the first - * lock we attempt to acquire, and if it fails we have to know. (It - * can fail -- SunOS, using fcntl(2) for locking, with an in-memory - * filesystem specified as the database home.) + * We also have to order region joins so that processes joining regions + * never see inconsistent data. We'd like to play permissions games + * with the backing file, but we can't because WNT filesystems won't + * open a file mode 0. */ - __db_mutex_init(&rp->lock, MUTEX_LOCK_OFFSET(rp, &rp->lock)); - if (lock_region && (ret = __db_mutex_lock(&rp->lock, fd)) != 0) - return (ret); - - rp->refcnt = 1; - rp->size = size; - rp->flags = 0; - db_version(&rp->majver, &rp->minver, &rp->patch); + rlp = (RLAYOUT *)infop->addr; + if (F_ISSET(infop, REGION_CREATED)) { + /* + * The process creating the region acquires a lock before it + * sets the valid flag. Any processes joining the region will + * check the valid flag before acquiring the lock. + * + * Check the return of __db_mutex_init() and __db_mutex_lock(), + * even though we don't usually check elsewhere. This is the + * first lock we initialize and acquire, and we have to know if + * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2) + * for locking, with an in-memory filesystem specified as the + * database home.) + */ + if ((ret = __db_mutex_init(&rlp->lock, + MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 || + (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0) + goto err; - return (0); -} + /* Initialize the remaining region information. */ + rlp->refcnt = 1; + rlp->size = infop->size; + db_version(&rlp->majver, &rlp->minver, &rlp->patch); + rlp->segid = infop->segid; + rlp->flags = 0; + if (F_ISSET(infop, REGION_ANONYMOUS)) + F_SET(rlp, REGION_ANONYMOUS); -/* - * __db_ropen -- - * Construct the name of a file, open it and map it in. - * - * PUBLIC: int __db_ropen __P((DB_ENV *, - * PUBLIC: APPNAME, const char *, const char *, int, int *, void *)); - */ -int -__db_ropen(dbenv, appname, path, file, flags, fdp, retp) - DB_ENV *dbenv; - APPNAME appname; - const char *path, *file; - int flags, *fdp; - void *retp; -{ - RLAYOUT *rp; - size_t size; - u_int32_t mbytes, bytes; - int fd, ret; - char *name; + /* + * Fill in the valid field last -- use a magic number, memory + * may not be zero-filled, and we want to minimize the chance + * for collision. + */ + rlp->valid = DB_REGIONMAGIC; - fd = -1; - rp = NULL; + /* + * If the region is anonymous, write the RLAYOUT information + * into the backing file so that future region join and unlink + * calls can find it. + * + * XXX + * We MUST do the seek before we do the write. On Win95, while + * closing the last reference to an anonymous shared region + * doesn't discard the region, it does zero it out. So, the + * REGION_CREATED may be set, but the file may have already + * been written and the file descriptor may be at the end of + * the file. + */ + if (F_ISSET(infop, REGION_ANONYMOUS)) { + if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, 0)) != 0) + goto err; + if ((ret = + __db_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0) + goto err; + } + } else { + /* + * Check the valid flag to ensure the region is initialized. + * If the valid flag has not been set, the mutex may not have + * been initialized, and an attempt to get it could lead to + * random behavior. + */ + if (rlp->valid != DB_REGIONMAGIC) + goto retry; - /* Get the filename. */ - if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0) - return (ret); + /* Get the region lock. */ + (void)__db_mutex_lock(&rlp->lock, infop->fd); - /* Open the file. */ - if ((ret = __db_open(name, flags, DB_MUTEXDEBUG, 0, &fd)) != 0) { - __db_err(dbenv, "region open: %s: %s", name, strerror(ret)); - goto err2; - } + /* + * We now own the region. There are a couple of things that + * may have gone wrong, however. + * + * Problem #1: while we were waiting for the lock, the region + * was deleted. Detected by re-checking the valid flag, since + * it's cleared by the delete region routines. + */ + if (rlp->valid != DB_REGIONMAGIC) { + (void)__db_mutex_unlock(&rlp->lock, infop->fd); + goto retry; + } - *fdp = fd; + /* + * Problem #2: We want a bigger region than has previously been + * created. Detected by checking if the region is smaller than + * our caller requested. If it is, we grow the region, (which + * does the detach and re-attach for us). + */ + if (grow_region != 0 && + (ret = __db_rgrow(infop, grow_region)) != 0) { + (void)__db_mutex_unlock(&rlp->lock, infop->fd); + goto err; + } - /* - * Map the file in. We have to do things in a strange order so that - * we don't get into a situation where the file was just created and - * isn't yet initialized. See the comment in __db_rcreate() above. - * - * XXX - * We'd like to test to see if the file is too big to mmap. Since we - * don't know what size or type off_t's or size_t's are, or the largest - * unsigned integral type is, or what random insanity the local C - * compiler will perpetrate, doing the comparison in a portable way is - * flatly impossible. Hope that mmap fails if the file is too large. - * - */ - if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) { - __db_err(dbenv, "%s: %s", name, strerror(ret)); - goto err2; - } - size = mbytes * MEGABYTE + bytes; + /* + * Problem #3: when we checked the size of the file, it was + * still growing as part of creation. Detected by the fact + * that infop->size isn't the same size as the region. + */ + if (infop->size != rlp->size) { + (void)__db_mutex_unlock(&rlp->lock, infop->fd); + goto retry; + } - /* Check to make sure the first block has been written. */ - if (size < sizeof(RLAYOUT)) { - ret = EAGAIN; - goto err2; + /* Increment the reference count. */ + ++rlp->refcnt; } - /* Map in whatever is there. */ - if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0) - goto err2; + /* Return the region in a locked condition. */ - /* - * Check to make sure the region has been initialized. We can't just - * grab the lock because the lock may not have been initialized yet. - */ - if (rp->majver == 0) { - ret = EAGAIN; - goto err2; - } - - /* Get the region lock. */ - if (!LF_ISSET(DB_MUTEXDEBUG)) - (void)__db_mutex_lock(&rp->lock, fd); + if (0) { +errmsg: __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret)); - /* - * The file may have been half-written if we were descheduled between - * getting the size of the file and checking the major version. Check - * to make sure we got the entire file. - */ - if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) { - __db_err(dbenv, "%s: %s", name, strerror(ret)); - goto err1; - } - if (size != mbytes * MEGABYTE + bytes) { - ret = EAGAIN; - goto err1; - } +err: +retry: /* Discard the region. */ + if (infop->addr != NULL) { + (void)__db_unmapregion(infop); + infop->addr = NULL; + } - /* The file may have just been deleted. */ - if (F_ISSET(rp, DB_R_DELETED)) { - ret = EAGAIN; - goto err1; - } + /* Discard the backing file. */ + if (infop->fd != -1) { + (void)__db_close(infop->fd); + infop->fd = -1; - /* Increment the reference count. */ - ++rp->refcnt; + if (F_ISSET(infop, REGION_CREATED)) + (void)__db_unlink(infop->name); + } - /* Release the lock. */ - if (!LF_ISSET(DB_MUTEXDEBUG)) - (void)__db_mutex_unlock(&rp->lock, fd); + /* Discard the name. */ + if (infop->name != NULL) { + FREES(infop->name); + infop->name = NULL; + } - FREES(name); + /* + * If we had a temporary error, wait a few seconds and + * try again. + */ + if (ret == 0) { + if (++retry_cnt <= 3) { + __db_sleep(retry_cnt * 2, 0); + goto loop; + } + ret = EAGAIN; + } + } - *(void **)retp = rp; - return (0); + /* + * XXX + * HP-UX won't permit mutexes to live in anything but shared memory. + * Instantiate a shared region file on that architecture, regardless. + * + * XXX + * There's a problem in cleaning this up on application exit, or on + * application failure. If an application opens a database without + * an environment, we create a temporary backing mpool region for it. + * That region is marked REGION_PRIVATE, but as HP-UX won't permit + * mutexes to live in anything but shared memory, we instantiate a + * real file plus a memory region of some form. If the application + * crashes, the necessary information to delete the backing file and + * any system region (e.g., the shmget(2) segment ID) is no longer + * available. We can't completely fix the problem, but we try. + * + * The underlying UNIX __db_mapregion() code preferentially uses the + * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions + * that are marked REGION_PRIVATE. This means that we normally aren't + * holding any system resources when we get here, in which case we can + * delete the backing file. This results in a short race, from the + * __db_open() call above to here. + * + * If, for some reason, we are holding system resources when we get + * here, we don't have any choice -- we can't delete the backing file + * because we may need it to detach from the resources. Set the + * REGION_LASTDETACH flag, so that we do all necessary cleanup when + * the application closes the region. + */ + if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC)) + if (F_ISSET(infop, REGION_HOLDINGSYS)) + F_SET(infop, REGION_LASTDETACH); + else { + F_SET(infop, REGION_REMOVED); + F_CLR(infop, REGION_CANGROW); + + (void)__db_close(infop->fd); + (void)__db_unlink(infop->name); + } -err1: if (!LF_ISSET(DB_MUTEXDEBUG)) - (void)__db_mutex_unlock(&rp->lock, fd); -err2: if (rp != NULL) - (void)__db_unmap(rp, rp->size); - if (fd != -1) - (void)__db_close(fd); - FREES(name); return (ret); } /* - * __db_rclose -- - * Close a shared memory region. + * __db_rdetach -- + * De-attach from a shared memory region. * - * PUBLIC: int __db_rclose __P((DB_ENV *, int, void *)); + * PUBLIC: int __db_rdetach __P((REGINFO *)); */ int -__db_rclose(dbenv, fd, ptr) - DB_ENV *dbenv; - int fd; - void *ptr; +__db_rdetach(infop) + REGINFO *infop; { - RLAYOUT *rp; - int ret, t_ret; - const char *fail; + RLAYOUT *rlp; + int detach, ret, t_ret; - rp = ptr; - fail = NULL; + ret = 0; - /* Get the lock. */ - if ((ret = __db_mutex_lock(&rp->lock, fd)) != 0) { - fail = "lock get"; - goto err; + /* + * If the region was removed when it was created, no further action + * is required. + */ + if (F_ISSET(infop, REGION_REMOVED)) + goto done; + /* + * If the region was created in memory returned by malloc, the only + * action required is freeing the memory. + */ + if (F_ISSET(infop, REGION_MALLOC)) { + __db_free(infop->addr); + goto done; } + /* Otherwise, attach to the region and optionally delete it. */ + rlp = infop->addr; + + /* Get the lock. */ + (void)__db_mutex_lock(&rlp->lock, infop->fd); + /* Decrement the reference count. */ - --rp->refcnt; + if (rlp->refcnt == 0) + __db_err(infop->dbenv, + "region rdetach: reference count went to zero!"); + else + --rlp->refcnt; + + /* + * If we're going to remove the region, clear the valid flag so + * that any region join that's blocked waiting for us will know + * what happened. + */ + detach = 0; + if (F_ISSET(infop, REGION_LASTDETACH)) + if (rlp->refcnt == 0) { + detach = 1; + rlp->valid = 0; + } else + ret = EBUSY; /* Release the lock. */ - if ((t_ret = __db_mutex_unlock(&rp->lock, fd)) != 0 && fail == NULL) { - ret = t_ret; - fail = "lock release"; - } + (void)__db_mutex_unlock(&rlp->lock, infop->fd); - /* Discard the region. */ - if ((t_ret = __db_unmap(ptr, rp->size)) != 0 && fail == NULL) { - ret = t_ret; - fail = "munmap"; - } + /* Close the backing file descriptor. */ + (void)__db_close(infop->fd); + infop->fd = -1; - if ((t_ret = __db_close(fd)) != 0 && fail == NULL) { + /* Discard our mapping of the region. */ + if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0) ret = t_ret; - fail = "close"; + + /* Discard the region itself. */ + if (detach) { + if ((t_ret = + __db_unlinkregion(infop->name, infop) != 0) && ret == 0) + ret = t_ret; + if ((t_ret = __db_unlink(infop->name) != 0) && ret == 0) + ret = t_ret; } - if (fail == NULL) - return (0); +done: /* Discard the name. */ + if (infop->name != NULL) { + FREES(infop->name); + infop->name = NULL; + } -err: __db_err(dbenv, "region detach: %s: %s", fail, strerror(ret)); return (ret); } /* * __db_runlink -- - * Remove a shared memory region. + * Remove a region. * - * PUBLIC: int __db_runlink __P((DB_ENV *, - * PUBLIC: APPNAME, const char *, const char *, int)); + * PUBLIC: int __db_runlink __P((REGINFO *, int)); */ int -__db_runlink(dbenv, appname, path, file, force) - DB_ENV *dbenv; - APPNAME appname; - const char *path, *file; +__db_runlink(infop, force) + REGINFO *infop; int force; { - RLAYOUT *rp; - int cnt, fd, ret, t_ret; + RLAYOUT rl, *rlp; + size_t size; + ssize_t nr; + u_int32_t mbytes, bytes; + int fd, ret, t_ret; char *name; - rp = NULL; + /* + * XXX + * We assume that we've created a new REGINFO structure for this + * call, not used one that was already initialized. Regardless, + * if anyone is planning to use it after we're done, they're going + * to be sorely disappointed. + * + * If force isn't set, we attach to the region, set a flag to delete + * the region on last close, and let the region delete code do the + * work. + */ + if (!force) { + if ((ret = __db_rattach(infop)) != 0) + return (ret); - /* Get the filename. */ - if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0) - return (ret); + rlp = (RLAYOUT *)infop->addr; + (void)__db_mutex_unlock(&rlp->lock, infop->fd); - /* If the file doesn't exist, we're done. */ - if (__db_exists(name, NULL)) - goto done; + F_SET(infop, REGION_LASTDETACH); + + return (__db_rdetach(infop)); + } /* - * If we're called with a force flag, try and unlink the file. This - * may not succeed if the file is currently open, but there's nothing - * we can do about that. There is a race condition between the check - * for existence above and the actual unlink. If someone else snuck - * in and removed it before we do the remove, then we might get an - * ENOENT error. If we get the ENOENT, we treat it as success, just - * as we do above. + * Otherwise, we don't want to attach to the region. We may have been + * called to clean up if a process died leaving a region locked and/or + * corrupted, which could cause the attach to hang. */ - if (force) { - if ((ret = __db_unlink(name)) != 0 && ret != ENOENT) - goto err1; - goto done; + if ((ret = __db_appname(infop->dbenv, infop->appname, + infop->path, infop->file, infop->dbflags, NULL, &name)) != 0) + return (ret); + + /* + * An underlying file is created for all regions other than private + * (REGION_PRIVATE) ones, regardless of whether or not it's used to + * back the region. If that file doesn't exist, we're done. + */ + if (__db_exists(name, NULL) != 0) { + FREES(name); + return (0); } - /* Open and lock the region. */ - if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0) - goto err1; - (void)__db_mutex_lock(&rp->lock, fd); + /* + * See the comments in __db_rattach -- figure out if this is a regular + * file backing a region or if it's a regular file with information + * about a region. + */ + if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0) + goto errmsg; + if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) + goto errmsg; + size = mbytes * MEGABYTE + bytes; - /* If the region is currently being deleted, fail. */ - if (F_ISSET(rp, DB_R_DELETED)) { - ret = ENOENT; /* XXX: ENOENT? */ - goto err2; - } + if (size <= sizeof(RLAYOUT)) { + if ((ret = __db_read(fd, &rl, sizeof(rl), &nr)) != 0) + goto errmsg; + if (rl.valid != DB_REGIONMAGIC) { + __db_err(infop->dbenv, + "%s: illegal region magic number", name); + ret = EINVAL; + goto err; + } - /* If the region is currently in use by someone else, fail. */ - if (rp->refcnt > 1) { - ret = EBUSY; - goto err2; + /* Set the size, memory id and characteristics. */ + infop->size = rl.size; + infop->segid = rl.segid; + if (F_ISSET(&rl, REGION_ANONYMOUS)) + F_SET(infop, REGION_ANONYMOUS); + } else { + infop->size = size; + infop->segid = INVALID_SEGID; } - /* Set the delete flag. */ - F_SET(rp, DB_R_DELETED); - - /* Release the lock and close the region. */ - (void)__db_mutex_unlock(&rp->lock, fd); - if ((t_ret = __db_rclose(dbenv, fd, rp)) != 0 && ret == 0) - goto err1; + /* Remove the underlying region. */ + ret = __db_unlinkregion(name, infop); /* - * Unlink the region. There's a race here -- other threads or - * processes might be opening the region while we're trying to - * remove it. They'll fail, because we've set the DELETED flag, - * but they could still stop us from succeeding in the unlink. + * Unlink the backing file. Close the open file descriptor first, + * because some architectures (e.g., Win32) won't unlink a file if + * open file descriptors remain. */ - for (cnt = 5; cnt > 0; --cnt) { - if ((ret = __db_unlink(name)) == 0) - break; - (void)__db_sleep(0, 250000); - } - if (ret == 0) { -done: FREES(name); - return (0); - } - - /* Not a clue. Try to clear the DB_R_DELETED flag. */ - if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0) - goto err1; - (void)__db_mutex_lock(&rp->lock, fd); - F_CLR(rp, DB_R_DELETED); - /* FALLTHROUGH */ + (void)__db_close(fd); + if ((t_ret = __db_unlink(name)) != 0 && ret == 0) + ret = t_ret; -err2: (void)__db_mutex_unlock(&rp->lock, fd); - (void)__db_rclose(dbenv, fd, rp); -err1: __db_err(dbenv, "region unlink: %s: %s", name, strerror(ret)); + if (0) { +errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret)); +err: (void)__db_close(fd); + } FREES(name); return (ret); } /* - * DB creates all regions on 4K boundaries so that we don't make the - * underlying VM unhappy. - */ -#define __DB_VMPAGESIZE (4 * 1024) - -/* * __db_rgrow -- - * Extend a region by a specified amount. + * Extend a region. * - * PUBLIC: int __db_rgrow __P((DB_ENV *, int, size_t)); + * PUBLIC: int __db_rgrow __P((REGINFO *, size_t)); */ int -__db_rgrow(dbenv, fd, incr) - DB_ENV *dbenv; - int fd; - size_t incr; +__db_rgrow(infop, new_size) + REGINFO *infop; + size_t new_size; +{ + RLAYOUT *rlp; + size_t increment; + int ret; + + /* + * !!! + * This routine MUST be called with the region already locked. + */ + + /* The underlying routines have flagged if this region can grow. */ + if (!F_ISSET(infop, REGION_CANGROW)) + return (EINVAL); + + /* + * Round off the requested size to the next page boundary, and + * determine the additional space required. + */ + rlp = (RLAYOUT *)infop->addr; + DB_ROUNDOFF(new_size); + increment = new_size - rlp->size; + + if ((ret = __db_growregion(infop, increment)) != 0) + return (ret); + + /* Update the on-disk region size. */ + rlp->size = new_size; + + /* Detach from and reattach to the region. */ + return (__db_rreattach(infop, new_size)); +} + +/* + * __db_growregion -- + * Grow a shared memory region. + */ +static int +__db_growregion(infop, increment) + REGINFO *infop; + size_t increment; { + db_pgno_t pages; size_t i; - ssize_t nw; - int mmap_init_needed, ret; - char buf[__DB_VMPAGESIZE]; + ssize_t nr, nw; + u_int32_t relative; + int ret; + char buf[DB_VMPAGESIZE]; /* Seek to the end of the region. */ - if ((ret = __db_seek(fd, 0, 0, 0, SEEK_END)) != 0) + if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0) goto err; /* Write nuls to the new bytes. */ memset(buf, 0, sizeof(buf)); /* - * Historically, some systems required that all of the bytes of the - * region be written before it could be mmapped and accessed randomly. - * - * Windows/95 doesn't have that problem, but it leaves file contents - * uninitialized. Win/NT apparently initializes them. + * Some systems require that all of the bytes of the region be + * written before it can be mapped and accessed randomly, and + * other systems don't zero out the pages. */ -#ifdef MMAP_INIT_NEEDED - mmap_init_needed = 1; -#else - mmap_init_needed = __os_oldwin(); -#endif - if (mmap_init_needed) + if (__db_mapinit()) /* Extend the region by writing each new page. */ - for (i = 0; i < incr; i += __DB_VMPAGESIZE) { - if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0) + for (i = 0; i < increment; i += DB_VMPAGESIZE) { + if ((ret = + __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; } else { /* - * Extend the region by writing the last page. - * - * Round off the increment to the next page boundary. + * Extend the region by writing the last page. If the region + * is >4Gb, increment may be larger than the maximum possible + * seek "relative" argument, as it's an unsigned 32-bit value. + * Break the offset into pages of 1MB each so that we don't + * overflow (2^20 + 2^32 is bigger than any memory I expect + * to see for awhile). */ - incr += __DB_VMPAGESIZE - 1; - incr -= incr % __DB_VMPAGESIZE; - - /* Write the last page, not the page after the last. */ - if ((ret = - __db_seek(fd, 0, 0, incr - __DB_VMPAGESIZE, SEEK_CUR)) != 0) + pages = (increment - DB_VMPAGESIZE) / MEGABYTE; + relative = (increment - DB_VMPAGESIZE) % MEGABYTE; + if ((ret = __db_seek(infop->fd, + MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0) goto err; - if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0) + if ((ret = __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; + + /* + * It's sometimes significantly faster to page-fault in all + * of the region's pages before we run the application, as + * we can see fairly nasty side-effects when we page-fault + * while holding various locks, i.e., the lock takes a long + * time, and other threads convoy behind the lock holder. + */ + if (DB_GLOBAL(db_region_init)) { + pages = increment / MEGABYTE; + relative = increment % MEGABYTE; + if ((ret = __db_seek(infop->fd, + MEGABYTE, pages, relative, 1, SEEK_END)) != 0) + goto err; + + /* Read a byte from each page. */ + for (i = 0; i < increment; i += DB_VMPAGESIZE) { + if ((ret = + __db_read(infop->fd, buf, 1, &nr)) != 0) + goto err; + if (nr != 1) + goto eio; + if ((ret = __db_seek(infop->fd, + 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0) + goto err; + } + } } return (0); eio: ret = EIO; -err: __db_err(dbenv, "region grow: %s", strerror(ret)); +err: __db_err(infop->dbenv, "region grow: %s", strerror(ret)); return (ret); } /* - * __db_rremap -- - * Unmap the old region and map in a new region of a new size. If - * either call fails, returns NULL, else returns the address of the - * new region. + * __db_rreattach -- + * Detach from and reattach to a region. * - * PUBLIC: int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *)); + * PUBLIC: int __db_rreattach __P((REGINFO *, size_t)); */ int -__db_rremap(dbenv, ptr, oldsize, newsize, fd, retp) - DB_ENV *dbenv; - void *ptr, *retp; - size_t oldsize, newsize; - int fd; +__db_rreattach(infop, new_size) + REGINFO *infop; + size_t new_size; { int ret; - if ((ret = __db_unmap(ptr, oldsize)) != 0) { - __db_err(dbenv, "region remap: munmap: %s", strerror(ret)); - return (ret); +#ifdef DIAGNOSTIC + if (infop->name == NULL) { + __db_err(infop->dbenv, "__db_rreattach: name was NULL"); + return (EINVAL); } +#endif + /* + * If we're growing an already mapped region, we have to unmap it + * and get it back. We have it locked, so nobody else can get in, + * which makes it fairly straight-forward to do, as everybody else + * is going to block while we do the unmap/remap. NB: if we fail + * to get it back, the pooch is genuinely screwed, because we can + * never release the lock we're holding. + * + * Detach from the region. We have to do this first so architectures + * that don't permit a file to be mapped into different places in the + * address space simultaneously, e.g., HP's PaRisc, will work. + */ + if ((ret = __db_unmapregion(infop)) != 0) + return (ret); - return (__db_rmap(dbenv, fd, newsize, retp)); -} - -/* - * __db_rmap -- - * Attach to a shared memory region. - */ -static int -__db_rmap(dbenv, fd, size, retp) - DB_ENV *dbenv; - int fd; - size_t size; - void *retp; -{ - RLAYOUT *rp; - int ret; + /* Update the caller's REGINFO size to the new map size. */ + infop->size = new_size; - if ((ret = __db_map(fd, size, 0, 0, (void **)&rp)) != 0) { - __db_err(dbenv, "region map: mmap %s", strerror(ret)); - return (ret); - } - if (rp->size < size) - rp->size = size; + /* Attach to the region. */ + ret = __db_mapregion(infop->name, infop); - *(void **)retp = rp; - return (0); + return (ret); } diff --git a/db2/common/db_salloc.c b/db2/common/db_salloc.c index f0202ddb90..0fa696bf7e 100644 --- a/db2/common/db_salloc.c +++ b/db2/common/db_salloc.c @@ -1,21 +1,21 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_salloc.c 10.6 (Sleepycat) 7/5/97"; +static const char sccsid[] = "@(#)db_salloc.c 10.13 (Sleepycat) 5/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> -#include <stdio.h> +#include <string.h> #endif #include "db_int.h" @@ -109,11 +109,13 @@ __db_shalloc(p, len, align, retp) *(void **)retp = rp; +#define SHALLOC_FRAGMENT 32 /* - * If there are at least 32 bytes of additional memory, divide - * the chunk into two chunks. + * If there are at least SHALLOC_FRAGMENT additional bytes of + * memory, divide the chunk into two chunks. */ - if ((u_int8_t *)rp >= (u_int8_t *)&elp->links + 32) { + if ((u_int8_t *)rp >= + (u_int8_t *)&elp->links + SHALLOC_FRAGMENT) { sp = rp; *--sp = elp->len - ((u_int8_t *)rp - (u_int8_t *)&elp->links); @@ -136,7 +138,7 @@ __db_shalloc(p, len, align, retp) return (0); } - /* Nothing found large enough; need to figure out how to grow region. */ + /* Nothing found large enough; need to grow the region. */ return (ENOMEM); } @@ -159,12 +161,18 @@ __db_shalloc_free(regionp, ptr) * Step back over flagged length fields to find the beginning of * the object and its real size. */ - for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp); + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp) + ; ptr = sp; newp = (struct __data *)((u_int8_t *)ptr - sizeof(size_t)); free_size = newp->len; + /* Trash the returned memory. */ +#ifdef DIAGNOSTIC + memset(ptr, 0xff, free_size); +#endif + /* * Walk the list, looking for where this entry goes. * @@ -177,7 +185,8 @@ __db_shalloc_free(regionp, ptr) hp = (struct __head *)regionp; for (elp = SH_LIST_FIRST(hp, __data), lastp = NULL; elp != NULL && (void *)elp < (void *)ptr; - lastp = elp, elp = SH_LIST_NEXT(elp, links, __data)); + lastp = elp, elp = SH_LIST_NEXT(elp, links, __data)) + ; /* * Elp is either NULL (we reached the end of the list), or the slot @@ -259,32 +268,34 @@ __db_shsizeof(ptr) * Step back over flagged length fields to find the beginning of * the object and its real size. */ - for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp); + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp) + ; elp = (struct __data *)((u_int8_t *)sp - sizeof(size_t)); return (elp->len); } -#ifdef DEBUG /* * __db_shalloc_dump -- * - * PUBLIC: void __db_shalloc_dump __P((FILE *, void *)); + * PUBLIC: void __db_shalloc_dump __P((void *, FILE *)); */ void -__db_shalloc_dump(fp, addr) - FILE *fp; +__db_shalloc_dump(addr, fp) void *addr; + FILE *fp; { struct __data *elp; + /* Make it easy to call from the debugger. */ if (fp == NULL) fp = stderr; + fprintf(fp, "%s\nMemory free list\n", DB_LINE); + for (elp = SH_LIST_FIRST((struct __head *)addr, __data); elp != NULL; elp = SH_LIST_NEXT(elp, links, __data)) fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len); fprintf(fp, "\n"); } -#endif diff --git a/db2/common/db_shash.c b/db2/common/db_shash.c index ab188f564f..3f48a55907 100644 --- a/db2/common/db_shash.c +++ b/db2/common/db_shash.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997 + * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_shash.c 10.4 (Sleepycat) 1/8/98"; +static const char sccsid[] = "@(#)db_shash.c 10.9 (Sleepycat) 4/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,39 +19,75 @@ static const char sccsid[] = "@(#)db_shash.c 10.4 (Sleepycat) 1/8/98"; #include "shqueue.h" #include "common_ext.h" -/* Powers-of-2 and close-by prime number pairs. */ +/* + * Table of good hash values. Up to ~250,000 buckets, we use powers of 2. + * After that, we slow the rate of increase by half. For each choice, we + * then use a nearby prime number as the hash value. + * + * If a terabyte is the maximum cache we'll see, and we assume there are + * 10 1K buckets on each hash chain, then 107374182 is the maximum number + * of buckets we'll ever need. + */ static const struct { - u_int power; - u_int prime; + u_int32_t power; + u_int32_t prime; } list[] = { - { 64, 67}, - { 128, 131}, - { 256, 257}, - { 512, 521}, - {1024, 1031}, - {2048, 2053}, - {4096, 4099}, - {8192, 8191}, - {0, 0} + { 64, 67}, /* 2^6 */ + { 128, 131}, /* 2^7 */ + { 256, 257}, /* 2^8 */ + { 512, 521}, /* 2^9 */ + { 1024, 1031}, /* 2^10 */ + { 2048, 2053}, /* 2^11 */ + { 4096, 4099}, /* 2^12 */ + { 8192, 8191}, /* 2^13 */ + { 16384, 16381}, /* 2^14 */ + { 32768, 32771}, /* 2^15 */ + { 65536, 65537}, /* 2^16 */ + { 131072, 131071}, /* 2^17 */ + { 262144, 262147}, /* 2^18 */ + { 393216, 393209}, /* 2^18 + 2^18/2 */ + { 524288, 524287}, /* 2^19 */ + { 786432, 786431}, /* 2^19 + 2^19/2 */ + { 1048576, 1048573}, /* 2^20 */ + { 1572864, 1572869}, /* 2^20 + 2^20/2 */ + { 2097152, 2097169}, /* 2^21 */ + { 3145728, 3145721}, /* 2^21 + 2^21/2 */ + { 4194304, 4194301}, /* 2^22 */ + { 6291456, 6291449}, /* 2^22 + 2^22/2 */ + { 8388608, 8388617}, /* 2^23 */ + { 12582912, 12582917}, /* 2^23 + 2^23/2 */ + { 16777216, 16777213}, /* 2^24 */ + { 25165824, 25165813}, /* 2^24 + 2^24/2 */ + { 33554432, 33554393}, /* 2^25 */ + { 50331648, 50331653}, /* 2^25 + 2^25/2 */ + { 67108864, 67108859}, /* 2^26 */ + { 100663296, 100663291}, /* 2^26 + 2^26/2 */ + { 134217728, 134217757}, /* 2^27 */ + { 201326592, 201326611}, /* 2^27 + 2^27/2 */ + { 268435456, 268435459}, /* 2^28 */ + { 402653184, 402653189}, /* 2^28 + 2^28/2 */ + { 536870912, 536870909}, /* 2^29 */ + { 805306368, 805306357}, /* 2^29 + 2^29/2 */ + {1073741824, 1073741827}, /* 2^30 */ + {0, 0} }; /* * __db_tablesize -- * Choose a size for the hash table. * - * PUBLIC: int __db_tablesize __P((u_int)); + * PUBLIC: int __db_tablesize __P((u_int32_t)); */ int __db_tablesize(n_buckets) - u_int n_buckets; + u_int32_t n_buckets; { int i; /* - * We try to be clever about how big we make the hash tables. Pick - * a prime number close to the "suggested" number of elements that - * will be in the hash table. We shoot for minimum collisions (i.e. - * one element in each bucket). We use 64 as the minimum table size. + * We try to be clever about how big we make the hash tables. Use a + * prime number close to the "suggested" number of elements that will + * be in the hash table. Use 64 as the minimum hash table size. * * Ref: Sedgewick, Algorithms in C, "Hash Functions" */ @@ -73,14 +109,14 @@ __db_tablesize(n_buckets) * __db_hashinit -- * Initialize a hash table that resides in shared memory. * - * PUBLIC: void __db_hashinit __P((void *, int)); + * PUBLIC: void __db_hashinit __P((void *, u_int32_t)); */ void __db_hashinit(begin, nelements) void *begin; - int nelements; + u_int32_t nelements; { - int i; + u_int32_t i; SH_TAILQ_HEAD(hash_head) *headp; headp = (struct hash_head *)begin; |