summary refs log tree commit diff
path: root/db2/common/db_region.c
diff options
context:
space:
mode:
Diffstat (limited to 'db2/common/db_region.c')
-rw-r--r--db2/common/db_region.c565
1 files changed, 565 insertions, 0 deletions
diff --git a/db2/common/db_region.c b/db2/common/db_region.c
new file mode 100644
index 0000000000..51f8f4465c
--- /dev/null
+++ b/db2/common/db_region.c
@@ -0,0 +1,565 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Harvard by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)db_region.c	10.12 (Sleepycat) 7/26/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "common_ext.h"
+
+static int __db_rmap __P((DB_ENV *, int, size_t, void *));
+
+/*
+ * __db_rcreate --
+ *
+ * Common interface for creating a shared region.  Handles synchronization
+ * across multiple processes.
+ *
+ * The dbenv contains the environment for this process, including naming
+ * information.  The path argument represents the parameters passed to
+ * the open routines and may be either a file or a directory.  If it is
+ * a directory, it must exist.  If it is a file, then the file parameter
+ * must be NULL, otherwise, file is the name to be created inside the
+ * directory path.
+ *
+ * The function returns a pointer to the shared region that has been mapped
+ * into memory, NULL on error.
+ *
+ * PUBLIC: int __db_rcreate __P((DB_ENV *, APPNAME,
+ * PUBLIC:    const char *, const char *, int, size_t, int *, void *));
+ */
+int
+__db_rcreate(dbenv, appname, path, file, mode, size, fdp, retp)
+	DB_ENV *dbenv;
+	APPNAME appname;
+	const char *path, *file;
+	int mode, *fdp;
+	size_t size;
+	void *retp;
+{
+	RLAYOUT *rp;
+	int fd, ret;
+	char *name;
+
+	fd = -1;
+	rp = NULL;
+
+	/*
+	 * Get the filename -- note, if it's a temporary file, it will
+	 * be created by the underlying temporary file creation code,
+	 * so we have to check the file descriptor to be sure it's an
+	 * error.
+	 */
+	if ((ret = __db_appname(dbenv, appname, path, file, &fd, &name)) != 0)
+		return (ret);
+
+	/*
+	 * Now open the file. We need to make sure that multiple processes
+	 * that attempt to create the region at the same time are properly
+	 * ordered, so we open it O_EXCL and O_CREAT so two simultaneous
+	 * attempts to create the region will return failure in one of the
+	 * attempts.
+	 */
+	if (fd == -1 && (ret = __db_fdopen(name,
+	    DB_CREATE | DB_EXCL, DB_CREATE | DB_EXCL, mode, &fd)) != 0) {
+		if (ret != EEXIST)
+			__db_err(dbenv,
+			    "region create: %s: %s", name, strerror(ret));
+		goto err;
+	}
+	*fdp = fd;
+
+	/* Grow the region to the correct size. */
+	if ((ret = __db_rgrow(dbenv, fd, size)) != 0)
+		goto err;
+
+	/* Map the region in. */
+	if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0)
+		goto err;
+
+	/*
+	 * Initialize the common information.
+	 *
+	 * !!!
+	 * We have to order the region creates so that two processes don't try
+	 * to simultaneously create the region and so that processes that are
+	 * joining the region never see inconsistent data.  We'd like to play
+	 * file permissions games, but we can't because WNT filesystems won't
+	 * open a file mode 0.
+	 *
+	 * So, the process that's creating the region always acquires the lock
+	 * before the setting the version number.  Any process joining always
+	 * checks the version number before attempting to acquire the lock.
+	 *
+	 * We have to check the version number first, because if the version
+	 * number has not been written, it's possible that the mutex has not
+	 * been initialized in which case an attempt to get it could lead to
+	 * random behavior.  If the version number isn't there (the file size
+	 * is too small) or it's 0, we know that the region is being created.
+	 */
+	(void)__db_mutex_init(&rp->lock, MUTEX_LOCK_OFFSET(rp, &rp->lock));
+	(void)__db_mutex_lock(&rp->lock,
+	    fd, dbenv == NULL ? NULL : dbenv->db_yield);
+
+	rp->refcnt = 1;
+	rp->size = size;
+	rp->flags = 0;
+	db_version(&rp->majver, &rp->minver, &rp->patch);
+
+	if (name != NULL)
+		FREES(name);
+
+	*(void **)retp = rp;
+	return (0);
+
+err:	if (fd != -1) {
+		if (rp != NULL)
+			(void)__db_munmap(rp, rp->size);
+		(void)__db_unlink(name);
+		(void)__db_close(fd);
+	}
+	if (name != NULL)
+		FREES(name);
+	return (ret);
+}
+
+/*
+ * __db_ropen --
+ *	Construct the name of a file, open it and map it in.
+ *
+ * PUBLIC: int __db_ropen __P((DB_ENV *,
+ * PUBLIC:    APPNAME, const char *, const char *, int, int *, void *));
+ */
+int
+__db_ropen(dbenv, appname, path, file, flags, fdp, retp)
+	DB_ENV *dbenv;
+	APPNAME appname;
+	const char *path, *file;
+	int flags, *fdp;
+	void *retp;
+{
+	RLAYOUT *rp;
+	off_t size1, size2;
+	int fd, ret;
+	char *name;
+
+	fd = -1;
+	rp = NULL;
+
+	/* Get the filename. */
+	if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
+		return (ret);
+
+	/* Open the file. */
+	if ((ret = __db_fdopen(name, flags, DB_MUTEXDEBUG, 0, &fd)) != 0) {
+		__db_err(dbenv, "region open: %s: %s", name, strerror(ret));
+		goto err2;
+	}
+
+	*fdp = fd;
+
+	/*
+	 * Map the file in.  We have to do things in a strange order so that
+	 * we don't get into a situation where the file was just created and
+	 * isn't yet initialized.  See the comment in __db_rcreate() above.
+	 *
+	 * XXX
+	 * We'd like to test to see if the file is too big to mmap.  Since we
+	 * don't know what size or type off_t's or size_t's are, or the largest
+	 * unsigned integral type is, or what random insanity the local C
+	 * compiler will perpetrate, doing the comparison in a portable way is
+	 * flatly impossible.  Hope that mmap fails if the file is too large.
+	 *
+	 */
+	if ((ret = __db_stat(dbenv, name, fd, &size1, NULL)) != 0)
+		goto err2;
+
+	/* Check to make sure the first block has been written. */
+	if ((size_t) size1 < sizeof(RLAYOUT)) {
+		ret = EAGAIN;
+		goto err2;
+	}
+
+	/* Map in whatever is there. */
+	if ((ret = __db_rmap(dbenv, fd, size1, &rp)) != 0)
+		goto err2;
+
+	/*
+	 * Check to make sure the region has been initialized.  We can't just
+	 * grab the lock because the lock may not have been initialized yet.
+	 */
+	if (rp->majver == 0) {
+		ret = EAGAIN;
+		goto err2;
+	}
+
+	/* Get the region lock. */
+	if (!LF_ISSET(DB_MUTEXDEBUG))
+		(void)__db_mutex_lock(&rp->lock,
+		    fd, dbenv == NULL ? NULL : dbenv->db_yield);
+
+	/*
+	 * The file may have been half-written if we were descheduled between
+	 * getting the size of the file and checking the major version.  Check
+	 * to make sure we got the entire file.
+	 */
+	if ((ret = __db_stat(dbenv, name, fd, &size2, NULL)) != 0)
+		goto err1;
+	if (size1 != size2) {
+		ret = EAGAIN;
+		goto err1;
+	}
+
+	/* The file may have just been deleted. */
+	if (F_ISSET(rp, DB_R_DELETED)) {
+		ret = EAGAIN;
+		goto err1;
+	}
+
+	/* Increment the reference count. */
+	++rp->refcnt;
+
+	/* Release the lock. */
+	if (!LF_ISSET(DB_MUTEXDEBUG))
+		(void)__db_mutex_unlock(&rp->lock, fd);
+
+	FREES(name);
+
+	*(void **)retp = rp;
+	return (0);
+
+err1:	if (!LF_ISSET(DB_MUTEXDEBUG))
+		(void)__db_mutex_unlock(&rp->lock, fd);
+err2:	if (rp != NULL)
+		(void)__db_munmap(rp, rp->size);
+	if (fd != -1)
+		(void)__db_close(fd);
+	FREES(name);
+	return (ret);
+}
+
+/*
+ * __db_rclose --
+ *	Close a shared memory region.
+ *
+ * PUBLIC: int __db_rclose __P((DB_ENV *, int, void *));
+ */
+int
+__db_rclose(dbenv, fd, ptr)
+	DB_ENV *dbenv;
+	int fd;
+	void *ptr;
+{
+	RLAYOUT *rp;
+	int ret, t_ret;
+	const char *fail;
+
+	rp = ptr;
+	fail = NULL;
+
+	/* Get the lock. */
+	if ((ret = __db_mutex_lock(&rp->lock,
+	    fd, dbenv == NULL ? NULL : dbenv->db_yield)) != 0) {
+		fail = "lock get";
+		goto err;
+	}
+
+	/* Decrement the reference count. */
+	--rp->refcnt;
+
+	/* Release the lock. */
+	if ((t_ret = __db_mutex_unlock(&rp->lock, fd)) != 0 && fail == NULL) {
+		ret = t_ret;
+		fail = "lock release";
+	}
+
+	/* Discard the region. */
+	if ((t_ret = __db_munmap(ptr, rp->size)) != 0 && fail == NULL) {
+		ret = t_ret;
+		fail = "munmap";
+	}
+
+	if ((t_ret = __db_close(fd)) != 0 && fail == NULL) {
+		ret = t_ret;
+		fail = "close";
+	}
+
+	if (fail == NULL)
+		return (0);
+
+err:	__db_err(dbenv, "region detach: %s: %s", fail, strerror(ret));
+	return (ret);
+}
+
+/*
+ * __db_runlink --
+ *	Remove a shared memory region.
+ *
+ * PUBLIC: int __db_runlink __P((DB_ENV *,
+ * PUBLIC:    APPNAME, const char *, const char *, int));
+ */
+int
+__db_runlink(dbenv, appname, path, file, force)
+	DB_ENV *dbenv;
+	APPNAME appname;
+	const char *path, *file;
+	int force;
+{
+	RLAYOUT *rp;
+	int cnt, fd, ret, t_ret;
+	char *name;
+
+	rp = NULL;
+
+	/* Get the filename. */
+	if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
+		return (ret);
+
+	/* If the file doesn't exist, we're done. */
+	if (__db_exists(name, NULL))
+		return (0);		/* XXX: ENOENT? */
+
+	/*
+	 * If we're called with a force flag, try and unlink the file.  This
+	 * may not succeed if the file is currently open, but there's nothing
+	 * we can do about that.  There is a race condition between the check
+	 * for existence above and the actual unlink.  If someone else snuck
+	 * in and removed it before we do the remove, then we might get an
+	 * ENOENT error.  If we get the ENOENT, we treat it as success, just
+	 * as we do above.
+	 */
+	if (force) {
+		if ((ret = __db_unlink(name)) != 0 && ret != ENOENT)
+			goto err1;
+		FREES(name);
+		return (0);
+	}
+
+	/* Open and lock the region. */
+	if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
+		goto err1;
+	(void)__db_mutex_lock(&rp->lock,
+	    fd, dbenv == NULL ? NULL : dbenv->db_yield);
+
+	/* If the region is currently being deleted, fail. */
+	if (F_ISSET(rp, DB_R_DELETED)) {
+		ret = ENOENT;		/* XXX: ENOENT? */
+		goto err2;
+	}
+
+	/* If the region is currently in use by someone else, fail. */
+	if (rp->refcnt > 1) {
+		ret = EBUSY;
+		goto err2;
+	}
+
+	/* Set the delete flag. */
+	F_SET(rp, DB_R_DELETED);
+
+	/* Release the lock and close the region. */
+	(void)__db_mutex_unlock(&rp->lock, fd);
+	if ((t_ret = __db_rclose(dbenv, fd, rp)) != 0 && ret == 0)
+		goto err1;
+
+	/*
+	 * Unlink the region.  There's a race here -- other threads or
+	 * processes might be opening the region while we're trying to
+	 * remove it.  They'll fail, because we've set the DELETED flag,
+	 * but they could still stop us from succeeding in the unlink.
+	 */
+	for (cnt = 5; cnt > 0; --cnt) {
+		if ((ret = __db_unlink(name)) == 0)
+			break;
+		(void)__db_sleep(0, 250000);
+	}
+	if (ret == 0) {
+		FREES(name);
+		return (0);
+	}
+
+	/* Not a clue.  Try to clear the DB_R_DELETED flag. */
+	if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
+		goto err1;
+	(void)__db_mutex_lock(&rp->lock,
+	    fd, dbenv == NULL ? NULL : dbenv->db_yield);
+	F_CLR(rp, DB_R_DELETED);
+	/* FALLTHROUGH */
+
+err2:	(void)__db_mutex_unlock(&rp->lock, fd);
+	(void)__db_rclose(dbenv, fd, rp);
+err1:	__db_err(dbenv, "region unlink: %s: %s", name, strerror(ret));
+	FREES(name);
+	return (ret);
+}
+
+/*
+ * DB creates all regions on 4K boundaries so that we don't make the
+ * underlying VM unhappy.
+ */
+#define	__DB_VMPAGESIZE	(4 * 1024)
+
+/*
+ * __db_rgrow --
+ *	Extend a region by a specified amount.
+ *
+ * PUBLIC: int __db_rgrow __P((DB_ENV *, int, size_t));
+ */
+int
+__db_rgrow(dbenv, fd, incr)
+	DB_ENV *dbenv;
+	int fd;
+	size_t incr;
+{
+#ifdef MMAP_INIT_NEEDED
+	size_t i;
+#endif
+	ssize_t nw;
+	int ret;
+	char buf[__DB_VMPAGESIZE];
+
+	/* Seek to the end of the region. */
+	if ((ret = __db_lseek(fd, 0, 0, 0, SEEK_END)) != 0)
+		goto err;
+
+	/* Write nuls to the new bytes. */
+	memset(buf, 0, sizeof(buf));
+
+	/*
+	 * Historically, some systems required that all of the bytes of the
+	 * region be written before you could mmap it and access it randomly.
+	 */
+#ifdef MMAP_INIT_NEEDED
+	/* Extend the region by writing each new page. */
+	for (i = 0; i < incr; i += __DB_VMPAGESIZE) {
+		if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+			goto err;
+		if (nw != sizeof(buf))
+			goto eio;
+	}
+#else
+	/*
+	 * Extend the region by writing the last page.
+	 *
+	 * Round off the increment to the next page boundary.
+	 */
+	incr += __DB_VMPAGESIZE - 1;
+	incr -= incr % __DB_VMPAGESIZE;
+
+	/* Write the last page, not the page after the last. */
+	if ((ret = __db_lseek(fd, 0, 0, incr - __DB_VMPAGESIZE, SEEK_CUR)) != 0)
+		goto err;
+	if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+		goto err;
+	if (nw != sizeof(buf))
+		goto eio;
+#endif
+	return (0);
+
+eio:	ret = EIO;
+err:	__db_err(dbenv, "region grow: %s", strerror(ret));
+	return (ret);
+}
+
+/*
+ * __db_rremap --
+ *	Unmap the old region and map in a new region of a new size.  If
+ *	either call fails, returns NULL, else returns the address of the
+ *	new region.
+ *
+ * PUBLIC: int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *));
+ */
+int
+__db_rremap(dbenv, ptr, oldsize, newsize, fd, retp)
+	DB_ENV *dbenv;
+	void *ptr, *retp;
+	size_t oldsize, newsize;
+	int fd;
+{
+	int ret;
+
+	if ((ret = __db_munmap(ptr, oldsize)) != 0) {
+		__db_err(dbenv, "region remap: munmap: %s", strerror(ret));
+		return (ret);
+	}
+
+	return (__db_rmap(dbenv, fd, newsize, retp));
+}
+
+/*
+ * __db_rmap --
+ *	Attach to a shared memory region.
+ */
+static int
+__db_rmap(dbenv, fd, size, retp)
+	DB_ENV *dbenv;
+	int fd;
+	size_t size;
+	void *retp;
+{
+	RLAYOUT *rp;
+	int ret;
+
+	if ((ret = __db_mmap(fd, size, 0, 0, &rp)) != 0) {
+		__db_err(dbenv, "region map: mmap %s", strerror(ret));
+		return (ret);
+	}
+	if (rp->size < size)
+		rp->size = size;
+
+	*(void **)retp = rp;
+	return (0);
+}