about summary refs log tree commit diff
path: root/db2/hash/hash_page.c
diff options
context:
space:
mode:
Diffstat (limited to 'db2/hash/hash_page.c')
-rw-r--r--db2/hash/hash_page.c1775
1 files changed, 1775 insertions, 0 deletions
diff --git a/db2/hash/hash_page.c b/db2/hash/hash_page.c
new file mode 100644
index 0000000000..68c31b14f9
--- /dev/null
+++ b/db2/hash/hash_page.c
@@ -0,0 +1,1775 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ *	Sleepycat Software.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)hash_page.c	10.18 (Sleepycat) 8/21/97";
+#endif /* not lint */
+
+
+/*
+ * PACKAGE:  hashing
+ *
+ * DESCRIPTION:
+ *      Page manipulation for hashing package.
+ *
+ * ROUTINES:
+ *
+ * External
+ *      __get_page
+ *      __add_ovflpage
+ *      __overflow_page
+ * Internal
+ *      open_temp
+ */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_swap.h"
+#include "hash.h"
+
+static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t));
+
+#ifdef DEBUG_SLOW
+static void	 account_page(HTAB *, db_pgno_t, int);
+#endif
+
+/*
+ * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_item(hashp, cursorp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	db_lockmode_t mode;
+{
+	db_pgno_t next_pgno;
+	int ret;
+
+	if (F_ISSET(cursorp, H_DELETED))
+		return (EINVAL);
+	F_CLR(cursorp, H_OK | H_NOMORE);
+
+	/* Check if we need to get a page for this cursor. */
+	if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+		return (ret);
+
+	/* Check if we are looking for space in which to insert an item. */
+	if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID
+	    && cursorp->seek_size < P_FREESPACE(cursorp->pagep))
+		cursorp->seek_found_page = cursorp->pgno;
+
+	/* Check if we need to go on to the next page. */
+	if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID)
+		/*
+		 * ISDUP is set, and offset is at the beginning of the datum.
+		 * We need to grab the length of the datum, then set the datum
+		 * pointer to be the beginning of the datum.
+		 */
+		memcpy(&cursorp->dup_len,
+		    H_PAIRDATA(cursorp->pagep, cursorp->bndx)->data +
+		    cursorp->dup_off, sizeof(db_indx_t));
+	else if (F_ISSET(cursorp, H_ISDUP)) {
+		/* Make sure we're not about to run off the page. */
+		if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp,
+		    cursorp->dpgno, &cursorp->dpagep)) != 0)
+			return (ret);
+
+		if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) {
+			if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) {
+				if ((ret = __ham_put_page(hashp->dbp,
+				    cursorp->dpagep, 0)) != 0)
+					return (ret);
+				F_CLR(cursorp, H_ISDUP);
+				cursorp->dpagep = NULL;
+				cursorp->dpgno = PGNO_INVALID;
+				cursorp->dndx = NDX_INVALID;
+				cursorp->bndx++;
+			} else if ((ret = __ham_next_cpage(hashp, cursorp,
+			    NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
+				return (ret);
+		}
+	}
+
+	if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) {
+		/* Fetch next page. */
+		if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) {
+			F_SET(cursorp, H_NOMORE);
+			if (cursorp->dpagep != NULL &&
+			    (ret = __ham_put_page(hashp->dbp,
+			    cursorp->dpagep, 0)) != 0)
+				return (ret);
+			cursorp->dpgno = PGNO_INVALID;
+			return (DB_NOTFOUND);
+		}
+		next_pgno = NEXT_PGNO(cursorp->pagep);
+		cursorp->bndx = 0;
+		if ((ret = __ham_next_cpage(hashp,
+		    cursorp, next_pgno, 0, 0)) != 0)
+			return (ret);
+	}
+
+	F_SET(cursorp, H_OK);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *));
+ */
+int
+__ham_item_reset(hashp, cursorp)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+{
+	int ret;
+
+	if (cursorp->pagep)
+		ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0);
+	else
+		ret = 0;
+
+	__ham_item_init(cursorp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: void __ham_item_init __P((HASH_CURSOR *));
+ */
+void
+__ham_item_init(cursorp)
+	HASH_CURSOR *cursorp;
+{
+	cursorp->pagep = NULL;
+	cursorp->bucket = BUCKET_INVALID;
+	cursorp->lock = 0;
+	cursorp->bndx = NDX_INVALID;
+	cursorp->pgno = PGNO_INVALID;
+	cursorp->dpgno = PGNO_INVALID;
+	cursorp->dndx = NDX_INVALID;
+	cursorp->dpagep = NULL;
+	cursorp->flags = 0;
+	cursorp->seek_size = 0;
+	cursorp->seek_found_page = PGNO_INVALID;
+}
+
+/*
+ * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int));
+ */
+int
+__ham_item_done(hashp, cursorp, dirty)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	int dirty;
+{
+	int ret, t_ret;
+
+	t_ret = ret = 0;
+
+	if (cursorp->pagep)
+		ret = __ham_put_page(hashp->dbp, cursorp->pagep,
+		    dirty && cursorp->dpagep == NULL);
+	cursorp->pagep = NULL;
+
+	if (cursorp->dpagep)
+		t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty);
+	cursorp->dpagep = NULL;
+
+	if (ret == 0 && t_ret != 0)
+		ret = t_ret;
+
+	/*
+	 * If we are running with transactions, then we must
+	 * not relinquish locks explicitly.
+	 */
+	if (cursorp->lock && hashp->dbp->txn == NULL)
+	    t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock);
+	cursorp->lock = 0;
+
+
+	/*
+	 * We don't throw out the page number since we might want to
+	 * continue getting on this page.
+	 */
+	return (ret != 0 ? ret : t_ret);
+}
+
+/*
+ * Returns the last item in a bucket.
+ *
+ * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_item_last(hashp, cursorp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	db_lockmode_t mode;
+{
+	int ret;
+
+	if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
+		return (ret);
+
+	cursorp->bucket = hashp->hdr->max_bucket;
+	F_SET(cursorp, H_OK);
+	return (__ham_item_prev(hashp, cursorp, mode));
+}
+/*
+ * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_item_first(hashp, cursorp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	db_lockmode_t mode;
+{
+	int ret;
+
+	if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
+		return (ret);
+	F_SET(cursorp, H_OK);
+	cursorp->bucket = 0;
+	return (__ham_item_next(hashp, cursorp, mode));
+}
+
+/*
+ * Returns a pointer to key/data pair on a page.  In the case of bigkeys,
+ * just returns the page number and index of the bigkey pointer pair.
+ *
+ * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_item_prev(hashp, cursorp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	db_lockmode_t mode;
+{
+	db_pgno_t next_pgno;
+	int ret;
+
+	/*
+	 * There are N cases for backing up in a hash file.
+	 * Case 1: In the middle of a page, no duplicates, just dec the index.
+	 * Case 2: In the middle of a duplicate set, back up one.
+	 * Case 3: At the beginning of a duplicate set, get out of set and
+	 *	back up to next key.
+	 * Case 4: At the beginning of a page; go to previous page.
+	 * Case 5: At the beginning of a bucket; go to prev bucket.
+	 */
+	F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED);
+
+	/*
+	 * First handle the duplicates.  Either you'll get the key here
+	 * or you'll exit the duplicate set and drop into the code below
+	 * to handle backing up through keys.
+	 */
+	if (F_ISSET(cursorp, H_ISDUP)) {
+		if (cursorp->dpgno == PGNO_INVALID) {
+			/* Duplicates are on-page. */
+			if (cursorp->dup_off != 0)
+				if ((ret = __ham_get_cpage(hashp,
+				    cursorp, mode)) != 0)
+					return (ret);
+				else {
+					HASH_CURSOR *h;
+					h = cursorp;
+					memcpy(&h->dup_len,
+					    H_PAIRDATA(h->pagep, h->bndx)->data
+					    + h->dup_off - sizeof(db_indx_t),
+					    sizeof(db_indx_t));
+					cursorp->dup_off -=
+					    DUP_SIZE(cursorp->dup_len);
+					cursorp->dndx--;
+					return (__ham_item(hashp,
+					    cursorp, mode));
+				}
+		} else if (cursorp->dndx > 0) {	/* Duplicates are off-page. */
+			cursorp->dndx--;
+			return (__ham_item(hashp, cursorp, mode));
+		} else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+			return (ret);
+		else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) {
+			F_CLR(cursorp, H_ISDUP); /* End of dups */
+			cursorp->dpgno = PGNO_INVALID;
+			if (cursorp->dpagep != NULL)
+				(void)__ham_put_page(hashp->dbp,
+				    cursorp->dpagep, 0);
+			cursorp->dpagep = NULL;
+		} else if ((ret = __ham_next_cpage(hashp, cursorp,
+		    PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
+			return (ret);
+		else {
+			cursorp->dndx = NUM_ENT(cursorp->pagep) - 1;
+			return (__ham_item(hashp, cursorp, mode));
+		}
+	}
+
+	/*
+	 * If we get here, we are not in a duplicate set, and just need
+	 * to back up the cursor.  There are still three cases:
+	 * midpage, beginning of page, beginning of bucket.
+	 */
+
+	if (cursorp->bndx == 0) { 		/* Beginning of page. */
+		if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
+			return (ret);
+		cursorp->pgno = PREV_PGNO(cursorp->pagep);
+		if (cursorp->pgno == PGNO_INVALID) {
+			/* Beginning of bucket. */
+			F_SET(cursorp, H_NOMORE);
+			return (DB_NOTFOUND);
+		} else if ((ret = __ham_next_cpage(hashp,
+		    cursorp, cursorp->pgno, 0, 0)) != 0)
+			return (ret);
+		else
+			cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
+	}
+
+	/*
+	 * Either we've got the cursor set up to be decremented, or we
+	 * have to find the end of a bucket.
+	 */
+	if (cursorp->bndx == NDX_INVALID) {
+		if (cursorp->pagep == NULL)
+			next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket);
+		else
+			goto got_page;
+
+		do {
+			if ((ret = __ham_next_cpage(hashp,
+			    cursorp, next_pgno, 0, 0)) != 0)
+				return (ret);
+got_page:		next_pgno = NEXT_PGNO(cursorp->pagep);
+			cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
+		} while (next_pgno != PGNO_INVALID);
+
+		if (cursorp->bndx == 0) {
+			/* Bucket was empty. */
+			F_SET(cursorp, H_NOMORE);
+			return (DB_NOTFOUND);
+		}
+	}
+
+	cursorp->bndx--;
+
+	return (__ham_item(hashp, cursorp, mode));
+}
+
+/*
+ * Sets the cursor to the next key/data pair on a page.
+ *
+ * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_item_next(hashp, cursorp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+	db_lockmode_t mode;
+{
+	/*
+	 * Deleted on-page duplicates are a weird case. If we delete the last
+	 * one, then our cursor is at the very end of a duplicate set and
+	 * we actually need to go on to the next key.
+	 */
+	if (F_ISSET(cursorp, H_DELETED)) {
+		if (cursorp->bndx != NDX_INVALID &&
+		    F_ISSET(cursorp, H_ISDUP) &&
+		    cursorp->dpgno == PGNO_INVALID &&
+		    cursorp->dup_tlen == cursorp->dup_off) {
+			F_CLR(cursorp, H_ISDUP);
+			cursorp->dpgno = PGNO_INVALID;
+			cursorp->bndx++;
+		}
+		F_CLR(cursorp, H_DELETED);
+	} else if (cursorp->bndx == NDX_INVALID) {
+		cursorp->bndx = 0;
+		cursorp->dpgno = PGNO_INVALID;
+		F_CLR(cursorp, H_ISDUP);
+	} else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID)
+		cursorp->dndx++;
+	else if (F_ISSET(cursorp, H_ISDUP)) {
+		cursorp->dndx++;
+		cursorp->dup_off += DUP_SIZE(cursorp->dup_len);
+		if (cursorp->dup_off >= cursorp->dup_tlen) {
+			F_CLR(cursorp, H_ISDUP);
+			cursorp->dpgno = PGNO_INVALID;
+			cursorp->bndx++;
+		}
+	} else
+		cursorp->bndx++;
+
+	return (__ham_item(hashp, cursorp, mode));
+}
+
+/*
+ * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int));
+ *
+ * This is a little bit sleazy in that we're overloading the meaning
+ * of the H_OFFPAGE type here.  When we recover deletes, we have the
+ * entire entry instead of having only the DBT, so we'll pass type
+ * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing
+ * an H_KEYDATA around it.
+ */
+void
+__ham_putitem(p, dbt, type)
+	PAGE *p;
+	const DBT *dbt;
+	int type;
+{
+	u_int16_t n, off;
+
+	n = NUM_ENT(p);
+
+	/* Put the item element on the page. */
+	if (type == H_OFFPAGE) {
+		off = HOFFSET(p) - dbt->size;
+		HOFFSET(p) = p->inp[n] = off;
+		memcpy(P_ENTRY(p, n), dbt->data, dbt->size);
+	} else {
+		off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size);
+		HOFFSET(p) = p->inp[n] = off;
+		PUT_HKEYDATA(GET_HKEYDATA(p, n), dbt->data, dbt->size, type);
+	}
+
+	/* Adjust page info. */
+	NUM_ENT(p) += 1;
+}
+
+
+/*
+ * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *));
+ * XXX TODO: if the item is an offdup, delete the other pages and
+ * then remove the pair. If the offpage page is 0, then you can
+ * just remove the pair.
+ */
+int
+__ham_del_pair(hashp, cursorp)
+	HTAB *hashp;
+	HASH_CURSOR *cursorp;
+{
+	DBT data_dbt, key_dbt;
+	DB_ENV *dbenv;
+	DB_LSN new_lsn, *n_lsn;
+	PAGE *p;
+	db_indx_t ndx;
+	db_pgno_t chg_pgno, pgno;
+	int ret, tret;
+
+	dbenv = hashp->dbp->dbenv;
+	ndx = cursorp->bndx;
+	if (cursorp->pagep == NULL && (ret =
+	    __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0)
+		return (ret);
+
+	p = cursorp->pagep;
+
+	/*
+	 * We optimize for the normal case which is when neither the key nor
+	 * the data are large.  In this case, we write a single log record
+	 * and do the delete.  If either is large, we'll call __big_delete
+	 * to remove the big item and then update the page to remove the
+	 * entry referring to the big item.
+	 */
+	ret = 0;
+	if (H_PAIRKEY(p, ndx)->type == H_OFFPAGE) {
+		memcpy(&pgno, (u_int8_t *)GET_HOFFPAGE(p, H_KEYINDEX(ndx)) +
+		    SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t));
+		ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
+	}
+
+	if (ret == 0)
+		switch (H_PAIRDATA(p, ndx)->type) {
+		case H_OFFPAGE:
+			memcpy(&pgno,
+			    (u_int8_t *)GET_HOFFPAGE(p, H_DATAINDEX(ndx)) +
+			    SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t));
+			ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
+			break;
+		case H_OFFDUP:
+			memcpy(&pgno,
+			    (u_int8_t *)GET_HOFFDUP(p, H_DATAINDEX(ndx)) +
+			    SSZ(HOFFDUP, pgno), sizeof(db_pgno_t));
+			ret = __db_ddup(hashp->dbp, pgno, __ham_del_page);
+			break;
+		}
+
+	if (ret)
+		return (ret);
+
+	/* Now log the delete off this page. */
+	if (DB_LOGGING(hashp->dbp)) {
+		key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx));
+		key_dbt.size =
+		    LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx));
+		data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx));
+		data_dbt.size =
+		    LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx));
+
+		if ((ret = __ham_insdel_log(dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR,
+		    hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
+		    &LSN(p), &key_dbt, &data_dbt)) != 0)
+			return (ret);
+
+		/* Move lsn onto page. */
+		LSN(p) = new_lsn;
+	}
+
+	__ham_dpair(hashp->dbp, p, ndx);
+
+	/*
+	 * If we are locking, we will not maintain this.
+	 * XXXX perhaps we can retain incremental numbers and apply them
+	 * later.
+	 */
+	if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
+		--hashp->hdr->nelem;
+
+	/*
+	 * Check if the page is empty.  There are two cases.  If it's
+	 * empty and it's not the first chain in the bucket (i.e., the
+	 * bucket page) then we can simply remove it. If it is the first
+	 * chain in the bucket, then we need to copy the second page into
+	 * it and remove the second page.
+	 */
+	if (NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID &&
+	    NEXT_PGNO(p) != PGNO_INVALID) {
+		PAGE *n_pagep, *nn_pagep;
+		db_pgno_t tmp_pgno;
+
+		/*
+		 * First page in chain is empty and we know that there
+		 * are more pages in the chain.
+		 * XXX Need to log this.
+		 */
+		if ((ret =
+		    __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0)
+			return (ret);
+
+		if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
+			if ((ret =
+			    __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep),
+			    &nn_pagep)) != 0) {
+				(void) __ham_put_page(hashp->dbp, n_pagep, 0);
+				return (ret);
+			}
+			PREV_PGNO(nn_pagep) = PGNO(p);
+			(void)__ham_put_page(hashp->dbp, nn_pagep, 1);
+		}
+
+		tmp_pgno = PGNO(p);
+		memcpy(p, n_pagep, hashp->hdr->pagesize);
+		PGNO(p) = tmp_pgno;
+		PREV_PGNO(p) = PGNO_INVALID;
+
+		/*
+		 * Cursor is advanced to the beginning of the next page.
+		 */
+		cursorp->bndx = NDX_INVALID;
+		cursorp->pgno = PGNO(p);
+		chg_pgno = PGNO(p);
+		if ((ret = __ham_dirty_page(hashp, p)) != 0 ||
+		    (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0)
+			return (ret);
+	} else if (NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) {
+		PAGE *n_pagep, *p_pagep;
+
+		if ((ret =
+		    __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0)
+			return (ret);
+
+		if (NEXT_PGNO(p) != PGNO_INVALID) {
+			if ((ret = __ham_get_page(hashp->dbp,
+			    NEXT_PGNO(p), &n_pagep)) != 0) {
+				(void)__ham_put_page(hashp->dbp, p_pagep, 0);
+				return (ret);
+			}
+			n_lsn = &LSN(n_pagep);
+		} else {
+			n_pagep = NULL;
+			n_lsn = NULL;
+		}
+
+		NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
+		if (n_pagep != NULL)
+			PREV_PGNO(n_pagep) = PGNO(p_pagep);
+
+		if (DB_LOGGING(hashp->dbp)) {
+			if ((ret = __ham_newpage_log(dbenv->lg_info,
+			    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL,
+			    hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
+			    PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
+				return (ret);
+
+			/* Move lsn onto page. */
+			LSN(p_pagep) = new_lsn;	/* Structure assignment. */
+			if (n_pagep)
+				LSN(n_pagep) = new_lsn;
+			LSN(p) = new_lsn;
+		}
+		cursorp->pgno = NEXT_PGNO(p);
+		cursorp->bndx = 0;
+		/*
+		 * Since we are about to delete the cursor page and we have
+		 * just moved the cursor, we need to make sure that the
+		 * old page pointer isn't left hanging around in the cursor.
+		 */
+		cursorp->pagep = NULL;
+		chg_pgno = PGNO(p);
+		ret = __ham_del_page(hashp->dbp, p);
+		if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 &&
+		    ret == 0)
+			ret = tret;
+		if (n_pagep != NULL &&
+		    (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 &&
+		    ret == 0)
+			ret = tret;
+		if (ret != 0)
+			return (ret);
+	} else {
+		/*
+		 * Mark item deleted so that we don't try to return it, and
+		 * so that we update the cursor correctly on the next call
+		 * to next.
+		 */
+		F_SET(cursorp, H_DELETED);
+		chg_pgno = cursorp->pgno;
+		ret = __ham_dirty_page(hashp, p);
+	}
+	__ham_c_update(hashp, cursorp, chg_pgno, 0, 0, 0);
+
+	F_CLR(cursorp, H_OK);
+	return (ret);
+}
+/*
+ * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
+ * Given the key data indicated by the cursor, replace part/all of it
+ * according to the fields in the dbt.
+ */
+int
+__ham_replpair(hashp, hcp, dbt, make_dup)
+	HTAB *hashp;
+	HASH_CURSOR *hcp;
+	DBT *dbt;
+	u_int32_t make_dup;
+{
+	DBT old_dbt, tmp;
+	DB_LSN	new_lsn;
+	HKEYDATA *hk;
+	u_int32_t len;
+	int32_t change;
+	int is_big, ret, type;
+	u_int8_t *beg, *dest, *end, *src;
+
+	/*
+	 * Big item replacements are handled in generic code.
+	 * Items that fit on the current page fall into 4 classes.
+	 * 1. On-page element, same size
+	 * 2. On-page element, new is bigger (fits)
+	 * 3. On-page element, new is bigger (does not fit)
+	 * 4. On-page element, old is bigger
+	 * Numbers 1, 2, and 4 are essentially the same (and should
+	 * be the common case).  We handle case 3 as a delete and
+	 * add.
+	 */
+
+	/*
+	 * We need to compute the number of bytes that we are adding or
+	 * removing from the entry.  Normally, we can simply substract
+	 * the number of bytes we are replacing (dbt->dlen) from the
+	 * number of bytes we are inserting (dbt->size).  However, if
+	 * we are doing a partial put off the end of a record, then this
+	 * formula doesn't work, because we are essentially adding
+	 * new bytes.
+	 */
+	change = dbt->size - dbt->dlen;
+
+	hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
+	is_big = hk->type == H_OFFPAGE;
+
+	if (is_big)
+		memcpy(&len, (u_int8_t *)hk + SSZ(HOFFPAGE, tlen),
+		    sizeof(u_int32_t));
+	else
+		len = LEN_HKEYDATA(hcp->pagep,
+		    hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx));
+
+	if (dbt->doff + dbt->dlen > len)
+		change += dbt->doff + dbt->dlen - len;
+
+
+	if (change > (int)P_FREESPACE(hcp->pagep) || is_big) {
+		/*
+		 * Case 3 -- two subcases.
+		 * A. This is not really a partial operation, but an overwrite.
+		 *    Simple del and add works.
+		 * B. This is a partial and we need to construct the data that
+		 *    we are really inserting (yuck).
+		 * In both cases, we need to grab the key off the page (in
+		 * some cases we could do this outside of this routine; for
+		 * cleanliness we do it here.  If you happen to be on a big
+		 * key, this could be a performance hit).
+		 */
+		tmp.flags = 0;
+		F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL);
+		if ((ret =
+		    __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
+		    &tmp, &hcp->big_key, &hcp->big_keylen)) != 0)
+			return (ret);
+
+		type = hk->type;
+		if (dbt->doff == 0 && dbt->dlen == len) {
+			ret = __ham_del_pair(hashp, hcp);
+			if (ret == 0)
+			    ret = __ham_add_el(hashp, hcp, &tmp, dbt, type);
+		} else {					/* Case B */
+			DBT tdata;
+			tdata.flags = 0;
+			F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL);
+
+			if ((ret = __db_ret(hashp->dbp, hcp->pagep,
+			    H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data,
+			    &hcp->big_datalen)) != 0)
+				goto err;
+
+			/* Now we can delete the item. */
+			if ((ret = __ham_del_pair(hashp, hcp)) != 0) {
+				free(tdata.data);
+				goto err;
+			}
+
+			/* Now shift old data around to make room for new. */
+			if (change > 0) {
+				tdata.data = (void *)
+				    realloc(tdata.data, tdata.size + change);
+				memset((u_int8_t *)tdata.data + tdata.size,
+				    0, change);
+			}
+			if (tdata.data == NULL)
+				return (ENOMEM);
+			end = (u_int8_t *)tdata.data + tdata.size;
+
+			src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
+			if (src < end && tdata.size > dbt->doff + dbt->dlen) {
+				len = tdata.size - dbt->doff - dbt->dlen;
+				dest = src + change;
+				memmove(dest, src, len);
+			}
+			memcpy((u_int8_t *)tdata.data + dbt->doff,
+			    dbt->data, dbt->size);
+			tdata.size += change;
+
+			/* Now add the pair. */
+			ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type);
+			free(tdata.data);
+		}
+err:		free(tmp.data);
+		return (ret);
+	}
+
+	/*
+	 * Set up pointer into existing data. Do it before the log
+	 * message so we can use it inside of the log setup.
+	 */
+	beg = H_PAIRDATA(hcp->pagep, hcp->bndx)->data;
+	beg += dbt->doff;
+
+	/*
+	 * If we are going to have to move bytes at all, figure out
+	 * all the parameters here.  Then log the call before moving
+	 * anything around.
+	 */
+	if (DB_LOGGING(hashp->dbp)) {
+		old_dbt.data = beg;
+		old_dbt.size = dbt->dlen;
+		if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
+		    hashp->dbp->log_fileid, PGNO(hcp->pagep),
+		    (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep),
+		    (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0)
+			return (ret);
+
+		LSN(hcp->pagep) = new_lsn;	/* Structure assignment. */
+	}
+
+	__ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize,
+	    (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt);
+
+	return (0);
+}
+
+/*
+ * Replace data on a page with new data, possibly growing or shrinking what's
+ * there.  This is called on two different occasions. On one (from replpair)
+ * we are interested in changing only the data.  On the other (from recovery)
+ * we are replacing the entire data (header and all) with a new element.  In
+ * the latter case, the off argument is negative.
+ * pagep: the page that we're changing
+ * ndx: page index of the element that is growing/shrinking.
+ * off: Offset at which we are beginning the replacement.
+ * change: the number of bytes (+ or -) that the element is growing/shrinking.
+ * dbt: the new data that gets written at beg.
+ * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t,
+ * PUBLIC:     int32_t,  DBT *));
+ */
+void
+__ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt)
+	PAGE *pagep;
+	size_t pgsize;
+	u_int32_t ndx;
+	int32_t off;
+	int32_t change;
+	DBT *dbt;
+{
+	db_indx_t i;
+	int32_t len;
+	u_int8_t *src, *dest;
+	int zero_me;
+
+	if (change != 0) {
+		zero_me = 0;
+		src = (u_int8_t *)(pagep) + HOFFSET(pagep);
+		if (off < 0)
+			len = pagep->inp[ndx] - HOFFSET(pagep);
+		else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) {
+			len = GET_HKEYDATA(pagep, ndx)->data +
+			    LEN_HKEYDATA(pagep, pgsize, ndx) - src;
+			zero_me = 1;
+		} else
+			len = (GET_HKEYDATA(pagep, ndx)->data + off) - src;
+		dest = src - change;
+		memmove(dest, src, len);
+		if (zero_me)
+			memset(dest + len, 0, change);
+
+		/* Now update the indices. */
+		for (i = ndx; i < NUM_ENT(pagep); i++)
+			pagep->inp[i] -= change;
+		HOFFSET(pagep) -= change;
+	}
+	if (off >= 0)
+		memcpy(GET_HKEYDATA(pagep, ndx)->data + off,
+		    dbt->data, dbt->size);
+	else
+		memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size);
+}
+
+/*
+ * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t));
+ */
+int
+__ham_split_page(hashp, obucket, nbucket)
+	HTAB *hashp;
+	u_int32_t obucket, nbucket;
+{
+	DBT key, val, page_dbt;
+	DB_ENV *dbenv;
+	DB_LSN new_lsn;
+	PAGE **pp, *old_pagep, *temp_pagep, *new_pagep;
+	db_indx_t n;
+	db_pgno_t bucket_pgno, next_pgno;
+	u_int32_t big_len, len;
+	int ret, tret;
+	void *big_buf;
+
+	dbenv = hashp->dbp->dbenv;
+	temp_pagep = old_pagep = new_pagep = NULL;
+
+	bucket_pgno = BUCKET_TO_PAGE(hashp, obucket);
+	if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0)
+		return (ret);
+	if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH,
+	    &new_pagep)) != 0)
+		goto err;
+
+	temp_pagep = hashp->split_buf;
+	memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize);
+
+	if (DB_LOGGING(hashp->dbp)) {
+		page_dbt.size = hashp->hdr->pagesize;
+		page_dbt.data = old_pagep;
+		if ((ret = __ham_splitdata_log(dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
+		    hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep),
+		    &page_dbt, &LSN(old_pagep))) != 0)
+			goto err;
+	}
+
+	P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
+	    PGNO_INVALID, 0, P_HASH);
+
+	if (DB_LOGGING(hashp->dbp))
+		LSN(old_pagep) = new_lsn;	/* Structure assignment. */
+
+	big_len = 0;
+	big_buf = NULL;
+	val.flags = key.flags = 0;
+	while (temp_pagep != NULL) {
+		for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) {
+			if ((ret =
+			    __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n),
+			    &key, &big_buf, &big_len)) != 0)
+				goto err;
+
+			if (__ham_call_hash(hashp, key.data, key.size)
+			    == obucket)
+				pp = &old_pagep;
+			else
+				pp = &new_pagep;
+
+			/*
+			 * Figure out how many bytes we need on the new
+			 * page to store the key/data pair.
+			 */
+
+			len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
+			    H_DATAINDEX(n)) +
+			    LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
+			    H_KEYINDEX(n)) +
+			    2 * sizeof(db_indx_t);
+
+			if (P_FREESPACE(*pp) < len) {
+				if (DB_LOGGING(hashp->dbp)) {
+					page_dbt.size = hashp->hdr->pagesize;
+					page_dbt.data = *pp;
+					if ((ret = __ham_splitdata_log(
+					    dbenv->lg_info,
+					    (DB_TXN *)hashp->dbp->txn,
+					    &new_lsn, 0,
+					    hashp->dbp->log_fileid, SPLITNEW,
+					    PGNO(*pp), &page_dbt,
+					    &LSN(*pp))) != 0)
+						goto err;
+					LSN(*pp) = new_lsn;
+				}
+				if ((ret = __ham_add_ovflpage(hashp,
+				    *pp, 1, pp)) != 0)
+					goto err;
+			}
+			__ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp);
+			__ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp);
+		}
+		next_pgno = NEXT_PGNO(temp_pagep);
+
+		/* Clear temp_page; if it's a link overflow page, free it. */
+		if (PGNO(temp_pagep) != bucket_pgno && (ret =
+		    __ham_del_page(hashp->dbp, temp_pagep)) != 0)
+			goto err;
+
+		if (next_pgno == PGNO_INVALID)
+			temp_pagep = NULL;
+		else if ((ret =
+		    __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0)
+			goto err;
+
+		if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) {
+			page_dbt.size = hashp->hdr->pagesize;
+			page_dbt.data = temp_pagep;
+			if ((ret = __ham_splitdata_log(dbenv->lg_info,
+			    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
+			    hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep),
+			    &page_dbt, &LSN(temp_pagep))) != 0)
+				goto err;
+			LSN(temp_pagep) = new_lsn;
+		}
+	}
+	if (big_buf != NULL)
+		free(big_buf);
+
+	/*
+	 * If the original bucket spanned multiple pages, then we've got
+	 * a pointer to a page that used to be on the bucket chain.  It
+	 * should be deleted.
+	 */
+	if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
+	    (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0)
+		goto err;
+
+	/*
+	 * Write new buckets out.
+	 */
+	if (DB_LOGGING(hashp->dbp)) {
+		page_dbt.size = hashp->hdr->pagesize;
+		page_dbt.data = old_pagep;
+		if ((ret = __ham_splitdata_log(dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
+		    hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep),
+		    &page_dbt, &LSN(old_pagep))) != 0)
+			goto err;
+		LSN(old_pagep) = new_lsn;
+
+		page_dbt.data = new_pagep;
+		if ((ret = __ham_splitdata_log(dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
+		    hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep),
+		    &page_dbt, &LSN(new_pagep))) != 0)
+			goto err;
+		LSN(new_pagep) = new_lsn;
+	}
+	ret = __ham_put_page(hashp->dbp, old_pagep, 1);
+	if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 &&
+	    ret == 0)
+		ret = tret;
+
+err:	if (0) {
+		if (old_pagep != NULL)
+			(void)__ham_put_page(hashp->dbp, old_pagep, 1);
+		if (new_pagep != NULL)
+			(void)__ham_put_page(hashp->dbp, new_pagep, 1);
+		if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
+			(void)__ham_put_page(hashp->dbp, temp_pagep, 1);
+	}
+	return (ret);
+}
+
+/*
+ * Add the given pair to the page.  The page in question may already be
+ * held (i.e. it was already gotten).  If it is, then the page is passed
+ * in via the pagep parameter.  On return, pagep will contain the page
+ * to which we just added something.  This allows us to link overflow
+ * pages and return the new page having correctly put the last page.
+ *
+ * PUBLIC: int __ham_add_el __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *,
+ * PUBLIC:     int));
+ */
+int
+__ham_add_el(hashp, hcp, key, val, type)
+	HTAB *hashp;
+	HASH_CURSOR *hcp;
+	const DBT *key, *val;
+	int type;
+{
+	DBT *pkey, *pdata, key_dbt, data_dbt;
+	DB_LSN new_lsn;
+	HOFFPAGE doff, koff;
+	db_pgno_t next_pgno;
+	u_int32_t data_size, key_size, pairsize;
+	int do_expand, is_keybig, is_databig, rectype, ret;
+	int key_type, data_type;
+
+	do_expand = 0;
+
+	if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp,
+	    hcp->seek_found_page != PGNO_INVALID ?  hcp->seek_found_page :
+	    hcp->pgno, &hcp->pagep)) != 0)
+		return (ret);
+
+	key_size = HKEYDATA_PSIZE(key->size);
+	data_size = HKEYDATA_PSIZE(val->size);
+	is_keybig = ISBIG(hashp, key->size);
+	is_databig = ISBIG(hashp, val->size);
+	if (is_keybig)
+		key_size = HOFFPAGE_PSIZE;
+	if (is_databig)
+		data_size = HOFFPAGE_PSIZE;
+
+	pairsize = key_size + data_size;
+
+	/* Advance to first page in chain with room for item. */
+	while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) !=
+	    PGNO_INVALID) {
+		/*
+		 * This may not be the end of the chain, but the pair may fit
+		 * anyway.  Check if it's a bigpair that fits or a regular
+		 * pair that fits.
+		 */
+		if (P_FREESPACE(hcp->pagep) >= pairsize)
+			break;
+		next_pgno = NEXT_PGNO(hcp->pagep);
+		if ((ret =
+		    __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * Check if we need to allocate a new page.
+	 */
+	if (P_FREESPACE(hcp->pagep) < pairsize) {
+		do_expand = 1;
+		if ((ret = __ham_add_ovflpage(hashp,
+		    hcp->pagep, 1, &hcp->pagep)) !=  0)
+			return (ret);
+		hcp->pgno = PGNO(hcp->pagep);
+	}
+
+	/*
+	 * Update cursor.
+	 */
+	hcp->bndx = H_NUMPAIRS(hcp->pagep);
+	F_CLR(hcp, H_DELETED);
+	if (is_keybig) {
+		if ((ret = __db_poff(hashp->dbp,
+		    key, &koff.pgno, __ham_overflow_page)) != 0)
+			return (ret);
+		koff.type = H_OFFPAGE;
+		koff.tlen = key->size;
+		key_dbt.data = &koff;
+		key_dbt.size = sizeof(koff);
+		pkey = &key_dbt;
+		key_type = H_OFFPAGE;
+	} else {
+		pkey = (DBT *)key;
+		key_type = H_KEYDATA;
+	}
+
+	if (is_databig) {
+		if ((ret = __db_poff(hashp->dbp,
+		    val, &doff.pgno, __ham_overflow_page)) != 0)
+			return (ret);
+		doff.type = H_OFFPAGE;
+		doff.tlen = val->size;
+		data_dbt.data = &doff;
+		data_dbt.size = sizeof(doff);
+		pdata = &data_dbt;
+		data_type = H_OFFPAGE;
+	} else {
+		pdata = (DBT *)val;
+		data_type = type;
+	}
+
+	if (DB_LOGGING(hashp->dbp)) {
+		rectype = PUTPAIR;
+		if (is_databig)
+			rectype |= PAIR_DATAMASK;
+		if (is_keybig)
+			rectype |= PAIR_KEYMASK;
+
+		if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype,
+		    hashp->dbp->log_fileid, PGNO(hcp->pagep),
+		    (u_int32_t)H_NUMPAIRS(hcp->pagep),
+		    &LSN(hcp->pagep), pkey, pdata)) != 0)
+			return (ret);
+
+		/* Move lsn onto page. */
+		LSN(hcp->pagep) = new_lsn;	/* Structure assignment. */
+	}
+
+	__ham_putitem(hcp->pagep, pkey, key_type);
+	__ham_putitem(hcp->pagep, pdata, data_type);
+
+	/*
+	 * For splits, we are going to update item_info's page number
+	 * field, so that we can easily return to the same page the
+	 * next time we come in here.  For other operations, this shouldn't
+	 * matter, since odds are this is the last thing that happens before
+	 * we return to the user program.
+	 */
+	hcp->pgno = PGNO(hcp->pagep);
+
+	/*
+	 * XXX Maybe keep incremental numbers here
+	 */
+	if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
+		hashp->hdr->nelem++;
+
+	if (do_expand || (hashp->hdr->ffactor != 0 &&
+	    (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor))
+		F_SET(hcp, H_EXPAND);
+	return (0);
+}
+
+
+/*
+ * Special __putitem call used in splitting -- copies one entry to
+ * another.  Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
+ * H_DUPLICATE, H_OFFDUP).  Since we log splits at a high level, we
+ * do not need to do any logging here.
+ * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, int, PAGE *));
+ */
+void
+__ham_copy_item(hashp, src_page, src_ndx, dest_page)
+	HTAB *hashp;
+	PAGE *src_page;
+	int src_ndx;
+	PAGE *dest_page;
+{
+	u_int32_t len;
+	void *src, *dest;
+
+	/*
+	 * Copy the key and data entries onto this new page.
+	 */
+	src = P_ENTRY(src_page, src_ndx);
+
+	/* Set up space on dest. */
+	len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx);
+	HOFFSET(dest_page) -= len;
+	dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page);
+	dest = P_ENTRY(dest_page, NUM_ENT(dest_page));
+	NUM_ENT(dest_page)++;
+
+	memcpy(dest, src, len);
+}
+
+/*
+ *
+ * Returns:
+ *      pointer on success
+ *      NULL on error
+ *
+ * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **));
+ */
+int
+__ham_add_ovflpage(hashp, pagep, release, pp)
+	HTAB *hashp;
+	PAGE *pagep;
+	int release;
+	PAGE **pp;
+{
+	DB_ENV *dbenv;
+	DB_LSN new_lsn;
+	PAGE *new_pagep;
+	int ret;
+
+	dbenv = hashp->dbp->dbenv;
+
+	if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0)
+		return (ret);
+
+	if (DB_LOGGING(hashp->dbp)) {
+		if ((ret = __ham_newpage_log(dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL,
+		    hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep),
+		    PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0)
+			return (ret);
+
+		/* Move lsn onto page. */
+		LSN(pagep) = LSN(new_pagep) = new_lsn;
+	}
+	NEXT_PGNO(pagep) = PGNO(new_pagep);
+	PREV_PGNO(new_pagep) = PGNO(pagep);
+
+	if (release)
+		ret = __ham_put_page(hashp->dbp, pagep, 1);
+
+	hashp->hash_overflows++;
+	*pp = new_pagep;
+	return (ret);
+}
+
+
+/*
+ * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **));
+ */
+int
+__ham_new_page(hashp, addr, type, pp)
+	HTAB *hashp;
+	u_int32_t addr, type;
+	PAGE **pp;
+{
+	PAGE *pagep;
+	int ret;
+
+	if ((ret = memp_fget(hashp->dbp->mpf,
+	    &addr, DB_MPOOL_CREATE, &pagep)) != 0)
+		return (ret);
+
+#ifdef DEBUG_SLOW
+	account_page(hashp, addr, 1);
+#endif
+	/* This should not be necessary because page-in should do it. */
+	P_INIT(pagep,
+	    hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
+
+	*pp = pagep;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_del_page __P((DB *, PAGE *));
+ */
+int
+__ham_del_page(dbp, pagep)
+	DB *dbp;
+	PAGE *pagep;
+{
+	DB_LSN new_lsn;
+	HTAB *hashp;
+	int ret;
+
+	hashp = (HTAB *)dbp->internal;
+	ret = 0;
+	DIRTY_META(hashp, ret);
+	if (ret != 0) {
+		if (ret != EAGAIN)
+			__db_err(hashp->dbp->dbenv,
+			    "free_ovflpage: unable to lock meta data page %s\n",
+			    strerror(ret));
+		/*
+		 * If we are going to return an error, then we should free
+		 * the page, so it doesn't stay pinned forever.
+		 */
+		(void)__ham_put_page(hashp->dbp, pagep, 0);
+		return (ret);
+	}
+
+	if (DB_LOGGING(hashp->dbp)) {
+		if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO,
+		    hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed,
+		    (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID,
+		    &LSN(pagep), &hashp->hdr->lsn)) != 0)
+			return (ret);
+
+		hashp->hdr->lsn = new_lsn;
+		LSN(pagep) = new_lsn;
+	}
+
+#ifdef DEBUG
+	{
+		db_pgno_t __pgno;
+		DB_LSN __lsn;
+		__pgno = pagep->pgno;
+		__lsn = pagep->lsn;
+		memset(pagep, 0xff, dbp->pgsize);
+		pagep->pgno = __pgno;
+		pagep->lsn = __lsn;
+	}
+#endif
+	TYPE(pagep) = P_INVALID;
+	NEXT_PGNO(pagep) = hashp->hdr->last_freed;
+	hashp->hdr->last_freed = PGNO(pagep);
+
+	return (__ham_put_page(hashp->dbp, pagep, 1));
+}
+
+
+/*
+ * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t));
+ */
+int
+__ham_put_page(dbp, pagep, is_dirty)
+	DB *dbp;
+	PAGE *pagep;
+	int32_t is_dirty;
+{
+#ifdef DEBUG_SLOW
+	account_page((HTAB *)dbp->cookie,
+	    ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
+#endif
+	return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0)));
+}
+
+/*
+ * __ham_dirty_page --
+ *	Mark a page dirty.
+ *
+ * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *));
+ */
+int
+__ham_dirty_page(hashp, pagep)
+	HTAB *hashp;
+	PAGE *pagep;
+{
+	return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY));
+}
+
+/*
+ * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **));
+ */
+int
+__ham_get_page(dbp, addr, pagep)
+	DB *dbp;
+	db_pgno_t addr;
+	PAGE **pagep;
+{
+	int ret;
+
+	ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep);
+#ifdef DEBUG_SLOW
+	if (*pagep != NULL)
+		account_page((HTAB *)dbp->internal, addr, 1);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **));
+ */
+int
+__ham_overflow_page(dbp, type, pp)
+	DB *dbp;
+	u_int32_t type;
+	PAGE **pp;
+{
+	DB_LSN *lsnp, new_lsn;
+	HTAB *hashp;
+	PAGE *p;
+	db_pgno_t new_addr, next_free, newalloc_flag;
+	u_int32_t offset, splitnum;
+	int ret;
+
+	hashp = (HTAB *)dbp->internal;
+
+	ret = 0;
+	DIRTY_META(hashp, ret);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * This routine is split up into two parts.  First we have
+	 * to figure out the address of the new page that we are
+	 * allocating.  Then we have to log the allocation.  Only
+	 * after the log do we get to complete allocation of the
+	 * new page.
+	 */
+	new_addr = hashp->hdr->last_freed;
+	if (new_addr != PGNO_INVALID) {
+		if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0)
+			return (ret);
+		next_free = NEXT_PGNO(p);
+		lsnp = &LSN(p);
+		newalloc_flag = 0;
+	} else {
+		splitnum = hashp->hdr->ovfl_point;
+		hashp->hdr->spares[splitnum]++;
+		offset = hashp->hdr->spares[splitnum] -
+		    (splitnum ? hashp->hdr->spares[splitnum - 1] : 0);
+		new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset);
+		if (new_addr > MAX_PAGES(hashp)) {
+			__db_err(hashp->dbp->dbenv, "hash: out of file pages");
+			hashp->hdr->spares[splitnum]--;
+			return (ENOMEM);
+		}
+		next_free = PGNO_INVALID;
+		p = NULL;
+		lsnp = NULL;
+		newalloc_flag = 1;
+	}
+
+	if (DB_LOGGING(hashp->dbp)) {
+		if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
+		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO,
+		    hashp->dbp->log_fileid, new_addr, next_free,
+		    0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0)
+			return (ret);
+
+		hashp->hdr->lsn = new_lsn;
+		if (lsnp != NULL)
+			*lsnp = new_lsn;
+	}
+
+	if (p != NULL) {
+		/* We just took something off the free list, initialize it. */
+		hashp->hdr->last_freed = next_free;
+		P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID,
+		    PGNO_INVALID, 0, (u_int8_t)type);
+	} else {
+		/* Get the new page. */
+		if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0)
+			return (ret);
+	}
+	if (DB_LOGGING(hashp->dbp))
+		LSN(p) = new_lsn;
+
+	*pp = p;
+	return (0);
+}
+
+#ifdef DEBUG
+/*
+ * PUBLIC: #ifdef DEBUG
+ * PUBLIC: int bucket_to_page __P((HTAB *, int));
+ * PUBLIC: #endif
+ */
+int
+bucket_to_page(hashp, n)
+	HTAB *hashp;
+	int n;
+{
+	int ret_val;
+
+	ret_val = n + 1;
+	if (n != 0)
+		ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1];
+	return (ret_val);
+}
+#endif
+
+
+/*
+ * Create a bunch of overflow pages at the current split point.
+ * PUBLIC: void __ham_init_ovflpages __P((HTAB *));
+ */
+void
+__ham_init_ovflpages(hp)
+	HTAB *hp;
+{
+	DB_LSN new_lsn;
+	PAGE *p;
+	db_pgno_t last_pgno;
+	u_int32_t i, numpages;
+
+	numpages = hp->hdr->ovfl_point + 1;
+
+	last_pgno = hp->hdr->last_freed;
+	if (DB_LOGGING(hp->dbp)) {
+		(void)__ham_ovfl_log(hp->dbp->dbenv->lg_info,
+		    (DB_TXN *)hp->dbp->txn, &new_lsn, 0,
+		    hp->dbp->log_fileid, PGNO_OF(hp, hp->hdr->ovfl_point, 1),
+		    numpages, last_pgno, &hp->hdr->lsn);
+		hp->hdr->lsn = new_lsn;
+	} else
+		ZERO_LSN(new_lsn);
+
+	hp->hdr->spares[hp->hdr->ovfl_point] += numpages;
+	for (i = numpages; i > 0; i--) {
+		if (__ham_new_page(hp,
+		    PGNO_OF(hp, hp->hdr->ovfl_point, i), P_INVALID, &p) != 0)
+			break;
+		LSN(p) = new_lsn;
+		NEXT_PGNO(p) = last_pgno;
+		last_pgno = PGNO(p);
+		(void)__ham_put_page(hp->dbp, p, 1);
+	}
+	hp->hdr->last_freed = last_pgno;
+}
+
+/*
+ * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
+ */
+int
+__ham_get_cpage(hashp, hcp, mode)
+	HTAB *hashp;
+	HASH_CURSOR *hcp;
+	db_lockmode_t mode;
+{
+	int ret;
+
+	if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) &&
+	    (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0)
+		return (ret);
+
+	if (hcp->pagep == NULL) {
+		if (hcp->pgno == PGNO_INVALID) {
+			hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket);
+			hcp->bndx = 0;
+		}
+
+		if ((ret =
+		    __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0)
+			return (ret);
+	}
+
+	if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL)
+		if ((ret =
+		    __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
+			return (ret);
+	return (0);
+}
+
+/*
+ * Get a new page at the cursor, putting the last page if necessary.
+ * If the flag is set to H_ISDUP, then we are talking about the
+ * duplicate page, not the main page.
+ * PUBLIC: int __ham_next_cpage __P((HTAB *, HASH_CURSOR *, db_pgno_t,
+ * PUBLIC:     int, int));
+ */
+int
+__ham_next_cpage(hashp, hcp, pgno, dirty, flags)
+	HTAB *hashp;
+	HASH_CURSOR *hcp;
+	db_pgno_t pgno;
+	int dirty;
+	int flags;
+{
+	PAGE *p;
+	int ret;
+
+	if (flags & H_ISDUP && hcp->dpagep != NULL &&
+	    (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0)
+		return (ret);
+	else if (!(flags & H_ISDUP) && hcp->pagep != NULL &&
+	    (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0)
+		return (ret);
+
+	if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0)
+		return (ret);
+
+	if (flags & H_ISDUP) {
+		hcp->dpagep = p;
+		hcp->dpgno = pgno;
+		hcp->dndx = 0;
+	} else {
+		hcp->pagep = p;
+		hcp->pgno = pgno;
+		hcp->bndx = 0;
+	}
+
+	return (0);
+}
+
+/*
+ * __ham_lock_bucket --
+ *	Get the lock on a particular bucket.
+ */
+static int
+__ham_lock_bucket(dbp, hcp, mode)
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_lockmode_t mode;
+{
+	int ret;
+
+	/*
+	 * What a way to trounce on the memory system.  It might be
+	 * worth copying the lk_info into the hashp.
+	 */
+	ret = 0;
+	dbp->lock.pgno = (db_pgno_t)(hcp->bucket);
+	ret = lock_get(dbp->dbenv->lk_info,
+	    dbp->txn == NULL ?  dbp->locker : dbp->txn->txnid, 0,
+	    &dbp->lock_dbt, mode, &hcp->lock);
+
+	return (ret < 0 ? EAGAIN : ret);
+}
+
+/*
+ * __ham_dpair --
+ *	Delete a pair on a page, paying no attention to what the pair
+ *	represents.  The caller is responsible for freeing up duplicates
+ *	or offpage entries that might be referenced by this pair.
+ *
+ * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
+ */
+void
+__ham_dpair(dbp, p, pndx)
+	DB *dbp;
+	PAGE *p;
+	u_int32_t pndx;
+{
+	db_indx_t delta, n;
+	u_int8_t *dest, *src;
+
+	/*
+	 * Compute "delta", the amount we have to shift all of the
+	 * offsets.  To find the delta, we just need to calculate
+	 * the size of the pair of elements we are removing.
+	 */
+	delta = H_PAIRSIZE(p, dbp->pgsize, pndx);
+
+	/*
+	 * The hard case: we want to remove something other than
+	 * the last item on the page.  We need to shift data and
+	 * offsets down.
+	 */
+	if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) {
+		/*
+		 * Move the data: src is the first occupied byte on
+		 * the page. (Length is delta.)
+		 */
+		src = (u_int8_t *)p + HOFFSET(p);
+
+		/*
+		 * Destination is delta bytes beyond src.  This might
+		 * be an overlapping copy, so we have to use memmove.
+		 */
+		dest = src + delta;
+		memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p));
+	}
+
+	/* Adjust the offsets. */
+	for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) {
+		p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta;
+		p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta;
+	}
+
+	/* Adjust page metadata. */
+	HOFFSET(p) = HOFFSET(p) + delta;
+	NUM_ENT(p) = NUM_ENT(p) - 2;
+}
+
+#ifdef DEBUG_SLOW
+static void
+account_page(hashp, pgno, inout)
+	HTAB *hashp;
+	db_pgno_t pgno;
+	int inout;
+{
+	static struct {
+		db_pgno_t pgno;
+		int times;
+	} list[100];
+	static int last;
+	int i, j;
+
+	if (inout == -1)			/* XXX: Kluge */
+		inout = 0;
+
+	/* Find page in list. */
+	for (i = 0; i < last; i++)
+		if (list[i].pgno == pgno)
+			break;
+	/* Not found. */
+	if (i == last) {
+		list[last].times = inout;
+		list[last].pgno = pgno;
+		last++;
+	}
+	list[i].times = inout;
+	if (list[i].times == 0) {
+		for (j = i; j < last; j++)
+			list[j] = list[j + 1];
+		last--;
+	}
+	for (i = 0; i < last; i++, list[i].times++)
+		if (list[i].times > 20 && !is_bitmap_pgno(hashp, list[i].pgno))
+			(void)fprintf(stderr,
+			    "Warning: pg %lu has been out for %d times\n",
+			    (u_long)list[i].pgno, list[i].times);
+}
+#endif /* DEBUG_SLOW */