about summary refs log tree commit diff
path: root/db2/btree
diff options
context:
space:
mode:
Diffstat (limited to 'db2/btree')
-rw-r--r--db2/btree/bt_cursor.c62
-rw-r--r--db2/btree/bt_delete.c15
-rw-r--r--db2/btree/bt_put.c131
-rw-r--r--db2/btree/bt_search.c14
-rw-r--r--db2/btree/bt_split.c9
-rw-r--r--db2/btree/btree_auto.c8
6 files changed, 175 insertions, 64 deletions
diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c
index e5f3faeb70..47ecd7c66d 100644
--- a/db2/btree/bt_cursor.c
+++ b/db2/btree/bt_cursor.c
@@ -8,7 +8,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_cursor.c	10.35 (Sleepycat) 10/25/97";
+static const char sccsid[] = "@(#)bt_cursor.c	10.37 (Sleepycat) 11/22/97";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -33,7 +33,7 @@ static int __bam_c_next __P((DB *, CURSOR *, int));
 static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *));
 static int __bam_c_prev __P((DB *, CURSOR *));
 static int __bam_c_put __P((DBC *, DBT *, DBT *, int));
-static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int));
+static int __bam_c_rget __P((DB *, CURSOR *, DBT *, int));
 static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *));
 
 /* Discard the current page/lock held by a cursor. */
@@ -229,7 +229,7 @@ __bam_c_del(dbc, flags)
 		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
 	else
 		B_DSET(GET_BKEYDATA(h, indx)->type);
-	(void)__bam_ca_delete(dbp, pgno, indx, NULL);
+	(void)__bam_ca_delete(dbp, pgno, indx, NULL, 0);
 
 	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
 
@@ -313,7 +313,7 @@ __bam_c_get(dbc, key, data, flags)
 	 * been rammed into the interface.
 	 */
 	if (LF_ISSET(DB_GET_RECNO)) {
-		ret = __bam_c_rget(dbp, cp, key, data, flags);
+		ret = __bam_c_rget(dbp, cp, data, flags);
 		PUTHANDLE(dbp);
 		return (ret);
 	}
@@ -441,10 +441,10 @@ err:		if (cp->page != NULL)
  *	Return the record number for a cursor.
  */
 static int
-__bam_c_rget(dbp, cp, key, data, flags)
+__bam_c_rget(dbp, cp, data, flags)
 	DB *dbp;
 	CURSOR *cp;
-	DBT *key, *data;
+	DBT *data;
 	int flags;
 {
 	BTREE *t;
@@ -1113,18 +1113,18 @@ __bam_cprint(dbp)
 
 /*
  * __bam_ca_delete --
- * 	Check if any of the cursors refer to the item we are about to delete.
- *	We'll return the number of cursors that refer to the item in question.
- *	If a cursor does refer to the item, then we set its deleted bit.
+ * 	Check if any of the cursors refer to the item we are about to delete,
+ *	returning the number of cursors that refer to the item in question.
  *
- * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *));
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int));
  */
 int
-__bam_ca_delete(dbp, pgno, indx, curs)
+__bam_ca_delete(dbp, pgno, indx, curs, key_delete)
 	DB *dbp;
 	db_pgno_t pgno;
 	u_int32_t indx;
 	CURSOR *curs;
+	int key_delete;
 {
 	DBC *dbc;
 	CURSOR *cp;
@@ -1140,22 +1140,40 @@ __bam_ca_delete(dbp, pgno, indx, curs)
 	 * It's possible for multiple cursors within the thread to have write
 	 * locks on the same page, but, cursors within a thread must be single
 	 * threaded, so all we're locking here is the cursor linked list.
-	 *
-	 * indx refers to the first of what might be a duplicate set.  The
-	 * cursor passed in is the one initiating the delete, so we don't
-	 * want to count it.
 	 */
 	DB_THREAD_LOCK(dbp);
+
 	for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue);
 	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
 		cp = (CURSOR *)dbc->internal;
-		if ((curs != cp &&
-		    cp->pgno == pgno && cp->indx == indx) ||
-		    (cp->dpgno == pgno && cp->dindx == indx)) {
-			++count;
-			F_SET(cp, C_DELETED);
-		}
+
+		/*
+		 * Optionally, a cursor passed in is the one initiating the
+		 * delete, so we don't want to count it or set its deleted
+		 * flag.  Otherwise, if a cursor refers to the item, then we
+		 * set its deleted flag.
+		 */
+		if (curs == cp)
+			continue;
+
+		/*
+		 * If we're deleting the key itself and not just one of its
+		 * duplicates, repoint the cursor to the main-page key/data
+		 * pair, everything else is about to be discarded.
+		 */
+		if (key_delete || cp->dpgno == PGNO_INVALID) {
+			if (cp->pgno == pgno && cp->indx == indx) {
+				cp->dpgno = PGNO_INVALID;
+				++count;
+				F_SET(cp, C_DELETED);
+			}
+		} else
+			if (cp->dpgno == pgno && cp->dindx == indx) {
+				++count;
+				F_SET(cp, C_DELETED);
+			}
 	}
+
 	DB_THREAD_UNLOCK(dbp);
 	return (count);
 }
@@ -1440,7 +1458,7 @@ __bam_c_physdel(dbp, cp, h)
 	 * If the item is referenced by another cursor, leave it up to that
 	 * cursor to do the delete.
 	 */
-	if (__bam_ca_delete(dbp, pgno, indx, cp) != 0)
+	if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0)
 		return (0);
 
 	/*
diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c
index 9593d0109c..dbd1995f89 100644
--- a/db2/btree/bt_delete.c
+++ b/db2/btree/bt_delete.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_delete.c	10.22 (Sleepycat) 11/2/97";
+static const char sccsid[] = "@(#)bt_delete.c	10.23 (Sleepycat) 11/22/97";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -101,17 +101,20 @@ __bam_delete(argdbp, txn, key, flags)
 	h = t->bt_csp->page;
 	indx = t->bt_csp->indx;
 
-	/* Delete the key/data pair, including any duplicates. */
+	/* Delete the key/data pair, including any on-or-off page duplicates. */
 	for (cnt = 1, i = indx;; ++cnt)
 		if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx])
 			break;
 	for (; cnt > 0; --cnt, ++t->lstat.bt_deleted)
-		if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) {
+		if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) {
+			if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+				goto err;
+			if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+				goto err;
+		} else {
 			B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
 			indx += P_INDX;
-		} else if ((ret = __bam_ditem(dbp, h, indx)) != 0 ||
-		    (ret = __bam_ditem(dbp, h, indx)) != 0)
-			goto err;
+		}
 
 	/* If we're using record numbers, update internal page record counts. */
 	if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0)
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c
index b3d775bb0f..3161b02b55 100644
--- a/db2/btree/bt_put.c
+++ b/db2/btree/bt_put.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_put.c	10.31 (Sleepycat) 10/26/97";
+static const char sccsid[] = "@(#)bt_put.c	10.35 (Sleepycat) 11/22/97";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -64,6 +64,7 @@ static const char sccsid[] = "@(#)bt_put.c	10.31 (Sleepycat) 10/26/97";
 #include "btree.h"
 
 static int __bam_fixed __P((BTREE *, DBT *));
+static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *));
 static int __bam_lookup __P((DB *, DBT *, int *));
 static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
 static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *));
@@ -89,7 +90,7 @@ __bam_put(argdbp, txn, key, data, flags)
 	DB *dbp;
 	PAGE *h;
 	db_indx_t indx;
-	int exact, iflags, newkey, replace, ret, stack;
+	int exact, iflags, isdeleted, newkey, replace, ret, stack;
 
 	DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags);
 
@@ -114,21 +115,25 @@ retry:	/*
 	stack = 1;
 
 	/*
-	 * If an identical key is already in the tree, and DB_NOOVERWRITE is
-	 * set, an error is returned.  If an identical key is already in the
-	 * tree and DB_NOOVERWRITE is not set, the key is either added (when
-	 * duplicates are permitted) or an error is returned.  The exception
-	 * is when the item located is referenced by a cursor and marked for
-	 * deletion, in which case we permit the overwrite and flag the cursor.
+	 * If DB_NOOVERWRITE is set and there's an identical key in the tree,
+	 * return an error unless the data item has already been marked for
+	 * deletion, or, all the remaining data items have already been marked
+	 * for deletion in the case of duplicates.  If all the data items have
+	 * been marked for deletion, we do a replace, otherwise, it has to be
+	 * a set of duplicates, and we simply append a new one to the set.
 	 */
-	replace = 0;
-	if (exact && flags == DB_NOOVERWRITE) {
-		if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) {
-			ret = DB_KEYEXIST;
+	isdeleted = replace = 0;
+	if (exact) {
+		if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0)
 			goto err;
-		}
-		replace = 1;
-		__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
+		if (isdeleted) {
+			replace = 1;
+			__bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
+		} else
+			if (flags == DB_NOOVERWRITE) {
+				ret = DB_KEYEXIST;
+				goto err;
+			}
 	}
 
 	/*
@@ -151,7 +156,7 @@ retry:	/*
 	 */
 	newkey = dbp->type == DB_BTREE && !exact;
 	if (exact) {
-		if (F_ISSET(dbp, DB_AM_DUP)) {
+		if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) {
 			/*
 			 * Make sure that we're not looking at a page of
 			 * duplicates -- if so, move to the last entry on
@@ -234,6 +239,88 @@ err:	if (stack)
 }
 
 /*
+ * __bam_isdeleted --
+ *	Return if the only remaining data item for the element has been
+ *	deleted.
+ */
+static int
+__bam_isdeleted(dbp, h, indx, isdeletedp)
+	DB *dbp;
+	PAGE *h;
+	u_int32_t indx;
+	int *isdeletedp;
+{
+	BKEYDATA *bk;
+	db_pgno_t pgno;
+	int ret;
+
+	*isdeletedp = 1;
+	for (;;) {
+		bk = GET_BKEYDATA(h, indx + O_INDX);
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+		case B_OVERFLOW:
+			if (!B_DISSET(bk->type)) {
+				*isdeletedp = 0;
+				return (0);
+			}
+			break;
+		case B_DUPLICATE:
+			/*
+			 * If the data item referencing the off-page duplicates
+			 * is flagged as deleted, we're done.  Else, we have to
+			 * walk the chain of duplicate pages.
+			 */
+			if (B_DISSET(bk->type))
+				return (0);
+			goto dupchk;
+		default:
+			return (__db_pgfmt(dbp, h->pgno));
+		}
+
+		/*
+		 * If there are no more on-page duplicate items, then every
+		 * data item for this key must have been deleted.
+		 */
+		if (indx + P_INDX >= (u_int32_t)NUM_ENT(h))
+			return (0);
+		if (h->inp[indx] != h->inp[indx + P_INDX])
+			return (0);
+
+		/* Check the next item. */
+		indx += P_INDX;
+	}
+	/* NOTREACHED */
+
+dupchk:	/* Check a chain of duplicate pages. */
+	pgno = ((BOVERFLOW *)bk)->pgno;
+	for (;;) {
+		/* Acquire the next page in the duplicate chain. */
+		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+			return (ret);
+
+		/* Check each item for a delete flag. */
+		for (indx = 0; indx < NUM_ENT(h); ++indx)
+			if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) {
+				*isdeletedp = 0;
+				goto done;
+			}
+		/*
+		 * If we reach the end of the duplicate pages, then every
+		 * item we reviewed must have been deleted.
+		 */
+		if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
+			goto done;
+
+		(void)memp_fput(dbp->mpf, h, 0);
+	}
+	/* NOTREACHED */
+
+done:	(void)memp_fput(dbp->mpf, h, 0);
+	return (0);
+}
+
+/*
  * __bam_lookup --
  *	Find the right location in the tree for the key.
  */
@@ -425,10 +512,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 		if (op == DB_CURRENT) {
 			bk = GET_BKEYDATA(h,
 			    indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
-			if (B_TYPE(bk->type) == B_OVERFLOW)
-				have_bytes = BOVERFLOW_PSIZE;
-			else
+			if (B_TYPE(bk->type) == B_KEYDATA)
 				have_bytes = BKEYDATA_PSIZE(bk->len);
+			else
+				have_bytes = BOVERFLOW_PSIZE;
 			need_bytes = 0;
 		} else {
 			have_bytes = 0;
@@ -542,7 +629,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
 			 * If we're dealing with offpage items, we have to 
 			 * delete and then re-add the item.
 			 */
-			if (bigdata || B_TYPE(bk->type) == B_OVERFLOW) {
+			if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
 				if ((ret = __bam_ditem(dbp, h, indx)) != 0)
 					return (ret);
 				break;
@@ -704,9 +791,9 @@ __bam_ritem(dbp, h, indx, data)
 {
 	BKEYDATA *bk;
 	DBT orig, repl;
-	db_indx_t lo, ln, min, off, prefix, suffix;
+	db_indx_t cnt, lo, ln, min, off, prefix, suffix;
 	int32_t nbytes;
-	int cnt, ret;
+	int ret;
 	u_int8_t *p, *t;
 
 	/*
diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c
index a21a8208bc..c39c9af322 100644
--- a/db2/btree/bt_search.c
+++ b/db2/btree/bt_search.c
@@ -47,7 +47,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_search.c	10.8 (Sleepycat) 10/25/97";
+static const char sccsid[] = "@(#)bt_search.c	10.9 (Sleepycat) 11/18/97";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -119,12 +119,20 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
 		return (ret);
 	}
 
-	/* Decide if we need to save this page; if we do, write lock it. */
+	/*
+	 * Decide if we need to save this page; if we do, write lock it.
+	 * We deliberately don't lock-couple on this call.  If the tree
+	 * is tiny, i.e., one page, and two threads are busily updating
+	 * the root page, we're almost guaranteed deadlocks galore, as
+	 * each one gets a read lock and then blocks the other's attempt
+	 * for a write lock.
+	 */
 	if (!stack &&
 	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
 	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
 		(void)memp_fput(dbp->mpf, h, 0);
-		if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
+		(void)__BT_LPUT(dbp, lock);
+		if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
 			return (ret);
 		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
 			(void)__BT_LPUT(dbp, lock);
diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c
index bc09131b00..219d486dc5 100644
--- a/db2/btree/bt_split.c
+++ b/db2/btree/bt_split.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_split.c	10.17 (Sleepycat) 11/2/97";
+static const char sccsid[] = "@(#)bt_split.c	10.18 (Sleepycat) 11/23/97";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -396,14 +396,14 @@ __bam_broot(dbp, rootp, lp, rp)
 	 * The btree comparison code guarantees that the left-most key on any
 	 * level of the tree is never used, so it doesn't need to be filled in.
 	 */
+	memset(&bi, 0, sizeof(bi));
 	bi.len = 0;
 	B_TSET(bi.type, B_KEYDATA, 0);
 	bi.pgno = lp->pgno;
 	if (F_ISSET(dbp, DB_BT_RECNUM)) {
 		bi.nrecs = __bam_total(lp);
 		RE_NREC_SET(rootp, bi.nrecs);
-	} else
-		bi.nrecs = 0;
+	}
 	hdr.data = &bi;
 	hdr.size = SSZA(BINTERNAL, data);
 	if ((ret =
@@ -591,6 +591,7 @@ __bam_pinsert(dbp, parent, lchild, rchild)
 			return (DB_NEEDSPLIT);
 
 		/* Add a new record for the right page. */
+		memset(&bi, 0, sizeof(bi));
 		bi.len = child_bi->len;
 		B_TSET(bi.type, child_bi->type, 0);
 		bi.pgno = rchild->pgno;
@@ -640,6 +641,7 @@ noprefix:			nksize = child_bk->len;
 			if (P_FREESPACE(ppage) < nbytes)
 				return (DB_NEEDSPLIT);
 
+			memset(&bi, 0, sizeof(bi));
 			bi.len = nksize;
 			B_TSET(bi.type, child_bk->type, 0);
 			bi.pgno = rchild->pgno;
@@ -661,6 +663,7 @@ noprefix:			nksize = child_bk->len;
 			if (P_FREESPACE(ppage) < nbytes)
 				return (DB_NEEDSPLIT);
 
+			memset(&bi, 0, sizeof(bi));
 			bi.len = BOVERFLOW_SIZE;
 			B_TSET(bi.type, child_bk->type, 0);
 			bi.pgno = rchild->pgno;
diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c
index 45232bbc41..18b9b34975 100644
--- a/db2/btree/btree_auto.c
+++ b/db2/btree/btree_auto.c
@@ -100,7 +100,6 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_pg_alloc_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -265,7 +264,6 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_pg_free_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -460,7 +458,6 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_split_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_split_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -657,7 +654,6 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_rsplit_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -836,7 +832,6 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_adj_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_adj_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -995,7 +990,6 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_cadjust_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -1145,7 +1139,6 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_cdel_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;
@@ -1329,7 +1322,6 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags,
  * PUBLIC: int __bam_repl_print
  * PUBLIC:    __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
  */
-
 int
 __bam_repl_print(notused1, dbtp, lsnp, notused3, notused4)
 	DB_LOG *notused1;