about summary refs log tree commit diff
path: root/db2/btree/bt_rsearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'db2/btree/bt_rsearch.c')
-rw-r--r--db2/btree/bt_rsearch.c164
1 files changed, 86 insertions, 78 deletions
diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c
index caa6b3515e..8efe4059a8 100644
--- a/db2/btree/bt_rsearch.c
+++ b/db2/btree/bt_rsearch.c
@@ -44,7 +44,7 @@
 #include "config.h"
 
 #ifndef lint
-static const char sccsid[] = "@(#)bt_rsearch.c	10.15 (Sleepycat) 5/6/98";
+static const char sccsid[] = "@(#)bt_rsearch.c	10.21 (Sleepycat) 12/2/98";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -59,39 +59,37 @@ static const char sccsid[] = "@(#)bt_rsearch.c	10.15 (Sleepycat) 5/6/98";
  * __bam_rsearch --
  *	Search a btree for a record number.
  *
- * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *));
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
  */
 int
-__bam_rsearch(dbp, recnop, flags, stop, exactp)
-	DB *dbp;
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+	DBC *dbc;
 	db_recno_t *recnop;
 	u_int32_t flags;
 	int stop, *exactp;
 {
 	BINTERNAL *bi;
-	BTREE *t;
+	CURSOR *cp;
+	DB *dbp;
 	DB_LOCK lock;
 	PAGE *h;
 	RINTERNAL *ri;
 	db_indx_t indx, top;
 	db_pgno_t pg;
 	db_recno_t i, recno, total;
-	int isappend, ret, stack;
+	int ret, stack;
 
-	t = dbp->internal;
+	dbp = dbc->dbp;
+	cp = dbc->internal;
 
-	/*
-	 * We test for groups of flags, S_APPEND is the only one that can be
-	 * OR'd into the set.  Clear it now so that the tests for equality
-	 * will work.
-	 */
-	if ((isappend = LF_ISSET(S_APPEND)) != 0)
-		LF_CLR(S_APPEND);
+	BT_STK_CLR(cp);
 
 	/*
 	 * There are several ways we search a btree tree.  The flags argument
 	 * specifies if we're acquiring read or write locks and if we are
-	 * locking pairs of pages.  See btree.h for more details.
+	 * locking pairs of pages.  In addition, if we're adding or deleting
+	 * an item, we have to lock the entire tree, regardless.  See btree.h
+	 * for more details.
 	 *
 	 * If write-locking pages, we need to know whether or not to acquire a
 	 * write lock on a page before getting it.  This depends on how deep it
@@ -102,15 +100,36 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 	 * Retrieve the root page.
 	 */
 	pg = PGNO_ROOT;
-	if ((ret = __bam_lget(dbp, 0, PGNO_ROOT,
-	    flags == S_INSERT || flags == S_DELETE ?
-	    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
+	stack = LF_ISSET(S_STACK);
+	if ((ret = __bam_lget(dbc,
+	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 		return (ret);
-	if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-		(void)__BT_LPUT(dbp, lock);
+	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+		(void)__BT_LPUT(dbc, lock);
 		return (ret);
 	}
-	total = RE_NREC(h);
+
+	/*
+	 * Decide if we need to save this page; if we do, write lock it.
+	 * We deliberately don't lock-couple on this call.  If the tree
+	 * is tiny, i.e., one page, and two threads are busily updating
+	 * the root page, we're almost guaranteed deadlocks galore, as
+	 * each one gets a read lock and then blocks the other's attempt
+	 * for a write lock.
+	 */
+	if (!stack &&
+	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+		(void)memp_fput(dbp->mpf, h, 0);
+		(void)__BT_LPUT(dbc, lock);
+		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
+			return (ret);
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+			(void)__BT_LPUT(dbc, lock);
+			return (ret);
+		}
+		stack = 1;
+	}
 
 	/*
 	 * If appending to the tree, set the record number now -- we have the
@@ -124,7 +143,8 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 	 * for the record immediately after the last record in the tree, so do
 	 * a fast check now.
 	 */
-	if (isappend) {
+	total = RE_NREC(h);
+	if (LF_ISSET(S_APPEND)) {
 		*exactp = 0;
 		*recnop = recno = total + 1;
 	} else {
@@ -133,33 +153,14 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 			*exactp = 1;
 		else {
 			*exactp = 0;
-			if (!PAST_END_OK(flags) || recno > total + 1) {
+			if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
 				(void)memp_fput(dbp->mpf, h, 0);
-				(void)__BT_LPUT(dbp, lock);
+				(void)__BT_LPUT(dbc, lock);
 				return (DB_NOTFOUND);
 			}
 		}
 	}
 
-	/* Decide if we're building a stack based on the operation. */
-	BT_STK_CLR(t);
-	stack = flags == S_DELETE || flags == S_INSERT;
-
-	/*
-	 * Decide if we need to save this page; if we do, write lock it, and
-	 * start to build a stack.
-	 */
-	if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) {
-		(void)memp_fput(dbp->mpf, h, 0);
-		if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
-			return (ret);
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
-			(void)__BT_LPUT(dbp, lock);
-			return (ret);
-		}
-		stack = 1;
-	}
-
 	/*
 	 * !!!
 	 * Record numbers in the tree are 0-based, but the recno is
@@ -177,7 +178,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 			 * not exist if there are enough deleted records in the
 			 * page.
 			 */
-			if (recno <= NUM_ENT(h))
+			if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX)
 				for (i = recno - 1;; --i) {
 					if (B_DISSET(GET_BKEYDATA(h,
 					    i * P_INDX + O_INDX)->type))
@@ -185,10 +186,10 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 					if (i == 0)
 						break;
 				}
-			if (recno > NUM_ENT(h)) {
+			if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) {
 				*exactp = 0;
-				if (!PAST_END_OK(flags) ||
-				    recno > (db_recno_t)(NUM_ENT(h) + 1)) {
+				if (!LF_ISSET(S_PAST_EOF) || recno >
+				    (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) {
 					ret = DB_NOTFOUND;
 					goto err;
 				}
@@ -197,7 +198,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 
 			/* Correct from 1-based to 0-based for a page offset. */
 			--recno;
-			BT_STK_ENTER(t, h, recno * P_INDX, lock, ret);
+			BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret);
 			return (ret);
 		case P_IBTREE:
 			for (indx = 0, top = NUM_ENT(h);;) {
@@ -213,7 +214,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 
 			/* Correct from 1-based to 0-based for a page offset. */
 			--recno;
-			BT_STK_ENTER(t, h, recno, lock, ret);
+			BT_STK_ENTER(cp, h, recno, lock, ret);
 			return (ret);
 		case P_IRECNO:
 			for (indx = 0, top = NUM_ENT(h);;) {
@@ -232,42 +233,42 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp)
 		if (stack) {
 			/* Return if this is the lowest page wanted. */
 			if (LF_ISSET(S_PARENT) && stop == h->level) {
-				BT_STK_ENTER(t, h, indx, lock, ret);
+				BT_STK_ENTER(cp, h, indx, lock, ret);
 				return (ret);
 			}
-			BT_STK_PUSH(t, h, indx, lock, ret);
-			if (ret)
+			BT_STK_PUSH(cp, h, indx, lock, ret);
+			if (ret != 0)
 				goto err;
 
-			if ((ret = __bam_lget(dbp, 0, pg,
-			    LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
-			    &lock)) != 0)
+			if ((ret =
+			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
 				goto err;
 		} else {
-			(void)memp_fput(dbp->mpf, h, 0);
-
 			/*
 			 * Decide if we want to return a pointer to the next
 			 * page in the stack.  If we do, write lock it and
 			 * never unlock it.
 			 */
-			if (LF_ISSET(S_PARENT) &&
-			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1))
+			if ((LF_ISSET(S_PARENT) &&
+			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+			    (h->level - 1) == LEAFLEVEL)
 				stack = 1;
 
-			if ((ret = __bam_lget(dbp, 1, pg,
-			    LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ,
-			    &lock)) != 0)
+			(void)memp_fput(dbp->mpf, h, 0);
+
+			if ((ret =
+			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
+			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
 				goto err;
 		}
 
-		if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
+		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
 			goto err;
 	}
 	/* NOTREACHED */
 
-err:	BT_STK_POP(t);
-	__bam_stkrel(dbp);
+err:	BT_STK_POP(cp);
+	__bam_stkrel(dbc, 0);
 	return (ret);
 }
 
@@ -275,25 +276,29 @@ err:	BT_STK_POP(t);
  * __bam_adjust --
  *	Adjust the tree after adding or deleting a record.
  *
- * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int32_t));
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
  */
 int
-__bam_adjust(dbp, t, adjust)
-	DB *dbp;
-	BTREE *t;
+__bam_adjust(dbc, adjust)
+	DBC *dbc;
 	int32_t adjust;
 {
+	CURSOR *cp;
+	DB *dbp;
 	EPG *epg;
 	PAGE *h;
 	int ret;
 
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+
 	/* Update the record counts for the tree. */
-	for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
+	for (epg = cp->sp; epg <= cp->csp; ++epg) {
 		h = epg->page;
 		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
-			if (DB_LOGGING(dbp) &&
+			if (DB_LOGGING(dbc) &&
 			    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
-			    dbp->txn, &LSN(h), 0, dbp->log_fileid,
+			    dbc->txn, &LSN(h), 0, dbp->log_fileid,
 			    PGNO(h), &LSN(h), (u_int32_t)epg->indx,
 			    adjust, 1)) != 0)
 				return (ret);
@@ -317,28 +322,31 @@ __bam_adjust(dbp, t, adjust)
  * __bam_nrecs --
  *	Return the number of records in the tree.
  *
- * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *));
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
  */
 int
-__bam_nrecs(dbp, rep)
-	DB *dbp;
+__bam_nrecs(dbc, rep)
+	DBC *dbc;
 	db_recno_t *rep;
 {
+	DB *dbp;
 	DB_LOCK lock;
 	PAGE *h;
 	db_pgno_t pgno;
 	int ret;
 
+	dbp = dbc->dbp;
+
 	pgno = PGNO_ROOT;
-	if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0)
+	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
 		return (ret);
-	if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0)
+	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
 		return (ret);
 
 	*rep = RE_NREC(h);
 
 	(void)memp_fput(dbp->mpf, h, 0);
-	(void)__BT_TLPUT(dbp, lock);
+	(void)__BT_TLPUT(dbc, lock);
 
 	return (0);
 }