diff options
Diffstat (limited to 'db2/hash/hash_page.c')
-rw-r--r-- | db2/hash/hash_page.c | 1775 |
1 files changed, 1775 insertions, 0 deletions
diff --git a/db2/hash/hash_page.c b/db2/hash/hash_page.c new file mode 100644 index 0000000000..68c31b14f9 --- /dev/null +++ b/db2/hash/hash_page.c @@ -0,0 +1,1775 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_page.c 10.18 (Sleepycat) 8/21/97"; +#endif /* not lint */ + + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Page manipulation for hashing package. + * + * ROUTINES: + * + * External + * __get_page + * __add_ovflpage + * __overflow_page + * Internal + * open_temp + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" + +static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t)); + +#ifdef DEBUG_SLOW +static void account_page(HTAB *, db_pgno_t, int); +#endif + +/* + * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + db_pgno_t next_pgno; + int ret; + + if (F_ISSET(cursorp, H_DELETED)) + return (EINVAL); + F_CLR(cursorp, H_OK | H_NOMORE); + + /* Check if we need to get a page for this cursor. */ + if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + + /* Check if we are looking for space in which to insert an item. */ + if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID + && cursorp->seek_size < P_FREESPACE(cursorp->pagep)) + cursorp->seek_found_page = cursorp->pgno; + + /* Check if we need to go on to the next page. */ + if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID) + /* + * ISDUP is set, and offset is at the beginning of the datum. + * We need to grab the length of the datum, then set the datum + * pointer to be the beginning of the datum. + */ + memcpy(&cursorp->dup_len, + H_PAIRDATA(cursorp->pagep, cursorp->bndx)->data + + cursorp->dup_off, sizeof(db_indx_t)); + else if (F_ISSET(cursorp, H_ISDUP)) { + /* Make sure we're not about to run off the page. */ + if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp, + cursorp->dpgno, &cursorp->dpagep)) != 0) + return (ret); + + if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) { + if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) { + if ((ret = __ham_put_page(hashp->dbp, + cursorp->dpagep, 0)) != 0) + return (ret); + F_CLR(cursorp, H_ISDUP); + cursorp->dpagep = NULL; + cursorp->dpgno = PGNO_INVALID; + cursorp->dndx = NDX_INVALID; + cursorp->bndx++; + } else if ((ret = __ham_next_cpage(hashp, cursorp, + NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + return (ret); + } + } + + if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) { + /* Fetch next page. */ + if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) { + F_SET(cursorp, H_NOMORE); + if (cursorp->dpagep != NULL && + (ret = __ham_put_page(hashp->dbp, + cursorp->dpagep, 0)) != 0) + return (ret); + cursorp->dpgno = PGNO_INVALID; + return (DB_NOTFOUND); + } + next_pgno = NEXT_PGNO(cursorp->pagep); + cursorp->bndx = 0; + if ((ret = __ham_next_cpage(hashp, + cursorp, next_pgno, 0, 0)) != 0) + return (ret); + } + + F_SET(cursorp, H_OK); + return (0); +} + +/* + * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *)); + */ +int +__ham_item_reset(hashp, cursorp) + HTAB *hashp; + HASH_CURSOR *cursorp; +{ + int ret; + + if (cursorp->pagep) + ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0); + else + ret = 0; + + __ham_item_init(cursorp); + return (ret); +} + +/* + * PUBLIC: void __ham_item_init __P((HASH_CURSOR *)); + */ +void +__ham_item_init(cursorp) + HASH_CURSOR *cursorp; +{ + cursorp->pagep = NULL; + cursorp->bucket = BUCKET_INVALID; + cursorp->lock = 0; + cursorp->bndx = NDX_INVALID; + cursorp->pgno = PGNO_INVALID; + cursorp->dpgno = PGNO_INVALID; + cursorp->dndx = NDX_INVALID; + cursorp->dpagep = NULL; + cursorp->flags = 0; + cursorp->seek_size = 0; + cursorp->seek_found_page = PGNO_INVALID; +} + +/* + * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int)); + */ +int +__ham_item_done(hashp, cursorp, dirty) + HTAB *hashp; + HASH_CURSOR *cursorp; + int dirty; +{ + int ret, t_ret; + + t_ret = ret = 0; + + if (cursorp->pagep) + ret = __ham_put_page(hashp->dbp, cursorp->pagep, + dirty && cursorp->dpagep == NULL); + cursorp->pagep = NULL; + + if (cursorp->dpagep) + t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty); + cursorp->dpagep = NULL; + + if (ret == 0 && t_ret != 0) + ret = t_ret; + + /* + * If we are running with transactions, then we must + * not relinquish locks explicitly. + */ + if (cursorp->lock && hashp->dbp->txn == NULL) + t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock); + cursorp->lock = 0; + + + /* + * We don't throw out the page number since we might want to + * continue getting on this page. + */ + return (ret != 0 ? ret : t_ret); +} + +/* + * Returns the last item in a bucket. + * + * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_last(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + int ret; + + if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + return (ret); + + cursorp->bucket = hashp->hdr->max_bucket; + F_SET(cursorp, H_OK); + return (__ham_item_prev(hashp, cursorp, mode)); +} +/* + * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_first(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + int ret; + + if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + return (ret); + F_SET(cursorp, H_OK); + cursorp->bucket = 0; + return (__ham_item_next(hashp, cursorp, mode)); +} + +/* + * Returns a pointer to key/data pair on a page. In the case of bigkeys, + * just returns the page number and index of the bigkey pointer pair. + * + * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_prev(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + db_pgno_t next_pgno; + int ret; + + /* + * There are N cases for backing up in a hash file. + * Case 1: In the middle of a page, no duplicates, just dec the index. + * Case 2: In the middle of a duplicate set, back up one. + * Case 3: At the beginning of a duplicate set, get out of set and + * back up to next key. + * Case 4: At the beginning of a page; go to previous page. + * Case 5: At the beginning of a bucket; go to prev bucket. + */ + F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED); + + /* + * First handle the duplicates. Either you'll get the key here + * or you'll exit the duplicate set and drop into the code below + * to handle backing up through keys. + */ + if (F_ISSET(cursorp, H_ISDUP)) { + if (cursorp->dpgno == PGNO_INVALID) { + /* Duplicates are on-page. */ + if (cursorp->dup_off != 0) + if ((ret = __ham_get_cpage(hashp, + cursorp, mode)) != 0) + return (ret); + else { + HASH_CURSOR *h; + h = cursorp; + memcpy(&h->dup_len, + H_PAIRDATA(h->pagep, h->bndx)->data + + h->dup_off - sizeof(db_indx_t), + sizeof(db_indx_t)); + cursorp->dup_off -= + DUP_SIZE(cursorp->dup_len); + cursorp->dndx--; + return (__ham_item(hashp, + cursorp, mode)); + } + } else if (cursorp->dndx > 0) { /* Duplicates are off-page. */ + cursorp->dndx--; + return (__ham_item(hashp, cursorp, mode)); + } else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) { + F_CLR(cursorp, H_ISDUP); /* End of dups */ + cursorp->dpgno = PGNO_INVALID; + if (cursorp->dpagep != NULL) + (void)__ham_put_page(hashp->dbp, + cursorp->dpagep, 0); + cursorp->dpagep = NULL; + } else if ((ret = __ham_next_cpage(hashp, cursorp, + PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + return (ret); + else { + cursorp->dndx = NUM_ENT(cursorp->pagep) - 1; + return (__ham_item(hashp, cursorp, mode)); + } + } + + /* + * If we get here, we are not in a duplicate set, and just need + * to back up the cursor. There are still three cases: + * midpage, beginning of page, beginning of bucket. + */ + + if (cursorp->bndx == 0) { /* Beginning of page. */ + if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + cursorp->pgno = PREV_PGNO(cursorp->pagep); + if (cursorp->pgno == PGNO_INVALID) { + /* Beginning of bucket. */ + F_SET(cursorp, H_NOMORE); + return (DB_NOTFOUND); + } else if ((ret = __ham_next_cpage(hashp, + cursorp, cursorp->pgno, 0, 0)) != 0) + return (ret); + else + cursorp->bndx = H_NUMPAIRS(cursorp->pagep); + } + + /* + * Either we've got the cursor set up to be decremented, or we + * have to find the end of a bucket. + */ + if (cursorp->bndx == NDX_INVALID) { + if (cursorp->pagep == NULL) + next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket); + else + goto got_page; + + do { + if ((ret = __ham_next_cpage(hashp, + cursorp, next_pgno, 0, 0)) != 0) + return (ret); +got_page: next_pgno = NEXT_PGNO(cursorp->pagep); + cursorp->bndx = H_NUMPAIRS(cursorp->pagep); + } while (next_pgno != PGNO_INVALID); + + if (cursorp->bndx == 0) { + /* Bucket was empty. */ + F_SET(cursorp, H_NOMORE); + return (DB_NOTFOUND); + } + } + + cursorp->bndx--; + + return (__ham_item(hashp, cursorp, mode)); +} + +/* + * Sets the cursor to the next key/data pair on a page. + * + * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_next(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + /* + * Deleted on-page duplicates are a weird case. If we delete the last + * one, then our cursor is at the very end of a duplicate set and + * we actually need to go on to the next key. + */ + if (F_ISSET(cursorp, H_DELETED)) { + if (cursorp->bndx != NDX_INVALID && + F_ISSET(cursorp, H_ISDUP) && + cursorp->dpgno == PGNO_INVALID && + cursorp->dup_tlen == cursorp->dup_off) { + F_CLR(cursorp, H_ISDUP); + cursorp->dpgno = PGNO_INVALID; + cursorp->bndx++; + } + F_CLR(cursorp, H_DELETED); + } else if (cursorp->bndx == NDX_INVALID) { + cursorp->bndx = 0; + cursorp->dpgno = PGNO_INVALID; + F_CLR(cursorp, H_ISDUP); + } else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID) + cursorp->dndx++; + else if (F_ISSET(cursorp, H_ISDUP)) { + cursorp->dndx++; + cursorp->dup_off += DUP_SIZE(cursorp->dup_len); + if (cursorp->dup_off >= cursorp->dup_tlen) { + F_CLR(cursorp, H_ISDUP); + cursorp->dpgno = PGNO_INVALID; + cursorp->bndx++; + } + } else + cursorp->bndx++; + + return (__ham_item(hashp, cursorp, mode)); +} + +/* + * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int)); + * + * This is a little bit sleazy in that we're overloading the meaning + * of the H_OFFPAGE type here. When we recover deletes, we have the + * entire entry instead of having only the DBT, so we'll pass type + * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing + * an H_KEYDATA around it. + */ +void +__ham_putitem(p, dbt, type) + PAGE *p; + const DBT *dbt; + int type; +{ + u_int16_t n, off; + + n = NUM_ENT(p); + + /* Put the item element on the page. */ + if (type == H_OFFPAGE) { + off = HOFFSET(p) - dbt->size; + HOFFSET(p) = p->inp[n] = off; + memcpy(P_ENTRY(p, n), dbt->data, dbt->size); + } else { + off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size); + HOFFSET(p) = p->inp[n] = off; + PUT_HKEYDATA(GET_HKEYDATA(p, n), dbt->data, dbt->size, type); + } + + /* Adjust page info. */ + NUM_ENT(p) += 1; +} + + +/* + * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *)); + * XXX TODO: if the item is an offdup, delete the other pages and + * then remove the pair. If the offpage page is 0, then you can + * just remove the pair. + */ +int +__ham_del_pair(hashp, cursorp) + HTAB *hashp; + HASH_CURSOR *cursorp; +{ + DBT data_dbt, key_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn, *n_lsn; + PAGE *p; + db_indx_t ndx; + db_pgno_t chg_pgno, pgno; + int ret, tret; + + dbenv = hashp->dbp->dbenv; + ndx = cursorp->bndx; + if (cursorp->pagep == NULL && (ret = + __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0) + return (ret); + + p = cursorp->pagep; + + /* + * We optimize for the normal case which is when neither the key nor + * the data are large. In this case, we write a single log record + * and do the delete. If either is large, we'll call __big_delete + * to remove the big item and then update the page to remove the + * entry referring to the big item. + */ + ret = 0; + if (H_PAIRKEY(p, ndx)->type == H_OFFPAGE) { + memcpy(&pgno, (u_int8_t *)GET_HOFFPAGE(p, H_KEYINDEX(ndx)) + + SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t)); + ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + } + + if (ret == 0) + switch (H_PAIRDATA(p, ndx)->type) { + case H_OFFPAGE: + memcpy(&pgno, + (u_int8_t *)GET_HOFFPAGE(p, H_DATAINDEX(ndx)) + + SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t)); + ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + break; + case H_OFFDUP: + memcpy(&pgno, + (u_int8_t *)GET_HOFFDUP(p, H_DATAINDEX(ndx)) + + SSZ(HOFFDUP, pgno), sizeof(db_pgno_t)); + ret = __db_ddup(hashp->dbp, pgno, __ham_del_page); + break; + } + + if (ret) + return (ret); + + /* Now log the delete off this page. */ + if (DB_LOGGING(hashp->dbp)) { + key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx)); + key_dbt.size = + LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx)); + data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx)); + data_dbt.size = + LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx)); + + if ((ret = __ham_insdel_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR, + hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx, + &LSN(p), &key_dbt, &data_dbt)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(p) = new_lsn; + } + + __ham_dpair(hashp->dbp, p, ndx); + + /* + * If we are locking, we will not maintain this. + * XXXX perhaps we can retain incremental numbers and apply them + * later. + */ + if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) + --hashp->hdr->nelem; + + /* + * Check if the page is empty. There are two cases. If it's + * empty and it's not the first chain in the bucket (i.e., the + * bucket page) then we can simply remove it. If it is the first + * chain in the bucket, then we need to copy the second page into + * it and remove the second page. + */ + if (NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID && + NEXT_PGNO(p) != PGNO_INVALID) { + PAGE *n_pagep, *nn_pagep; + db_pgno_t tmp_pgno; + + /* + * First page in chain is empty and we know that there + * are more pages in the chain. + * XXX Need to log this. + */ + if ((ret = + __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0) + return (ret); + + if (NEXT_PGNO(n_pagep) != PGNO_INVALID) { + if ((ret = + __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep), + &nn_pagep)) != 0) { + (void) __ham_put_page(hashp->dbp, n_pagep, 0); + return (ret); + } + PREV_PGNO(nn_pagep) = PGNO(p); + (void)__ham_put_page(hashp->dbp, nn_pagep, 1); + } + + tmp_pgno = PGNO(p); + memcpy(p, n_pagep, hashp->hdr->pagesize); + PGNO(p) = tmp_pgno; + PREV_PGNO(p) = PGNO_INVALID; + + /* + * Cursor is advanced to the beginning of the next page. + */ + cursorp->bndx = NDX_INVALID; + cursorp->pgno = PGNO(p); + chg_pgno = PGNO(p); + if ((ret = __ham_dirty_page(hashp, p)) != 0 || + (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0) + return (ret); + } else if (NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) { + PAGE *n_pagep, *p_pagep; + + if ((ret = + __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0) + return (ret); + + if (NEXT_PGNO(p) != PGNO_INVALID) { + if ((ret = __ham_get_page(hashp->dbp, + NEXT_PGNO(p), &n_pagep)) != 0) { + (void)__ham_put_page(hashp->dbp, p_pagep, 0); + return (ret); + } + n_lsn = &LSN(n_pagep); + } else { + n_pagep = NULL; + n_lsn = NULL; + } + + NEXT_PGNO(p_pagep) = NEXT_PGNO(p); + if (n_pagep != NULL) + PREV_PGNO(n_pagep) = PGNO(p_pagep); + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpage_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL, + hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep), + PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(p_pagep) = new_lsn; /* Structure assignment. */ + if (n_pagep) + LSN(n_pagep) = new_lsn; + LSN(p) = new_lsn; + } + cursorp->pgno = NEXT_PGNO(p); + cursorp->bndx = 0; + /* + * Since we are about to delete the cursor page and we have + * just moved the cursor, we need to make sure that the + * old page pointer isn't left hanging around in the cursor. + */ + cursorp->pagep = NULL; + chg_pgno = PGNO(p); + ret = __ham_del_page(hashp->dbp, p); + if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 && + ret == 0) + ret = tret; + if (n_pagep != NULL && + (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 && + ret == 0) + ret = tret; + if (ret != 0) + return (ret); + } else { + /* + * Mark item deleted so that we don't try to return it, and + * so that we update the cursor correctly on the next call + * to next. + */ + F_SET(cursorp, H_DELETED); + chg_pgno = cursorp->pgno; + ret = __ham_dirty_page(hashp, p); + } + __ham_c_update(hashp, cursorp, chg_pgno, 0, 0, 0); + + F_CLR(cursorp, H_OK); + return (ret); +} +/* + * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); + * Given the key data indicated by the cursor, replace part/all of it + * according to the fields in the dbt. + */ +int +__ham_replpair(hashp, hcp, dbt, make_dup) + HTAB *hashp; + HASH_CURSOR *hcp; + DBT *dbt; + u_int32_t make_dup; +{ + DBT old_dbt, tmp; + DB_LSN new_lsn; + HKEYDATA *hk; + u_int32_t len; + int32_t change; + int is_big, ret, type; + u_int8_t *beg, *dest, *end, *src; + + /* + * Big item replacements are handled in generic code. + * Items that fit on the current page fall into 4 classes. + * 1. On-page element, same size + * 2. On-page element, new is bigger (fits) + * 3. On-page element, new is bigger (does not fit) + * 4. On-page element, old is bigger + * Numbers 1, 2, and 4 are essentially the same (and should + * be the common case). We handle case 3 as a delete and + * add. + */ + + /* + * We need to compute the number of bytes that we are adding or + * removing from the entry. Normally, we can simply substract + * the number of bytes we are replacing (dbt->dlen) from the + * number of bytes we are inserting (dbt->size). However, if + * we are doing a partial put off the end of a record, then this + * formula doesn't work, because we are essentially adding + * new bytes. + */ + change = dbt->size - dbt->dlen; + + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + is_big = hk->type == H_OFFPAGE; + + if (is_big) + memcpy(&len, (u_int8_t *)hk + SSZ(HOFFPAGE, tlen), + sizeof(u_int32_t)); + else + len = LEN_HKEYDATA(hcp->pagep, + hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx)); + + if (dbt->doff + dbt->dlen > len) + change += dbt->doff + dbt->dlen - len; + + + if (change > (int)P_FREESPACE(hcp->pagep) || is_big) { + /* + * Case 3 -- two subcases. + * A. This is not really a partial operation, but an overwrite. + * Simple del and add works. + * B. This is a partial and we need to construct the data that + * we are really inserting (yuck). + * In both cases, we need to grab the key off the page (in + * some cases we could do this outside of this routine; for + * cleanliness we do it here. If you happen to be on a big + * key, this could be a performance hit). + */ + tmp.flags = 0; + F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL); + if ((ret = + __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx), + &tmp, &hcp->big_key, &hcp->big_keylen)) != 0) + return (ret); + + type = hk->type; + if (dbt->doff == 0 && dbt->dlen == len) { + ret = __ham_del_pair(hashp, hcp); + if (ret == 0) + ret = __ham_add_el(hashp, hcp, &tmp, dbt, type); + } else { /* Case B */ + DBT tdata; + tdata.flags = 0; + F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL); + + if ((ret = __db_ret(hashp->dbp, hcp->pagep, + H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data, + &hcp->big_datalen)) != 0) + goto err; + + /* Now we can delete the item. */ + if ((ret = __ham_del_pair(hashp, hcp)) != 0) { + free(tdata.data); + goto err; + } + + /* Now shift old data around to make room for new. */ + if (change > 0) { + tdata.data = (void *) + realloc(tdata.data, tdata.size + change); + memset((u_int8_t *)tdata.data + tdata.size, + 0, change); + } + if (tdata.data == NULL) + return (ENOMEM); + end = (u_int8_t *)tdata.data + tdata.size; + + src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen; + if (src < end && tdata.size > dbt->doff + dbt->dlen) { + len = tdata.size - dbt->doff - dbt->dlen; + dest = src + change; + memmove(dest, src, len); + } + memcpy((u_int8_t *)tdata.data + dbt->doff, + dbt->data, dbt->size); + tdata.size += change; + + /* Now add the pair. */ + ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type); + free(tdata.data); + } +err: free(tmp.data); + return (ret); + } + + /* + * Set up pointer into existing data. Do it before the log + * message so we can use it inside of the log setup. + */ + beg = H_PAIRDATA(hcp->pagep, hcp->bndx)->data; + beg += dbt->doff; + + /* + * If we are going to have to move bytes at all, figure out + * all the parameters here. Then log the call before moving + * anything around. + */ + if (DB_LOGGING(hashp->dbp)) { + old_dbt.data = beg; + old_dbt.size = dbt->dlen; + if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, PGNO(hcp->pagep), + (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep), + (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0) + return (ret); + + LSN(hcp->pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize, + (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt); + + return (0); +} + +/* + * Replace data on a page with new data, possibly growing or shrinking what's + * there. This is called on two different occasions. On one (from replpair) + * we are interested in changing only the data. On the other (from recovery) + * we are replacing the entire data (header and all) with a new element. In + * the latter case, the off argument is negative. + * pagep: the page that we're changing + * ndx: page index of the element that is growing/shrinking. + * off: Offset at which we are beginning the replacement. + * change: the number of bytes (+ or -) that the element is growing/shrinking. + * dbt: the new data that gets written at beg. + * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t, + * PUBLIC: int32_t, DBT *)); + */ +void +__ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt) + PAGE *pagep; + size_t pgsize; + u_int32_t ndx; + int32_t off; + int32_t change; + DBT *dbt; +{ + db_indx_t i; + int32_t len; + u_int8_t *src, *dest; + int zero_me; + + if (change != 0) { + zero_me = 0; + src = (u_int8_t *)(pagep) + HOFFSET(pagep); + if (off < 0) + len = pagep->inp[ndx] - HOFFSET(pagep); + else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) { + len = GET_HKEYDATA(pagep, ndx)->data + + LEN_HKEYDATA(pagep, pgsize, ndx) - src; + zero_me = 1; + } else + len = (GET_HKEYDATA(pagep, ndx)->data + off) - src; + dest = src - change; + memmove(dest, src, len); + if (zero_me) + memset(dest + len, 0, change); + + /* Now update the indices. */ + for (i = ndx; i < NUM_ENT(pagep); i++) + pagep->inp[i] -= change; + HOFFSET(pagep) -= change; + } + if (off >= 0) + memcpy(GET_HKEYDATA(pagep, ndx)->data + off, + dbt->data, dbt->size); + else + memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size); +} + +/* + * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t)); + */ +int +__ham_split_page(hashp, obucket, nbucket) + HTAB *hashp; + u_int32_t obucket, nbucket; +{ + DBT key, val, page_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn; + PAGE **pp, *old_pagep, *temp_pagep, *new_pagep; + db_indx_t n; + db_pgno_t bucket_pgno, next_pgno; + u_int32_t big_len, len; + int ret, tret; + void *big_buf; + + dbenv = hashp->dbp->dbenv; + temp_pagep = old_pagep = new_pagep = NULL; + + bucket_pgno = BUCKET_TO_PAGE(hashp, obucket); + if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0) + return (ret); + if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH, + &new_pagep)) != 0) + goto err; + + temp_pagep = hashp->split_buf; + memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize); + + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep), + &page_dbt, &LSN(old_pagep))) != 0) + goto err; + } + + P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID, + PGNO_INVALID, 0, P_HASH); + + if (DB_LOGGING(hashp->dbp)) + LSN(old_pagep) = new_lsn; /* Structure assignment. */ + + big_len = 0; + big_buf = NULL; + val.flags = key.flags = 0; + while (temp_pagep != NULL) { + for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) { + if ((ret = + __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n), + &key, &big_buf, &big_len)) != 0) + goto err; + + if (__ham_call_hash(hashp, key.data, key.size) + == obucket) + pp = &old_pagep; + else + pp = &new_pagep; + + /* + * Figure out how many bytes we need on the new + * page to store the key/data pair. + */ + + len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + H_DATAINDEX(n)) + + LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + H_KEYINDEX(n)) + + 2 * sizeof(db_indx_t); + + if (P_FREESPACE(*pp) < len) { + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = *pp; + if ((ret = __ham_splitdata_log( + dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, + &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, + PGNO(*pp), &page_dbt, + &LSN(*pp))) != 0) + goto err; + LSN(*pp) = new_lsn; + } + if ((ret = __ham_add_ovflpage(hashp, + *pp, 1, pp)) != 0) + goto err; + } + __ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp); + __ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp); + } + next_pgno = NEXT_PGNO(temp_pagep); + + /* Clear temp_page; if it's a link overflow page, free it. */ + if (PGNO(temp_pagep) != bucket_pgno && (ret = + __ham_del_page(hashp->dbp, temp_pagep)) != 0) + goto err; + + if (next_pgno == PGNO_INVALID) + temp_pagep = NULL; + else if ((ret = + __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0) + goto err; + + if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = temp_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep), + &page_dbt, &LSN(temp_pagep))) != 0) + goto err; + LSN(temp_pagep) = new_lsn; + } + } + if (big_buf != NULL) + free(big_buf); + + /* + * If the original bucket spanned multiple pages, then we've got + * a pointer to a page that used to be on the bucket chain. It + * should be deleted. + */ + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno && + (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0) + goto err; + + /* + * Write new buckets out. + */ + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep), + &page_dbt, &LSN(old_pagep))) != 0) + goto err; + LSN(old_pagep) = new_lsn; + + page_dbt.data = new_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep), + &page_dbt, &LSN(new_pagep))) != 0) + goto err; + LSN(new_pagep) = new_lsn; + } + ret = __ham_put_page(hashp->dbp, old_pagep, 1); + if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 && + ret == 0) + ret = tret; + +err: if (0) { + if (old_pagep != NULL) + (void)__ham_put_page(hashp->dbp, old_pagep, 1); + if (new_pagep != NULL) + (void)__ham_put_page(hashp->dbp, new_pagep, 1); + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno) + (void)__ham_put_page(hashp->dbp, temp_pagep, 1); + } + return (ret); +} + +/* + * Add the given pair to the page. The page in question may already be + * held (i.e. it was already gotten). If it is, then the page is passed + * in via the pagep parameter. On return, pagep will contain the page + * to which we just added something. This allows us to link overflow + * pages and return the new page having correctly put the last page. + * + * PUBLIC: int __ham_add_el __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, + * PUBLIC: int)); + */ +int +__ham_add_el(hashp, hcp, key, val, type) + HTAB *hashp; + HASH_CURSOR *hcp; + const DBT *key, *val; + int type; +{ + DBT *pkey, *pdata, key_dbt, data_dbt; + DB_LSN new_lsn; + HOFFPAGE doff, koff; + db_pgno_t next_pgno; + u_int32_t data_size, key_size, pairsize; + int do_expand, is_keybig, is_databig, rectype, ret; + int key_type, data_type; + + do_expand = 0; + + if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp, + hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page : + hcp->pgno, &hcp->pagep)) != 0) + return (ret); + + key_size = HKEYDATA_PSIZE(key->size); + data_size = HKEYDATA_PSIZE(val->size); + is_keybig = ISBIG(hashp, key->size); + is_databig = ISBIG(hashp, val->size); + if (is_keybig) + key_size = HOFFPAGE_PSIZE; + if (is_databig) + data_size = HOFFPAGE_PSIZE; + + pairsize = key_size + data_size; + + /* Advance to first page in chain with room for item. */ + while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) != + PGNO_INVALID) { + /* + * This may not be the end of the chain, but the pair may fit + * anyway. Check if it's a bigpair that fits or a regular + * pair that fits. + */ + if (P_FREESPACE(hcp->pagep) >= pairsize) + break; + next_pgno = NEXT_PGNO(hcp->pagep); + if ((ret = + __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0) + return (ret); + } + + /* + * Check if we need to allocate a new page. + */ + if (P_FREESPACE(hcp->pagep) < pairsize) { + do_expand = 1; + if ((ret = __ham_add_ovflpage(hashp, + hcp->pagep, 1, &hcp->pagep)) != 0) + return (ret); + hcp->pgno = PGNO(hcp->pagep); + } + + /* + * Update cursor. + */ + hcp->bndx = H_NUMPAIRS(hcp->pagep); + F_CLR(hcp, H_DELETED); + if (is_keybig) { + if ((ret = __db_poff(hashp->dbp, + key, &koff.pgno, __ham_overflow_page)) != 0) + return (ret); + koff.type = H_OFFPAGE; + koff.tlen = key->size; + key_dbt.data = &koff; + key_dbt.size = sizeof(koff); + pkey = &key_dbt; + key_type = H_OFFPAGE; + } else { + pkey = (DBT *)key; + key_type = H_KEYDATA; + } + + if (is_databig) { + if ((ret = __db_poff(hashp->dbp, + val, &doff.pgno, __ham_overflow_page)) != 0) + return (ret); + doff.type = H_OFFPAGE; + doff.tlen = val->size; + data_dbt.data = &doff; + data_dbt.size = sizeof(doff); + pdata = &data_dbt; + data_type = H_OFFPAGE; + } else { + pdata = (DBT *)val; + data_type = type; + } + + if (DB_LOGGING(hashp->dbp)) { + rectype = PUTPAIR; + if (is_databig) + rectype |= PAIR_DATAMASK; + if (is_keybig) + rectype |= PAIR_KEYMASK; + + if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype, + hashp->dbp->log_fileid, PGNO(hcp->pagep), + (u_int32_t)H_NUMPAIRS(hcp->pagep), + &LSN(hcp->pagep), pkey, pdata)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(hcp->pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_putitem(hcp->pagep, pkey, key_type); + __ham_putitem(hcp->pagep, pdata, data_type); + + /* + * For splits, we are going to update item_info's page number + * field, so that we can easily return to the same page the + * next time we come in here. For other operations, this shouldn't + * matter, since odds are this is the last thing that happens before + * we return to the user program. + */ + hcp->pgno = PGNO(hcp->pagep); + + /* + * XXX Maybe keep incremental numbers here + */ + if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) + hashp->hdr->nelem++; + + if (do_expand || (hashp->hdr->ffactor != 0 && + (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor)) + F_SET(hcp, H_EXPAND); + return (0); +} + + +/* + * Special __putitem call used in splitting -- copies one entry to + * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA, + * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we + * do not need to do any logging here. + * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, int, PAGE *)); + */ +void +__ham_copy_item(hashp, src_page, src_ndx, dest_page) + HTAB *hashp; + PAGE *src_page; + int src_ndx; + PAGE *dest_page; +{ + u_int32_t len; + void *src, *dest; + + /* + * Copy the key and data entries onto this new page. + */ + src = P_ENTRY(src_page, src_ndx); + + /* Set up space on dest. */ + len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx); + HOFFSET(dest_page) -= len; + dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page); + dest = P_ENTRY(dest_page, NUM_ENT(dest_page)); + NUM_ENT(dest_page)++; + + memcpy(dest, src, len); +} + +/* + * + * Returns: + * pointer on success + * NULL on error + * + * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **)); + */ +int +__ham_add_ovflpage(hashp, pagep, release, pp) + HTAB *hashp; + PAGE *pagep; + int release; + PAGE **pp; +{ + DB_ENV *dbenv; + DB_LSN new_lsn; + PAGE *new_pagep; + int ret; + + dbenv = hashp->dbp->dbenv; + + if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0) + return (ret); + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpage_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL, + hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep), + PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(pagep) = LSN(new_pagep) = new_lsn; + } + NEXT_PGNO(pagep) = PGNO(new_pagep); + PREV_PGNO(new_pagep) = PGNO(pagep); + + if (release) + ret = __ham_put_page(hashp->dbp, pagep, 1); + + hashp->hash_overflows++; + *pp = new_pagep; + return (ret); +} + + +/* + * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **)); + */ +int +__ham_new_page(hashp, addr, type, pp) + HTAB *hashp; + u_int32_t addr, type; + PAGE **pp; +{ + PAGE *pagep; + int ret; + + if ((ret = memp_fget(hashp->dbp->mpf, + &addr, DB_MPOOL_CREATE, &pagep)) != 0) + return (ret); + +#ifdef DEBUG_SLOW + account_page(hashp, addr, 1); +#endif + /* This should not be necessary because page-in should do it. */ + P_INIT(pagep, + hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type); + + *pp = pagep; + return (0); +} + +/* + * PUBLIC: int __ham_del_page __P((DB *, PAGE *)); + */ +int +__ham_del_page(dbp, pagep) + DB *dbp; + PAGE *pagep; +{ + DB_LSN new_lsn; + HTAB *hashp; + int ret; + + hashp = (HTAB *)dbp->internal; + ret = 0; + DIRTY_META(hashp, ret); + if (ret != 0) { + if (ret != EAGAIN) + __db_err(hashp->dbp->dbenv, + "free_ovflpage: unable to lock meta data page %s\n", + strerror(ret)); + /* + * If we are going to return an error, then we should free + * the page, so it doesn't stay pinned forever. + */ + (void)__ham_put_page(hashp->dbp, pagep, 0); + return (ret); + } + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO, + hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed, + (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID, + &LSN(pagep), &hashp->hdr->lsn)) != 0) + return (ret); + + hashp->hdr->lsn = new_lsn; + LSN(pagep) = new_lsn; + } + +#ifdef DEBUG + { + db_pgno_t __pgno; + DB_LSN __lsn; + __pgno = pagep->pgno; + __lsn = pagep->lsn; + memset(pagep, 0xff, dbp->pgsize); + pagep->pgno = __pgno; + pagep->lsn = __lsn; + } +#endif + TYPE(pagep) = P_INVALID; + NEXT_PGNO(pagep) = hashp->hdr->last_freed; + hashp->hdr->last_freed = PGNO(pagep); + + return (__ham_put_page(hashp->dbp, pagep, 1)); +} + + +/* + * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t)); + */ +int +__ham_put_page(dbp, pagep, is_dirty) + DB *dbp; + PAGE *pagep; + int32_t is_dirty; +{ +#ifdef DEBUG_SLOW + account_page((HTAB *)dbp->cookie, + ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1); +#endif + return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0))); +} + +/* + * __ham_dirty_page -- + * Mark a page dirty. + * + * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *)); + */ +int +__ham_dirty_page(hashp, pagep) + HTAB *hashp; + PAGE *pagep; +{ + return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY)); +} + +/* + * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **)); + */ +int +__ham_get_page(dbp, addr, pagep) + DB *dbp; + db_pgno_t addr; + PAGE **pagep; +{ + int ret; + + ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep); +#ifdef DEBUG_SLOW + if (*pagep != NULL) + account_page((HTAB *)dbp->internal, addr, 1); +#endif + return (ret); +} + +/* + * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **)); + */ +int +__ham_overflow_page(dbp, type, pp) + DB *dbp; + u_int32_t type; + PAGE **pp; +{ + DB_LSN *lsnp, new_lsn; + HTAB *hashp; + PAGE *p; + db_pgno_t new_addr, next_free, newalloc_flag; + u_int32_t offset, splitnum; + int ret; + + hashp = (HTAB *)dbp->internal; + + ret = 0; + DIRTY_META(hashp, ret); + if (ret != 0) + return (ret); + + /* + * This routine is split up into two parts. First we have + * to figure out the address of the new page that we are + * allocating. Then we have to log the allocation. Only + * after the log do we get to complete allocation of the + * new page. + */ + new_addr = hashp->hdr->last_freed; + if (new_addr != PGNO_INVALID) { + if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0) + return (ret); + next_free = NEXT_PGNO(p); + lsnp = &LSN(p); + newalloc_flag = 0; + } else { + splitnum = hashp->hdr->ovfl_point; + hashp->hdr->spares[splitnum]++; + offset = hashp->hdr->spares[splitnum] - + (splitnum ? hashp->hdr->spares[splitnum - 1] : 0); + new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset); + if (new_addr > MAX_PAGES(hashp)) { + __db_err(hashp->dbp->dbenv, "hash: out of file pages"); + hashp->hdr->spares[splitnum]--; + return (ENOMEM); + } + next_free = PGNO_INVALID; + p = NULL; + lsnp = NULL; + newalloc_flag = 1; + } + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO, + hashp->dbp->log_fileid, new_addr, next_free, + 0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0) + return (ret); + + hashp->hdr->lsn = new_lsn; + if (lsnp != NULL) + *lsnp = new_lsn; + } + + if (p != NULL) { + /* We just took something off the free list, initialize it. */ + hashp->hdr->last_freed = next_free; + P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID, + PGNO_INVALID, 0, (u_int8_t)type); + } else { + /* Get the new page. */ + if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0) + return (ret); + } + if (DB_LOGGING(hashp->dbp)) + LSN(p) = new_lsn; + + *pp = p; + return (0); +} + +#ifdef DEBUG +/* + * PUBLIC: #ifdef DEBUG + * PUBLIC: int bucket_to_page __P((HTAB *, int)); + * PUBLIC: #endif + */ +int +bucket_to_page(hashp, n) + HTAB *hashp; + int n; +{ + int ret_val; + + ret_val = n + 1; + if (n != 0) + ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1]; + return (ret_val); +} +#endif + + +/* + * Create a bunch of overflow pages at the current split point. + * PUBLIC: void __ham_init_ovflpages __P((HTAB *)); + */ +void +__ham_init_ovflpages(hp) + HTAB *hp; +{ + DB_LSN new_lsn; + PAGE *p; + db_pgno_t last_pgno; + u_int32_t i, numpages; + + numpages = hp->hdr->ovfl_point + 1; + + last_pgno = hp->hdr->last_freed; + if (DB_LOGGING(hp->dbp)) { + (void)__ham_ovfl_log(hp->dbp->dbenv->lg_info, + (DB_TXN *)hp->dbp->txn, &new_lsn, 0, + hp->dbp->log_fileid, PGNO_OF(hp, hp->hdr->ovfl_point, 1), + numpages, last_pgno, &hp->hdr->lsn); + hp->hdr->lsn = new_lsn; + } else + ZERO_LSN(new_lsn); + + hp->hdr->spares[hp->hdr->ovfl_point] += numpages; + for (i = numpages; i > 0; i--) { + if (__ham_new_page(hp, + PGNO_OF(hp, hp->hdr->ovfl_point, i), P_INVALID, &p) != 0) + break; + LSN(p) = new_lsn; + NEXT_PGNO(p) = last_pgno; + last_pgno = PGNO(p); + (void)__ham_put_page(hp->dbp, p, 1); + } + hp->hdr->last_freed = last_pgno; +} + +/* + * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_get_cpage(hashp, hcp, mode) + HTAB *hashp; + HASH_CURSOR *hcp; + db_lockmode_t mode; +{ + int ret; + + if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) && + (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0) + return (ret); + + if (hcp->pagep == NULL) { + if (hcp->pgno == PGNO_INVALID) { + hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket); + hcp->bndx = 0; + } + + if ((ret = + __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0) + return (ret); + } + + if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL) + if ((ret = + __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + return (ret); + return (0); +} + +/* + * Get a new page at the cursor, putting the last page if necessary. + * If the flag is set to H_ISDUP, then we are talking about the + * duplicate page, not the main page. + * PUBLIC: int __ham_next_cpage __P((HTAB *, HASH_CURSOR *, db_pgno_t, + * PUBLIC: int, int)); + */ +int +__ham_next_cpage(hashp, hcp, pgno, dirty, flags) + HTAB *hashp; + HASH_CURSOR *hcp; + db_pgno_t pgno; + int dirty; + int flags; +{ + PAGE *p; + int ret; + + if (flags & H_ISDUP && hcp->dpagep != NULL && + (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0) + return (ret); + else if (!(flags & H_ISDUP) && hcp->pagep != NULL && + (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0) + return (ret); + + if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0) + return (ret); + + if (flags & H_ISDUP) { + hcp->dpagep = p; + hcp->dpgno = pgno; + hcp->dndx = 0; + } else { + hcp->pagep = p; + hcp->pgno = pgno; + hcp->bndx = 0; + } + + return (0); +} + +/* + * __ham_lock_bucket -- + * Get the lock on a particular bucket. + */ +static int +__ham_lock_bucket(dbp, hcp, mode) + DB *dbp; + HASH_CURSOR *hcp; + db_lockmode_t mode; +{ + int ret; + + /* + * What a way to trounce on the memory system. It might be + * worth copying the lk_info into the hashp. + */ + ret = 0; + dbp->lock.pgno = (db_pgno_t)(hcp->bucket); + ret = lock_get(dbp->dbenv->lk_info, + dbp->txn == NULL ? dbp->locker : dbp->txn->txnid, 0, + &dbp->lock_dbt, mode, &hcp->lock); + + return (ret < 0 ? EAGAIN : ret); +} + +/* + * __ham_dpair -- + * Delete a pair on a page, paying no attention to what the pair + * represents. The caller is responsible for freeing up duplicates + * or offpage entries that might be referenced by this pair. + * + * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t)); + */ +void +__ham_dpair(dbp, p, pndx) + DB *dbp; + PAGE *p; + u_int32_t pndx; +{ + db_indx_t delta, n; + u_int8_t *dest, *src; + + /* + * Compute "delta", the amount we have to shift all of the + * offsets. To find the delta, we just need to calculate + * the size of the pair of elements we are removing. + */ + delta = H_PAIRSIZE(p, dbp->pgsize, pndx); + + /* + * The hard case: we want to remove something other than + * the last item on the page. We need to shift data and + * offsets down. + */ + if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) { + /* + * Move the data: src is the first occupied byte on + * the page. (Length is delta.) + */ + src = (u_int8_t *)p + HOFFSET(p); + + /* + * Destination is delta bytes beyond src. This might + * be an overlapping copy, so we have to use memmove. + */ + dest = src + delta; + memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p)); + } + + /* Adjust the offsets. */ + for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) { + p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta; + p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta; + } + + /* Adjust page metadata. */ + HOFFSET(p) = HOFFSET(p) + delta; + NUM_ENT(p) = NUM_ENT(p) - 2; +} + +#ifdef DEBUG_SLOW +static void +account_page(hashp, pgno, inout) + HTAB *hashp; + db_pgno_t pgno; + int inout; +{ + static struct { + db_pgno_t pgno; + int times; + } list[100]; + static int last; + int i, j; + + if (inout == -1) /* XXX: Kluge */ + inout = 0; + + /* Find page in list. */ + for (i = 0; i < last; i++) + if (list[i].pgno == pgno) + break; + /* Not found. */ + if (i == last) { + list[last].times = inout; + list[last].pgno = pgno; + last++; + } + list[i].times = inout; + if (list[i].times == 0) { + for (j = i; j < last; j++) + list[j] = list[j + 1]; + last--; + } + for (i = 0; i < last; i++, list[i].times++) + if (list[i].times > 20 && !is_bitmap_pgno(hashp, list[i].pgno)) + (void)fprintf(stderr, + "Warning: pg %lu has been out for %d times\n", + (u_long)list[i].pgno, list[i].times); +} +#endif /* DEBUG_SLOW */ |