diff options
Diffstat (limited to 'db/btree')
-rw-r--r-- | db/btree/bt_close.c | 182 | ||||
-rw-r--r-- | db/btree/bt_conv.c | 221 | ||||
-rw-r--r-- | db/btree/bt_debug.c | 329 | ||||
-rw-r--r-- | db/btree/bt_delete.c | 657 | ||||
-rw-r--r-- | db/btree/bt_get.c | 105 | ||||
-rw-r--r-- | db/btree/bt_open.c | 444 | ||||
-rw-r--r-- | db/btree/bt_overflow.c | 228 | ||||
-rw-r--r-- | db/btree/bt_page.c | 98 | ||||
-rw-r--r-- | db/btree/bt_put.c | 320 | ||||
-rw-r--r-- | db/btree/bt_search.c | 213 | ||||
-rw-r--r-- | db/btree/bt_seq.c | 460 | ||||
-rw-r--r-- | db/btree/bt_split.c | 827 | ||||
-rw-r--r-- | db/btree/bt_utils.c | 260 | ||||
-rw-r--r-- | db/btree/btree.h | 383 | ||||
-rw-r--r-- | db/btree/extern.h | 70 |
15 files changed, 4797 insertions, 0 deletions
diff --git a/db/btree/bt_close.c b/db/btree/bt_close.c new file mode 100644 index 0000000000..27f9ab660f --- /dev/null +++ b/db/btree/bt_close.c @@ -0,0 +1,182 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_close.c 8.7 (Berkeley) 8/17/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <db.h> +#include "btree.h" + +static int bt_meta __P((BTREE *)); + +/* + * BT_CLOSE -- Close a btree. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__bt_close(dbp) + DB *dbp; +{ + BTREE *t; + int fd; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Sync the tree. */ + if (__bt_sync(dbp, 0) == RET_ERROR) + return (RET_ERROR); + + /* Close the memory pool. */ + if (mpool_close(t->bt_mp) == RET_ERROR) + return (RET_ERROR); + + /* Free random memory. */ + if (t->bt_cursor.key.data != NULL) { + free(t->bt_cursor.key.data); + t->bt_cursor.key.size = 0; + t->bt_cursor.key.data = NULL; + } + if (t->bt_rkey.data) { + free(t->bt_rkey.data); + t->bt_rkey.size = 0; + t->bt_rkey.data = NULL; + } + if (t->bt_rdata.data) { + free(t->bt_rdata.data); + t->bt_rdata.size = 0; + t->bt_rdata.data = NULL; + } + + fd = t->bt_fd; + free(t); + free(dbp); + return (close(fd) ? RET_ERROR : RET_SUCCESS); +} + +/* + * BT_SYNC -- sync the btree to disk. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_sync(dbp, flags) + const DB *dbp; + u_int flags; +{ + BTREE *t; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Sync doesn't currently take any flags. */ + if (flags != 0) { + errno = EINVAL; + return (RET_ERROR); + } + + if (F_ISSET(t, B_INMEM | B_RDONLY) || !F_ISSET(t, B_MODIFIED)) + return (RET_SUCCESS); + + if (F_ISSET(t, B_METADIRTY) && bt_meta(t) == RET_ERROR) + return (RET_ERROR); + + if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS) + F_CLR(t, B_MODIFIED); + + return (status); +} + +/* + * BT_META -- write the tree meta data to disk. + * + * Parameters: + * t: tree + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_meta(t) + BTREE *t; +{ + BTMETA m; + void *p; + + if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL) + return (RET_ERROR); + + /* Fill in metadata. */ + m.magic = BTREEMAGIC; + m.version = BTREEVERSION; + m.psize = t->bt_psize; + m.free = t->bt_free; + m.nrecs = t->bt_nrecs; + m.flags = F_ISSET(t, SAVEMETA); + + memmove(p, &m, sizeof(BTMETA)); + mpool_put(t->bt_mp, p, MPOOL_DIRTY); + return (RET_SUCCESS); +} diff --git a/db/btree/bt_conv.c b/db/btree/bt_conv.c new file mode 100644 index 0000000000..1cb208b14d --- /dev/null +++ b/db/btree/bt_conv.c @@ -0,0 +1,221 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_conv.c 8.5 (Berkeley) 8/17/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> + +#include <stdio.h> + +#include <db.h> +#include "btree.h" + +static void mswap __P((PAGE *)); + +/* + * __BT_BPGIN, __BT_BPGOUT -- + * Convert host-specific number layout to/from the host-independent + * format stored on disk. + * + * Parameters: + * t: tree + * pg: page number + * h: page to convert + */ +void +__bt_pgin(t, pg, pp) + void *t; + pgno_t pg; + void *pp; +{ + PAGE *h; + indx_t i, top; + u_char flags; + char *p; + + if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) + return; + if (pg == P_META) { + mswap(pp); + return; + } + + h = pp; + M_32_SWAP(h->pgno); + M_32_SWAP(h->prevpg); + M_32_SWAP(h->nextpg); + M_32_SWAP(h->flags); + M_16_SWAP(h->lower); + M_16_SWAP(h->upper); + + top = NEXTINDEX(h); + if ((h->flags & P_TYPE) == P_BINTERNAL) + for (i = 0; i < top; i++) { + M_16_SWAP(h->linp[i]); + p = (char *)GETBINTERNAL(h, i); + P_32_SWAP(p); + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + if (*(u_char *)p & P_BIGKEY) { + p += sizeof(u_char); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + else if ((h->flags & P_TYPE) == P_BLEAF) + for (i = 0; i < top; i++) { + M_16_SWAP(h->linp[i]); + p = (char *)GETBLEAF(h, i); + P_32_SWAP(p); + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(u_int32_t); + flags = *(u_char *)p; + if (flags & (P_BIGKEY | P_BIGDATA)) { + p += sizeof(u_char); + if (flags & P_BIGKEY) { + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + if (flags & P_BIGDATA) { + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + } +} + +void +__bt_pgout(t, pg, pp) + void *t; + pgno_t pg; + void *pp; +{ + PAGE *h; + indx_t i, top; + u_char flags; + char *p; + + if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) + return; + if (pg == P_META) { + mswap(pp); + return; + } + + h = pp; + top = NEXTINDEX(h); + if ((h->flags & P_TYPE) == P_BINTERNAL) + for (i = 0; i < top; i++) { + p = (char *)GETBINTERNAL(h, i); + P_32_SWAP(p); + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + if (*(u_char *)p & P_BIGKEY) { + p += sizeof(u_char); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + M_16_SWAP(h->linp[i]); + } + else if ((h->flags & P_TYPE) == P_BLEAF) + for (i = 0; i < top; i++) { + p = (char *)GETBLEAF(h, i); + P_32_SWAP(p); + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(u_int32_t); + flags = *(u_char *)p; + if (flags & (P_BIGKEY | P_BIGDATA)) { + p += sizeof(u_char); + if (flags & P_BIGKEY) { + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + if (flags & P_BIGDATA) { + p += sizeof(u_int32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + M_16_SWAP(h->linp[i]); + } + + M_32_SWAP(h->pgno); + M_32_SWAP(h->prevpg); + M_32_SWAP(h->nextpg); + M_32_SWAP(h->flags); + M_16_SWAP(h->lower); + M_16_SWAP(h->upper); +} + +/* + * MSWAP -- Actually swap the bytes on the meta page. + * + * Parameters: + * p: page to convert + */ +static void +mswap(pg) + PAGE *pg; +{ + char *p; + + p = (char *)pg; + P_32_SWAP(p); /* magic */ + p += sizeof(u_int32_t); + P_32_SWAP(p); /* version */ + p += sizeof(u_int32_t); + P_32_SWAP(p); /* psize */ + p += sizeof(u_int32_t); + P_32_SWAP(p); /* free */ + p += sizeof(u_int32_t); + P_32_SWAP(p); /* nrecs */ + p += sizeof(u_int32_t); + P_32_SWAP(p); /* flags */ + p += sizeof(u_int32_t); +} diff --git a/db/btree/bt_debug.c b/db/btree/bt_debug.c new file mode 100644 index 0000000000..3aefbe7622 --- /dev/null +++ b/db/btree/bt_debug.c @@ -0,0 +1,329 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_debug.c 8.5 (Berkeley) 8/17/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +#ifdef DEBUG +/* + * BT_DUMP -- Dump the tree + * + * Parameters: + * dbp: pointer to the DB + */ +void +__bt_dump(dbp) + DB *dbp; +{ + BTREE *t; + PAGE *h; + pgno_t i; + char *sep; + + t = dbp->internal; + (void)fprintf(stderr, "%s: pgsz %d", + F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize); + if (F_ISSET(t, R_RECNO)) + (void)fprintf(stderr, " keys %lu", t->bt_nrecs); +#undef X +#define X(flag, name) \ + if (F_ISSET(t, flag)) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + if (t->flags != 0) { + sep = " flags ("; + X(R_FIXLEN, "FIXLEN"); + X(B_INMEM, "INMEM"); + X(B_NODUPS, "NODUPS"); + X(B_RDONLY, "RDONLY"); + X(R_RECNO, "RECNO"); + X(B_METADIRTY,"METADIRTY"); + (void)fprintf(stderr, ")\n"); + } +#undef X + + for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { + __bt_dpage(h); + (void)mpool_put(t->bt_mp, h, 0); + } +} + +/* + * BT_DMPAGE -- Dump the meta page + * + * Parameters: + * h: pointer to the PAGE + */ +void +__bt_dmpage(h) + PAGE *h; +{ + BTMETA *m; + char *sep; + + m = (BTMETA *)h; + (void)fprintf(stderr, "magic %lx\n", m->magic); + (void)fprintf(stderr, "version %lu\n", m->version); + (void)fprintf(stderr, "psize %lu\n", m->psize); + (void)fprintf(stderr, "free %lu\n", m->free); + (void)fprintf(stderr, "nrecs %lu\n", m->nrecs); + (void)fprintf(stderr, "flags %lu", m->flags); +#undef X +#define X(flag, name) \ + if (m->flags & flag) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + if (m->flags) { + sep = " ("; + X(B_NODUPS, "NODUPS"); + X(R_RECNO, "RECNO"); + (void)fprintf(stderr, ")"); + } +} + +/* + * BT_DNPAGE -- Dump the page + * + * Parameters: + * n: page number to dump. + */ +void +__bt_dnpage(dbp, pgno) + DB *dbp; + pgno_t pgno; +{ + BTREE *t; + PAGE *h; + + t = dbp->internal; + if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) { + __bt_dpage(h); + (void)mpool_put(t->bt_mp, h, 0); + } +} + +/* + * BT_DPAGE -- Dump the page + * + * Parameters: + * h: pointer to the PAGE + */ +void +__bt_dpage(h) + PAGE *h; +{ + BINTERNAL *bi; + BLEAF *bl; + RINTERNAL *ri; + RLEAF *rl; + indx_t cur, top; + char *sep; + + (void)fprintf(stderr, " page %d: (", h->pgno); +#undef X +#define X(flag, name) \ + if (h->flags & flag) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + sep = ""; + X(P_BINTERNAL, "BINTERNAL") /* types */ + X(P_BLEAF, "BLEAF") + X(P_RINTERNAL, "RINTERNAL") /* types */ + X(P_RLEAF, "RLEAF") + X(P_OVERFLOW, "OVERFLOW") + X(P_PRESERVE, "PRESERVE"); + (void)fprintf(stderr, ")\n"); +#undef X + + (void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg); + if (h->flags & P_OVERFLOW) + return; + + top = NEXTINDEX(h); + (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n", + h->lower, h->upper, top); + for (cur = 0; cur < top; cur++) { + (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]); + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + bi = GETBINTERNAL(h, cur); + (void)fprintf(stderr, + "size %03d pgno %03d", bi->ksize, bi->pgno); + if (bi->flags & P_BIGKEY) + (void)fprintf(stderr, " (indirect)"); + else if (bi->ksize) + (void)fprintf(stderr, + " {%.*s}", (int)bi->ksize, bi->bytes); + break; + case P_RINTERNAL: + ri = GETRINTERNAL(h, cur); + (void)fprintf(stderr, "entries %03d pgno %03d", + ri->nrecs, ri->pgno); + break; + case P_BLEAF: + bl = GETBLEAF(h, cur); + if (bl->flags & P_BIGKEY) + (void)fprintf(stderr, + "big key page %lu size %u/", + *(pgno_t *)bl->bytes, + *(u_int32_t *)(bl->bytes + sizeof(pgno_t))); + else if (bl->ksize) + (void)fprintf(stderr, "%s/", bl->bytes); + if (bl->flags & P_BIGDATA) + (void)fprintf(stderr, + "big data page %lu size %u", + *(pgno_t *)(bl->bytes + bl->ksize), + *(u_int32_t *)(bl->bytes + bl->ksize + + sizeof(pgno_t))); + else if (bl->dsize) + (void)fprintf(stderr, "%.*s", + (int)bl->dsize, bl->bytes + bl->ksize); + break; + case P_RLEAF: + rl = GETRLEAF(h, cur); + if (rl->flags & P_BIGDATA) + (void)fprintf(stderr, + "big data page %lu size %u", + *(pgno_t *)rl->bytes, + *(u_int32_t *)(rl->bytes + sizeof(pgno_t))); + else if (rl->dsize) + (void)fprintf(stderr, + "%.*s", (int)rl->dsize, rl->bytes); + break; + } + (void)fprintf(stderr, "\n"); + } +} +#endif + +#ifdef STATISTICS +/* + * BT_STAT -- Gather/print the tree statistics + * + * Parameters: + * dbp: pointer to the DB + */ +void +__bt_stat(dbp) + DB *dbp; +{ + extern u_long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit; + extern u_long bt_sortsplit, bt_split; + BTREE *t; + PAGE *h; + pgno_t i, pcont, pinternal, pleaf; + u_long ifree, lfree, nkeys; + int levels; + + t = dbp->internal; + pcont = pinternal = pleaf = 0; + nkeys = ifree = lfree = 0; + for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + case P_RINTERNAL: + ++pinternal; + ifree += h->upper - h->lower; + break; + case P_BLEAF: + case P_RLEAF: + ++pleaf; + lfree += h->upper - h->lower; + nkeys += NEXTINDEX(h); + break; + case P_OVERFLOW: + ++pcont; + break; + } + (void)mpool_put(t->bt_mp, h, 0); + } + + /* Count the levels of the tree. */ + for (i = P_ROOT, levels = 0 ;; ++levels) { + h = mpool_get(t->bt_mp, i, 0); + if (h->flags & (P_BLEAF|P_RLEAF)) { + if (levels == 0) + levels = 1; + (void)mpool_put(t->bt_mp, h, 0); + break; + } + i = F_ISSET(t, R_RECNO) ? + GETRINTERNAL(h, 0)->pgno : + GETBINTERNAL(h, 0)->pgno; + (void)mpool_put(t->bt_mp, h, 0); + } + + (void)fprintf(stderr, "%d level%s with %ld keys", + levels, levels == 1 ? "" : "s", nkeys); + if (F_ISSET(t, R_RECNO)) + (void)fprintf(stderr, " (%ld header count)", t->bt_nrecs); + (void)fprintf(stderr, + "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n", + pinternal + pleaf + pcont, pleaf, pinternal, pcont); + (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n", + bt_cache_hit, bt_cache_miss); + (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n", + bt_split, bt_rootsplit, bt_sortsplit); + pleaf *= t->bt_psize - BTDATAOFF; + if (pleaf) + (void)fprintf(stderr, + "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n", + ((double)(pleaf - lfree) / pleaf) * 100, + pleaf - lfree, lfree); + pinternal *= t->bt_psize - BTDATAOFF; + if (pinternal) + (void)fprintf(stderr, + "%.0f%% internal fill (%ld bytes used, %ld bytes free\n", + ((double)(pinternal - ifree) / pinternal) * 100, + pinternal - ifree, ifree); + if (bt_pfxsaved) + (void)fprintf(stderr, "prefix checking removed %lu bytes.\n", + bt_pfxsaved); +} +#endif diff --git a/db/btree/bt_delete.c b/db/btree/bt_delete.c new file mode 100644 index 0000000000..ece1ab656e --- /dev/null +++ b/db/btree/bt_delete.c @@ -0,0 +1,657 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +static int __bt_bdelete __P((BTREE *, const DBT *)); +static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int)); +static int __bt_pdelete __P((BTREE *, PAGE *)); +static int __bt_relink __P((BTREE *, PAGE *)); +static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *)); + +/* + * __bt_delete + * Delete the item(s) referenced by a key. + * + * Return RET_SPECIAL if the key is not found. + */ +int +__bt_delete(dbp, key, flags) + const DB *dbp; + const DBT *key; + u_int flags; +{ + BTREE *t; + CURSOR *c; + PAGE *h; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Check for change to a read-only tree. */ + if (F_ISSET(t, B_RDONLY)) { + errno = EPERM; + return (RET_ERROR); + } + + switch (flags) { + case 0: + status = __bt_bdelete(t, key); + break; + case R_CURSOR: + /* + * If flags is R_CURSOR, delete the cursor. Must already + * have started a scan and not have already deleted it. + */ + c = &t->bt_cursor; + if (F_ISSET(c, CURS_INIT)) { + if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) + return (RET_ERROR); + + /* + * If the page is about to be emptied, we'll need to + * delete it, which means we have to acquire a stack. + */ + if (NEXTINDEX(h) == 1) + if (__bt_stkacq(t, &h, &t->bt_cursor)) + return (RET_ERROR); + + status = __bt_dleaf(t, NULL, h, c->pg.index); + + if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + } else + mpool_put(t->bt_mp, + h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); + break; + } + /* FALLTHROUGH */ + default: + errno = EINVAL; + return (RET_ERROR); + } + if (status == RET_SUCCESS) + F_SET(t, B_MODIFIED); + return (status); +} + +/* + * __bt_stkacq -- + * Acquire a stack so we can delete a cursor entry. + * + * Parameters: + * t: tree + * hp: pointer to current, pinned PAGE pointer + * c: pointer to the cursor + * + * Returns: + * 0 on success, 1 on failure + */ +static int +__bt_stkacq(t, hp, c) + BTREE *t; + PAGE **hp; + CURSOR *c; +{ + BINTERNAL *bi; + EPG *e; + EPGNO *parent; + PAGE *h; + indx_t index; + pgno_t pgno; + recno_t nextpg, prevpg; + int exact, level; + + /* + * Find the first occurrence of the key in the tree. Toss the + * currently locked page so we don't hit an already-locked page. + */ + h = *hp; + mpool_put(t->bt_mp, h, 0); + if ((e = __bt_search(t, &c->key, &exact)) == NULL) + return (1); + h = e->page; + + /* See if we got it in one shot. */ + if (h->pgno == c->pg.pgno) + goto ret; + + /* + * Move right, looking for the page. At each move we have to move + * up the stack until we don't have to move to the next page. If + * we have to change pages at an internal level, we have to fix the + * stack back up. + */ + while (h->pgno != c->pg.pgno) { + if ((nextpg = h->nextpg) == P_INVALID) + break; + mpool_put(t->bt_mp, h, 0); + + /* Move up the stack. */ + for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (1); + + /* Move to the next index. */ + if (parent->index != NEXTINDEX(h) - 1) { + index = parent->index + 1; + BT_PUSH(t, h->pgno, index); + break; + } + mpool_put(t->bt_mp, h, 0); + } + + /* Restore the stack. */ + while (level--) { + /* Push the next level down onto the stack. */ + bi = GETBINTERNAL(h, index); + pgno = bi->pgno; + BT_PUSH(t, pgno, 0); + + /* Lose the currently pinned page. */ + mpool_put(t->bt_mp, h, 0); + + /* Get the next level down. */ + if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) + return (1); + index = 0; + } + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) + return (1); + } + + if (h->pgno == c->pg.pgno) + goto ret; + + /* Reacquire the original stack. */ + mpool_put(t->bt_mp, h, 0); + if ((e = __bt_search(t, &c->key, &exact)) == NULL) + return (1); + h = e->page; + + /* + * Move left, looking for the page. At each move we have to move + * up the stack until we don't have to change pages to move to the + * next page. If we have to change pages at an internal level, we + * have to fix the stack back up. + */ + while (h->pgno != c->pg.pgno) { + if ((prevpg = h->prevpg) == P_INVALID) + break; + mpool_put(t->bt_mp, h, 0); + + /* Move up the stack. */ + for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (1); + + /* Move to the next index. */ + if (parent->index != 0) { + index = parent->index - 1; + BT_PUSH(t, h->pgno, index); + break; + } + mpool_put(t->bt_mp, h, 0); + } + + /* Restore the stack. */ + while (level--) { + /* Push the next level down onto the stack. */ + bi = GETBINTERNAL(h, index); + pgno = bi->pgno; + + /* Lose the currently pinned page. */ + mpool_put(t->bt_mp, h, 0); + + /* Get the next level down. */ + if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) + return (1); + + index = NEXTINDEX(h) - 1; + BT_PUSH(t, pgno, index); + } + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) + return (1); + } + + +ret: mpool_put(t->bt_mp, h, 0); + return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); +} + +/* + * __bt_bdelete -- + * Delete all key/data pairs matching the specified key. + * + * Parameters: + * t: tree + * key: key to delete + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +static int +__bt_bdelete(t, key) + BTREE *t; + const DBT *key; +{ + EPG *e; + PAGE *h; + int deleted, exact, redo; + + deleted = 0; + + /* Find any matching record; __bt_search pins the page. */ +loop: if ((e = __bt_search(t, key, &exact)) == NULL) + return (deleted ? RET_SUCCESS : RET_ERROR); + if (!exact) { + mpool_put(t->bt_mp, e->page, 0); + return (deleted ? RET_SUCCESS : RET_SPECIAL); + } + + /* + * Delete forward, then delete backward, from the found key. If + * there are duplicates and we reach either side of the page, do + * the key search again, so that we get them all. + */ + redo = 0; + h = e->page; + do { + if (__bt_dleaf(t, key, h, e->index)) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + if (F_ISSET(t, B_NODUPS)) { + if (NEXTINDEX(h) == 0) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + } else + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); + } + deleted = 1; + } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); + + /* Check for right-hand edge of the page. */ + if (e->index == NEXTINDEX(h)) + redo = 1; + + /* Delete from the key to the beginning of the page. */ + while (e->index-- > 0) { + if (__bt_cmp(t, key, e) != 0) + break; + if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + if (e->index == 0) + redo = 1; + } + + /* Check for an empty page. */ + if (NEXTINDEX(h) == 0) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + goto loop; + } + + /* Put the page. */ + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + if (redo) + goto loop; + return (RET_SUCCESS); +} + +/* + * __bt_pdelete -- + * Delete a single page from the tree. + * + * Parameters: + * t: tree + * h: leaf page + * + * Returns: + * RET_SUCCESS, RET_ERROR. + * + * Side-effects: + * mpool_put's the page + */ +static int +__bt_pdelete(t, h) + BTREE *t; + PAGE *h; +{ + BINTERNAL *bi; + PAGE *pg; + EPGNO *parent; + indx_t cnt, index, *ip, offset; + u_int32_t nksize; + char *from; + + /* + * Walk the parent page stack -- a LIFO stack of the pages that were + * traversed when we searched for the page where the delete occurred. + * Each stack entry is a page number and a page index offset. The + * offset is for the page traversed on the search. We've just deleted + * a page, so we have to delete the key from the parent page. + * + * If the delete from the parent page makes it empty, this process may + * continue all the way up the tree. We stop if we reach the root page + * (which is never deleted, it's just not worth the effort) or if the + * delete does not empty the page. + */ + while ((parent = BT_POP(t)) != NULL) { + /* Get the parent page. */ + if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (RET_ERROR); + + index = parent->index; + bi = GETBINTERNAL(pg, index); + + /* Free any overflow pages. */ + if (bi->flags & P_BIGKEY && + __ovfl_delete(t, bi->bytes) == RET_ERROR) { + mpool_put(t->bt_mp, pg, 0); + return (RET_ERROR); + } + + /* + * Free the parent if it has only the one key and it's not the + * root page. If it's the rootpage, turn it back into an empty + * leaf page. + */ + if (NEXTINDEX(pg) == 1) + if (pg->pgno == P_ROOT) { + pg->lower = BTDATAOFF; + pg->upper = t->bt_psize; + pg->flags = P_BLEAF; + } else { + if (__bt_relink(t, pg) || __bt_free(t, pg)) + return (RET_ERROR); + continue; + } + else { + /* Pack remaining key items at the end of the page. */ + nksize = NBINTERNAL(bi->ksize); + from = (char *)pg + pg->upper; + memmove(from + nksize, from, (char *)bi - from); + pg->upper += nksize; + + /* Adjust indices' offsets, shift the indices down. */ + offset = pg->linp[index]; + for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip) + if (ip[0] < offset) + ip[0] += nksize; + for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip) + ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; + pg->lower -= sizeof(indx_t); + } + + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + break; + } + + /* Free the leaf page, as long as it wasn't the root. */ + if (h->pgno == P_ROOT) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); + } + return (__bt_relink(t, h) || __bt_free(t, h)); +} + +/* + * __bt_dleaf -- + * Delete a single record from a leaf page. + * + * Parameters: + * t: tree + * key: referenced key + * h: page + * index: index on page to delete + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_dleaf(t, key, h, index) + BTREE *t; + const DBT *key; + PAGE *h; + u_int index; +{ + BLEAF *bl; + indx_t cnt, *ip, offset; + u_int32_t nbytes; + void *to; + char *from; + + /* If this record is referenced by the cursor, delete the cursor. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index && + __bt_curdel(t, key, h, index)) + return (RET_ERROR); + + /* If the entry uses overflow pages, make them available for reuse. */ + to = bl = GETBLEAF(h, index); + if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) + return (RET_ERROR); + if (bl->flags & P_BIGDATA && + __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) + return (RET_ERROR); + + /* Pack the remaining key/data items at the end of the page. */ + nbytes = NBLEAF(bl); + from = (char *)h + h->upper; + memmove(from + nbytes, from, (char *)to - from); + h->upper += nbytes; + + /* Adjust the indices' offsets, shift the indices down. */ + offset = h->linp[index]; + for (cnt = index, ip = &h->linp[0]; cnt--; ++ip) + if (ip[0] < offset) + ip[0] += nbytes; + for (cnt = NEXTINDEX(h) - index; --cnt; ++ip) + ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; + h->lower -= sizeof(indx_t); + + /* If the cursor is on this page, adjust it as necessary. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index) + --t->bt_cursor.pg.index; + + return (RET_SUCCESS); +} + +/* + * __bt_curdel -- + * Delete the cursor. + * + * Parameters: + * t: tree + * key: referenced key (or NULL) + * h: page + * index: index on page to delete + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +static int +__bt_curdel(t, key, h, index) + BTREE *t; + const DBT *key; + PAGE *h; + u_int index; +{ + CURSOR *c; + EPG e; + PAGE *pg; + int curcopy, status; + + /* + * If there are duplicates, move forward or backward to one. + * Otherwise, copy the key into the cursor area. + */ + c = &t->bt_cursor; + F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); + + curcopy = 0; + if (!F_ISSET(t, B_NODUPS)) { + /* + * We're going to have to do comparisons. If we weren't + * provided a copy of the key, i.e. the user is deleting + * the current cursor position, get one. + */ + if (key == NULL) { + e.page = h; + e.index = index; + if ((status = __bt_ret(t, &e, + &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) + return (status); + curcopy = 1; + key = &c->key; + } + /* Check previous key, if not at the beginning of the page. */ + if (index > 0) { + e.page = h; + e.index = index - 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_BEFORE); + goto dup2; + } + } + /* Check next key, if not at the end of the page. */ + if (index < NEXTINDEX(h) - 1) { + e.page = h; + e.index = index + 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_AFTER); + goto dup2; + } + } + /* Check previous key if at the beginning of the page. */ + if (index == 0 && h->prevpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (RET_ERROR); + e.page = pg; + e.index = NEXTINDEX(pg) - 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_BEFORE); + goto dup1; + } + mpool_put(t->bt_mp, pg, 0); + } + /* Check next key if at the end of the page. */ + if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (RET_ERROR); + e.page = pg; + e.index = 0; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_AFTER); +dup1: mpool_put(t->bt_mp, pg, 0); +dup2: c->pg.pgno = e.page->pgno; + c->pg.index = e.index; + return (RET_SUCCESS); + } + mpool_put(t->bt_mp, pg, 0); + } + } + e.page = h; + e.index = index; + if (curcopy || (status = + __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { + F_SET(c, CURS_ACQUIRE); + return (RET_SUCCESS); + } + return (status); +} + +/* + * __bt_relink -- + * Link around a deleted page. + * + * Parameters: + * t: tree + * h: page to be deleted + */ +static int +__bt_relink(t, h) + BTREE *t; + PAGE *h; +{ + PAGE *pg; + + if (h->nextpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (RET_ERROR); + pg->prevpg = h->prevpg; + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + } + if (h->prevpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (RET_ERROR); + pg->nextpg = h->nextpg; + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + } + return (0); +} diff --git a/db/btree/bt_get.c b/db/btree/bt_get.c new file mode 100644 index 0000000000..74824c73f4 --- /dev/null +++ b/db/btree/bt_get.c @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_get.c 8.6 (Berkeley) 7/20/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <errno.h> +#include <stddef.h> +#include <stdio.h> + +#include <db.h> +#include "btree.h" + +/* + * __BT_GET -- Get a record from the btree. + * + * Parameters: + * dbp: pointer to access method + * key: key to find + * data: data to return + * flag: currently unused + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +int +__bt_get(dbp, key, data, flags) + const DB *dbp; + const DBT *key; + DBT *data; + u_int flags; +{ + BTREE *t; + EPG *e; + int exact, status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Get currently doesn't take any flags. */ + if (flags) { + errno = EINVAL; + return (RET_ERROR); + } + + if ((e = __bt_search(t, key, &exact)) == NULL) + return (RET_ERROR); + if (!exact) { + mpool_put(t->bt_mp, e->page, 0); + return (RET_SPECIAL); + } + + status = __bt_ret(t, e, NULL, NULL, data, &t->bt_rdata, 0); + + /* + * If the user is doing concurrent access, we copied the + * key/data, toss the page. + */ + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e->page, 0); + else + t->bt_pinned = e->page; + return (status); +} diff --git a/db/btree/bt_open.c b/db/btree/bt_open.c new file mode 100644 index 0000000000..f052249777 --- /dev/null +++ b/db/btree/bt_open.c @@ -0,0 +1,444 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; +#endif /* LIBC_SCCS and not lint */ + +/* + * Implementation of btree access method for 4.4BSD. + * + * The design here was originally based on that of the btree access method + * used in the Postgres database system at UC Berkeley. This implementation + * is wholly independent of the Postgres code. + */ + +#include <sys/param.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <db.h> +#include "btree.h" + +#ifdef DEBUG +#undef MINPSIZE +#define MINPSIZE 128 +#endif + +static int byteorder __P((void)); +static int nroot __P((BTREE *)); +static int tmp __P((void)); + +/* + * __BT_OPEN -- Open a btree. + * + * Creates and fills a DB struct, and calls the routine that actually + * opens the btree. + * + * Parameters: + * fname: filename (NULL for in-memory trees) + * flags: open flag bits + * mode: open permission bits + * b: BTREEINFO pointer + * + * Returns: + * NULL on failure, pointer to DB on success. + * + */ +DB * +__bt_open(fname, flags, mode, openinfo, dflags) + const char *fname; + int flags, mode, dflags; + const BTREEINFO *openinfo; +{ + struct stat sb; + BTMETA m; + BTREE *t; + BTREEINFO b; + DB *dbp; + pgno_t ncache; + ssize_t nr; + int machine_lorder; + + t = NULL; + + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. Can't be complete, since + * we don't know the right page size, lorder or flags until the backing + * file is opened. Also, the file's page size can cause the cachesize + * to change. + */ + machine_lorder = byteorder(); + if (openinfo) { + b = *openinfo; + + /* Flags: R_DUP. */ + if (b.flags & ~(R_DUP)) + goto einval; + + /* + * Page size must be indx_t aligned and >= MINPSIZE. Default + * page size is set farther on, based on the underlying file + * transfer size. + */ + if (b.psize && + (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || + b.psize & sizeof(indx_t) - 1)) + goto einval; + + /* Minimum number of keys per page; absolute minimum is 2. */ + if (b.minkeypage) { + if (b.minkeypage < 2) + goto einval; + } else + b.minkeypage = DEFMINKEYPAGE; + + /* If no comparison, use default comparison and prefix. */ + if (b.compare == NULL) { + b.compare = __bt_defcmp; + if (b.prefix == NULL) + b.prefix = __bt_defpfx; + } + + if (b.lorder == 0) + b.lorder = machine_lorder; + } else { + b.compare = __bt_defcmp; + b.cachesize = 0; + b.flags = 0; + b.lorder = machine_lorder; + b.minkeypage = DEFMINKEYPAGE; + b.prefix = __bt_defpfx; + b.psize = 0; + } + + /* Check for the ubiquitous PDP-11. */ + if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) + goto einval; + + /* Allocate and initialize DB and BTREE structures. */ + if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) + goto err; + memset(t, 0, sizeof(BTREE)); + t->bt_fd = -1; /* Don't close unopened fd on error. */ + t->bt_lorder = b.lorder; + t->bt_order = NOT; + t->bt_cmp = b.compare; + t->bt_pfx = b.prefix; + t->bt_rfd = -1; + + if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) + goto err; + memset(t->bt_dbp, 0, sizeof(DB)); + if (t->bt_lorder != machine_lorder) + F_SET(t, B_NEEDSWAP); + + dbp->type = DB_BTREE; + dbp->internal = t; + dbp->close = __bt_close; + dbp->del = __bt_delete; + dbp->fd = __bt_fd; + dbp->get = __bt_get; + dbp->put = __bt_put; + dbp->seq = __bt_seq; + dbp->sync = __bt_sync; + + /* + * If no file name was supplied, this is an in-memory btree and we + * open a backing temporary file. Otherwise, it's a disk-based tree. + */ + if (fname) { + switch (flags & O_ACCMODE) { + case O_RDONLY: + F_SET(t, B_RDONLY); + break; + case O_RDWR: + break; + case O_WRONLY: + default: + goto einval; + } + + if ((t->bt_fd = open(fname, flags, mode)) < 0) + goto err; + + } else { + if ((flags & O_ACCMODE) != O_RDWR) + goto einval; + if ((t->bt_fd = tmp()) == -1) + goto err; + F_SET(t, B_INMEM); + } + + if (fcntl(t->bt_fd, F_SETFD, 1) == -1) + goto err; + + if (fstat(t->bt_fd, &sb)) + goto err; + if (sb.st_size) { + if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) + goto err; + if (nr != sizeof(BTMETA)) + goto eftype; + + /* + * Read in the meta-data. This can change the notion of what + * the lorder, page size and flags are, and, when the page size + * changes, the cachesize value can change too. If the user + * specified the wrong byte order for an existing database, we + * don't bother to return an error, we just clear the NEEDSWAP + * bit. + */ + if (m.magic == BTREEMAGIC) + F_CLR(t, B_NEEDSWAP); + else { + F_SET(t, B_NEEDSWAP); + M_32_SWAP(m.magic); + M_32_SWAP(m.version); + M_32_SWAP(m.psize); + M_32_SWAP(m.free); + M_32_SWAP(m.nrecs); + M_32_SWAP(m.flags); + } + if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) + goto eftype; + if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || + m.psize & sizeof(indx_t) - 1) + goto eftype; + if (m.flags & ~SAVEMETA) + goto eftype; + b.psize = m.psize; + F_SET(t, m.flags); + t->bt_free = m.free; + t->bt_nrecs = m.nrecs; + } else { + /* + * Set the page size to the best value for I/O to this file. + * Don't overflow the page offset type. + */ + if (b.psize == 0) { + b.psize = sb.st_blksize; + if (b.psize < MINPSIZE) + b.psize = MINPSIZE; + if (b.psize > MAX_PAGE_OFFSET + 1) + b.psize = MAX_PAGE_OFFSET + 1; + } + + /* Set flag if duplicates permitted. */ + if (!(b.flags & R_DUP)) + F_SET(t, B_NODUPS); + + t->bt_free = P_INVALID; + t->bt_nrecs = 0; + F_SET(t, B_METADIRTY); + } + + t->bt_psize = b.psize; + + /* Set the cache size; must be a multiple of the page size. */ + if (b.cachesize && b.cachesize & b.psize - 1) + b.cachesize += (~b.cachesize & b.psize - 1) + 1; + if (b.cachesize < b.psize * MINCACHE) + b.cachesize = b.psize * MINCACHE; + + /* Calculate number of pages to cache. */ + ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; + + /* + * The btree data structure requires that at least two keys can fit on + * a page, but other than that there's no fixed requirement. The user + * specified a minimum number per page, and we translated that into the + * number of bytes a key/data pair can use before being placed on an + * overflow page. This calculation includes the page header, the size + * of the index referencing the leaf item and the size of the leaf item + * structure. Also, don't let the user specify a minkeypage such that + * a key/data pair won't fit even if both key and data are on overflow + * pages. + */ + t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - + (sizeof(indx_t) + NBLEAFDBT(0, 0)); + if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) + t->bt_ovflsize = + NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); + + /* Initialize the buffer pool. */ + if ((t->bt_mp = + mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) + goto err; + if (!F_ISSET(t, B_INMEM)) + mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); + + /* Create a root page if new tree. */ + if (nroot(t) == RET_ERROR) + goto err; + + /* Global flags. */ + if (dflags & DB_LOCK) + F_SET(t, B_DB_LOCK); + if (dflags & DB_SHMEM) + F_SET(t, B_DB_SHMEM); + if (dflags & DB_TXN) + F_SET(t, B_DB_TXN); + + return (dbp); + +einval: errno = EINVAL; + goto err; + +eftype: errno = EFTYPE; + goto err; + +err: if (t) { + if (t->bt_dbp) + free(t->bt_dbp); + if (t->bt_fd != -1) + (void)close(t->bt_fd); + free(t); + } + return (NULL); +} + +/* + * NROOT -- Create the root of a new tree. + * + * Parameters: + * t: tree + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +nroot(t) + BTREE *t; +{ + PAGE *meta, *root; + pgno_t npg; + + if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { + mpool_put(t->bt_mp, meta, 0); + return (RET_SUCCESS); + } + if (errno != EINVAL) /* It's OK to not exist. */ + return (RET_ERROR); + errno = 0; + + if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) + return (RET_ERROR); + + if ((root = mpool_new(t->bt_mp, &npg)) == NULL) + return (RET_ERROR); + + if (npg != P_ROOT) + return (RET_ERROR); + root->pgno = npg; + root->prevpg = root->nextpg = P_INVALID; + root->lower = BTDATAOFF; + root->upper = t->bt_psize; + root->flags = P_BLEAF; + memset(meta, 0, t->bt_psize); + mpool_put(t->bt_mp, meta, MPOOL_DIRTY); + mpool_put(t->bt_mp, root, MPOOL_DIRTY); + return (RET_SUCCESS); +} + +static int +tmp() +{ + sigset_t set, oset; + int fd; + char *envtmp; + char path[MAXPATHLEN]; + + envtmp = getenv("TMPDIR"); + (void)snprintf(path, + sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp"); + + (void)sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + if ((fd = mkstemp(path)) != -1) + (void)unlink(path); + (void)sigprocmask(SIG_SETMASK, &oset, NULL); + return(fd); +} + +static int +byteorder() +{ + u_int32_t x; + u_char *p; + + x = 0x01020304; + p = (u_char *)&x; + switch (*p) { + case 1: + return (BIG_ENDIAN); + case 4: + return (LITTLE_ENDIAN); + default: + return (0); + } +} + +int +__bt_fd(dbp) + const DB *dbp; +{ + BTREE *t; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* In-memory database can't have a file descriptor. */ + if (F_ISSET(t, B_INMEM)) { + errno = ENOENT; + return (-1); + } + return (t->bt_fd); +} diff --git a/db/btree/bt_overflow.c b/db/btree/bt_overflow.c new file mode 100644 index 0000000000..b28b8e0471 --- /dev/null +++ b/db/btree/bt_overflow.c @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_overflow.c 8.5 (Berkeley) 7/16/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +/* + * Big key/data code. + * + * Big key and data entries are stored on linked lists of pages. The initial + * reference is byte string stored with the key or data and is the page number + * and size. The actual record is stored in a chain of pages linked by the + * nextpg field of the PAGE header. + * + * The first page of the chain has a special property. If the record is used + * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set + * in the header. + * + * XXX + * A single DBT is written to each chain, so a lot of space on the last page + * is wasted. This is a fairly major bug for some data sets. + */ + +/* + * __OVFL_GET -- Get an overflow key/data item. + * + * Parameters: + * t: tree + * p: pointer to { pgno_t, u_int32_t } + * buf: storage address + * bufsz: storage size + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_get(t, p, ssz, buf, bufsz) + BTREE *t; + void *p; + size_t *ssz; + void **buf; + size_t *bufsz; +{ + PAGE *h; + pgno_t pg; + size_t nb, plen; + u_int32_t sz; + + memmove(&pg, p, sizeof(pgno_t)); + memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t)); + *ssz = sz; + +#ifdef DEBUG + if (pg == P_INVALID || sz == 0) + abort(); +#endif + /* Make the buffer bigger as necessary. */ + if (*bufsz < sz) { + *buf = (char *)(*buf == NULL ? malloc(sz) : realloc(*buf, sz)); + if (*buf == NULL) + return (RET_ERROR); + *bufsz = sz; + } + + /* + * Step through the linked list of pages, copying the data on each one + * into the buffer. Never copy more than the data's length. + */ + plen = t->bt_psize - BTDATAOFF; + for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + nb = MIN(sz, plen); + memmove(p, (char *)h + BTDATAOFF, nb); + mpool_put(t->bt_mp, h, 0); + + if ((sz -= nb) == 0) + break; + } + return (RET_SUCCESS); +} + +/* + * __OVFL_PUT -- Store an overflow key/data item. + * + * Parameters: + * t: tree + * data: DBT to store + * pgno: storage page number + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_put(t, dbt, pg) + BTREE *t; + const DBT *dbt; + pgno_t *pg; +{ + PAGE *h, *last; + void *p; + pgno_t npg; + size_t nb, plen; + u_int32_t sz; + + /* + * Allocate pages and copy the key/data record into them. Store the + * number of the first page in the chain. + */ + plen = t->bt_psize - BTDATAOFF; + for (last = NULL, p = dbt->data, sz = dbt->size;; + p = (char *)p + plen, last = h) { + if ((h = __bt_new(t, &npg)) == NULL) + return (RET_ERROR); + + h->pgno = npg; + h->nextpg = h->prevpg = P_INVALID; + h->flags = P_OVERFLOW; + h->lower = h->upper = 0; + + nb = MIN(sz, plen); + memmove((char *)h + BTDATAOFF, p, nb); + + if (last) { + last->nextpg = h->pgno; + mpool_put(t->bt_mp, last, MPOOL_DIRTY); + } else + *pg = h->pgno; + + if ((sz -= nb) == 0) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + } + } + return (RET_SUCCESS); +} + +/* + * __OVFL_DELETE -- Delete an overflow chain. + * + * Parameters: + * t: tree + * p: pointer to { pgno_t, u_int32_t } + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_delete(t, p) + BTREE *t; + void *p; +{ + PAGE *h; + pgno_t pg; + size_t plen; + u_int32_t sz; + + memmove(&pg, p, sizeof(pgno_t)); + memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t)); + +#ifdef DEBUG + if (pg == P_INVALID || sz == 0) + abort(); +#endif + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Don't delete chains used by internal pages. */ + if (h->flags & P_PRESERVE) { + mpool_put(t->bt_mp, h, 0); + return (RET_SUCCESS); + } + + /* Step through the chain, calling the free routine for each page. */ + for (plen = t->bt_psize - BTDATAOFF;; sz -= plen) { + pg = h->nextpg; + __bt_free(t, h); + if (sz <= plen) + break; + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + } + return (RET_SUCCESS); +} diff --git a/db/btree/bt_page.c b/db/btree/bt_page.c new file mode 100644 index 0000000000..0d9d138d5c --- /dev/null +++ b/db/btree/bt_page.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_page.c 8.3 (Berkeley) 7/14/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <stdio.h> + +#include <db.h> +#include "btree.h" + +/* + * __bt_free -- + * Put a page on the freelist. + * + * Parameters: + * t: tree + * h: page to free + * + * Returns: + * RET_ERROR, RET_SUCCESS + * + * Side-effect: + * mpool_put's the page. + */ +int +__bt_free(t, h) + BTREE *t; + PAGE *h; +{ + /* Insert the page at the head of the free list. */ + h->prevpg = P_INVALID; + h->nextpg = t->bt_free; + t->bt_free = h->pgno; + + /* Make sure the page gets written back. */ + return (mpool_put(t->bt_mp, h, MPOOL_DIRTY)); +} + +/* + * __bt_new -- + * Get a new page, preferably from the freelist. + * + * Parameters: + * t: tree + * npg: storage for page number. + * + * Returns: + * Pointer to a page, NULL on error. + */ +PAGE * +__bt_new(t, npg) + BTREE *t; + pgno_t *npg; +{ + PAGE *h; + + if (t->bt_free != P_INVALID && + (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) { + *npg = t->bt_free; + t->bt_free = h->nextpg; + return (h); + } + return (mpool_new(t->bt_mp, npg)); +} diff --git a/db/btree/bt_put.c b/db/btree/bt_put.c new file mode 100644 index 0000000000..952be09e55 --- /dev/null +++ b/db/btree/bt_put.c @@ -0,0 +1,320 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_put.c 8.8 (Berkeley) 7/26/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +static EPG *bt_fast __P((BTREE *, const DBT *, const DBT *, int *)); + +/* + * __BT_PUT -- Add a btree item to the tree. + * + * Parameters: + * dbp: pointer to access method + * key: key + * data: data + * flag: R_NOOVERWRITE + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the + * tree and R_NOOVERWRITE specified. + */ +int +__bt_put(dbp, key, data, flags) + const DB *dbp; + DBT *key; + const DBT *data; + u_int flags; +{ + BTREE *t; + DBT tkey, tdata; + EPG *e; + PAGE *h; + indx_t index, nxtindex; + pgno_t pg; + u_int32_t nbytes; + int dflags, exact, status; + char *dest, db[NOVFLSIZE], kb[NOVFLSIZE]; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Check for change to a read-only tree. */ + if (F_ISSET(t, B_RDONLY)) { + errno = EPERM; + return (RET_ERROR); + } + + switch (flags) { + case 0: + case R_NOOVERWRITE: + break; + case R_CURSOR: + /* + * If flags is R_CURSOR, put the cursor. Must already + * have started a scan and not have already deleted it. + */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, + CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) + break; + /* FALLTHROUGH */ + default: + errno = EINVAL; + return (RET_ERROR); + } + + /* + * If the key/data pair won't fit on a page, store it on overflow + * pages. Only put the key on the overflow page if the pair are + * still too big after moving the data to an overflow page. + * + * XXX + * If the insert fails later on, the overflow pages aren't recovered. + */ + dflags = 0; + if (key->size + data->size > t->bt_ovflsize) { + if (key->size > t->bt_ovflsize) { +storekey: if (__ovfl_put(t, key, &pg) == RET_ERROR) + return (RET_ERROR); + tkey.data = kb; + tkey.size = NOVFLSIZE; + memmove(kb, &pg, sizeof(pgno_t)); + memmove(kb + sizeof(pgno_t), + &key->size, sizeof(u_int32_t)); + dflags |= P_BIGKEY; + key = &tkey; + } + if (key->size + data->size > t->bt_ovflsize) { + if (__ovfl_put(t, data, &pg) == RET_ERROR) + return (RET_ERROR); + tdata.data = db; + tdata.size = NOVFLSIZE; + memmove(db, &pg, sizeof(pgno_t)); + memmove(db + sizeof(pgno_t), + &data->size, sizeof(u_int32_t)); + dflags |= P_BIGDATA; + data = &tdata; + } + if (key->size + data->size > t->bt_ovflsize) + goto storekey; + } + + /* Replace the cursor. */ + if (flags == R_CURSOR) { + if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL) + return (RET_ERROR); + index = t->bt_cursor.pg.index; + goto delete; + } + + /* + * Find the key to delete, or, the location at which to insert. + * Bt_fast and __bt_search both pin the returned page. + */ + if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL) + if ((e = __bt_search(t, key, &exact)) == NULL) + return (RET_ERROR); + h = e->page; + index = e->index; + + /* + * Add the key/data pair to the tree. If an identical key is already + * in the tree, and R_NOOVERWRITE is set, an error is returned. If + * R_NOOVERWRITE is not set, the key is either added (if duplicates are + * permitted) or an error is returned. + */ + switch (flags) { + case R_NOOVERWRITE: + if (!exact) + break; + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + default: + if (!exact || !F_ISSET(t, B_NODUPS)) + break; + /* + * !!! + * Note, the delete may empty the page, so we need to put a + * new entry into the page immediately. + */ +delete: if (__bt_dleaf(t, key, h, index) == RET_ERROR) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + break; + } + + /* + * If not enough room, or the user has put a ceiling on the number of + * keys permitted in the page, split the page. The split code will + * insert the key and data and unpin the current page. If inserting + * into the offset array, shift the pointers up. + */ + nbytes = NBLEAFDBT(key->size, data->size); + if (h->upper - h->lower < nbytes + sizeof(indx_t)) { + if ((status = __bt_split(t, h, key, + data, dflags, nbytes, index)) != RET_SUCCESS) + return (status); + goto success; + } + + if (index < (nxtindex = NEXTINDEX(h))) + memmove(h->linp + index + 1, h->linp + index, + (nxtindex - index) * sizeof(indx_t)); + h->lower += sizeof(indx_t); + + h->linp[index] = h->upper -= nbytes; + dest = (char *)h + h->upper; + WR_BLEAF(dest, key, data, dflags); + + /* If the cursor is on this page, adjust it as necessary. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index >= index) + ++t->bt_cursor.pg.index; + + if (t->bt_order == NOT) + if (h->nextpg == P_INVALID) { + if (index == NEXTINDEX(h) - 1) { + t->bt_order = FORWARD; + t->bt_last.index = index; + t->bt_last.pgno = h->pgno; + } + } else if (h->prevpg == P_INVALID) { + if (index == 0) { + t->bt_order = BACK; + t->bt_last.index = 0; + t->bt_last.pgno = h->pgno; + } + } + + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + +success: + if (flags == R_SETCURSOR) + __bt_setcur(t, e->page->pgno, e->index); + + F_SET(t, B_MODIFIED); + return (RET_SUCCESS); +} + +#ifdef STATISTICS +u_long bt_cache_hit, bt_cache_miss; +#endif + +/* + * BT_FAST -- Do a quick check for sorted data. + * + * Parameters: + * t: tree + * key: key to insert + * + * Returns: + * EPG for new record or NULL if not found. + */ +static EPG * +bt_fast(t, key, data, exactp) + BTREE *t; + const DBT *key, *data; + int *exactp; +{ + PAGE *h; + u_int32_t nbytes; + int cmp; + + if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) { + t->bt_order = NOT; + return (NULL); + } + t->bt_cur.page = h; + t->bt_cur.index = t->bt_last.index; + + /* + * If won't fit in this page or have too many keys in this page, + * have to search to get split stack. + */ + nbytes = NBLEAFDBT(key->size, data->size); + if (h->upper - h->lower < nbytes + sizeof(indx_t)) + goto miss; + + if (t->bt_order == FORWARD) { + if (t->bt_cur.page->nextpg != P_INVALID) + goto miss; + if (t->bt_cur.index != NEXTINDEX(h) - 1) + goto miss; + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) < 0) + goto miss; + t->bt_last.index = cmp ? ++t->bt_cur.index : t->bt_cur.index; + } else { + if (t->bt_cur.page->prevpg != P_INVALID) + goto miss; + if (t->bt_cur.index != 0) + goto miss; + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) > 0) + goto miss; + t->bt_last.index = 0; + } + *exactp = cmp == 0; +#ifdef STATISTICS + ++bt_cache_hit; +#endif + return (&t->bt_cur); + +miss: +#ifdef STATISTICS + ++bt_cache_miss; +#endif + t->bt_order = NOT; + mpool_put(t->bt_mp, h, 0); + return (NULL); +} diff --git a/db/btree/bt_search.c b/db/btree/bt_search.c new file mode 100644 index 0000000000..485afcbbf0 --- /dev/null +++ b/db/btree/bt_search.c @@ -0,0 +1,213 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_search.c 8.8 (Berkeley) 7/31/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <stdio.h> + +#include <db.h> +#include "btree.h" + +static int __bt_snext __P((BTREE *, PAGE *, const DBT *, int *)); +static int __bt_sprev __P((BTREE *, PAGE *, const DBT *, int *)); + +/* + * __bt_search -- + * Search a btree for a key. + * + * Parameters: + * t: tree to search + * key: key to find + * exactp: pointer to exact match flag + * + * Returns: + * The EPG for matching record, if any, or the EPG for the location + * of the key, if it were inserted into the tree, is entered into + * the bt_cur field of the tree. A pointer to the field is returned. + */ +EPG * +__bt_search(t, key, exactp) + BTREE *t; + const DBT *key; + int *exactp; +{ + PAGE *h; + indx_t base, index, lim; + pgno_t pg; + int cmp; + + BT_CLR(t); + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (NULL); + + /* Do a binary search on the current page. */ + t->bt_cur.page = h; + for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { + t->bt_cur.index = index = base + (lim >> 1); + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { + if (h->flags & P_BLEAF) { + *exactp = 1; + return (&t->bt_cur); + } + goto next; + } + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * If it's a leaf page, we're almost done. If no duplicates + * are allowed, or we have an exact match, we're done. Else, + * it's possible that there were matching keys on this page, + * which later deleted, and we're on a page with no matches + * while there are matches on other pages. If at the start or + * end of a page, check the adjacent page. + */ + if (h->flags & P_BLEAF) { + if (!F_ISSET(t, B_NODUPS)) { + if (base == 0 && + h->prevpg != P_INVALID && + __bt_sprev(t, h, key, exactp)) + return (&t->bt_cur); + if (base == NEXTINDEX(h) && + h->nextpg != P_INVALID && + __bt_snext(t, h, key, exactp)) + return (&t->bt_cur); + } + *exactp = 0; + t->bt_cur.index = base; + return (&t->bt_cur); + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + 1 index. If it's non-zero, + * decrement by one, and record the internal page which should + * be a parent page for the key. If a split later occurs, the + * inserted page will be to the right of the saved page. + */ + index = base ? base - 1 : base; + +next: BT_PUSH(t, h->pgno, index); + pg = GETBINTERNAL(h, index)->pgno; + mpool_put(t->bt_mp, h, 0); + } +} + +/* + * __bt_snext -- + * Check for an exact match after the key. + * + * Parameters: + * t: tree + * h: current page + * key: key + * exactp: pointer to exact match flag + * + * Returns: + * If an exact match found. + */ +static int +__bt_snext(t, h, key, exactp) + BTREE *t; + PAGE *h; + const DBT *key; + int *exactp; +{ + EPG e; + + /* + * Get the next page. The key is either an exact + * match, or not as good as the one we already have. + */ + if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (0); + e.index = 0; + if (__bt_cmp(t, key, &e) == 0) { + mpool_put(t->bt_mp, h, 0); + t->bt_cur = e; + *exactp = 1; + return (1); + } + mpool_put(t->bt_mp, e.page, 0); + return (0); +} + +/* + * __bt_sprev -- + * Check for an exact match before the key. + * + * Parameters: + * t: tree + * h: current page + * key: key + * exactp: pointer to exact match flag + * + * Returns: + * If an exact match found. + */ +static int +__bt_sprev(t, h, key, exactp) + BTREE *t; + PAGE *h; + const DBT *key; + int *exactp; +{ + EPG e; + + /* + * Get the previous page. The key is either an exact + * match, or not as good as the one we already have. + */ + if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (0); + e.index = NEXTINDEX(e.page) - 1; + if (__bt_cmp(t, key, &e) == 0) { + mpool_put(t->bt_mp, h, 0); + t->bt_cur = e; + *exactp = 1; + return (1); + } + mpool_put(t->bt_mp, e.page, 0); + return (0); +} diff --git a/db/btree/bt_seq.c b/db/btree/bt_seq.c new file mode 100644 index 0000000000..303b481903 --- /dev/null +++ b/db/btree/bt_seq.c @@ -0,0 +1,460 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_seq.c 8.7 (Berkeley) 7/20/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <errno.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> + +#include <db.h> +#include "btree.h" + +static int __bt_first __P((BTREE *, const DBT *, EPG *, int *)); +static int __bt_seqadv __P((BTREE *, EPG *, int)); +static int __bt_seqset __P((BTREE *, EPG *, DBT *, int)); + +/* + * Sequential scan support. + * + * The tree can be scanned sequentially, starting from either end of the + * tree or from any specific key. A scan request before any scanning is + * done is initialized as starting from the least node. + */ + +/* + * __bt_seq -- + * Btree sequential scan interface. + * + * Parameters: + * dbp: pointer to access method + * key: key for positioning and return value + * data: data return value + * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +int +__bt_seq(dbp, key, data, flags) + const DB *dbp; + DBT *key, *data; + u_int flags; +{ + BTREE *t; + EPG e; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* + * If scan unitialized as yet, or starting at a specific record, set + * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin + * the page the cursor references if they're successful. + */ + switch (flags) { + case R_NEXT: + case R_PREV: + if (F_ISSET(&t->bt_cursor, CURS_INIT)) { + status = __bt_seqadv(t, &e, flags); + break; + } + /* FALLTHROUGH */ + case R_FIRST: + case R_LAST: + case R_CURSOR: + status = __bt_seqset(t, &e, key, flags); + break; + default: + errno = EINVAL; + return (RET_ERROR); + } + + if (status == RET_SUCCESS) { + __bt_setcur(t, e.page->pgno, e.index); + + status = + __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); + + /* + * If the user is doing concurrent access, we copied the + * key/data, toss the page. + */ + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e.page, 0); + else + t->bt_pinned = e.page; + } + return (status); +} + +/* + * __bt_seqset -- + * Set the sequential scan to a specific key. + * + * Parameters: + * t: tree + * ep: storage for returned key + * key: key for initial scan position + * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV + * + * Side effects: + * Pins the page the cursor references. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +static int +__bt_seqset(t, ep, key, flags) + BTREE *t; + EPG *ep; + DBT *key; + int flags; +{ + PAGE *h; + pgno_t pg; + int exact; + + /* + * Find the first, last or specific key in the tree and point the + * cursor at it. The cursor may not be moved until a new key has + * been found. + */ + switch (flags) { + case R_CURSOR: /* Keyed scan. */ + /* + * Find the first instance of the key or the smallest key + * which is greater than or equal to the specified key. + */ + if (key->data == NULL || key->size == 0) { + errno = EINVAL; + return (RET_ERROR); + } + return (__bt_first(t, key, ep, &exact)); + case R_FIRST: /* First record. */ + case R_NEXT: + /* Walk down the left-hand side of the tree. */ + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Check for an empty tree. */ + if (NEXTINDEX(h) == 0) { + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + } + + if (h->flags & (P_BLEAF | P_RLEAF)) + break; + pg = GETBINTERNAL(h, 0)->pgno; + mpool_put(t->bt_mp, h, 0); + } + ep->page = h; + ep->index = 0; + break; + case R_LAST: /* Last record. */ + case R_PREV: + /* Walk down the right-hand side of the tree. */ + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Check for an empty tree. */ + if (NEXTINDEX(h) == 0) { + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + } + + if (h->flags & (P_BLEAF | P_RLEAF)) + break; + pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; + mpool_put(t->bt_mp, h, 0); + } + + ep->page = h; + ep->index = NEXTINDEX(h) - 1; + break; + } + return (RET_SUCCESS); +} + +/* + * __bt_seqadvance -- + * Advance the sequential scan. + * + * Parameters: + * t: tree + * flags: R_NEXT, R_PREV + * + * Side effects: + * Pins the page the new key/data record is on. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +static int +__bt_seqadv(t, ep, flags) + BTREE *t; + EPG *ep; + int flags; +{ + CURSOR *c; + PAGE *h; + indx_t index; + pgno_t pg; + int exact; + + /* + * There are a couple of states that we can be in. The cursor has + * been initialized by the time we get here, but that's all we know. + */ + c = &t->bt_cursor; + + /* + * The cursor was deleted where there weren't any duplicate records, + * so the key was saved. Find out where that key would go in the + * current tree. It doesn't matter if the returned key is an exact + * match or not -- if it's an exact match, the record was added after + * the delete so we can just return it. If not, as long as there's + * a record there, return it. + */ + if (F_ISSET(c, CURS_ACQUIRE)) + return (__bt_first(t, &c->key, ep, &exact)); + + /* Get the page referenced by the cursor. */ + if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) + return (RET_ERROR); + + /* + * Find the next/previous record in the tree and point the cursor at + * it. The cursor may not be moved until a new key has been found. + */ + switch (flags) { + case R_NEXT: /* Next record. */ + /* + * The cursor was deleted in duplicate records, and moved + * forward to a record that has yet to be returned. Clear + * that flag, and return the record. + */ + if (F_ISSET(c, CURS_AFTER)) + goto usecurrent; + index = c->pg.index; + if (++index == NEXTINDEX(h)) { + pg = h->nextpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + index = 0; + } + break; + case R_PREV: /* Previous record. */ + /* + * The cursor was deleted in duplicate records, and moved + * backward to a record that has yet to be returned. Clear + * that flag, and return the record. + */ + if (F_ISSET(c, CURS_BEFORE)) { +usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); + ep->page = h; + ep->index = c->pg.index; + return (RET_SUCCESS); + } + index = c->pg.index; + if (index == 0) { + pg = h->prevpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + index = NEXTINDEX(h) - 1; + } else + --index; + break; + } + + ep->page = h; + ep->index = index; + return (RET_SUCCESS); +} + +/* + * __bt_first -- + * Find the first entry. + * + * Parameters: + * t: the tree + * key: the key + * erval: return EPG + * exactp: pointer to exact match flag + * + * Returns: + * The first entry in the tree greater than or equal to key, + * or RET_SPECIAL if no such key exists. + */ +static int +__bt_first(t, key, erval, exactp) + BTREE *t; + const DBT *key; + EPG *erval; + int *exactp; +{ + PAGE *h; + EPG *ep, save; + pgno_t pg; + + /* + * Find any matching record; __bt_search pins the page. + * + * If it's an exact match and duplicates are possible, walk backwards + * in the tree until we find the first one. Otherwise, make sure it's + * a valid key (__bt_search may return an index just past the end of a + * page) and return it. + */ + if ((ep = __bt_search(t, key, exactp)) == NULL) + return (NULL); + if (*exactp) { + if (F_ISSET(t, B_NODUPS)) { + *erval = *ep; + return (RET_SUCCESS); + } + + /* + * Walk backwards, as long as the entry matches and there are + * keys left in the tree. Save a copy of each match in case + * we go too far. + */ + save = *ep; + h = ep->page; + do { + if (save.page->pgno != ep->page->pgno) { + mpool_put(t->bt_mp, save.page, 0); + save = *ep; + } else + save.index = ep->index; + + /* + * Don't unpin the page the last (or original) match + * was on, but make sure it's unpinned if an error + * occurs. + */ + if (ep->index == 0) { + if (h->prevpg == P_INVALID) + break; + if (h->pgno != save.page->pgno) + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, + h->prevpg, 0)) == NULL) { + if (h->pgno == save.page->pgno) + mpool_put(t->bt_mp, + save.page, 0); + return (RET_ERROR); + } + ep->page = h; + ep->index = NEXTINDEX(h); + } + --ep->index; + } while (__bt_cmp(t, key, ep) == 0); + + /* + * Reach here with the last page that was looked at pinned, + * which may or may not be the same as the last (or original) + * match page. If it's not useful, release it. + */ + if (h->pgno != save.page->pgno) + mpool_put(t->bt_mp, h, 0); + + *erval = save; + return (RET_SUCCESS); + } + + /* If at the end of a page, find the next entry. */ + if (ep->index == NEXTINDEX(ep->page)) { + h = ep->page; + pg = h->nextpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + ep->index = 0; + ep->page = h; + } + *erval = *ep; + return (RET_SUCCESS); +} + +/* + * __bt_setcur -- + * Set the cursor to an entry in the tree. + * + * Parameters: + * t: the tree + * pgno: page number + * index: page index + */ +void +__bt_setcur(t, pgno, index) + BTREE *t; + pgno_t pgno; + u_int index; +{ + /* Lose any already deleted key. */ + if (t->bt_cursor.key.data != NULL) { + free(t->bt_cursor.key.data); + t->bt_cursor.key.size = 0; + t->bt_cursor.key.data = NULL; + } + F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); + + /* Update the cursor. */ + t->bt_cursor.pg.pgno = pgno; + t->bt_cursor.pg.index = index; + F_SET(&t->bt_cursor, CURS_INIT); +} diff --git a/db/btree/bt_split.c b/db/btree/bt_split.c new file mode 100644 index 0000000000..1646d82159 --- /dev/null +++ b/db/btree/bt_split.c @@ -0,0 +1,827 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/types.h> + +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *)); +static PAGE *bt_page + __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); +static int bt_preserve __P((BTREE *, pgno_t)); +static PAGE *bt_psplit + __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t)); +static PAGE *bt_root + __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); +static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *)); +static recno_t rec_total __P((PAGE *)); + +#ifdef STATISTICS +u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; +#endif + +/* + * __BT_SPLIT -- Split the tree. + * + * Parameters: + * t: tree + * sp: page to split + * key: key to insert + * data: data to insert + * flags: BIGKEY/BIGDATA flags + * ilen: insert length + * skip: index to leave open + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__bt_split(t, sp, key, data, flags, ilen, argskip) + BTREE *t; + PAGE *sp; + const DBT *key, *data; + int flags; + size_t ilen; + u_int32_t argskip; +{ + BINTERNAL *bi; + BLEAF *bl, *tbl; + DBT a, b; + EPGNO *parent; + PAGE *h, *l, *r, *lchild, *rchild; + indx_t nxtindex; + u_int16_t skip; + u_int32_t n, nbytes, nksize; + int parentsplit; + char *dest; + + /* + * Split the page into two pages, l and r. The split routines return + * a pointer to the page into which the key should be inserted and with + * skip set to the offset which should be used. Additionally, l and r + * are pinned. + */ + skip = argskip; + h = sp->pgno == P_ROOT ? + bt_root(t, sp, &l, &r, &skip, ilen) : + bt_page(t, sp, &l, &r, &skip, ilen); + if (h == NULL) + return (RET_ERROR); + + /* + * Insert the new key/data pair into the leaf page. (Key inserts + * always cause a leaf page to split first.) + */ + h->linp[skip] = h->upper -= ilen; + dest = (char *)h + h->upper; + if (F_ISSET(t, R_RECNO)) + WR_RLEAF(dest, data, flags) + else + WR_BLEAF(dest, key, data, flags) + + /* If the root page was split, make it look right. */ + if (sp->pgno == P_ROOT && + (F_ISSET(t, R_RECNO) ? + bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) + goto err2; + + /* + * Now we walk the parent page stack -- a LIFO stack of the pages that + * were traversed when we searched for the page that split. Each stack + * entry is a page number and a page index offset. The offset is for + * the page traversed on the search. We've just split a page, so we + * have to insert a new key into the parent page. + * + * If the insert into the parent page causes it to split, may have to + * continue splitting all the way up the tree. We stop if the root + * splits or the page inserted into didn't have to split to hold the + * new key. Some algorithms replace the key for the old page as well + * as the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. + * + * There are a maximum of 5 pages pinned at any time. We keep the left + * and right pages pinned while working on the parent. The 5 are the + * two children, left parent and right parent (when the parent splits) + * and the root page or the overflow key page when calling bt_preserve. + * This code must make sure that all pins are released other than the + * root page or overflow page which is unlocked elsewhere. + */ + while ((parent = BT_POP(t)) != NULL) { + lchild = l; + rchild = r; + + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + goto err2; + + /* + * The new key goes ONE AFTER the index, because the split + * was to the right. + */ + skip = parent->index + 1; + + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack when inserting into BINTERNAL + * pages. Retain only what's needed to distinguish between + * the new entry and the LAST entry on the page to its left. + * If the keys compare equal, retain the entire key. Note, + * we don't touch overflow keys, and the entire key must be + * retained for the next-to-left most key on the leftmost + * page of each level, or the search will fail. Applicable + * ONLY to internal pages that have leaf pages as children. + * Further reduction of the key between pairs of internal + * pages loses too much information. + */ + switch (rchild->flags & P_TYPE) { + case P_BINTERNAL: + bi = GETBINTERNAL(rchild, 0); + nbytes = NBINTERNAL(bi->ksize); + break; + case P_BLEAF: + bl = GETBLEAF(rchild, 0); + nbytes = NBINTERNAL(bl->ksize); + if (t->bt_pfx && !(bl->flags & P_BIGKEY) && + (h->prevpg != P_INVALID || skip > 1)) { + tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); + a.size = tbl->ksize; + a.data = tbl->bytes; + b.size = bl->ksize; + b.data = bl->bytes; + nksize = t->bt_pfx(&a, &b); + n = NBINTERNAL(nksize); + if (n < nbytes) { +#ifdef STATISTICS + bt_pfxsaved += nbytes - n; +#endif + nbytes = n; + } else + nksize = 0; + } else + nksize = 0; + break; + case P_RINTERNAL: + case P_RLEAF: + nbytes = NRINTERNAL; + break; + default: + abort(); + } + + /* Split the parent page if necessary or shift the indices. */ + if (h->upper - h->lower < nbytes + sizeof(indx_t)) { + sp = h; + h = h->pgno == P_ROOT ? + bt_root(t, h, &l, &r, &skip, nbytes) : + bt_page(t, h, &l, &r, &skip, nbytes); + if (h == NULL) + goto err1; + parentsplit = 1; + } else { + if (skip < (nxtindex = NEXTINDEX(h))) + memmove(h->linp + skip + 1, h->linp + skip, + (nxtindex - skip) * sizeof(indx_t)); + h->lower += sizeof(indx_t); + parentsplit = 0; + } + + /* Insert the key into the parent page. */ + switch (rchild->flags & P_TYPE) { + case P_BINTERNAL: + h->linp[skip] = h->upper -= nbytes; + dest = (char *)h + h->linp[skip]; + memmove(dest, bi, nbytes); + ((BINTERNAL *)dest)->pgno = rchild->pgno; + break; + case P_BLEAF: + h->linp[skip] = h->upper -= nbytes; + dest = (char *)h + h->linp[skip]; + WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, + rchild->pgno, bl->flags & P_BIGKEY); + memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); + if (bl->flags & P_BIGKEY && + bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) + goto err1; + break; + case P_RINTERNAL: + /* + * Update the left page count. If split + * added at index 0, fix the correct page. + */ + if (skip > 0) + dest = (char *)h + h->linp[skip - 1]; + else + dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; + ((RINTERNAL *)dest)->nrecs = rec_total(lchild); + ((RINTERNAL *)dest)->pgno = lchild->pgno; + + /* Update the right page count. */ + h->linp[skip] = h->upper -= nbytes; + dest = (char *)h + h->linp[skip]; + ((RINTERNAL *)dest)->nrecs = rec_total(rchild); + ((RINTERNAL *)dest)->pgno = rchild->pgno; + break; + case P_RLEAF: + /* + * Update the left page count. If split + * added at index 0, fix the correct page. + */ + if (skip > 0) + dest = (char *)h + h->linp[skip - 1]; + else + dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; + ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); + ((RINTERNAL *)dest)->pgno = lchild->pgno; + + /* Update the right page count. */ + h->linp[skip] = h->upper -= nbytes; + dest = (char *)h + h->linp[skip]; + ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); + ((RINTERNAL *)dest)->pgno = rchild->pgno; + break; + default: + abort(); + } + + /* Unpin the held pages. */ + if (!parentsplit) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + } + + /* If the root page was split, make it look right. */ + if (sp->pgno == P_ROOT && + (F_ISSET(t, R_RECNO) ? + bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) + goto err1; + + mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); + mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); + } + + /* Unpin the held pages. */ + mpool_put(t->bt_mp, l, MPOOL_DIRTY); + mpool_put(t->bt_mp, r, MPOOL_DIRTY); + + /* Clear any pages left on the stack. */ + return (RET_SUCCESS); + + /* + * If something fails in the above loop we were already walking back + * up the tree and the tree is now inconsistent. Nothing much we can + * do about it but release any memory we're holding. + */ +err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); + mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); + +err2: mpool_put(t->bt_mp, l, 0); + mpool_put(t->bt_mp, r, 0); + __dbpanic(t->bt_dbp); + return (RET_ERROR); +} + +/* + * BT_PAGE -- Split a non-root page of a btree. + * + * Parameters: + * t: tree + * h: root page + * lp: pointer to left page pointer + * rp: pointer to right page pointer + * skip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert or NULL on error. + */ +static PAGE * +bt_page(t, h, lp, rp, skip, ilen) + BTREE *t; + PAGE *h, **lp, **rp; + indx_t *skip; + size_t ilen; +{ + PAGE *l, *r, *tp; + pgno_t npg; + +#ifdef STATISTICS + ++bt_split; +#endif + /* Put the new right page for the split into place. */ + if ((r = __bt_new(t, &npg)) == NULL) + return (NULL); + r->pgno = npg; + r->lower = BTDATAOFF; + r->upper = t->bt_psize; + r->nextpg = h->nextpg; + r->prevpg = h->pgno; + r->flags = h->flags & P_TYPE; + + /* + * If we're splitting the last page on a level because we're appending + * a key to it (skip is NEXTINDEX()), it's likely that the data is + * sorted. Adding an empty page on the side of the level is less work + * and can push the fill factor much higher than normal. If we're + * wrong it's no big deal, we'll just do the split the right way next + * time. It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Don't even try. + */ + if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { +#ifdef STATISTICS + ++bt_sortsplit; +#endif + h->nextpg = r->pgno; + r->lower = BTDATAOFF + sizeof(indx_t); + *skip = 0; + *lp = h; + *rp = r; + return (r); + } + + /* Put the new left page for the split into place. */ + if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) { + mpool_put(t->bt_mp, r, 0); + return (NULL); + } +#ifdef PURIFY + memset(l, 0xff, t->bt_psize); +#endif + l->pgno = h->pgno; + l->nextpg = r->pgno; + l->prevpg = h->prevpg; + l->lower = BTDATAOFF; + l->upper = t->bt_psize; + l->flags = h->flags & P_TYPE; + + /* Fix up the previous pointer of the page after the split page. */ + if (h->nextpg != P_INVALID) { + if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { + free(l); + /* XXX mpool_free(t->bt_mp, r->pgno); */ + return (NULL); + } + tp->prevpg = r->pgno; + mpool_put(t->bt_mp, tp, MPOOL_DIRTY); + } + + /* + * Split right. The key/data pairs aren't sorted in the btree page so + * it's simpler to copy the data from the split page onto two new pages + * instead of copying half the data to the right page and compacting + * the left page in place. Since the left page can't change, we have + * to swap the original and the allocated left page after the split. + */ + tp = bt_psplit(t, h, l, r, skip, ilen); + + /* Move the new left page onto the old left page. */ + memmove(h, l, t->bt_psize); + if (tp == l) + tp = h; + free(l); + + *lp = h; + *rp = r; + return (tp); +} + +/* + * BT_ROOT -- Split the root page of a btree. + * + * Parameters: + * t: tree + * h: root page + * lp: pointer to left page pointer + * rp: pointer to right page pointer + * skip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert or NULL on error. + */ +static PAGE * +bt_root(t, h, lp, rp, skip, ilen) + BTREE *t; + PAGE *h, **lp, **rp; + indx_t *skip; + size_t ilen; +{ + PAGE *l, *r, *tp; + pgno_t lnpg, rnpg; + +#ifdef STATISTICS + ++bt_split; + ++bt_rootsplit; +#endif + /* Put the new left and right pages for the split into place. */ + if ((l = __bt_new(t, &lnpg)) == NULL || + (r = __bt_new(t, &rnpg)) == NULL) + return (NULL); + l->pgno = lnpg; + r->pgno = rnpg; + l->nextpg = r->pgno; + r->prevpg = l->pgno; + l->prevpg = r->nextpg = P_INVALID; + l->lower = r->lower = BTDATAOFF; + l->upper = r->upper = t->bt_psize; + l->flags = r->flags = h->flags & P_TYPE; + + /* Split the root page. */ + tp = bt_psplit(t, h, l, r, skip, ilen); + + *lp = l; + *rp = r; + return (tp); +} + +/* + * BT_RROOT -- Fix up the recno root page after it has been split. + * + * Parameters: + * t: tree + * h: root page + * l: left page + * r: right page + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_rroot(t, h, l, r) + BTREE *t; + PAGE *h, *l, *r; +{ + char *dest; + + /* Insert the left and right keys, set the header information. */ + h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; + dest = (char *)h + h->upper; + WR_RINTERNAL(dest, + l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); + + h->linp[1] = h->upper -= NRINTERNAL; + dest = (char *)h + h->upper; + WR_RINTERNAL(dest, + r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); + + h->lower = BTDATAOFF + 2 * sizeof(indx_t); + + /* Unpin the root page, set to recno internal page. */ + h->flags &= ~P_TYPE; + h->flags |= P_RINTERNAL; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + return (RET_SUCCESS); +} + +/* + * BT_BROOT -- Fix up the btree root page after it has been split. + * + * Parameters: + * t: tree + * h: root page + * l: left page + * r: right page + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_broot(t, h, l, r) + BTREE *t; + PAGE *h, *l, *r; +{ + BINTERNAL *bi; + BLEAF *bl; + u_int32_t nbytes; + char *dest; + + /* + * If the root page was a leaf page, change it into an internal page. + * We copy the key we split on (but not the key's data, in the case of + * a leaf page) to the new root page. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + nbytes = NBINTERNAL(0); + h->linp[0] = h->upper = t->bt_psize - nbytes; + dest = (char *)h + h->upper; + WR_BINTERNAL(dest, 0, l->pgno, 0); + + switch (h->flags & P_TYPE) { + case P_BLEAF: + bl = GETBLEAF(r, 0); + nbytes = NBINTERNAL(bl->ksize); + h->linp[1] = h->upper -= nbytes; + dest = (char *)h + h->upper; + WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); + memmove(dest, bl->bytes, bl->ksize); + + /* + * If the key is on an overflow page, mark the overflow chain + * so it isn't deleted when the leaf copy of the key is deleted. + */ + if (bl->flags & P_BIGKEY && + bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) + return (RET_ERROR); + break; + case P_BINTERNAL: + bi = GETBINTERNAL(r, 0); + nbytes = NBINTERNAL(bi->ksize); + h->linp[1] = h->upper -= nbytes; + dest = (char *)h + h->upper; + memmove(dest, bi, nbytes); + ((BINTERNAL *)dest)->pgno = r->pgno; + break; + default: + abort(); + } + + /* There are two keys on the page. */ + h->lower = BTDATAOFF + 2 * sizeof(indx_t); + + /* Unpin the root page, set to btree internal page. */ + h->flags &= ~P_TYPE; + h->flags |= P_BINTERNAL; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + return (RET_SUCCESS); +} + +/* + * BT_PSPLIT -- Do the real work of splitting the page. + * + * Parameters: + * t: tree + * h: page to be split + * l: page to put lower half of data + * r: page to put upper half of data + * pskip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert. + */ +static PAGE * +bt_psplit(t, h, l, r, pskip, ilen) + BTREE *t; + PAGE *h, *l, *r; + indx_t *pskip; + size_t ilen; +{ + BINTERNAL *bi; + BLEAF *bl; + CURSOR *c; + RLEAF *rl; + PAGE *rval; + void *src; + indx_t full, half, nxt, off, skip, top, used; + u_int32_t nbytes; + int bigkeycnt, isbigkey; + + /* + * Split the data to the left and right pages. Leave the skip index + * open. Additionally, make some effort not to split on an overflow + * key. This makes internal page processing faster and can save + * space as overflow keys used by internal pages are never deleted. + */ + bigkeycnt = 0; + skip = *pskip; + full = t->bt_psize - BTDATAOFF; + half = full / 2; + used = 0; + for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { + if (skip == off) { + nbytes = ilen; + isbigkey = 0; /* XXX: not really known. */ + } else + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + src = bi = GETBINTERNAL(h, nxt); + nbytes = NBINTERNAL(bi->ksize); + isbigkey = bi->flags & P_BIGKEY; + break; + case P_BLEAF: + src = bl = GETBLEAF(h, nxt); + nbytes = NBLEAF(bl); + isbigkey = bl->flags & P_BIGKEY; + break; + case P_RINTERNAL: + src = GETRINTERNAL(h, nxt); + nbytes = NRINTERNAL; + isbigkey = 0; + break; + case P_RLEAF: + src = rl = GETRLEAF(h, nxt); + nbytes = NRLEAF(rl); + isbigkey = 0; + break; + default: + abort(); + } + + /* + * If the key/data pairs are substantial fractions of the max + * possible size for the page, it's possible to get situations + * where we decide to try and copy too much onto the left page. + * Make sure that doesn't happen. + */ + if (skip <= off && used + nbytes >= full) { + --off; + break; + } + + /* Copy the key/data pair, if not the skipped index. */ + if (skip != off) { + ++nxt; + + l->linp[off] = l->upper -= nbytes; + memmove((char *)l + l->upper, src, nbytes); + } + + used += nbytes; + if (used >= half) { + if (!isbigkey || bigkeycnt == 3) + break; + else + ++bigkeycnt; + } + } + + /* + * Off is the last offset that's valid for the left page. + * Nxt is the first offset to be placed on the right page. + */ + l->lower += (off + 1) * sizeof(indx_t); + + /* + * If splitting the page that the cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. If the + * cursor is at or past the skipped slot, the cursor is incremented by + * one. If the cursor is on the right page, it is decremented by the + * number of records split to the left page. + */ + c = &t->bt_cursor; + if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { + if (c->pg.index >= skip) + ++c->pg.index; + if (c->pg.index < nxt) /* Left page. */ + c->pg.pgno = l->pgno; + else { /* Right page. */ + c->pg.pgno = r->pgno; + c->pg.index -= nxt; + } + } + + /* + * If the skipped index was on the left page, just return that page. + * Otherwise, adjust the skip index to reflect the new position on + * the right page. + */ + if (skip <= off) { + skip = 0; + rval = l; + } else { + rval = r; + *pskip -= nxt; + } + + for (off = 0; nxt < top; ++off) { + if (skip == nxt) { + ++off; + skip = 0; + } + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + src = bi = GETBINTERNAL(h, nxt); + nbytes = NBINTERNAL(bi->ksize); + break; + case P_BLEAF: + src = bl = GETBLEAF(h, nxt); + nbytes = NBLEAF(bl); + break; + case P_RINTERNAL: + src = GETRINTERNAL(h, nxt); + nbytes = NRINTERNAL; + break; + case P_RLEAF: + src = rl = GETRLEAF(h, nxt); + nbytes = NRLEAF(rl); + break; + default: + abort(); + } + ++nxt; + r->linp[off] = r->upper -= nbytes; + memmove((char *)r + r->upper, src, nbytes); + } + r->lower += off * sizeof(indx_t); + + /* If the key is being appended to the page, adjust the index. */ + if (skip == top) + r->lower += sizeof(indx_t); + + return (rval); +} + +/* + * BT_PRESERVE -- Mark a chain of pages as used by an internal node. + * + * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the + * record that references them gets deleted. Chains pointed to by internal + * pages never get deleted. This routine marks a chain as pointed to by an + * internal page. + * + * Parameters: + * t: tree + * pg: page number of first page in the chain. + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +static int +bt_preserve(t, pg) + BTREE *t; + pgno_t pg; +{ + PAGE *h; + + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + h->flags |= P_PRESERVE; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); +} + +/* + * REC_TOTAL -- Return the number of recno entries below a page. + * + * Parameters: + * h: page + * + * Returns: + * The number of recno entries below a page. + * + * XXX + * These values could be set by the bt_psplit routine. The problem is that the + * entry has to be popped off of the stack etc. or the values have to be passed + * all the way back to bt_split/bt_rroot and it's not very clean. + */ +static recno_t +rec_total(h) + PAGE *h; +{ + recno_t recs; + indx_t nxt, top; + + for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) + recs += GETRINTERNAL(h, nxt)->nrecs; + return (recs); +} diff --git a/db/btree/bt_utils.c b/db/btree/bt_utils.c new file mode 100644 index 0000000000..9c1438eb84 --- /dev/null +++ b/db/btree/bt_utils.c @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bt_utils.c 8.8 (Berkeley) 7/20/94"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <db.h> +#include "btree.h" + +/* + * __bt_ret -- + * Build return key/data pair. + * + * Parameters: + * t: tree + * e: key/data pair to be returned + * key: user's key structure (NULL if not to be filled in) + * rkey: memory area to hold key + * data: user's data structure (NULL if not to be filled in) + * rdata: memory area to hold data + * copy: always copy the key/data item + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_ret(t, e, key, rkey, data, rdata, copy) + BTREE *t; + EPG *e; + DBT *key, *rkey, *data, *rdata; + int copy; +{ + BLEAF *bl; + void *p; + + bl = GETBLEAF(e->page, e->index); + + /* + * We must copy big keys/data to make them contigous. Otherwise, + * leave the page pinned and don't copy unless the user specified + * concurrent access. + */ + if (key == NULL) + goto dataonly; + + if (bl->flags & P_BIGKEY) { + if (__ovfl_get(t, bl->bytes, + &key->size, &rkey->data, &rkey->size)) + return (RET_ERROR); + key->data = rkey->data; + } else if (copy || F_ISSET(t, B_DB_LOCK)) { + if (bl->ksize > rkey->size) { + p = (void *)(rkey->data == NULL ? + malloc(bl->ksize) : realloc(rkey->data, bl->ksize)); + if (p == NULL) + return (RET_ERROR); + rkey->data = p; + rkey->size = bl->ksize; + } + memmove(rkey->data, bl->bytes, bl->ksize); + key->size = bl->ksize; + key->data = rkey->data; + } else { + key->size = bl->ksize; + key->data = bl->bytes; + } + +dataonly: + if (data == NULL) + return (RET_SUCCESS); + + if (bl->flags & P_BIGDATA) { + if (__ovfl_get(t, bl->bytes + bl->ksize, + &data->size, &rdata->data, &rdata->size)) + return (RET_ERROR); + data->data = rdata->data; + } else if (copy || F_ISSET(t, B_DB_LOCK)) { + /* Use +1 in case the first record retrieved is 0 length. */ + if (bl->dsize + 1 > rdata->size) { + p = (void *)(rdata->data == NULL ? + malloc(bl->dsize + 1) : + realloc(rdata->data, bl->dsize + 1)); + if (p == NULL) + return (RET_ERROR); + rdata->data = p; + rdata->size = bl->dsize + 1; + } + memmove(rdata->data, bl->bytes + bl->ksize, bl->dsize); + data->size = bl->dsize; + data->data = rdata->data; + } else { + data->size = bl->dsize; + data->data = bl->bytes + bl->ksize; + } + + return (RET_SUCCESS); +} + +/* + * __BT_CMP -- Compare a key to a given record. + * + * Parameters: + * t: tree + * k1: DBT pointer of first arg to comparison + * e: pointer to EPG for comparison + * + * Returns: + * < 0 if k1 is < record + * = 0 if k1 is = record + * > 0 if k1 is > record + */ +int +__bt_cmp(t, k1, e) + BTREE *t; + const DBT *k1; + EPG *e; +{ + BINTERNAL *bi; + BLEAF *bl; + DBT k2; + PAGE *h; + void *bigkey; + + /* + * The left-most key on internal pages, at any level of the tree, is + * guaranteed by the following code to be less than any user key. + * This saves us from having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything we've yet seen. + */ + h = e->page; + if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF)) + return (1); + + bigkey = NULL; + if (h->flags & P_BLEAF) { + bl = GETBLEAF(h, e->index); + if (bl->flags & P_BIGKEY) + bigkey = bl->bytes; + else { + k2.data = bl->bytes; + k2.size = bl->ksize; + } + } else { + bi = GETBINTERNAL(h, e->index); + if (bi->flags & P_BIGKEY) + bigkey = bi->bytes; + else { + k2.data = bi->bytes; + k2.size = bi->ksize; + } + } + + if (bigkey) { + if (__ovfl_get(t, bigkey, + &k2.size, &t->bt_rdata.data, &t->bt_rdata.size)) + return (RET_ERROR); + k2.data = t->bt_rdata.data; + } + return ((*t->bt_cmp)(k1, &k2)); +} + +/* + * __BT_DEFCMP -- Default comparison routine. + * + * Parameters: + * a: DBT #1 + * b: DBT #2 + * + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + */ +int +__bt_defcmp(a, b) + const DBT *a, *b; +{ + register size_t len; + register u_char *p1, *p2; + + /* + * XXX + * If a size_t doesn't fit in an int, this routine can lose. + * What we need is a integral type which is guaranteed to be + * larger than a size_t, and there is no such thing. + */ + len = MIN(a->size, b->size); + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((int)*p1 - (int)*p2); + return ((int)a->size - (int)b->size); +} + +/* + * __BT_DEFPFX -- Default prefix routine. + * + * Parameters: + * a: DBT #1 + * b: DBT #2 + * + * Returns: + * Number of bytes needed to distinguish b from a. + */ +size_t +__bt_defpfx(a, b) + const DBT *a, *b; +{ + register u_char *p1, *p2; + register size_t cnt, len; + + cnt = 1; + len = MIN(a->size, b->size); + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* a->size must be <= b->size, or they wouldn't be in this order. */ + return (a->size < b->size ? a->size + 1 : a->size); +} diff --git a/db/btree/btree.h b/db/btree/btree.h new file mode 100644 index 0000000000..36d35c998b --- /dev/null +++ b/db/btree/btree.h @@ -0,0 +1,383 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)btree.h 8.11 (Berkeley) 8/17/94 + */ + +/* Macros to set/clear/test flags. */ +#define F_SET(p, f) (p)->flags |= (f) +#define F_CLR(p, f) (p)->flags &= ~(f) +#define F_ISSET(p, f) ((p)->flags & (f)) + +#include <mpool.h> + +#define DEFMINKEYPAGE (2) /* Minimum keys per page */ +#define MINCACHE (5) /* Minimum cached pages */ +#define MINPSIZE (512) /* Minimum page size */ + +/* + * Page 0 of a btree file contains a copy of the meta-data. This page is also + * used as an out-of-band page, i.e. page pointers that point to nowhere point + * to page 0. Page 1 is the root of the btree. + */ +#define P_INVALID 0 /* Invalid tree page number. */ +#define P_META 0 /* Tree metadata page number. */ +#define P_ROOT 1 /* Tree root page number. */ + +/* + * There are five page layouts in the btree: btree internal pages (BINTERNAL), + * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages + * (RLEAF) and overflow pages. All five page types have a page header (PAGE). + * This implementation requires that values within structures NOT be padded. + * (ANSI C permits random padding.) If your compiler pads randomly you'll have + * to do some work to get this package to run. + */ +typedef struct _page { + pgno_t pgno; /* this page's page number */ + pgno_t prevpg; /* left sibling */ + pgno_t nextpg; /* right sibling */ + +#define P_BINTERNAL 0x01 /* btree internal page */ +#define P_BLEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_RINTERNAL 0x08 /* recno internal page */ +#define P_RLEAF 0x10 /* leaf page */ +#define P_TYPE 0x1f /* type mask */ +#define P_PRESERVE 0x20 /* never delete this chain of pages */ + u_int32_t flags; + + indx_t lower; /* lower bound of free space on page */ + indx_t upper; /* upper bound of free space on page */ + indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */ +} PAGE; + +/* First and next index. */ +#define BTDATAOFF \ + (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \ + sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t)) +#define NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t)) + +/* + * For pages other than overflow pages, there is an array of offsets into the + * rest of the page immediately following the page header. Each offset is to + * an item which is unique to the type of page. The h_lower offset is just + * past the last filled-in index. The h_upper offset is the first item on the + * page. Offsets are from the beginning of the page. + * + * If an item is too big to store on a single page, a flag is set and the item + * is a { page, size } pair such that the page is the first page of an overflow + * chain with size bytes of item. Overflow pages are simply bytes without any + * external structure. + * + * The page number and size fields in the items are pgno_t-aligned so they can + * be manipulated without copying. (This presumes that 32 bit items can be + * manipulated on this system.) + */ +#define LALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1)) +#define NOVFLSIZE (sizeof(pgno_t) + sizeof(u_int32_t)) + +/* + * For the btree internal pages, the item is a key. BINTERNALs are {key, pgno} + * pairs, such that the key compares less than or equal to all of the records + * on that page. For a tree without duplicate keys, an internal page with two + * consecutive keys, a and b, will have all records greater than or equal to a + * and less than b stored on the page associated with a. Duplicate keys are + * somewhat special and can cause duplicate internal and leaf page records and + * some minor modifications of the above rule. + */ +typedef struct _binternal { + u_int32_t ksize; /* key size */ + pgno_t pgno; /* page number stored on */ +#define P_BIGDATA 0x01 /* overflow data */ +#define P_BIGKEY 0x02 /* overflow key */ + u_char flags; + char bytes[1]; /* data */ +} BINTERNAL; + +/* Get the page's BINTERNAL structure at index indx. */ +#define GETBINTERNAL(pg, indx) \ + ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define NBINTERNAL(len) \ + LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len)) + +/* Copy a BINTERNAL entry to the page. */ +#define WR_BINTERNAL(p, size, pgno, flags) { \ + *(u_int32_t *)p = size; \ + p += sizeof(u_int32_t); \ + *(pgno_t *)p = pgno; \ + p += sizeof(pgno_t); \ + *(u_char *)p = flags; \ + p += sizeof(u_char); \ +} + +/* + * For the recno internal pages, the item is a page number with the number of + * keys found on that page and below. + */ +typedef struct _rinternal { + recno_t nrecs; /* number of records */ + pgno_t pgno; /* page number stored below */ +} RINTERNAL; + +/* Get the page's RINTERNAL structure at index indx. */ +#define GETRINTERNAL(pg, indx) \ + ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define NRINTERNAL \ + LALIGN(sizeof(recno_t) + sizeof(pgno_t)) + +/* Copy a RINTERAL entry to the page. */ +#define WR_RINTERNAL(p, nrecs, pgno) { \ + *(recno_t *)p = nrecs; \ + p += sizeof(recno_t); \ + *(pgno_t *)p = pgno; \ +} + +/* For the btree leaf pages, the item is a key and data pair. */ +typedef struct _bleaf { + u_int32_t ksize; /* size of key */ + u_int32_t dsize; /* size of data */ + u_char flags; /* P_BIGDATA, P_BIGKEY */ + char bytes[1]; /* data */ +} BLEAF; + +/* Get the page's BLEAF structure at index indx. */ +#define GETBLEAF(pg, indx) \ + ((BLEAF *)((char *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize) + +/* Get the number of bytes in the user's key/data pair. */ +#define NBLEAFDBT(ksize, dsize) \ + LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) + \ + (ksize) + (dsize)) + +/* Copy a BLEAF entry to the page. */ +#define WR_BLEAF(p, key, data, flags) { \ + *(u_int32_t *)p = key->size; \ + p += sizeof(u_int32_t); \ + *(u_int32_t *)p = data->size; \ + p += sizeof(u_int32_t); \ + *(u_char *)p = flags; \ + p += sizeof(u_char); \ + memmove(p, key->data, key->size); \ + p += key->size; \ + memmove(p, data->data, data->size); \ +} + +/* For the recno leaf pages, the item is a data entry. */ +typedef struct _rleaf { + u_int32_t dsize; /* size of data */ + u_char flags; /* P_BIGDATA */ + char bytes[1]; +} RLEAF; + +/* Get the page's RLEAF structure at index indx. */ +#define GETRLEAF(pg, indx) \ + ((RLEAF *)((char *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define NRLEAF(p) NRLEAFDBT((p)->dsize) + +/* Get the number of bytes from the user's data. */ +#define NRLEAFDBT(dsize) \ + LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize)) + +/* Copy a RLEAF entry to the page. */ +#define WR_RLEAF(p, data, flags) { \ + *(u_int32_t *)p = data->size; \ + p += sizeof(u_int32_t); \ + *(u_char *)p = flags; \ + p += sizeof(u_char); \ + memmove(p, data->data, data->size); \ +} + +/* + * A record in the tree is either a pointer to a page and an index in the page + * or a page number and an index. These structures are used as a cursor, stack + * entry and search returns as well as to pass records to other routines. + * + * One comment about searches. Internal page searches must find the largest + * record less than key in the tree so that descents work. Leaf page searches + * must find the smallest record greater than key so that the returned index + * is the record's correct position for insertion. + */ +typedef struct _epgno { + pgno_t pgno; /* the page number */ + indx_t index; /* the index on the page */ +} EPGNO; + +typedef struct _epg { + PAGE *page; /* the (pinned) page */ + indx_t index; /* the index on the page */ +} EPG; + +/* + * About cursors. The cursor (and the page that contained the key/data pair + * that it referenced) can be deleted, which makes things a bit tricky. If + * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set + * or there simply aren't any duplicates of the key) we copy the key that it + * referenced when it's deleted, and reacquire a new cursor key if the cursor + * is used again. If there are duplicates keys, we move to the next/previous + * key, and set a flag so that we know what happened. NOTE: if duplicate (to + * the cursor) keys are added to the tree during this process, it is undefined + * if they will be returned or not in a cursor scan. + * + * The flags determine the possible states of the cursor: + * + * CURS_INIT The cursor references *something*. + * CURS_ACQUIRE The cursor was deleted, and a key has been saved so that + * we can reacquire the right position in the tree. + * CURS_AFTER, CURS_BEFORE + * The cursor was deleted, and now references a key/data pair + * that has not yet been returned, either before or after the + * deleted key/data pair. + * XXX + * This structure is broken out so that we can eventually offer multiple + * cursors as part of the DB interface. + */ +typedef struct _cursor { + EPGNO pg; /* B: Saved tree reference. */ + DBT key; /* B: Saved key, or key.data == NULL. */ + recno_t rcursor; /* R: recno cursor (1-based) */ + +#define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */ +#define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */ +#define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */ +#define CURS_INIT 0x08 /* RB: Cursor initialized. */ + u_int8_t flags; +} CURSOR; + +/* + * The metadata of the tree. The nrecs field is used only by the RECNO code. + * This is because the btree doesn't really need it and it requires that every + * put or delete call modify the metadata. + */ +typedef struct _btmeta { + u_int32_t magic; /* magic number */ + u_int32_t version; /* version */ + u_int32_t psize; /* page size */ + u_int32_t free; /* page number of first free page */ + u_int32_t nrecs; /* R: number of records */ + +#define SAVEMETA (B_NODUPS | R_RECNO) + u_int32_t flags; /* bt_flags & SAVEMETA */ +} BTMETA; + +/* The in-memory btree/recno data structure. */ +typedef struct _btree { + MPOOL *bt_mp; /* memory pool cookie */ + + DB *bt_dbp; /* pointer to enclosing DB */ + + EPG bt_cur; /* current (pinned) page */ + PAGE *bt_pinned; /* page pinned across calls */ + + CURSOR bt_cursor; /* cursor */ + +#define BT_PUSH(t, p, i) { \ + t->bt_sp->pgno = p; \ + t->bt_sp->index = i; \ + ++t->bt_sp; \ +} +#define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp) +#define BT_CLR(t) (t->bt_sp = t->bt_stack) + EPGNO bt_stack[50]; /* stack of parent pages */ + EPGNO *bt_sp; /* current stack pointer */ + + DBT bt_rkey; /* returned key */ + DBT bt_rdata; /* returned data */ + + int bt_fd; /* tree file descriptor */ + + pgno_t bt_free; /* next free page */ + u_int32_t bt_psize; /* page size */ + indx_t bt_ovflsize; /* cut-off for key/data overflow */ + int bt_lorder; /* byte order */ + /* sorted order */ + enum { NOT, BACK, FORWARD } bt_order; + EPGNO bt_last; /* last insert */ + + /* B: key comparison function */ + int (*bt_cmp) __P((const DBT *, const DBT *)); + /* B: prefix comparison function */ + size_t (*bt_pfx) __P((const DBT *, const DBT *)); + /* R: recno input function */ + int (*bt_irec) __P((struct _btree *, recno_t)); + + FILE *bt_rfp; /* R: record FILE pointer */ + int bt_rfd; /* R: record file descriptor */ + + caddr_t bt_cmap; /* R: current point in mapped space */ + caddr_t bt_smap; /* R: start of mapped space */ + caddr_t bt_emap; /* R: end of mapped space */ + size_t bt_msize; /* R: size of mapped region. */ + + recno_t bt_nrecs; /* R: number of records */ + size_t bt_reclen; /* R: fixed record length */ + u_char bt_bval; /* R: delimiting byte/pad character */ + +/* + * NB: + * B_NODUPS and R_RECNO are stored on disk, and may not be changed. + */ +#define B_INMEM 0x00001 /* in-memory tree */ +#define B_METADIRTY 0x00002 /* need to write metadata */ +#define B_MODIFIED 0x00004 /* tree modified */ +#define B_NEEDSWAP 0x00008 /* if byte order requires swapping */ +#define B_RDONLY 0x00010 /* read-only tree */ + +#define B_NODUPS 0x00020 /* no duplicate keys permitted */ +#define R_RECNO 0x00080 /* record oriented tree */ + +#define R_CLOSEFP 0x00040 /* opened a file pointer */ +#define R_EOF 0x00100 /* end of input file reached. */ +#define R_FIXLEN 0x00200 /* fixed length records */ +#define R_MEMMAPPED 0x00400 /* memory mapped file. */ +#define R_INMEM 0x00800 /* in-memory file */ +#define R_MODIFIED 0x01000 /* modified file */ +#define R_RDONLY 0x02000 /* read-only file */ + +#define B_DB_LOCK 0x04000 /* DB_LOCK specified. */ +#define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */ +#define B_DB_TXN 0x10000 /* DB_TXN specified. */ + u_int32_t flags; +} BTREE; + +#include "extern.h" diff --git a/db/btree/extern.h b/db/btree/extern.h new file mode 100644 index 0000000000..ebd9c54923 --- /dev/null +++ b/db/btree/extern.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.10 (Berkeley) 7/20/94 + */ + +int __bt_close __P((DB *)); +int __bt_cmp __P((BTREE *, const DBT *, EPG *)); +int __bt_crsrdel __P((BTREE *, EPGNO *)); +int __bt_defcmp __P((const DBT *, const DBT *)); +size_t __bt_defpfx __P((const DBT *, const DBT *)); +int __bt_delete __P((const DB *, const DBT *, u_int)); +int __bt_dleaf __P((BTREE *, const DBT *, PAGE *, u_int)); +int __bt_fd __P((const DB *)); +int __bt_free __P((BTREE *, PAGE *)); +int __bt_get __P((const DB *, const DBT *, DBT *, u_int)); +PAGE *__bt_new __P((BTREE *, pgno_t *)); +void __bt_pgin __P((void *, pgno_t, void *)); +void __bt_pgout __P((void *, pgno_t, void *)); +int __bt_push __P((BTREE *, pgno_t, int)); +int __bt_put __P((const DB *dbp, DBT *, const DBT *, u_int)); +int __bt_ret __P((BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int)); +EPG *__bt_search __P((BTREE *, const DBT *, int *)); +int __bt_seq __P((const DB *, DBT *, DBT *, u_int)); +void __bt_setcur __P((BTREE *, pgno_t, u_int)); +int __bt_split __P((BTREE *, PAGE *, + const DBT *, const DBT *, int, size_t, u_int32_t)); +int __bt_sync __P((const DB *, u_int)); + +int __ovfl_delete __P((BTREE *, void *)); +int __ovfl_get __P((BTREE *, void *, size_t *, void **, size_t *)); +int __ovfl_put __P((BTREE *, const DBT *, pgno_t *)); + +#ifdef DEBUG +void __bt_dnpage __P((DB *, pgno_t)); +void __bt_dpage __P((PAGE *)); +void __bt_dump __P((DB *)); +#endif +#ifdef STATISTICS +void __bt_stat __P((DB *)); +#endif |