diff options
author | Ulrich Drepper <drepper@redhat.com> | 1997-08-27 20:26:10 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 1997-08-27 20:26:10 +0000 |
commit | 92f1da4da04a7a86ddee91be5eaf0b10c333ac64 (patch) | |
tree | 2a10ce9e4e407e7e5b5ca092ca0947d234b5ff60 /db2 | |
parent | 22be878ecbc66606371bd33258f56e6711e6ba7a (diff) | |
download | glibc-92f1da4da04a7a86ddee91be5eaf0b10c333ac64.tar.gz glibc-92f1da4da04a7a86ddee91be5eaf0b10c333ac64.tar.xz glibc-92f1da4da04a7a86ddee91be5eaf0b10c333ac64.zip |
Update. cvs/libc-ud-970827
1997-08-10 19:17 Philip Blundell <Philip.Blundell@pobox.com> * nss/nss_db/db-XXX.c: Include <db_185.h> not <db.h>. Somebody should update this to use the new db API. * nss/nss_db/db-netgrp.c: Likewise. * nss/nss_db/db-alias.c: Likewise. * db2/Makefile: Makefile for db-2.x in glibc. 1997-08-27 21:20 Ulrich Drepper <drepper@cygnus.com> * csu/Makefile (before-compile): New goal. Make sure abi-tag.h is generated. [$(elf)=yes] (asm-CPPFLAGS): Make sure abi-tag.h file can be found. * Makeconfig [$(build-omitfp)=yes] (CFLAGS-.o): Add -D__USE_STRING_INLINES. * string/string.f: Move strnlen optimization after inclusion of <bits/string.h>. Include <bits/string.h> only if __USE_STRING_INLINES is defined. * sysdeps/generic/memcpy.c: Undef memcpy to allow macro of this name in <bits/string.h>. * sysdeps/generic/memset.c: Likewise. * sysdeps/i386/string.h: i386 optimized string functions. * sysdeps/i386/i486string.h: i486+ optimized string functions. * Makefile (subdirs): Change db to db2. * shlib-versions: Bump libdb verion number to 3. * include/db.h: Include from db2 directory. * include/db_185.h: New file. * sysdeps/i386/Makefile [$(subdirs)=db2] (CPPFLAGS): Add macros to provide spinlock information for db2. * sysdeps/m68k/m68020/Makefile: New file. Likewise. * sysdeps/sparc/Makefile: New file. Likewise. * sysdeps/unix/sysv/linux/Makefile [$(subdirs)=db2] (CPPFLAGS): Add -DHAVE_LLSEEK. * db2/config.h: Hand-edited config file for db2 in glibc. * db2/compat.h: New file from db-2.3.4. * db2/db.h: Likewise. * db2/db_185.h: Likewise. * db2/db_int.h: Likewise. * db2/makedb.c: Likewise. * db2/btree/bt_close.c: Likewise. * db2/btree/bt_compare.c: Likewise. * db2/btree/bt_conv.c: Likewise. * db2/btree/bt_cursor.c: Likewise. * db2/btree/bt_delete.c: Likewise. * db2/btree/bt_open.c: Likewise. * db2/btree/bt_page.c: Likewise. * db2/btree/bt_put.c: Likewise. * db2/btree/bt_rec.c: Likewise. * db2/btree/bt_recno.c: Likewise. * db2/btree/btree_auto.c: Likewise. * db2/btree/bt_rsearch.c: Likewise. * db2/btree/bt_search.c: Likewise. * db2/btree/bt_split.c: Likewise. * db2/btree/bt_stat.c: Likewise. * db2/btree/btree.src: Likewise. * db2/common/db_appinit.c: Likewise. * db2/common/db_err.c: Likewise. * db2/common/db_byteorder.c: Likewise. * db2/common/db_apprec.c: Likewise. * db2/common/db_salloc.c: Likewise. * db2/common/db_log2.c: Likewise. * db2/common/db_region.c: Likewise. * db2/common/db_shash.c: Likewise. * db2/db/db.c: Likewise. * db2/db/db.src: Likewise. * db2/db/db_conv.c: Likewise. * db2/db/db_dispatch.c: Likewise. * db2/db/db_dup.c: Likewise. * db2/db/db_overflow.c: Likewise. * db2/db/db_pr.c: Likewise. * db2/db/db_rec.c: Likewise. * db2/db/db_ret.c: Likewise. * db2/db/db_thread.c: Likewise. * db2/db/db_auto.c: Likewise. * db2/db185/db185.c: Likewise. * db2/db185/db185_int.h: Likewise. * db2/dbm/dbm.c: Likewise. * db2/hash/hash.c: Likewise. * db2/hash/hash.src: Likewise. * db2/hash/hash_page.c: Likewise. * db2/hash/hash_conv.c: Likewise. * db2/hash/hash_debug.c: Likewise. * db2/hash/hash_stat.c: Likewise. * db2/hash/hash_rec.c: Likewise. * db2/hash/hash_dup.c: Likewise. * db2/hash/hash_func.c: Likewise. * db2/hash/hash_auto.c: Likewise. * db2/include/mp.h: Likewise. * db2/include/btree.h: Likewise. * db2/include/db.h.src: Likewise. * db2/include/db_int.h.src: Likewise. * db2/include/db_shash.h: Likewise. * db2/include/db_swap.h: Likewise. * db2/include/db_185.h.src: Likewise. * db2/include/txn.h: Likewise. * db2/include/db_am.h: Likewise. * db2/include/shqueue.h: Likewise. * db2/include/hash.h: Likewise. * db2/include/db_dispatch.h: Likewise. * db2/include/lock.h: Likewise. * db2/include/db_page.h: Likewise. * db2/include/log.h: Likewise. * db2/include/db_auto.h: Likewise. * db2/include/btree_auto.h: Likewise. * db2/include/hash_auto.h: Likewise. * db2/include/log_auto.h: Likewise. * db2/include/txn_auto.h: Likewise. * db2/include/db_ext.h: Likewise. * db2/include/btree_ext.h: Likewise. * db2/include/clib_ext.h: Likewise. * db2/include/common_ext.h: Likewise. * db2/include/hash_ext.h: Likewise. * db2/include/lock_ext.h: Likewise. * db2/include/log_ext.h: Likewise. * db2/include/mp_ext.h: Likewise. * db2/include/mutex_ext.h: Likewise. * db2/include/os_ext.h: Likewise. * db2/include/txn_ext.h: Likewise. * db2/include/cxx_int.h: Likewise. * db2/include/db_cxx.h: Likewise. * db2/include/queue.h: Likewise. * db2/lock/lock.c: Likewise. * db2/lock/lock_conflict.c: Likewise. * db2/lock/lock_util.c: Likewise. * db2/lock/lock_deadlock.c: Likewise. * db2/log/log.c: Likewise. * db2/log/log_get.c: Likewise. * db2/log/log.src: Likewise. * db2/log/log_compare.c: Likewise. * db2/log/log_put.c: Likewise. * db2/log/log_rec.c: Likewise. * db2/log/log_archive.c: Likewise. * db2/log/log_register.c: Likewise. * db2/log/log_auto.c: Likewise. * db2/log/log_findckp.c: Likewise. * db2/mp/mp_bh.c: Likewise. * db2/mp/mp_fget.c: Likewise. * db2/mp/mp_fopen.c: Likewise. * db2/mp/mp_fput.c: Likewise. * db2/mp/mp_fset.c: Likewise. * db2/mp/mp_open.c: Likewise. * db2/mp/mp_region.c: Likewise. * db2/mp/mp_pr.c: Likewise. * db2/mp/mp_sync.c: Likewise. * db2/mutex/68020.gcc: Likewise. * db2/mutex/mutex.c: Likewise. * db2/mutex/README: Likewise. * db2/mutex/x86.gcc: Likewise. * db2/mutex/sparc.gcc: Likewise. * db2/mutex/uts4.cc.s: Likewise. * db2/mutex/alpha.dec: Likewise. * db2/mutex/alpha.gcc: Likewise. * db2/mutex/parisc.gcc: Likewise. * db2/mutex/parisc.hp: Likewise. * db2/os/db_os_abs.c: Likewise. * db2/os/db_os_dir.c: Likewise. * db2/os/db_os_fid.c: Likewise. * db2/os/db_os_lseek.c: Likewise. * db2/os/db_os_mmap.c: Likewise. * db2/os/db_os_open.c: Likewise. * db2/os/db_os_rw.c: Likewise. * db2/os/db_os_sleep.c: Likewise. * db2/os/db_os_stat.c: Likewise. * db2/os/db_os_unlink.c: Likewise. * db2/txn/txn.c: Likewise. * db2/txn/txn.src: Likewise. * db2/txn/txn_rec.c: Likewise. * db2/txn/txn_auto.c: Likewise. * db2/clib/getlong.c: Likewise. * db2/progs/db_archive/db_archive.c: Likewise. * db2/progs/db_checkpoint/db_checkpoint.c: Likewise. * db2/progs/db_deadlock/db_deadlock.c: Likewise. * db2/progs/db_dump/db_dump.c: Likewise. * db2/progs/db_dump185/db_dump185.c: Likewise. * db2/progs/db_load/db_load.c: Likewise. * db2/progs/db_printlog/db_printlog.c: Likewise. * db2/progs/db_recover/db_recover.c: Likewise. * db2/progs/db_stat/db_stat.c: Likewise. * libio/stdio.h [__cplusplus] (__STDIO_INLINE): Define as inline. * po/de.po, po/sv.po: Update from 2.0.5 translations. * sysdeps/unix/sysv/linux/netinet/tcp.h: Pretty print. * sunrpc/rpc/xdr.h (XDR): Don't define argument of x_destroy callback as const. * sunrpc/xdr_mem.c (xdrmem_destroy): Don't define argument as const. * sunrpx/xdr_rec.c (xdrrec_destroy): Likewise. * sunrpx/xdr_stdio.c (xdrstdio_destroy): Likewise. 1997-08-27 18:47 Ulrich Drepper <drepper@cygnus.com> * sysdeps/unix/sysv/linux/if_index.c: Include <errno.h>. Reported by Benjamin Kosnik <bkoz@cygnus.com>. 1997-08-27 02:27 Roland McGrath <roland@baalperazim.frob.com> * abi-tags: New file. * csu/Makefile (distribute): Remove abi-tag.h. ($(objpfx)abi-tag.h): New target. * Makefile (distribute): Add abi-tags. * sysdeps/unix/sysv/linux/abi-tag.h: File removed. * sysdeps/mach/hurd/abi-tag.h: File removed. * sysdeps/stub/abi-tag.h: File removed. 1997-08-25 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de> * sysdeps/unix/make-syscalls.sh: Change output so that it generates compilation rules only for the currently selected object suffixes. 1997-08-25 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de> * sysdeps/m68k/dl-machine.h (RTLD_START): Switch back to previous section to avoid confusing the compiler. * sysdeps/alpha/dl-machine.h (RTLD_START): Likewise. * sysdeps/i386/dl-machine.h (RTLD_START): Likewise. * sysdeps/mips/dl-machine.h (RTLD_START): Likewise. * sysdeps/mips/mips64/dl-machine.h (RTLD_START): Likewise. * sysdeps/sparc/sparc32/dl-machine.h (RTLD_START): Likewise. * sysdeps/m68k/dl-machine.h (elf_machine_load_address): Use a GOT relocation instead of a constant to avoid text relocation. (ELF_MACHINE_BEFORE_RTLD_RELOC): Removed. (RTLD_START): Declare global labels as functions and add size directive. 1997-08-25 17:01 Ulrich Drepper <drepper@cygnus.com> * sysdeps/i386/bits/select.h: Correct assembler versions to work even for descriptors >= 32. * stdlib/alloca.h: Don't define alloca to __alloca since if gcc is used __alloca is not defined to __builtin_alloca and so might not be available. Reported by Uwe Ohse <uwe@ohse.de>. * sysdeps/unix/sysv/linux/sys/sysmacros.h: Define macros in a special way if gcc is not used and so dev_t is an array. Reported by Uwe Ohse <uwe@ohse.de>. 1997-08-23 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de> * manual/libc.texinfo: Reorder chapters to match logical order. 1997-08-25 12:22 Ulrich Drepper <drepper@cygnus.com> * sunrpc/rpc/xdr.h: Change name of parameters in prototypes of xdr_reference, xdrmem_create, and xdrstdio_create because of clash with g++ internal symbols. Patch by Sudish Joseph <sj@eng.mindspring.net>. * elf/dl-deps.c: Implement handling of DT_FILTER.
Diffstat (limited to 'db2')
146 files changed, 45840 insertions, 0 deletions
diff --git a/db2/Makefile b/db2/Makefile new file mode 100644 index 0000000000..24d74cc3d8 --- /dev/null +++ b/db2/Makefile @@ -0,0 +1,90 @@ +# Copyright (C) 1991, 92, 93, 94, 95, 96, 97 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. + +# You should have received a copy of the GNU Library General Public +# License along with the GNU C Library; see the file COPYING.LIB. If not, +# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# +# Sub-makefile for libdb. +# +# The code is lifted straight from the db 2.3.4 distribution +# with minimal changes. +# + +subdir = db2 + +subdir-dirs = btree common db db185 dbm hash lock log mp mutex os txn \ + progs/db_archive progs/db_checkpoint progs/db_deadlock \ + progs/db_dump progs/db_dump185 progs/db_load progs/db_printlog \ + progs/db_recover progs/db_stat clib + +vpath %.c $(subdir-dirs) + +extra-libs := libdb +extra-libs-others := $(extra-libs) + +libdb-routines := bt_close bt_compare bt_conv bt_cursor bt_delete \ + bt_open bt_page bt_put bt_rec bt_recno bt_rsearch bt_search \ + bt_split bt_stat btree_auto db db_appinit db_apprec \ + db_auto \ + db_byteorder db_conv db_dispatch db_dup db_err db_log2 \ + db_os_abs db_os_dir db_os_fid db_os_lseek db_os_mmap \ + db_os_open db_os_rw db_os_sleep db_os_stat db_os_unlink \ + db_overflow db_pr db_rec db_region db_ret db_salloc \ + db_shash db_thread hash hash_auto hash_conv hash_debug \ + hash_dup hash_func hash_page hash_rec hash_stat lock \ + lock_conflict lock_deadlock lock_util log log_archive \ + log_auto log_compare log_findckp log_get log_put log_rec \ + log_register mp_bh mp_fget mp_fopen mp_fput mp_fset \ + mp_open mp_pr mp_region mp_sync mutex txn txn_auto \ + txn_rec dbm db185 + +others := makedb db_dump185 db_archive db_checkpoint db_deadlock \ + db_dump db_load db_recover db_stat +install-bin := makedb db_dump185 db_archive db_checkpoint db_deadlock \ + db_dump db_load db_recover db_stat + +include ../Rules + +CPPFLAGS += -I./include -include ./compat.h + +$(objpfx)db_checkpoint: $(objpfx)getlong.o +$(objpfx)db_deadlock: $(objpfx)getlong.o +$(objpfx)db_load: $(objpfx)getlong.o + +ifeq ($(build-shared),yes) +$(objpfx)makedb: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_dump185: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_archive: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_checkpoint: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_deadlock: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_dump: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_load: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_recover: $(objpfx)libdb.so$(libdb.so-version) +$(objpfx)db_stat: $(objpfx)libdb.so$(libdb.so-version) +else +$(objpfx)makedb: $(objpfx)libdb.a +$(objpfx)db_dump185: $(objpfx)libdb.a +$(objpfx)db_archive: $(objpfx)libdb.a +$(objpfx)db_checkpoint: $(objpfx)libdb.a +$(objpfx)db_deadlock: $(objpfx)libdb.a +$(objpfx)db_dump: $(objpfx)libdb.a +$(objpfx)db_load: $(objpfx)libdb.a +$(objpfx)db_recover: $(objpfx)libdb.a +$(objpfx)db_stat: $(objpfx)libdb.a +endif + +# Depend on libc.so so a DT_NEEDED is generated in the shared objects. +$(objpfx)libdb.so: $(common-objpfx)libc.so diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c new file mode 100644 index 0000000000..4e80634e86 --- /dev/null +++ b/db2/btree/bt_close.c @@ -0,0 +1,184 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_close.c 10.22 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static void __bam_upstat __P((DB *dbp)); + +/* + * __bam_close -- + * Close a btree. + * + * PUBLIC: int __bam_close __P((DB *)); + */ +int +__bam_close(dbp) + DB *dbp; +{ + BTREE *t; + + DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0); + + t = dbp->internal; + + /* Update tree statistics. */ + __bam_upstat(dbp); + + /* Free any allocated memory. */ + if (t->bt_rkey.data) + FREE(t->bt_rkey.data, t->bt_rkey.size); + if (t->bt_rdata.data) + FREE(t->bt_rdata.data, t->bt_rdata.ulen); + if (t->bt_sp != t->bt_stack) + FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG)); + + FREE(t, sizeof(BTREE)); + dbp->internal = NULL; + + return (0); +} + +/* + * __bam_sync -- + * Sync the btree to disk. + * + * PUBLIC: int __bam_sync __P((DB *, int)); + */ +int +__bam_sync(argdbp, flags) + DB *argdbp; + int flags; +{ + DB *dbp; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_syncchk(argdbp, flags)) != 0) + return (ret); + + /* If it wasn't possible to modify the file, we're done. */ + if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY)) + return (0); + + GETHANDLE(argdbp, NULL, &dbp, ret); + + /* Flush any dirty pages from the cache to the backing file. */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = 0; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_upstat -- + * Update tree statistics. + */ +static void +__bam_upstat(dbp) + DB *dbp; +{ + BTREE *t; + BTMETA *meta; + DB_LOCK mlock; + db_pgno_t pgno; + int flags, ret; + + /* + * We use a no-op log call to log the update of the statistics onto the + * metadata page. The dbp->close() call isn't transaction protected to + * start with, and I'm not sure what undoing a statistics update means, + * anyway. + */ + if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY)) + return; + + /* Lock the page. */ + if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock) != 0) + return; + + flags = 0; + pgno = PGNO_METADATA; + + /* Get the page. */ + if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) { + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __db_noop_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(meta), 0)) == 0) + goto err; + + /* Update the statistics. */ + t = dbp->internal; + __bam_add_mstat(&t->lstat, &meta->stat); + + flags = DB_MPOOL_DIRTY; + } + +err: (void)memp_fput(dbp->mpf, (PAGE *)meta, flags); + (void)__BT_LPUT(dbp, mlock); +} diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c new file mode 100644 index 0000000000..e802fd24ab --- /dev/null +++ b/db2/btree/bt_compare.c @@ -0,0 +1,205 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_compare.c 10.3 (Sleepycat) 7/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_cmp -- + * Compare a key to a given record. + * + * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *)); + */ +int +__bam_cmp(dbp, k1, e) + DB *dbp; + const DBT *k1; + EPG *e; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE *t; + DBT k2; + PAGE *h; + + t = dbp->internal; + + /* + * Returns: + * < 0 if k1 is < record + * = 0 if k1 is = record + * > 0 if k1 is > record + * + * The left-most key on internal pages, at any level of the tree, is + * guaranteed, by the following code, to be less than any user key. + * This saves us from having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything we've yet seen. + */ + h = e->page; + if (e->indx == 0 && + h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE) + return (1); + + bo = NULL; + if (TYPE(h) == P_LBTREE) { + bk = GET_BKEYDATA(h, e->indx); + if (bk->type == B_OVERFLOW) + bo = (BOVERFLOW *)bk; + else { + memset(&k2, 0, sizeof(k2)); + k2.data = bk->data; + k2.size = bk->len; + } + } else { + bi = GET_BINTERNAL(h, e->indx); + if (bi->type == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + memset(&k2, 0, sizeof(k2)); + k2.data = bi->data; + k2.size = bi->len; + } + } + + /* + * XXX + * We ignore system errors; the only recoverable one is ENOMEM, and we + * don't want to require that comparison routines handle random errors. + * We don't want to return a valid comparison, either, so we stop. + */ + if (bo != NULL) { + /* + * If using the default comparison routine, use __db_moff(), + * which compares the overflow key a page at a time. + */ + if (t->bt_compare == __bam_defcmp) + return (__db_moff(dbp, k1, bo->pgno)); + + /* + * Otherwise, we need a contiguous record so we can hand it + * to the user's routine. + */ + if (__db_goff(dbp, &k2, bo->tlen, + bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0) + abort(); + } + return ((*t->bt_compare)(k1, &k2)); +} + +/* + * __bam_defcmp -- + * Default comparison routine. + * + * PUBLIC: int __bam_defcmp __P((const DBT *, const DBT *)); + */ +int +__bam_defcmp(a, b) + const DBT *a, *b; +{ + size_t len; + u_int8_t *p1, *p2; + + /* + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + * + * XXX + * If a size_t doesn't fit into a long, or if the difference between + * any two characters doesn't fit into an int, this routine can lose. + * What we need is a signed integral type that's guaranteed to be at + * least as large as a size_t, and there is no such thing. + */ + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((long)*p1 - (long)*p2); + return ((long)a->size - (long)b->size); +} + +/* + * __bam_defpfx -- + * Default prefix routine. + * + * PUBLIC: size_t __bam_defpfx __P((const DBT *, const DBT *)); + */ +size_t +__bam_defpfx(a, b) + const DBT *a, *b; +{ + size_t cnt, len; + u_int8_t *p1, *p2; + + cnt = 1; + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* + * We know that a->size must be <= b->size, or they wouldn't be + * in this order. + */ + return (a->size < b->size ? a->size + 1 : a->size); +} diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c new file mode 100644 index 0000000000..537e2f98ec --- /dev/null +++ b/db2/btree/bt_conv.c @@ -0,0 +1,83 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_conv.c 10.3 (Sleepycat) 8/9/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" + +/* + * __bam_pgin, __bam_pgout -- + * Convert host-specific page layout to/from the host-independent + * format stored on disk. + * + * PUBLIC: int __bam_pgin __P((db_pgno_t, void *, DBT *)); + * PUBLIC: int __bam_pgout __P((db_pgno_t, void *, DBT *)); + */ +int +__bam_pgin(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgin(pg, pp)); +} + +int +__bam_pgout(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgout(pg, pp)); +} + +/* + * __bam_mswap -- + * Swap the bytes on the btree metadata page. + * + * PUBLIC: int __bam_mswap __P((PAGE *)); + */ +int +__bam_mswap(pg) + PAGE *pg; +{ + u_int8_t *p; + + p = (u_int8_t *)pg; + SWAP32(p); /* lsn.file */ + SWAP32(p); /* lsn.offset */ + SWAP32(p); /* pgno */ + SWAP32(p); /* magic */ + SWAP32(p); /* version */ + SWAP32(p); /* pagesize */ + SWAP32(p); /* maxkey */ + SWAP32(p); /* minkey */ + SWAP32(p); /* free */ + SWAP32(p); /* flags */ + return (0); +} diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c new file mode 100644 index 0000000000..592ec9b3ff --- /dev/null +++ b/db2/btree/bt_cursor.c @@ -0,0 +1,1577 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_cursor.c 10.26 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_c_close __P((DBC *)); +static int __bam_c_del __P((DBC *, int)); +static int __bam_c_first __P((DB *, CURSOR *)); +static int __bam_c_get __P((DBC *, DBT *, DBT *, int)); +static int __bam_c_last __P((DB *, CURSOR *)); +static int __bam_c_next __P((DB *, CURSOR *, int)); +static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *)); +static int __bam_c_prev __P((DB *, CURSOR *)); +static int __bam_c_put __P((DBC *, DBT *, DBT *, int)); +static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int)); +static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *)); + +/* Discard the current page/lock held by a cursor. */ +#undef DISCARD +#define DISCARD(dbp, cp) { \ + (void)memp_fput(dbp->mpf, (cp)->page, 0); \ + (cp)->page = NULL; \ + (void)__BT_TLPUT((dbp), (cp)->lock); \ + (cp)->lock = LOCK_INVALID; \ +} + +/* + * __bam_cursor -- + * Interface to the cursor functions. + * + * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **)); + */ +int +__bam_cursor(dbp, txn, dbcp) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; +{ + CURSOR *cp; + DBC *dbc; + + DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0); + + if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL) + return (ENOMEM); + if ((cp = (CURSOR *)calloc(1, sizeof(CURSOR))) == NULL) { + free(dbc); + return (ENOMEM); + } + + cp->dbc = dbc; + cp->pgno = cp->dpgno = PGNO_INVALID; + cp->lock = LOCK_INVALID; + + dbc->dbp = dbp; + dbc->txn = txn; + dbc->internal = cp; + dbc->c_close = __bam_c_close; + dbc->c_del = __bam_c_del; + dbc->c_get = __bam_c_get; + dbc->c_put = __bam_c_put; + + /* All cursor structures hang off the main DB structure. */ + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + *dbcp = dbc; + return (0); +} + +/* + * __bam_c_close -- + * Close a single cursor. + */ +static int +__bam_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + CURSOR *cp; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + cp = dbc->internal; + + /* If a cursor key was deleted do the actual deletion. */ + ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0; + + /* Discard any lock if we're not inside a transaction. */ + if (dbp->txn == NULL && cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + + /* Remove the cursor from the queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Discard the structures. */ + FREE(cp, sizeof(CURSOR)); + FREE(dbc, sizeof(DBC)); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_del -- + * Delete using a cursor. + */ +static int +__bam_c_del(dbc, flags) + DBC *dbc; + int flags; +{ + CURSOR *cp; + DB *dbp; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + db_indx_t indx; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cdelchk(dbc->dbp, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + /* If already deleted, return failure. */ + if (F_ISSET(cp, C_DELETED | C_REPLACE)) + return (DB_KEYEMPTY); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + + /* + * We don't physically delete the record until the cursor moves, + * so we have to have a long-lived write lock on the page instead + * of a long-lived read lock. Note, we have to have a read lock + * to even get here, so we simply discard it. + */ + if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) { + if ((ret = __bam_lget(dbp, + 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + goto err; + (void)__BT_TLPUT(dbp, cp->lock); + cp->lock = lock; + cp->mode = DB_LOCK_WRITE; + } + + /* + * Acquire the underlying page (which may be different from the above + * page because it may be a duplicate page), and set the on-page and + * in-cursor delete flags. We don't need to lock it as we've already + * write-locked the page leading to it. + */ + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) { + (void)memp_fput(dbp->mpf, h, 0); + goto err; + } + + /* Set the intent-to-delete flag on the page and in all cursors. */ + if (cp->dpgno == PGNO_INVALID) + GET_BKEYDATA(h, indx + O_INDX)->deleted = 1; + else + GET_BKEYDATA(h, indx)->deleted = 1; + (void)__bam_ca_delete(dbp, pgno, indx, NULL); + + ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + +err: PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_get -- + * Retrieve a key/data pair from the tree. + * + * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); + */ +int +__bam_get(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + DBC dbc; + CURSOR cp; + int ret; + + DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) + return (ret); + + /* Build a cursor. */ + memset(&cp, 0, sizeof(cp)); + cp.dbc = &dbc; + cp.pgno = cp.dpgno = PGNO_INVALID; + cp.lock = LOCK_INVALID; + + memset(&dbc, 0, sizeof(dbc)); + dbc.dbp = argdbp; + dbc.txn = txn; + dbc.internal = &cp; + + /* Get the key. */ + if ((ret = __bam_c_get(&dbc, + key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET)) != 0) + return (ret); + + /* Discard any lock, the cursor didn't really exist. */ + if (cp.lock != LOCK_INVALID) + (void)__BT_TLPUT(argdbp, cp.lock); + + return (0); +} + +/* + * __bam_c_get -- + * Get using a cursor (btree). + */ +static int +__bam_c_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR *cp, copy; + DB *dbp; + PAGE *h; + int exact, ret; + + DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, + NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cgetchk(dbc->dbp, + key, data, flags, cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* + * Break out the code to return a cursor's record number. It + * has nothing to do with the cursor get code except that it's + * been rammed into the interface. + */ + if (LF_ISSET(DB_GET_RECNO)) { + ret = __bam_c_rget(dbp, cp, key, data, flags); + PUTHANDLE(dbp); + return (ret); + } + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + cp->page = NULL; + cp->lock = LOCK_INVALID; + + switch (flags) { + case DB_CURRENT: + /* It's not possible to return a deleted record. */ + if (F_ISSET(cp, C_DELETED | C_REPLACE)) { + PUTHANDLE(dbp); + return (DB_KEYEMPTY); + } + + /* Get the page with the current item on it. */ + if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + goto err; + break; + case DB_NEXT: + if (cp->pgno != PGNO_INVALID) { + if ((ret = __bam_c_next(dbp, cp, 1)) != 0) + goto err; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + if ((ret = __bam_c_first(dbp, cp)) != 0) + goto err; + break; + case DB_PREV: + if (cp->pgno != PGNO_INVALID) { + if ((ret = __bam_c_prev(dbp, cp)) != 0) + goto err; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + if ((ret = __bam_c_last(dbp, cp)) != 0) + goto err; + break; + case DB_SET_RECNO: + exact = 1; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0) + goto err; + break; + case DB_SET: + exact = 1; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + goto err; + break; + case DB_SET_RANGE: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + goto err; + break; + } + + /* + * Return the key if the user didn't give us one. If we've moved to + * a duplicate page, we may no longer have a pointer to the main page, + * so we have to go get it. We know that it's already read-locked, + * however, so we don't have to acquire a new lock. + */ + if (flags != DB_SET) { + if (cp->dpgno != PGNO_INVALID) { + if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) + goto err; + } else + h = cp->page; + ret = __db_ret(dbp, + h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen); + if (cp->dpgno != PGNO_INVALID) + (void)memp_fput(dbp->mpf, h, 0); + if (ret) + goto err; + } + + /* Return the data. */ + if ((ret = __db_ret(dbp, cp->page, + cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx, + data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + goto err; + + /* + * If the previous cursor record has been deleted, delete it. The + * returned key isn't a deleted key, so clear the flag. + */ + if (F_ISSET(©, C_DELETED) && __bam_c_physdel(dbp, ©, cp->page)) + goto err; + F_CLR(cp, C_DELETED | C_REPLACE); + + /* Release the previous lock, if any. */ + if (copy.lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, copy.lock); + + /* Release the pinned page. */ + ret = memp_fput(dbp->mpf, cp->page, 0); + + ++t->lstat.bt_get; + + if (0) { +err: if (cp->page != NULL) + (void)memp_fput(dbp->mpf, cp->page, 0); + if (cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + *cp = copy; + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_rget -- + * Return the record number for a cursor. + */ +static int +__bam_c_rget(dbp, cp, key, data, flags) + DB *dbp; + CURSOR *cp; + DBT *key, *data; + int flags; +{ + BTREE *t; + DBT dbt; + db_recno_t recno; + int exact, ret; + + /* Get the page with the current item on it. */ + if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + return (ret); + + /* Get a copy of the key. */ + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; + if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0) + goto err; + + exact = 1; + if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0) + goto err; + + t = dbp->internal; + ret = __db_retcopy(data, &recno, sizeof(recno), + &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc); + + /* Release the stack. */ + __bam_stkrel(dbp); + +err: (void)memp_fput(dbp->mpf, cp->page, 0); + free(dbt.data); + return (ret); +} + +/* + * __bam_c_put -- + * Put using a cursor. + */ +static int +__bam_c_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR *cp, copy; + DB *dbp; + DBT dbt; + db_indx_t indx; + db_pgno_t pgno; + int exact, needkey, ret; + void *arg; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, + data, flags); + + cp = dbc->internal; + + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + cp->page = NULL; + cp->lock = LOCK_INVALID; + + /* + * To split, we need a valid key for the page. Since it's a cursor, + * we have to build one. + */ + if (0) { +split: if (needkey) { + memset(&dbt, 0, sizeof(DBT)); + ret = __db_ret(dbp, cp->page, indx, + &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen); + + DISCARD(dbp, cp); + + if (ret) + goto err; + arg = &dbt; + } else { + (void)__bam_stkrel(dbp); + arg = key; + } + if ((ret = __bam_split(dbp, arg)) != 0) + goto err; + } + + /* If there's no key supplied, use the cursor. */ + if (flags == DB_KEYFIRST || flags == DB_KEYLAST) + needkey = 0; + else { + needkey = 1; + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + /* Acquire the current page. */ + if ((ret = __bam_lget(dbp, + 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + goto err; + } + + ret = 0; + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + case DB_CURRENT: + if ((ret = __bam_iitem(dbp, &cp->page, + &indx, key, data, flags, 0)) == DB_NEEDSPLIT) + goto split; + break; + case DB_KEYFIRST: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0) + goto err; + + indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; + if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, + data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) + goto split; + if (ret) + goto err; + break; + case DB_KEYLAST: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0) + goto err; + + indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; + if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, + data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) + goto split; + break; + } + if (ret) + goto err; + + /* + * Update the cursor to point to the new entry. The new entry was + * stored on the current page, because we split pages until it was + * possible. + */ + if (cp->dpgno == PGNO_INVALID) + cp->indx = indx; + else + cp->dindx = indx; + + /* + * If the previous cursor record has been deleted, delete it. The + * returned key isn't a deleted key, so clear the flag. + */ + if (F_ISSET(©, C_DELETED) && + (ret = __bam_c_physdel(dbp, ©, cp->page)) != 0) + goto err; + F_CLR(cp, C_DELETED | C_REPLACE); + + /* Release the previous lock, if any. */ + if (copy.lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, copy.lock); + + /* Discard the pinned page. */ + ret = memp_fput(dbp->mpf, cp->page, 0); + if (0) { +err: if (cp->page != NULL) + (void)memp_fput(dbp->mpf, cp->page, 0); + if (cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + *cp = copy; + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_first -- + * Return the first record. + */ +static int +__bam_c_first(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_pgno_t pgno; + int ret; + + /* Walk down the left-hand side of the tree. */ + for (pgno = PGNO_ROOT;;) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = GET_BINTERNAL(cp->page, 0)->pgno; + DISCARD(dbp, cp); + } + + cp->pgno = cp->page->pgno; + cp->indx = 0; + cp->dpgno = PGNO_INVALID; + + /* If it's an empty page or a deleted record, go to the next one. */ + if (NUM_ENT(cp->page) == 0 || + GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted) + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + + /* If it's a duplicate reference, go to the first entry. */ + if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0) + return (ret); + + /* If it's a deleted record, go to the next one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + return (0); +} + +/* + * __bam_c_last -- + * Return the last record. + */ +static int +__bam_c_last(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_pgno_t pgno; + int ret; + + /* Walk down the right-hand side of the tree. */ + for (pgno = PGNO_ROOT;;) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = + GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; + DISCARD(dbp, cp); + } + + cp->pgno = cp->page->pgno; + cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX; + cp->dpgno = PGNO_INVALID; + + /* If it's an empty page or a deleted record, go to the previous one. */ + if (NUM_ENT(cp->page) == 0 || + GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted) + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + + /* If it's a duplicate reference, go to the last entry. */ + if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0) + return (ret); + + /* If it's a deleted record, go to the previous one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + return (0); +} + +/* + * __bam_c_next -- + * Move to the next record. + */ +static int +__bam_c_next(dbp, cp, initial_move) + DB *dbp; + CURSOR *cp; + int initial_move; +{ + db_indx_t adjust, indx; + db_pgno_t pgno; + int ret; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + */ + if (cp->dpgno == PGNO_INVALID) { + adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; + pgno = cp->pgno; + indx = cp->indx; + } else { + adjust = O_INDX; + pgno = cp->dpgno; + indx = cp->dindx; + } + if (cp->page == NULL) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + } + + /* + * If at the end of the page, move to a subsequent page. + * + * !!! + * Check for >= NUM_ENT. If we're here as the result of a search that + * landed us on NUM_ENT, we'll increment indx before we test. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (initial_move) + indx += adjust; + for (;;) { + if (indx >= NUM_ENT(cp->page)) { + pgno = cp->page->next_pgno; + DISCARD(dbp, cp); + + /* + * If we're in a btree leaf page, we've reached the end + * of the tree. If we've reached the end of a page of + * duplicates, continue from the btree leaf page where + * we found this page of duplicates. + */ + if (pgno == PGNO_INVALID) { + /* If in a btree leaf page, it's EOF. */ + if (cp->dpgno == PGNO_INVALID) + return (DB_NOTFOUND); + + /* Continue from the last btree leaf page. */ + cp->dpgno = PGNO_INVALID; + + adjust = P_INDX; + pgno = cp->pgno; + indx = cp->indx + P_INDX; + } else + indx = 0; + + if ((ret = __bam_lget(dbp, + 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + continue; + } + + /* Ignore deleted records. */ + if (dbp->type == DB_BTREE && + ((cp->dpgno == PGNO_INVALID && + GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) || + (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, indx)->deleted))) { + indx += adjust; + continue; + } + + /* + * If we're not in a duplicates page, check to see if we've + * found a page of duplicates, in which case we move to the + * first entry. + */ + if (cp->dpgno == PGNO_INVALID) { + cp->pgno = cp->page->pgno; + cp->indx = indx; + + if ((ret = + __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0) + return (ret); + if (cp->dpgno != PGNO_INVALID) { + indx = cp->dindx; + adjust = O_INDX; + continue; + } + } else { + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + } + break; + } + return (0); +} + +/* + * __bam_c_prev -- + * Move to the previous record. + */ +static int +__bam_c_prev(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_indx_t indx, adjust; + db_pgno_t pgno; + int ret, set_indx; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + */ + if (cp->dpgno == PGNO_INVALID) { + adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; + pgno = cp->pgno; + indx = cp->indx; + } else { + adjust = O_INDX; + pgno = cp->dpgno; + indx = cp->dindx; + } + if (cp->page == NULL) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + } + + /* + * If at the beginning of the page, move to any previous one. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + for (;;) { + if (indx == 0) { + pgno = cp->page->prev_pgno; + DISCARD(dbp, cp); + + /* + * If we're in a btree leaf page, we've reached the + * beginning of the tree. If we've reached the first + * of a page of duplicates, continue from the btree + * leaf page where we found this page of duplicates. + */ + if (pgno == PGNO_INVALID) { + /* If in a btree leaf page, it's SOF. */ + if (cp->dpgno == PGNO_INVALID) + return (DB_NOTFOUND); + + /* Continue from the last btree leaf page. */ + cp->dpgno = PGNO_INVALID; + + adjust = P_INDX; + pgno = cp->pgno; + indx = cp->indx; + set_indx = 0; + } else + set_indx = 1; + + if ((ret = __bam_lget(dbp, + 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + if (set_indx) + indx = NUM_ENT(cp->page); + if (indx == 0) + continue; + } + + /* Ignore deleted records. */ + indx -= adjust; + if (dbp->type == DB_BTREE && + ((cp->dpgno == PGNO_INVALID && + GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) || + (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, indx)->deleted))) + continue; + + /* + * If we're not in a duplicates page, check to see if we've + * found a page of duplicates, in which case we move to the + * last entry. + */ + if (cp->dpgno == PGNO_INVALID) { + cp->pgno = cp->page->pgno; + cp->indx = indx; + + if ((ret = + __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0) + return (ret); + if (cp->dpgno != PGNO_INVALID) { + indx = cp->dindx + O_INDX; + adjust = O_INDX; + continue; + } + } else { + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + } + break; + } + return (0); +} + +/* + * __bam_c_search -- + * Move to a specified record. + */ +static int +__bam_c_search(dbp, cp, key, flags, isrecno, exactp) + DB *dbp; + CURSOR *cp; + const DBT *key; + u_int flags; + int isrecno, *exactp; +{ + BTREE *t; + db_recno_t recno; + int needexact, ret; + + t = dbp->internal; + needexact = *exactp; + + /* + * Find any matching record; the search function pins the page. Make + * sure it's a valid key (__bam_search may return an index just past + * the end of a page) and return it. + */ + if (isrecno) { + if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + return (ret); + ret = __bam_rsearch(dbp, &recno, flags, 1, exactp); + } else + ret = __bam_search(dbp, key, flags, 1, NULL, exactp); + if (ret != 0) + return (ret); + + cp->page = t->bt_csp->page; + cp->pgno = cp->page->pgno; + cp->indx = t->bt_csp->indx; + cp->lock = t->bt_csp->lock; + cp->dpgno = PGNO_INVALID; + + /* + * If we have an exact match, make sure that we're not looking at a + * chain of duplicates -- if so, move to an entry in that chain. + */ + if (*exactp) { + if ((ret = __bam_ovfl_chk(dbp, + cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0) + return (ret); + } else + if (needexact) + return (DB_NOTFOUND); + + /* If past the end of a page, find the next entry. */ + if (cp->indx == NUM_ENT(cp->page) && + (ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + + /* If it's a deleted record, go to the next or previous one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if (flags == S_KEYLAST) { + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + } else + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + return (0); +} + +/* + * __bam_ovfl_chk -- + * Check for an overflow record, and if found, move to the correct + * record. + * + * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); + */ +int +__bam_ovfl_chk(dbp, cp, indx, to_end) + DB *dbp; + CURSOR *cp; + u_int32_t indx; + int to_end; +{ + BOVERFLOW *bo; + db_pgno_t pgno; + int ret; + + /* Check for an overflow entry. */ + bo = GET_BOVERFLOW(cp->page, indx); + if (bo->type != B_DUPLICATE) + return (0); + + /* + * If we find one, go to the duplicates page, and optionally move + * to the last record on that page. + * + * XXX + * We don't lock duplicates pages, we've already got the correct + * lock on the main page. + */ + pgno = bo->pgno; + if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + return (ret); + cp->page = NULL; + if (to_end) { + if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0) + return (ret); + indx = NUM_ENT(cp->page) - O_INDX; + } else { + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + indx = 0; + } + + /* Update the duplicate entry in the cursor. */ + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + + return (0); +} + +#ifdef DEBUG +/* + * __bam_cprint -- + * Display the current btree cursor list. + */ +int +__bam_cprint(dbp) + DB *dbp; +{ + CURSOR *cp; + DBC *dbc; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + fprintf(stderr, + "%#0x: page: %lu index: %lu dpage %lu dindex: %lu", + (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, + (u_long)cp->dpgno, (u_long)cp->dindx); + if (F_ISSET(cp, C_DELETED)) + fprintf(stderr, "(deleted)"); + fprintf(stderr, "\n"); + } + DB_THREAD_UNLOCK(dbp); + return (0); +} +#endif /* DEBUG */ + +/* + * __bam_ca_delete -- + * Check if any of the cursors refer to the item we are about to delete. + * We'll return the number of cursors that refer to the item in question. + * If a cursor does refer to the item, then we set its deleted bit. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *)); + */ +int +__bam_ca_delete(dbp, pgno, indx, curs) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + CURSOR *curs; +{ + DBC *dbc; + CURSOR *cp; + int count; + + /* + * Adjust the cursors. We don't have to review the cursors for any + * process other than the current one, because we have the page write + * locked at this point, and any other process had better be using a + * different locker ID, meaning that only cursors in our process can + * be on the page. + * + * It's possible for multiple cursors within the thread to have write + * locks on the same page, but, cursors within a thread must be single + * threaded, so all we're locking here is the cursor linked list. + * + * indx refers to the first of what might be a duplicate set. The + * cursor passed in is the one initiating the delete, so we don't + * want to count it. + */ + DB_THREAD_LOCK(dbp); + for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if ((curs != cp && + cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) { + ++count; + F_SET(cp, C_DELETED); + } + } + DB_THREAD_UNLOCK(dbp); + return (count); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_di(dbp, pgno, indx, value) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int value; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx) + cp->indx += value; + if (cp->dpgno == pgno && cp->dindx >= indx) + cp->dindx += value; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving data items to a duplicates page. + * + * PUBLIC: void __bam_ca_dup __P((DB *, + * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +void +__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) + DB *dbp; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + CURSOR *cp; + DBC *dbc; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * No need to test duplicates, this only gets called when moving + * leaf page data items onto a duplicates page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + /* + * Ignore matching entries that have already been moved, + * we move from the same location on the leaf page more + * than once. + */ + if (cp->dpgno == PGNO_INVALID && + cp->pgno == fpgno && cp->indx == fi) { + cp->indx = first; + cp->dpgno = tpgno; + cp->dindx = ti; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_move -- + * Adjust the cursors when moving data items to another page. + * + * PUBLIC: void __bam_ca_move __P((DB *, BTREE *, db_pgno_t, db_pgno_t)); + */ +void +__bam_ca_move(dbp, t, fpgno, tpgno) + DB *dbp; + BTREE *t; + db_pgno_t fpgno, tpgno; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * No need to test duplicates, this only gets called when copying + * over the root page with a leaf or internal page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == fpgno) + cp->pgno = tpgno; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_replace -- + * Check if any of the cursors refer to the item we are about to replace. + * If so, their flags should be changed from deleted to replaced. + * + * PUBLIC: void __bam_ca_replace + * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); + */ +void +__bam_ca_replace(dbp, pgno, indx, pass) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + ca_replace_arg pass; +{ + CURSOR *cp; + DBC *dbc; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * Find any cursors that have logically deleted a record we're about + * to overwrite. + * + * Pass == REPLACE_SETUP: + * Set C_REPLACE_SETUP so we can find the cursors again. + * + * Pass == REPLACE_SUCCESS: + * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the + * overwrite was successful. + * + * Pass == REPLACE_FAILED: + * Clear C_REPLACE_SETUP, the overwrite failed. + * + * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value + * for the cursor as it may have been changed by other cursor update + * routines as the item was deleted/inserted. + */ + DB_THREAD_LOCK(dbp); + switch (pass) { + case REPLACE_SETUP: /* Setup. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if ((cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) + F_SET(cp, C_REPLACE_SETUP); + } + break; + case REPLACE_SUCCESS: /* Overwrite succeeded. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (F_ISSET(cp, C_REPLACE_SETUP)) { + if (cp->dpgno == pgno) + cp->dindx = indx; + if (cp->pgno == pgno) + cp->indx = indx; + F_SET(cp, C_REPLACE); + F_CLR(cp, C_DELETED | C_REPLACE_SETUP); + } + } + break; + case REPLACE_FAILED: /* Overwrite failed. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (F_ISSET(cp, C_REPLACE_SETUP)) { + if (cp->dpgno == pgno) + cp->dindx = indx; + if (cp->pgno == pgno) + cp->indx = indx; + F_CLR(cp, C_REPLACE_SETUP); + } + } + break; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: void __bam_ca_split __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) + DB *dbp; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DBC *dbc; + CURSOR *cp; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == ppgno) + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + if (cp->dpgno == ppgno) + if (cp->dindx < split_indx) { + if (cleft) + cp->dpgno = lpgno; + } else { + cp->dpgno = rpgno; + cp->dindx -= split_indx; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_c_physdel -- + * Actually do the cursor deletion. + */ +static int +__bam_c_physdel(dbp, cp, h) + DB *dbp; + CURSOR *cp; + PAGE *h; +{ + BOVERFLOW bo; + BTREE *t; + DBT dbt; + DB_LOCK lock; + db_indx_t indx; + db_pgno_t pgno, next_pgno, prev_pgno; + int local, ret; + + t = dbp->internal; + ret = 0; + + /* Figure out what we're deleting. */ + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + + /* + * If the item is referenced by another cursor, leave it up to that + * cursor to do the delete. + */ + if (__bam_ca_delete(dbp, pgno, indx, cp) != 0) + return (0); + + /* + * If we don't already have the page locked, get it and delete the + * items. + */ + if ((h == NULL || h->pgno != pgno)) { + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + return (ret); + local = 1; + } else + local = 0; + + /* + * If we're deleting a duplicate entry, call the common code to do + * the work. + */ + if (TYPE(h) == P_DUPLICATE) { + pgno = PGNO(h); + prev_pgno = PREV_PGNO(h); + next_pgno = NEXT_PGNO(h); + if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0) + goto err; + + /* + * There are 4 cases: + * + * 1. We removed an item on a page, but there are more items + * on the page. + * 2. We removed the last item on a page, removing the last + * duplicate. + * 3. We removed the last item on a page, but there is a + * following page of duplicates. + * 4. We removed the last item on a page, but there is a + * previous page of duplicates. + * + * In case 1, h != NULL, h->pgno == pgno + * In case 2, h == NULL, + * prev_pgno == PGNO_INVALID, next_pgno == PGNO_INVALID + * In case 3, h != NULL, next_pgno != PGNO_INVALID + * In case 4, h == NULL, prev_pgno != PGNO_INVALID + * + * In case 1, there's nothing else to do. + * In case 2, remove the entry from the parent page. + * In case 3 or 4, if the deleted page was the first in a chain + * of duplicate pages, update the parent page's entry. + * + * Test: + * If there were previous pages of duplicates or we didn't + * empty the current page of duplicates, we don't need to + * touch the parent page. + */ + if (PREV_PGNO(h) != PGNO_INVALID || + (h != NULL && pgno == h->pgno)) + goto done; + + /* + * Release any page we're holding and the lock on the deleted + * page. + */ + if (local) { + if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + } + + /* Acquire the parent page. */ + if ((ret = + __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) { + (void)__BT_TLPUT(dbp, lock); + goto err; + } + local = 1; + + /* + * If we deleted the last duplicate, we can fall out and do a + * normal btree delete in the context of the parent page. If + * not, we have to update the parent's page. + */ + indx = cp->indx; + if (next_pgno != PGNO_INVALID) { + /* + * Copy, delete, update and re-insert the parent page's + * entry. + */ + bo = *GET_BOVERFLOW(h, indx); + (void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE); + bo.pgno = next_pgno; + memset(&dbt, 0, sizeof(dbt)); + dbt.data = &bo; + dbt.size = BOVERFLOW_SIZE; + (void)__db_pitem(dbp, + h, indx, BOVERFLOW_SIZE, &dbt, NULL); + + /* Discard the parent page. */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + + goto done; + } + } + + /* Otherwise, do a normal btree delete. */ + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + /* + * If the page is empty, delete it. To delete a leaf page we need a + * copy of a key from the page. We use the first one that was there, + * since it's the last key that the page held. We malloc the page + * information instead of using the return key/data memory because + * we've already set them -- the reason that we've already set them + * is because we're (potentially) about to do a reverse split, which + * would make our saved page information useless. + * + * XXX + * The following operations to delete a page might deadlock. I think + * that's OK. The problem is if we're deleting an item because we're + * closing cursors because we've already deadlocked and want to call + * txn_abort(). If we fail due to deadlock, we'll leave an locked + * empty page in the tree, which won't be empty long because we're + * going to undo the delete. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; + if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0) + goto err; + + if (local) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + } + + ret = __bam_dpage(dbp, &dbt); + free(dbt.data); + } + +err: +done: if (local) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + } + + if (ret == 0) + ++t->lstat.bt_deleted; + return (ret); +} diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c new file mode 100644 index 0000000000..e7ec4dfe3e --- /dev/null +++ b/db2/btree/bt_delete.c @@ -0,0 +1,607 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_delete.c 10.18 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_dpages __P((DB *, BTREE *)); + +/* + * __bam_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, int)); + */ +int +__bam_delete(argdbp, txn, key, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key; + int flags; +{ + BTREE *t; + DB *dbp; + PAGE *h; + db_indx_t cnt, i, indx; + int dpage, exact, ret, stack; + + DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags); + + stack = 0; + + /* Check for invalid flags. */ + if ((ret = + __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0) + goto err; + stack = 1; + h = t->bt_csp->page; + indx = t->bt_csp->indx; + + /* Delete the key/data pair, including any duplicates. */ + for (cnt = 1, i = indx;; ++cnt) + if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx]) + break; + for (; cnt > 0; --cnt, ++t->lstat.bt_deleted) + if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) { + GET_BKEYDATA(h, indx + O_INDX)->deleted = 1; + indx += P_INDX; + } else if ((ret = __bam_ditem(dbp, h, indx)) != 0 || + (ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + /* If we're using record numbers, update internal page record counts. */ + if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0) + goto err; + + /* If the page is now empty, delete it. */ + dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT; + + __bam_stkrel(dbp); + stack = 0; + + ret = dpage ? __bam_dpage(dbp, key) : 0; + +err: if (stack) + __bam_stkrel(dbp); + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, int)); + */ +int +__ram_delete(argdbp, txn, key, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key; + int flags; +{ + BKEYDATA bk; + BTREE *t; + DB *dbp; + DBT hdr, data; + PAGE *h; + db_indx_t indx; + db_recno_t recno; + int exact, ret, stack; + + stack = 0; + + /* Check for invalid flags. */ + if ((ret = + __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0) + goto err; + + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* If the record has already been deleted, we couldn't have found it. */ + if (GET_BKEYDATA(h, indx)->deleted) { + ret = DB_KEYEMPTY; + goto done; + } + + /* + * If we're not renumbering records, replace the record with a marker + * and return. + */ + if (!F_ISSET(dbp, DB_RE_RENUMBER)) { + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + bk.deleted = 1; + bk.type = B_KEYDATA; + bk.len = 0; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bk; + hdr.size = SSZA(BKEYDATA, data); + memset(&data, 0, sizeof(data)); + data.data = (char *) ""; + data.size = 0; + if ((ret = __db_pitem(dbp, + h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + + ++t->lstat.bt_deleted; + goto done; + } + + /* Delete the item. */ + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + ++t->lstat.bt_deleted; + if (t->bt_recno != NULL) + F_SET(t->bt_recno, RECNO_MODIFIED); + + /* Adjust the counts. */ + __bam_adjust(dbp, t, -1); + + /* Adjust the cursors. */ + __ram_ca(dbp, recno, CA_DELETE); + + /* + * If the page is now empty, delete it -- we have the whole tree + * locked, so there are no preparations to make. Else, release + * the pages. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + stack = 0; + ret = __bam_dpages(dbp, t); + } + +done: +err: if (stack) + __bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_ditem -- + * Delete one or more entries from a page. + * + * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t)); + */ +int +__bam_ditem(dbp, h, indx) + DB *dbp; + PAGE *h; + u_int32_t indx; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + u_int32_t nbytes; + int ret; + + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, indx); + switch (bi->type) { + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BINTERNAL_SIZE(bi->len); + goto offpage; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bi->len); + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + case P_LBTREE: + /* + * If it's a duplicate key, discard the index and don't touch + * the actual page item. This works because no data item can + * have an index that matches any other index so even if the + * data item is in an index "slot", it won't match any other + * index. + */ + if (!(indx % 2)) { + if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + return (__bam_adjindx(dbp, + h, indx, indx - P_INDX, 0)); + if (indx < (u_int32_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + return (__bam_adjindx(dbp, + h, indx, indx + O_INDX, 0)); + } + /* FALLTHROUGH */ + case P_LRECNO: + bk = GET_BKEYDATA(h, indx); + switch (bk->type) { + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BOVERFLOW_SIZE; + +offpage: /* Delete duplicate/offpage chains. */ + bo = GET_BOVERFLOW(h, indx); + if (bo->type == B_DUPLICATE) { + if ((ret = + __db_ddup(dbp, bo->pgno, __bam_free)) != 0) + return (ret); + } else + if ((ret = + __db_doff(dbp, bo->pgno, __bam_free)) != 0) + return (ret); + break; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bk->len); + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + + /* Delete the item. */ + if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0) + return (ret); + + /* Mark the page dirty. */ + return (memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)); +} + +/* + * __bam_adjindx -- + * Adjust an index on the page. + * + * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); + */ +int +__bam_adjindx(dbp, h, indx, indx_copy, is_insert) + DB *dbp; + PAGE *h; + u_int32_t indx, indx_copy; + int is_insert; +{ + db_indx_t copy; + int ret; + + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, + (u_int32_t)is_insert)) != 0) + return (ret); + + if (is_insert) { + copy = h->inp[indx_copy]; + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx + O_INDX], &h->inp[indx], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + h->inp[indx] = copy; + ++NUM_ENT(h); + } else { + --NUM_ENT(h); + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx], &h->inp[indx + O_INDX], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + } + + /* Mark the page dirty. */ + ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); + + /* Adjust the cursors. */ + __bam_ca_di(dbp, h->pgno, indx, is_insert ? 1 : -1); + return (0); +} + +/* + * __bam_dpage -- + * Delete a page from the tree. + * + * PUBLIC: int __bam_dpage __P((DB *, const DBT *)); + */ +int +__bam_dpage(dbp, key) + DB *dbp; + const DBT *key; +{ + BTREE *t; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int exact, level, ret; + + ret = 0; + t = dbp->internal; + + /* + * The locking protocol is that we acquire locks by walking down the + * tree, to avoid the obvious deadlocks. + * + * Call __bam_search to reacquire the empty leaf page, but this time + * get both the leaf page and it's parent, locked. Walk back up the + * tree, until we have the top pair of pages that we want to delete. + * Once we have the top page that we want to delete locked, lock the + * underlying pages and check to make sure they're still empty. If + * they are, delete them. + */ + for (level = LEAFLEVEL;; ++level) { + /* Acquire a page and its parent, locked. */ + if ((ret = + __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0) + return (ret); + + /* + * If we reach the root or the page isn't going to be empty + * when we delete one record, quit. + */ + h = t->bt_csp[-1].page; + if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1) + break; + + /* Release the two locked pages. */ + (void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0); + (void)__BT_TLPUT(dbp, t->bt_csp[-1].lock); + (void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0); + (void)__BT_TLPUT(dbp, t->bt_csp[0].lock); + } + + /* + * Leave the stack pointer one after the last entry, we may be about + * to push more items on the stack. + */ + ++t->bt_csp; + + /* + * t->bt_csp[-2].page is the top page, which we're not going to delete, + * and t->bt_csp[-1].page is the first page we are going to delete. + * + * Walk down the chain, acquiring the rest of the pages until we've + * retrieved the leaf page. If we find any pages that aren't going + * to be emptied by the delete, someone else added something while we + * were walking the tree, and we discontinue the delete. + */ + for (h = t->bt_csp[-1].page;;) { + if (ISLEAF(h)) { + if (NUM_ENT(h) != 0) + goto release; + break; + } else + if (NUM_ENT(h) != 1) + goto release; + + /* + * Get the next page, write lock it and push it onto the stack. + * We know it's index 0, because it can only have one element. + */ + pgno = TYPE(h) == P_IBTREE ? + GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno; + + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + goto release; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto release; + BT_STK_PUSH(t, h, 0, lock, ret); + if (ret != 0) + goto release; + } + + BT_STK_POP(t); + return (__bam_dpages(dbp, t)); + +release: + /* Discard any locked pages and return. */ + BT_STK_POP(t); + __bam_stkrel(dbp); + return (ret); +} + +/* + * __bam_dpages -- + * Delete a set of locked pages. + */ +static int +__bam_dpages(dbp, t) + DB *dbp; + BTREE *t; +{ + DBT a, b; + DB_LOCK lock; + EPG *epg; + PAGE *h; + db_pgno_t pgno; + db_recno_t rcnt; + int ret; + + rcnt = 0; /* XXX: Shut the compiler up. */ + epg = t->bt_sp; + + /* + * !!! + * There is an interesting deadlock situation here. We have to relink + * the leaf page chain around the leaf page being deleted. Consider + * a cursor walking through the leaf pages, that has the previous page + * read-locked and is waiting on a lock for the page we're deleting. + * It will deadlock here. This is a problem, because if our process is + * selected to resolve the deadlock, we'll leave an empty leaf page + * that we can never again access by walking down the tree. So, before + * we unlink the subtree, we relink the leaf page chain. + */ + if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0) + goto release; + + /* + * We have the entire stack of deletable pages locked. Start from the + * top of the tree and move to the bottom, as it's better to release + * the inner pages as soon as possible. + */ + if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0) + goto release; + + /* + * If we deleted the next-to-last item from the root page, the tree + * has collapsed a level. Try and write lock the remaining root + 1 + * page and copy it onto the root page. If we can't get the lock, + * that's okay, the tree just stays a level deeper than we'd like. + */ + h = epg->page; + if (h->pgno == PGNO_ROOT && NUM_ENT(h) == 1) { + pgno = TYPE(epg->page) == P_IBTREE ? + GET_BINTERNAL(epg->page, 0)->pgno : + GET_RINTERNAL(epg->page, 0)->pgno; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + goto release; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto release; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + memset(&a, 0, sizeof(a)); + a.data = h; + a.size = dbp->pgsize; + memset(&b, 0, sizeof(b)); + b.data = P_ENTRY(epg->page, 0); + b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); + __bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn, + &h->lsn, 0, dbp->log_fileid, h->pgno, &a, &b, + &epg->page->lsn); + } + + /* + * Make the switch. + * + * One fixup -- if the tree has record numbers and we're not + * converting to a leaf page, we have to preserve the total + * record count. + */ + if (TYPE(h) == P_IRECNO || + (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + rcnt = RE_NREC(epg->page); + memcpy(epg->page, h, dbp->pgsize); + epg->page->pgno = PGNO_ROOT; + if (TYPE(h) == P_IRECNO || + (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + RE_NREC_SET(epg->page, rcnt); + + /* Free the last page in that level of the btree. */ + ++t->lstat.bt_freed; + (void)__bam_free(dbp, h); + + /* Adjust the cursors. */ + __bam_ca_move(dbp, t, h->pgno, PGNO_ROOT); + + (void)__BT_TLPUT(dbp, lock); + } + + /* Release the top page in the subtree. */ + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + + /* + * Free the rest of the pages. + * + * XXX + * Don't bother checking for errors. We've unlinked the subtree from + * the tree, and there's no possibility of recovery. + */ + for (; ++epg <= t->bt_csp; ++t->lstat.bt_freed) { + if (NUM_ENT(epg->page) != 0) + (void)__bam_ditem(dbp, epg->page, epg->indx); + + (void)__bam_free(dbp, epg->page); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (0); + +release: + /* Discard any remaining pages and return. */ + for (; epg <= t->bt_csp; ++epg) { + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (ret); +} diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c new file mode 100644 index 0000000000..354888c6c2 --- /dev/null +++ b/db2/btree/bt_open.c @@ -0,0 +1,355 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_open.c 10.20 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +/* + * Implementation of btree access method for 4.4BSD. + * + * The design here was originally based on that of the btree access method + * used in the Postgres database system at UC Berkeley. This implementation + * is wholly independent of the Postgres code. + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "common_ext.h" + +static int __bam_keyalloc __P((BTREE *)); +static int __bam_setmeta __P((DB *, BTREE *)); + +/* + * __bam_open -- + * Open a btree. + * + * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *)); + */ +int +__bam_open(dbp, type, dbinfo) + DB *dbp; + DBTYPE type; + DB_INFO *dbinfo; +{ + BTREE *t; + int ret; + + /* Allocate the btree internal structure. */ + if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL) + return (ENOMEM); + + t->bt_sp = t->bt_csp = t->bt_stack; + t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + + if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) && + (ret = __bam_keyalloc(t)) != 0) + goto err; + + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. + */ + if (dbinfo != NULL) { + /* Minimum number of keys per page. */ + if (dbinfo->bt_minkey == 0) + t->bt_minkey = DEFMINKEYPAGE; + else { + if (dbinfo->bt_minkey < 2) + goto einval; + t->bt_minkey = dbinfo->bt_minkey; + } + + /* Maximum number of keys per page. */ + if (dbinfo->bt_maxkey == 0) + t->bt_maxkey = 0; + else { + if (dbinfo->bt_maxkey < 1) + goto einval; + t->bt_maxkey = dbinfo->bt_maxkey; + } + + /* + * If no comparison, use default comparison. If no comparison + * and no prefix, use default prefix. (We can't default the + * prefix if the user supplies a comparison routine; shortening + * the keys may break their comparison algorithm.) + */ + t->bt_compare = dbinfo->bt_compare == NULL ? + __bam_defcmp : dbinfo->bt_compare; + t->bt_prefix = dbinfo->bt_prefix == NULL ? + (dbinfo->bt_compare == NULL ? + __bam_defpfx : NULL) : dbinfo->bt_prefix; + } else { + t->bt_minkey = DEFMINKEYPAGE; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } + + /* Initialize the remaining fields of the DB. */ + dbp->type = type; + dbp->internal = t; + dbp->cursor = __bam_cursor; + dbp->del = __bam_delete; + dbp->get = __bam_get; + dbp->put = __bam_put; + dbp->stat = __bam_stat; + dbp->sync = __bam_sync; + + /* + * The btree data structure requires that at least two key/data pairs + * can fit on a page, but other than that there's no fixed requirement. + * Translate the minimum number of items into the bytes a key/data pair + * can use before being placed on an overflow page. We calculate for + * the worst possible alignment by assuming every item requires the + * maximum alignment for padding. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. + */ + t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) + - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); + + /* Create a root page if new tree. */ + if ((ret = __bam_setmeta(dbp, t)) != 0) + goto err; + + return (0); + +einval: ret = EINVAL; + +err: if (t != NULL) { + /* If we allocated room for key/data return, discard it. */ + if (t->bt_rkey.data != NULL) + free(t->bt_rkey.data); + + FREE(t, sizeof(BTREE)); + } + return (ret); +} + +/* + * __bam_bdup -- + * Create a BTREE handle for a threaded DB handle. + * + * PUBLIC: int __bam_bdup __P((DB *, DB *)); + */ +int +__bam_bdup(orig, new) + DB *orig, *new; +{ + BTREE *t, *ot; + int ret; + + ot = orig->internal; + + if ((t = (BTREE *)calloc(1, sizeof(*t))) == NULL) + return (ENOMEM); + + /* + * !!! + * Ignore the cursor queue, only the first DB has attached cursors. + */ + + t->bt_sp = t->bt_csp = t->bt_stack; + t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + + if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) && + (ret = __bam_keyalloc(t)) != 0) { + FREE(t, sizeof(*t)); + return (ret); + } + + t->bt_maxkey = ot->bt_maxkey; + t->bt_minkey = ot->bt_minkey; + t->bt_compare = ot->bt_compare; + t->bt_prefix = ot->bt_prefix; + t->bt_ovflsize = ot->bt_ovflsize; + + /* + * !!! + * The entire RECNO structure is shared. If it breaks, the application + * was misusing it to start with. + */ + t->bt_recno = ot->bt_recno; + + new->internal = t; + + return (0); +} + +/* + * __bam_keyalloc -- + * Allocate return memory for recno keys. + */ +static int +__bam_keyalloc(t) + BTREE *t; +{ + /* + * Recno keys are always the same size, and we don't want to have + * to check for space on each return. Allocate it now. + */ + if ((t->bt_rkey.data = (void *)malloc(sizeof(db_recno_t))) == NULL) + return (ENOMEM); + t->bt_rkey.ulen = sizeof(db_recno_t); + return (0); +} + +/* + * __bam_setmeta -- + * Check (and optionally create) a tree. + */ +static int +__bam_setmeta(dbp, t) + DB *dbp; + BTREE *t; +{ + BTMETA *meta; + PAGE *root; + DB_LOCK mlock, rlock; + db_pgno_t pgno; + int ret; + + /* Get, and optionally create the metadata page. */ + pgno = PGNO_METADATA; + if ((ret = + __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &mlock)) != 0) + return (ret); + if ((ret = + __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) { + (void)__BT_LPUT(dbp, mlock); + return (ret); + } + + /* + * If the magic number is correct, we're not creating the tree. + * Correct any fields that may not be right. Note, all of the + * local flags were set by db_open(3). + */ + if (meta->magic != 0) { + t->bt_maxkey = meta->maxkey; + t->bt_minkey = meta->minkey; + + (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); + (void)__BT_LPUT(dbp, mlock); + return (0); + } + + /* Initialize the tree structure metadata information. */ + ZERO_LSN(meta->lsn); + meta->pgno = PGNO_METADATA; + meta->magic = DB_BTREEMAGIC; + meta->version = DB_BTREEVERSION; + meta->pagesize = dbp->pgsize; + meta->maxkey = t->bt_maxkey; + meta->minkey = t->bt_minkey; + meta->free = PGNO_INVALID; + meta->flags = 0; + if (dbp->type == DB_RECNO) + F_SET(meta, BTM_RECNO); + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(meta, BTM_DUP); + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + F_SET(meta, BTM_FIXEDLEN); + if (F_ISSET(dbp, DB_BT_RECNUM)) + F_SET(meta, BTM_RECNUM); + if (F_ISSET(dbp, DB_RE_RENUMBER)) + F_SET(meta, BTM_RENUMBER); + meta->re_len = 0; + meta->re_pad = 0; + memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN); + + /* Create and initialize a root page. */ + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rlock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) { + (void)__BT_LPUT(dbp, rlock); + return (ret); + } + P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, + PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); + ZERO_LSN(root->lsn); + + /* Release the metadata and root pages. */ + if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) + return (ret); + if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) + return (ret); + + /* + * Flush the metadata and root pages to disk -- since the user can't + * transaction protect open, the pages have to exist during recovery. + * + * XXX + * It's not useful to return not-yet-flushed here -- convert it to + * an error. + */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = EINVAL; + + /* Release the locks. */ + (void)__BT_LPUT(dbp, mlock); + (void)__BT_LPUT(dbp, rlock); + + return (ret); +} diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c new file mode 100644 index 0000000000..7ee74ffcf8 --- /dev/null +++ b/db2/btree/bt_page.c @@ -0,0 +1,312 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_page.c 10.5 (Sleepycat) 8/18/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_new -- + * Get a new page, preferably from the freelist. + * + * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **)); + */ +int +__bam_new(dbp, type, pagepp) + DB *dbp; + u_int32_t type; + PAGE **pagepp; +{ + BTMETA *meta; + DB_LOCK mlock; + PAGE *h; + db_pgno_t pgno; + int ret; + + meta = NULL; + h = NULL; + mlock = LOCK_INVALID; + + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + goto err; + + if (meta->free == PGNO_INVALID) { + if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0) + goto err; + ZERO_LSN(h->lsn); + h->pgno = pgno; + } else { + pgno = meta->free; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto err; + meta->free = h->next_pgno; + } + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn, + &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn, + h->pgno, (u_int32_t)type, meta->free)) != 0) + goto err; + LSN(h) = LSN(meta); + } + + (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, mlock); + + P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); + *pagepp = h; + return (0); + +err: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + if (meta != NULL) + (void)memp_fput(dbp->mpf, meta, 0); + if (mlock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, mlock); + return (ret); +} + +/* + * __bam_free -- + * Add a page to the head of the freelist. + * + * PUBLIC: int __bam_free __P((DB *, PAGE *)); + */ +int +__bam_free(dbp, h) + DB *dbp; + PAGE *h; +{ + BTMETA *meta; + DBT ldbt; + DB_LOCK mlock; + db_pgno_t pgno; + int is_dirty, ret, t_ret; + + /* + * Retrieve the metadata page and insert the page at the head of + * the free list. If either the lock get or page get routines + * fail, then we need to put the page with which we were called + * back because our caller assumes we take care of it. + */ + is_dirty = 0; + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) { + (void)__BT_TLPUT(dbp, mlock); + goto err; + } + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + memset(&ldbt, 0, sizeof(ldbt)); + ldbt.data = h; + ldbt.size = P_OVERHEAD; + if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info, + dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, + &meta->lsn, &ldbt, meta->free)) != 0) { + (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); + (void)__BT_TLPUT(dbp, mlock); + return (ret); + } + LSN(h) = LSN(meta); + } + + /* + * The page should have nothing interesting on it, re-initialize it, + * leaving only the page number and the LSN. + */ +#ifdef DEBUG + { db_pgno_t __pgno; DB_LSN __lsn; + __pgno = h->pgno; + __lsn = h->lsn; + memset(h, 0xff, dbp->pgsize); + h->pgno = __pgno; + h->lsn = __lsn; + } +#endif + P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, meta->free, 0, P_INVALID); + + /* Link the page on the metadata free list. */ + meta->free = h->pgno; + + /* Discard the metadata page. */ + ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + if ((t_ret = __BT_TLPUT(dbp, mlock)) != 0) + ret = t_ret; + + /* Discard the caller's page reference. */ + is_dirty = DB_MPOOL_DIRTY; +err: if ((t_ret = memp_fput(dbp->mpf, h, is_dirty)) != 0 && ret == 0) + ret = t_ret; + + /* + * XXX + * We have to unlock the caller's page in the caller! + */ + return (ret); +} + +#ifdef DEBUG +/* + * __bam_lt -- + * Print out the list of currently held locks. + */ +int +__bam_lt(dbp) + DB *dbp; +{ + DB_LOCKREQ req; + + if (F_ISSET(dbp, DB_AM_LOCKING)) { + req.op = DB_LOCK_DUMP; + lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL); + } + return (0); +} +#endif + +/* + * __bam_lget -- + * The standard lock get call. + * + * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); + */ +int +__bam_lget(dbp, do_couple, pgno, mode, lockp) + DB *dbp; + int do_couple; + db_pgno_t pgno; + db_lockmode_t mode; + DB_LOCK *lockp; +{ + DB_LOCKREQ couple[2]; + u_int32_t locker; + int ret; + + if (!F_ISSET(dbp, DB_AM_LOCKING)) + return (0); + + locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid; + dbp->lock.pgno = pgno; + + /* + * If the object not currently locked, acquire the lock and return, + * otherwise, lock couple. If we fail and it's not a system error, + * convert to EAGAIN. + */ + if (do_couple) { + couple[0].op = DB_LOCK_GET; + couple[0].obj = &dbp->lock_dbt; + couple[0].mode = mode; + couple[1].op = DB_LOCK_PUT; + couple[1].lock = *lockp; + + ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL); + if (ret != 0) { + /* If we fail, discard the lock we held. */ + __bam_lput(dbp, *lockp); + + return (ret < 0 ? EAGAIN : ret); + } + *lockp = couple[0].lock; + } else { + ret = lock_get(dbp->dbenv->lk_info, + locker, 0, &dbp->lock_dbt, mode, lockp); + return (ret < 0 ? EAGAIN : ret); + } + return (0); +} + +/* + * __bam_lput -- + * The standard lock put call. + * + * PUBLIC: int __bam_lput __P((DB *, DB_LOCK)); + */ +int +__bam_lput(dbp, lock) + DB *dbp; + DB_LOCK lock; +{ + return (__BT_LPUT(dbp, lock)); +} + +/* + * __bam_pget -- + * The standard page get call. + * + * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, int)); + */ +int +__bam_pget(dbp, hp, pgnop, mflags) + DB *dbp; + PAGE **hp; + db_pgno_t *pgnop; + int mflags; +{ + return (memp_fget((dbp)->mpf, + pgnop, mflags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop)); +} diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c new file mode 100644 index 0000000000..632c3d185b --- /dev/null +++ b/db2/btree/bt_put.c @@ -0,0 +1,919 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_put.c 10.23 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_fixed __P((BTREE *, DBT *)); +static int __bam_lookup __P((DB *, DBT *, int *)); +static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); +static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t)); + +/* + * __bam_put -- + * Add a new key/data pair or replace an existing pair (btree). + * + * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); + */ +int +__bam_put(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR c; + DB *dbp; + PAGE *h; + db_indx_t indx; + int exact, iflags, newkey, replace, ret, stack; + + DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags); + + /* Check flags. */ + if ((ret = __db_putchk(argdbp, key, data, flags, + F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + +retry: /* + * Find the location at which to insert. The call to bt_lookup() + * leaves the returned page pinned. + */ + if ((ret = __bam_lookup(dbp, key, &exact)) != 0) { + PUTHANDLE(dbp); + return (ret); + } + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* + * If an identical key is already in the tree, and DB_NOOVERWRITE is + * set, an error is returned. If an identical key is already in the + * tree and DB_NOOVERWRITE is not set, the key is either added (when + * duplicates are permitted) or an error is returned. The exception + * is when the item located is referenced by a cursor and marked for + * deletion, in which case we permit the overwrite and flag the cursor. + */ + replace = 0; + if (exact && flags == DB_NOOVERWRITE) { + if (!GET_BKEYDATA(h, indx + O_INDX)->deleted) { + ret = DB_KEYEXIST; + goto err; + } + replace = 1; + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); + } + + /* + * If we're inserting into the first or last page of the tree, + * remember where we did it so we can do fast lookup next time. + * + * XXX + * Does reverse order still work (did it ever!?!?) + */ + t->bt_lpgno = + h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ? + h->pgno : PGNO_INVALID; + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, we're either adding a new duplicate at the + * end of the duplicate set, or we're replacing the data item with a + * new data item. If the key isn't an exact match, we're inserting + * a new key/data pair, before the search location. + */ + newkey = dbp->type == DB_BTREE && !exact; + if (exact) { + if (F_ISSET(dbp, DB_AM_DUP)) { + /* + * Make sure that we're not looking at a page of + * duplicates -- if so, move to the last entry on + * that page. + */ + c.page = h; + c.pgno = h->pgno; + c.indx = indx; + c.dpgno = PGNO_INVALID; + c.dindx = 0; + if ((ret = + __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0) + goto err; + if (c.dpgno != PGNO_INVALID) { + /* + * XXX + * The __bam_ovfl_chk() routine memp_fput() the + * current page and acquired a new one, but did + * not do anything about the lock we're holding. + */ + t->bt_csp->page = h = c.page; + indx = c.dindx; + } + iflags = DB_AFTER; + } else + iflags = DB_CURRENT; + } else + iflags = DB_BEFORE; + + /* + * The pages we're using may be modified by __bam_iitem(), so make + * sure we reset the stack. + */ + ret = __bam_iitem(dbp, + &h, &indx, key, data, iflags, newkey ? BI_NEWKEY : 0); + t->bt_csp->page = h; + t->bt_csp->indx = indx; + + switch (ret) { + case 0: + /* + * Done. Clean up the cursor, and, if we're doing record + * numbers, adjust the internal page counts. + */ + if (replace) + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); + + if (!replace && F_ISSET(dbp, DB_BT_RECNUM)) + ret = __bam_adjust(dbp, t, 1); + break; + case DB_NEEDSPLIT: + /* + * We have to split the page. Back out the cursor setup, + * discard the stack of pages, and do the split. + */ + if (replace) { + replace = 0; + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); + } + + (void)__bam_stkrel(dbp); + stack = 0; + + if ((ret = __bam_split(dbp, key)) != 0) + break; + + goto retry; + /* NOTREACHED */ + default: + if (replace) + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); + break; + } + +err: if (stack) + (void)__bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_lookup -- + * Find the right location in the tree for the key. + */ +static int +__bam_lookup(dbp, key, exactp) + DB *dbp; + DBT *key; + int *exactp; +{ + BTREE *t; + DB_LOCK lock; + EPG e; + PAGE *h; + db_indx_t indx; + int cmp, ret; + + t = dbp->internal; + h = NULL; + + /* + * Record numbers can't be fast-tracked, we have to lock the entire + * tree. + */ + if (F_ISSET(dbp, DB_BT_RECNUM)) + goto slow; + + /* Check to see if we've been seeing sorted input. */ + if (t->bt_lpgno == PGNO_INVALID) + goto slow; + + /* + * Retrieve the page on which we did the last insert. It's okay if + * it doesn't exist, or if it's not the page type we expect, it just + * means that the world changed. + */ + if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) + goto miss; + if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) { + (void)__BT_LPUT(dbp, lock); + goto miss; + } + if (TYPE(h) != P_LBTREE) + goto miss; + if (NUM_ENT(h) == 0) + goto miss; + + /* + * We have to be at the end or beginning of the tree to know that + * we're inserting in a sort order. If that's the case and we're + * in the right order in comparison to the first/last key/data pair, + * we have the right position. + */ + if (h->next_pgno == PGNO_INVALID) { + e.page = h; + e.indx = NUM_ENT(h) - P_INDX; + if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) { + if (cmp > 0) + e.indx += P_INDX; + goto fast; + } + } + if (h->prev_pgno == PGNO_INVALID) { + e.page = h; + e.indx = 0; + if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) { + /* + * We're doing a put, so we want to insert as the last + * of any set of duplicates. + */ + if (cmp == 0) { + for (indx = 0; + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]; + indx += P_INDX); + e.indx = indx; + } + goto fast; + } + } + goto miss; + + /* Set the exact match flag in case we've already inserted this key. */ +fast: *exactp = cmp == 0; + + /* Enter the entry in the stack. */ + BT_STK_CLR(t); + BT_STK_ENTER(t, e.page, e.indx, lock, ret); + if (ret != 0) + return (ret); + + ++t->lstat.bt_cache_hit; + return (0); + +miss: ++t->lstat.bt_cache_miss; + if (h != NULL) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + } + +slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp)); +} + +/* + * OVPUT -- + * Copy an overflow item onto a page. + */ +#undef OVPUT +#define OVPUT(h, indx, bo) do { \ + DBT __hdr; \ + memset(&__hdr, 0, sizeof(__hdr)); \ + __hdr.data = &bo; \ + __hdr.size = BOVERFLOW_SIZE; \ + if ((ret = __db_pitem(dbp, \ + h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ + return (ret); \ +} while (0) + +/* + * __bam_iitem -- + * Insert an item into the tree. + * + * PUBLIC: int __bam_iitem __P((DB *, + * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, int, int)); + */ +int +__bam_iitem(dbp, hp, indxp, key, data, op, flags) + DB *dbp; + PAGE **hp; + db_indx_t *indxp; + DBT *key, *data; + int op, flags; +{ + BTREE *t; + BKEYDATA *bk; + BOVERFLOW kbo, dbo; + DBT tdbt; + PAGE *h; + db_indx_t indx; + u_int32_t have_bytes, need_bytes, needed; + int bigkey, bigdata, dcopy, dupadjust, ret; + + t = dbp->internal; + h = *hp; + indx = *indxp; + + dupadjust = 0; + bk = NULL; /* XXX: Shut the compiler up. */ + + /* + * If it's a page of duplicates, call the common code to do the work. + * + * !!! + * Here's where the hp and indxp are important. The duplicate code + * may decide to rework/rearrange the pages and indices we're using, + * so the caller must understand that the stack has to change. + */ + if (TYPE(h) == P_DUPLICATE) { + /* Adjust the index for the new item if it's a DB_AFTER op. */ + if (op == DB_AFTER) + ++*indxp; + + /* Remove the current item if it's a DB_CURRENT op. */ + if (op == DB_CURRENT && (ret = __db_ditem(dbp, *hp, *indxp, + BKEYDATA_SIZE(GET_BKEYDATA(*hp, *indxp)->len))) != 0) + return (ret); + + /* Put the new/replacement item onto the page. */ + return (__db_dput(dbp, data, hp, indxp, __bam_new)); + } + + /* + * XXX + * Handle partial puts. + * + * This is truly awful from a performance standput. We don't optimize + * for partial puts at all, we delete the record and add it back in, + * regardless of size or if we're simply overwriting current data. + * The hash access method does this a lot better than we do, and we're + * eventually going to have to fix it. + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + tdbt = *data; + if ((ret = __bam_partial(dbp, &tdbt, h, indx)) != 0) + return (ret); + data = &tdbt; + } + + /* If it's a short fixed-length record, fix it up. */ + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) { + tdbt = *data; + if ((ret = __bam_fixed(t, &tdbt)) != 0) + return (ret); + data = &tdbt; + } + + /* + * If the key or data item won't fit on a page, store it in the + * overflow pages. + * + * !!! + * From this point on, we have to recover the allocated overflow + * pages on error. + */ + bigkey = bigdata = 0; + if (LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize) { + kbo.deleted = 0; + kbo.type = B_OVERFLOW; + kbo.tlen = key->size; + if ((ret = __db_poff(dbp, key, &kbo.pgno, __bam_new)) != 0) + goto err; + bigkey = 1; + } + if (data->size > t->bt_ovflsize) { + dbo.deleted = 0; + dbo.type = B_OVERFLOW; + dbo.tlen = data->size; + if ((ret = __db_poff(dbp, data, &dbo.pgno, __bam_new)) != 0) + goto err; + bigdata = 1; + } + + dcopy = 0; + needed = 0; + if (LF_ISSET(BI_NEWKEY)) { + /* If BI_NEWKEY is set we're adding a new key and data pair. */ + if (bigkey) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(key->size); + if (bigdata) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(data->size); + } else { + /* + * We're either overwriting the data item of a key/data pair + * or we're adding the data item only, i.e. a new duplicate. + */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (bk->type == B_OVERFLOW) + have_bytes = BOVERFLOW_PSIZE; + else + have_bytes = BKEYDATA_PSIZE(bk->len); + need_bytes = 0; + } else { + have_bytes = 0; + need_bytes = sizeof(db_indx_t); + } + if (bigdata) + need_bytes += BOVERFLOW_PSIZE; + else + need_bytes += BKEYDATA_PSIZE(data->size); + + /* + * If we're overwriting a data item, we copy it if it's not a + * special record type and it's the same size (including any + * alignment) and do a delete/insert otherwise. + */ + if (op == DB_CURRENT && !bigdata && + bk->type == B_KEYDATA && have_bytes == need_bytes) + dcopy = 1; + if (have_bytes < need_bytes) + needed += need_bytes - have_bytes; + } + + /* + * If there's not enough room, or the user has put a ceiling on the + * number of keys permitted in the page, split the page. + * + * XXX + * The t->bt_maxkey test here may be insufficient -- do we have to + * check in the btree split code, so we don't undo it there!?!? + */ + if (P_FREESPACE(h) < needed || + (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) { + ret = DB_NEEDSPLIT; + goto err; + } + + /* + * The code breaks it up into six cases: + * + * 1. Append a new key/data pair. + * 2. Insert a new key/data pair. + * 3. Copy the data item. + * 4. Delete/insert the data item. + * 5. Append a new data item. + * 6. Insert a new data item. + */ + if (LF_ISSET(BI_NEWKEY)) { + switch (op) { + case DB_AFTER: /* 1. Append a new key/data pair. */ + indx += 2; + *indxp += 2; + break; + case DB_BEFORE: /* 2. Insert a new key/data pair. */ + break; + default: + abort(); + } + + /* Add the key. */ + if (bigkey) + OVPUT(h, indx, kbo); + else { + DBT __data; + memset(&__data, 0, sizeof(__data)); + __data.data = key->data; + __data.size = key->size; + if ((ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(key->size), NULL, &__data)) != 0) + goto err; + } + ++indx; + } else { + switch (op) { + case DB_CURRENT: /* 3. Copy the data item. */ + /* + * If we're not logging and it's possible, overwrite + * the current item. + * + * XXX + * We should add a separate logging message so that + * we can do this anytime it's possible, including + * for partial record puts. + */ + if (dcopy && !DB_LOGGING(dbp)) { + bk->len = data->size; + memcpy(bk->data, data->data, data->size); + goto done; + } + /* 4. Delete/insert the data item. */ + if (TYPE(h) == P_LBTREE) + ++indx; + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + break; + case DB_AFTER: /* 5. Append a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* + * Adjust the cursor and copy in the key for + * the duplicate. + */ + if ((ret = __bam_adjindx(dbp, + h, indx + P_INDX, indx, 1)) != 0) + goto err; + + indx += 3; + dupadjust = 1; + + *indxp += 2; + } else { + ++indx; + __bam_ca_di(dbp, h->pgno, indx, 1); + + *indxp += 1; + } + break; + case DB_BEFORE: /* 6. Insert a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* + * Adjust the cursor and copy in the key for + * the duplicate. + */ + if ((ret = + __bam_adjindx(dbp, h, indx, indx, 1)) != 0) + goto err; + + ++indx; + dupadjust = 1; + } else + __bam_ca_di(dbp, h->pgno, indx, 1); + break; + default: + abort(); + } + } + + /* Add the data. */ + if (bigdata) + OVPUT(h, indx, dbo); + else { + BKEYDATA __bk; + DBT __hdr, __data; + memset(&__data, 0, sizeof(__data)); + __data.data = data->data; + __data.size = data->size; + + if (LF_ISSET(BI_DELETED)) { + __bk.len = __data.size; + __bk.deleted = 1; + __bk.type = B_KEYDATA; + __hdr.data = &__bk; + __hdr.size = SSZA(BKEYDATA, data); + ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(__data.size), &__hdr, &__data); + } else + ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(data->size), NULL, &__data); + if (ret != 0) + goto err; + } + +done: ++t->lstat.bt_added; + + ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); + + /* + * If the page is at least 50% full, and we added a duplicate, see if + * that set of duplicates takes up at least 25% of the space. If it + * does, move it off onto its own page. + */ + if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { + --indx; + if ((ret = __bam_ndup(dbp, h, indx)) != 0) + goto err; + } + + if (t->bt_recno != NULL) + F_SET(t->bt_recno, RECNO_MODIFIED); + + if (0) { +err: if (bigkey) + (void)__db_doff(dbp, kbo.pgno, __bam_free); + if (bigdata) + (void)__db_doff(dbp, dbo.pgno, __bam_free); + } + return (ret); +} + +/* + * __bam_ndup -- + * Check to see if the duplicate set at indx should have its own page. + * If it should, create it. + */ +static int +__bam_ndup(dbp, h, indx) + DB *dbp; + PAGE *h; + u_int32_t indx; +{ + BKEYDATA *bk; + BOVERFLOW bo; + DBT hdr; + PAGE *cp; + db_indx_t cnt, cpindx, first, sz; + int ret; + + while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { + if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx]) + break; + bk = GET_BKEYDATA(h, indx); + sz += bk->type == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + bk = GET_BKEYDATA(h, indx + O_INDX); + sz += bk->type == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + } + + /* + * If this set of duplicates is using more than 25% of the page, move + * them off. The choice of 25% is a WAG, but it has to be small enough + * that we can always split regardless of the presence of duplicates. + */ + if (sz < dbp->pgsize / 4) + return (0); + + /* Get a new page. */ + if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0) + return (ret); + + /* + * Move this set of duplicates off the page. First points to the first + * key of the first duplicate key/data pair, cnt is the number of pairs + * we're dealing with. + */ + memset(&hdr, 0, sizeof(hdr)); + for (indx = first + O_INDX, cpindx = 0;; ++cpindx) { + /* Copy the entry to the new page. */ + bk = GET_BKEYDATA(h, indx); + hdr.data = bk; + hdr.size = bk->type == B_KEYDATA ? + BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; + if ((ret = + __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0) + goto err; + + /* + * Move cursors referencing the old entry to the new entry. + * Done after the page put because __db_pitem() adjusts + * cursors on the new page, and before the delete because + * __db_ditem adjusts cursors on the old page. + */ + __bam_ca_dup(dbp, + PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx); + + /* Delete the data item. */ + if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0) + goto err; + + /* Delete all but the first reference to the key. */ + if (--cnt == 0) + break; + if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0) + goto err; + } + + /* Put in a new data item that points to the duplicates page. */ + bo.deleted = 0; + bo.type = B_DUPLICATE; + bo.pgno = cp->pgno; + bo.tlen = 0; + + OVPUT(h, indx, bo); + + return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY)); + +err: (void)__bam_free(dbp, cp); + return (ret); +} + +/* + * __bam_fixed -- + * Build the real record for a fixed length put. + */ +static int +__bam_fixed(t, dbt) + BTREE *t; + DBT *dbt; +{ + RECNO *rp; + + rp = t->bt_recno; + + /* + * If using fixed-length records, and the record is long, return + * EINVAL. If it's short, pad it out. Use the record data return + * memory, it's only short-term. + */ + if (dbt->size > rp->re_len) + return (EINVAL); + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + memcpy(t->bt_rdata.data, dbt->data, dbt->size); + memset((u_int8_t *)t->bt_rdata.data + dbt->size, + rp->re_pad, rp->re_len - dbt->size); + + /* Set the DBT to reference our new record. */ + t->bt_rdata.size = rp->re_len; + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + *dbt = t->bt_rdata; + return (0); +} + +/* + * __bam_partial -- + * Build the real record for a partial put. + */ +static int +__bam_partial(dbp, dbt, h, indx) + DB *dbp; + DBT *dbt; + PAGE *h; + u_int32_t indx; +{ + BTREE *t; + BKEYDATA *bk, tbk; + BOVERFLOW *bo; + DBT copy; + u_int32_t len, nbytes, tlen; + int ret; + u_int8_t *p; + + bo = NULL; /* XXX: Shut the compiler up. */ + t = dbp->internal; + + /* + * Figure out how much total space we'll need. Worst case is where + * the record is 0 bytes long, in which case doff causes the record + * to extend, and the put data is appended to it. + */ + if (indx < NUM_ENT(h)) { + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (bk->type == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + nbytes = bo->tlen; + } else + nbytes = bk->len; + } else { + bk = &tbk; + bk->type = B_KEYDATA; + nbytes = bk->len = 0; + } + nbytes += dbt->doff + dbt->size + dbt->dlen; + + /* Allocate the space. */ + if (t->bt_rdata.ulen < nbytes) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(nbytes) : + (void *)realloc(t->bt_rdata.data, nbytes); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = nbytes; + } + + /* We use nul bytes for extending the record, get it over with. */ + memset(t->bt_rdata.data, 0, nbytes); + + tlen = 0; + if (bk->type == B_OVERFLOW) { + /* Take up to doff bytes from the record. */ + memset(©, 0, sizeof(copy)); + if ((ret = __db_goff(dbp, ©, bo->tlen, + bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + return (ret); + tlen += dbt->doff; + + /* + * If the original record was larger than the offset: + * If dlen > size, shift the remaining data down. + * If dlen < size, shift the remaining data up. + * Use memmove(), the regions may overlap. + */ + p = t->bt_rdata.data; + if (bo->tlen > dbt->doff) + if (dbt->dlen > dbt->size) { + tlen += len = bo->tlen - + dbt->doff - (dbt->dlen - dbt->size); + memmove(p + dbt->doff + dbt->size, + p + dbt->doff + dbt->dlen, len); + } else if (dbt->dlen < dbt->size) { + tlen += len = bo->tlen - + dbt->doff - (dbt->size - dbt->dlen); + memmove(p + dbt->doff + dbt->dlen, + p + dbt->doff + dbt->size, len); + } else + tlen += bo->tlen - dbt->doff; + + /* Copy in the user's data. */ + memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff, + dbt->data, dbt->size); + tlen += dbt->size; + } else { + /* Take up to doff bytes from the record. */ + memcpy(t->bt_rdata.data, + bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); + tlen += dbt->doff; + + /* Copy in the user's data. */ + memcpy((u_int8_t *)t->bt_rdata.data + + dbt->doff, dbt->data, dbt->size); + tlen += dbt->size; + + /* Copy in any remaining data. */ + len = dbt->doff + dbt->dlen; + if (bk->len > len) { + memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff + + dbt->size, bk->data + len, bk->len - len); + tlen += bk->len - len; + } + } + + /* Set the DBT to reference our new record. */ + t->bt_rdata.size = tlen; + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + *dbt = t->bt_rdata; + return (0); +} diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c new file mode 100644 index 0000000000..d4bc7f6824 --- /dev/null +++ b/db2/btree/bt_rec.c @@ -0,0 +1,767 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_rec.c 10.11 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "shqueue.h" +#include "hash.h" +#include "btree.h" +#include "log.h" +#include "db_dispatch.h" +#include "common_ext.h" + +/* + * __bam_pg_alloc_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __bam_pg_alloc_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_pg_alloc_args *argp; + BTMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + DB *file_dbp, *mdbp; + db_pgno_t pgno; + int cmp_n, cmp_p, created, modified, ret; + + REC_PRINT(__bam_pg_alloc_print); + REC_INTRO(__bam_pg_alloc_read); + + /* + * Fix up the allocated page. If we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list. + * + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_METADATA; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + goto out; + } + if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + (void)memp_fput(mpf, meta, 0); + goto out; + } + + /* Fix up the allocated page. */ + created = IS_ZERO_LSN(LSN(pagep)); + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->page_lsn); + if ((created || cmp_p == 0) && redo) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, 0, argp->ptype); + + pagep->lsn = *lsnp; + modified = 1; + } else if ((created || cmp_n == 0) && !redo) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, meta->free, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + (void)memp_fput(mpf, meta, 0); + goto out; + } + + /* Fix up the metadata page. */ + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + meta->lsn = *lsnp; + meta->free = argp->next; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + meta->lsn = argp->meta_lsn; + meta->free = argp->pgno; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_pg_free_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __bam_pg_free_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_pg_free_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_pg_free_args *argp; + BTMETA *meta; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_pg_free_print); + REC_INTRO(__bam_pg_free_read); + + /* + * Fix up the freed page. If we're redoing the operation we get the + * page and explicitly discard its contents, then update its LSN. If + * we're undoing the operation, we get the page and restore its header. + */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &LSN(argp->header.data)); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + pagep->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + memcpy(pagep, argp->header.data, argp->header.size); + + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN and free pointer. + */ + pgno = PGNO_METADATA; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + meta->free = argp->pgno; + + meta->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + meta->free = argp->next; + + meta->lsn = argp->meta_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_split_recover -- + * Recovery function for split. + * + * PUBLIC: int __bam_split_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_split_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_split_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; + db_pgno_t pgno; + int l_update, p_update, r_update, ret, rootsplit, t_ret; + + REC_PRINT(__bam_split_print); + + mpf = NULL; + _lp = lp = np = pp = _rp = rp = NULL; + + REC_INTRO(__bam_split_read); + + /* + * There are two kinds of splits that we have to recover from. The + * first is a root-page split, where the root page is split from a + * leaf page into an internal page and two new leaf pages are created. + * The second is where a page is split into two pages, and a new key + * is inserted into the parent page. + */ + sp = argp->pg.data; + pgno = PGNO(sp); + rootsplit = pgno == PGNO_ROOT; + if (memp_fget(mpf, &argp->left, 0, &lp) != 0) + lp = NULL; + if (memp_fget(mpf, &argp->right, 0, &rp) != 0) + rp = NULL; + + if (redo) { + l_update = r_update = p_update = 0; + /* + * Decide if we need to resplit the page. + * + * If this is a root split, then the root has to exist, it's + * the page we're splitting and it gets modified. If this is + * not a root split, then the left page has to exist, for the + * same reason. + */ + if (rootsplit) { + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + pp = NULL; + goto out; + } + p_update = + log_compare(&LSN(pp), &LSN(argp->pg.data)) == 0; + } else + if (lp == NULL) { + (void)__db_pgerr(file_dbp, argp->left); + goto out; + } + if (lp == NULL || log_compare(&LSN(lp), &argp->llsn) == 0) + l_update = 1; + if (rp == NULL || log_compare(&LSN(rp), &argp->rlsn) == 0) + r_update = 1; + if (!p_update && !l_update && !r_update) + goto done; + + /* Allocate and initialize new left/right child pages. */ + if ((_lp = (PAGE *)malloc(file_dbp->pgsize)) == NULL) + goto nomem; + if ((_rp = (PAGE *)malloc(file_dbp->pgsize)) == NULL) { +nomem: errno = ENOMEM; + __db_err(file_dbp->dbenv, "%s", strerror(errno)); + goto out; + } + if (rootsplit) { + P_INIT(_lp, file_dbp->pgsize, argp->left, + PGNO_INVALID, + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : argp->left, + PGNO_INVALID, LEVEL(sp), TYPE(sp)); + } else { + P_INIT(_lp, file_dbp->pgsize, PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno, + ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp), + LEVEL(sp), TYPE(sp)); + } + + /* Split the page. */ + if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 || + (ret = __bam_copy(file_dbp, sp, _rp, argp->indx, + NUM_ENT(sp))) != 0) + goto out; + + /* If the left child is wrong, update it. */ + if (lp == NULL && (ret = + memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) { + (void)__db_pgerr(file_dbp, argp->left); + lp = NULL; + goto out; + } + if (l_update) { + memcpy(lp, _lp, file_dbp->pgsize); + lp->lsn = *lsnp; + if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + lp = NULL; + } + + /* If the right child is wrong, update it. */ + if (rp == NULL && (ret = memp_fget(mpf, + &argp->right, DB_MPOOL_CREATE, &rp)) != 0) { + (void)__db_pgerr(file_dbp, argp->right); + rp = NULL; + goto out; + } + if (r_update) { + memcpy(rp, _rp, file_dbp->pgsize); + rp->lsn = *lsnp; + if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + rp = NULL; + } + + /* + * If the parent page is wrong, update it. This is of interest + * only if it was a root split, since root splits create parent + * pages. All other splits modify a parent page, but those are + * separately logged and recovered. + */ + if (rootsplit && p_update) { + if (file_dbp->type == DB_BTREE) + P_INIT(pp, file_dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, + _lp->level + 1, P_IBTREE); + else + P_INIT(pp, file_dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, + _lp->level + 1, P_IRECNO); + RE_NREC_SET(pp, + file_dbp->type == DB_RECNO || + F_ISSET(file_dbp, DB_BT_RECNUM) ? + __bam_total(_lp) + __bam_total(_rp) : 0); + pp->lsn = *lsnp; + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + pp = NULL; + } + + /* + * Finally, redo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page had better exist. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + (void)__db_pgerr(file_dbp, argp->npgno); + np = NULL; + goto out; + } + if (log_compare(&LSN(np), &argp->nlsn) == 0) { + PREV_PGNO(np) = argp->right; + np->lsn = *lsnp; + if ((ret = memp_fput(mpf, + np, DB_MPOOL_DIRTY)) != 0) + goto fatal; + np = NULL; + } + } + } else { + /* + * If the split page is wrong, replace its contents with the + * logged page contents. The split page had better exist. + */ + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + pp = NULL; + goto out; + } + if (log_compare(lsnp, &LSN(pp)) == 0) { + memcpy(pp, argp->pg.data, argp->pg.size); + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + pp = NULL; + } + + /* + * If it's a root split and the left child ever existed, put + * it on the free list. (If it's not a root split, we just + * updated the left page -- it's the same as the split page.) + * If the right child ever existed, root split or not, put it + * on the free list. + */ + if ((rootsplit && lp != NULL) || rp != NULL) { + if (rootsplit && lp != NULL && + log_compare(lsnp, &LSN(lp)) == 0) { + lp->lsn = argp->llsn; + if ((ret = + memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + lp = NULL; + } + if (rp != NULL && + log_compare(lsnp, &LSN(rp)) == 0) { + rp->lsn = argp->rlsn; + if ((ret = + memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + rp = NULL; + } + } + + /* + * Finally, undo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page had better exist. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + (void)__db_pgerr(file_dbp, argp->npgno); + np = NULL; + goto out; + } + if (log_compare(lsnp, &LSN(np)) == 0) { + PREV_PGNO(np) = argp->left; + np->lsn = argp->nlsn; + if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) + goto fatal; + np = NULL; + } + } + } + +done: ret = 0; + *lsnp = argp->prev_lsn; + + if (0) { +fatal: (void)__db_panic(file_dbp); + } +out: /* Free any pages that weren't dirtied. */ + if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) + ret = t_ret; + if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0) + ret = t_ret; + if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0) + ret = t_ret; + + /* Free any allocated space. */ + if (_lp != NULL) + free(_lp); + if (_rp != NULL) + free(_rp); + + REC_CLOSE; +} + +/* + * __bam_rsplit_recover -- + * Recovery function for a reverse split. + * + * PUBLIC: int __bam_rsplit_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_rsplit_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_rsplit_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_rsplit_print); + REC_INTRO(__bam_rsplit_read); + + /* Fix the root page. */ + pgno = PGNO_ROOT; + if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) { + __db_pgerr(file_dbp, pgno); + pagep = NULL; + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->rootlsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + pagep->pgno = PGNO_ROOT; + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT, + PGNO_INVALID, PGNO_INVALID, pagep->level + 1, TYPE(pagep)); + if ((ret = __db_pitem(file_dbp, pagep, 0, + argp->rootent.size, &argp->rootent, NULL)) != 0) + goto out; + pagep->lsn = argp->rootlsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + /* Fix the page copied over the root page. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &LSN(argp->pgdbt.data)); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + ret = 0; + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __bam_adj_recover -- + * Recovery function for adj. + * + * PUBLIC: int __bam_adj_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_adj_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_adj_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_adj_print); + REC_INTRO(__bam_adj_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + if ((ret = __bam_adjindx(file_dbp, + pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) + goto err; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + if ((ret = __bam_adjindx(file_dbp, + pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) + goto err; + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + + if (0) { +err: (void)memp_fput(mpf, pagep, 0); + } +out: REC_CLOSE; +} + +/* + * __bam_cadjust_recover -- + * Recovery function for the adjust of a count change in an internal + * page. + * + * PUBLIC: int __bam_cadjust_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_cadjust_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_cadjust_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_cadjust_print); + REC_INTRO(__bam_cadjust_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + errno = __db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + if (file_dbp->type == DB_BTREE && + F_ISSET(file_dbp, DB_BT_RECNUM)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + if (file_dbp->type == DB_RECNO) { + GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + if (file_dbp->type == DB_BTREE && + F_ISSET(file_dbp, DB_BT_RECNUM)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + if (file_dbp->type == DB_RECNO) { + GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __bam_cdel_recover -- + * Recovery function for the intent-to-delete of a cursor record. + * + * PUBLIC: int __bam_cdel_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_cdel_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_cdel_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_cdel_print); + REC_INTRO(__bam_cdel_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 1; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 0; + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c new file mode 100644 index 0000000000..cd8872a064 --- /dev/null +++ b/db2/btree/bt_recno.c @@ -0,0 +1,1195 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_recno.c 10.12 (Sleepycat) 8/25/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __ram_add __P((DB *, db_recno_t *, DBT *, int, int)); +static int __ram_c_close __P((DBC *)); +static int __ram_c_del __P((DBC *, int)); +static int __ram_c_get __P((DBC *, DBT *, DBT *, int)); +static int __ram_c_put __P((DBC *, DBT *, DBT *, int)); +static int __ram_fmap __P((DB *, db_recno_t)); +static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __ram_source __P((DB *, RECNO *, const char *)); +static int __ram_sync __P((DB *, int)); +static int __ram_update __P((DB *, db_recno_t, int)); +static int __ram_vmap __P((DB *, db_recno_t)); +static int __ram_writeback __P((DB *)); + +/* + * If we're renumbering records, then we have to detect in the cursor that a + * record was deleted, and adjust the cursor as necessary. If not renumbering + * records, then we can detect this by looking at the actual record, so we + * ignore the cursor delete flag. + */ +#define CD_SET(dbp, cp) { \ + if (F_ISSET(dbp, DB_RE_RENUMBER)) \ + F_SET(cp, CR_DELETED); \ +} +#define CD_CLR(dbp, cp) { \ + if (F_ISSET(dbp, DB_RE_RENUMBER)) \ + F_CLR(cp, CR_DELETED); \ +} +#define CD_ISSET(dbp, cp) \ + (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED)) + +/* + * __ram_open -- + * Recno open function. + * + * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *)); + */ +int +__ram_open(dbp, type, dbinfo) + DB *dbp; + DBTYPE type; + DB_INFO *dbinfo; +{ + BTREE *t; + RECNO *rp; + int ret; + + ret = 0; + + /* Allocate and initialize the private RECNO structure. */ + if ((rp = (RECNO *)calloc(1, sizeof(*rp))) == NULL) + return (errno); + + if (dbinfo != NULL) { + /* + * If the user specified a source tree, open it and map it in. + * + * !!! + * We don't complain if the user specified transactions or + * threads. It's possible to make it work, but you'd better + * know what you're doing! + */ + if (dbinfo->re_source == NULL) { + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } else { + if ((ret = + __ram_source(dbp, rp, dbinfo->re_source)) != 0) + goto err; + } + + /* Copy delimiter, length and padding values. */ + rp->re_delim = + F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n'; + rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' '; + + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((rp->re_len = dbinfo->re_len) == 0) { + __db_err(dbp->dbenv, + "record length must be greater than 0"); + ret = EINVAL; + goto err; + } + } else + rp->re_len = 0; + } else { + rp->re_delim = '\n'; + rp->re_pad = ' '; + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } + + /* Open the underlying btree. */ + if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0) + goto err; + + /* Set the routines necessary to make it look like a recno tree. */ + dbp->cursor = __ram_cursor; + dbp->del = __ram_delete; + dbp->get = __ram_get; + dbp->put = __ram_put; + dbp->sync = __ram_sync; + + /* Link in the private recno structure. */ + ((BTREE *)dbp->internal)->bt_recno = rp; + + /* If we're snapshotting an underlying source file, do it now. */ + if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) + if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + goto err; + + return (0); + +err: /* If we mmap'd a source file, discard it. */ + if (rp->re_smap != NULL) + (void)__db_munmap(rp->re_smap, rp->re_msize); + + /* If we opened a source file, discard it. */ + if (rp->re_fd != -1) + (void)__db_close(rp->re_fd); + if (rp->re_source != NULL) + FREES(rp->re_source); + + /* If we allocated room for key/data return, discard it. */ + t = dbp->internal; + if (t->bt_rkey.data != NULL) + free(t->bt_rkey.data); + + FREE(rp, sizeof(*rp)); + + return (ret); +} + +/* + * __ram_cursor -- + * Recno db->cursor function. + * + * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **)); + */ +int +__ram_cursor(dbp, txn, dbcp) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; +{ + RCURSOR *cp; + DBC *dbc; + + DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0); + + if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL) + return (ENOMEM); + if ((cp = (RCURSOR *)calloc(1, sizeof(RCURSOR))) == NULL) { + free(dbc); + return (ENOMEM); + } + + cp->dbc = dbc; + cp->recno = RECNO_OOB; + + dbc->dbp = dbp; + dbc->txn = txn; + dbc->internal = cp; + dbc->c_close = __ram_c_close; + dbc->c_del = __ram_c_del; + dbc->c_get = __ram_c_get; + dbc->c_put = __ram_c_put; + + /* All cursor structures hang off the main DB structure. */ + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + *dbcp = dbc; + return (0); +} + +/* + * __ram_get -- + * Recno db->get function. + */ +static int +__ram_get(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + PAGE *h; + db_indx_t indx; + db_recno_t recno; + int exact, ret, stack; + + stack = 0; + + DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + goto done; + + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0) + goto done; + if (!exact) + return (DB_NOTFOUND); + stack = 1; + + h = t->bt_csp->page; + indx = t->bt_csp->indx; + + /* If the record has already been deleted, we couldn't have found it. */ + if (GET_BKEYDATA(h, indx)->deleted) { + ret = DB_KEYEMPTY; + goto done; + } + + /* Return the data item. */ + ret = __db_ret(dbp, + h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen); + ++t->lstat.bt_get; + +done: /* Discard the stack. */ + if (stack) + __bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_put -- + * Recno db->put function. + */ +static int +__ram_put(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + db_recno_t recno; + int ret; + + DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags); + + /* Check for invalid flags. */ + if ((ret = __db_putchk(argdbp, + key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + + /* + * If we're appending to the tree, make sure we've read in all of + * the backing source file. Otherwise, check the user's record + * number and fill in as necessary. + */ + ret = LF_ISSET(DB_APPEND) ? + __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1); + + /* Add the record. */ + if (ret == 0) + ret = __ram_add(dbp, &recno, data, flags, 0); + + /* If we're appending to the tree, we have to return the record. */ + if (ret == 0 && LF_ISSET(DB_APPEND)) { + t = dbp->internal; + ret = __db_retcopy(key, &recno, sizeof(recno), + &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc); + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_sync -- + * Recno db->sync function. + */ +static int +__ram_sync(argdbp, flags) + DB *argdbp; + int flags; +{ + DB *dbp; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags); + + /* Sync the underlying btree. */ + if ((ret = __bam_sync(argdbp, flags)) != 0) + return (ret); + + /* Copy back the backing source file. */ + GETHANDLE(argdbp, NULL, &dbp, ret); + ret = __ram_writeback(dbp); + PUTHANDLE(dbp); + + return (ret); +} + +/* + * __ram_close -- + * Recno db->close function. + * + * PUBLIC: int __ram_close __P((DB *)); + */ +int +__ram_close(argdbp) + DB *argdbp; +{ + RECNO *rp; + + DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0); + + rp = ((BTREE *)argdbp->internal)->bt_recno; + + /* Close any underlying mmap region. */ + if (rp->re_smap != NULL) + (void)__db_munmap(rp->re_smap, rp->re_msize); + + /* Close any backing source file descriptor. */ + if (rp->re_fd != -1) + (void)__db_close(rp->re_fd); + + /* Free any backing source file name. */ + if (rp->re_source != NULL) + FREES(rp->re_source); + + /* Free allocated memory. */ + FREE(rp, sizeof(RECNO)); + ((BTREE *)argdbp->internal)->bt_recno = NULL; + + /* Close the underlying btree. */ + return (__bam_close(argdbp)); +} + +/* + * __ram_c_close -- + * Recno cursor->close function. + */ +static int +__ram_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0); + + dbp = dbc->dbp; + + /* Remove the cursor from the queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Discard the structures. */ + FREE(dbc->internal, sizeof(RCURSOR)); + FREE(dbc, sizeof(DBC)); + + return (0); +} + +/* + * __ram_c_del -- + * Recno cursor->c_del function. + */ +static int +__ram_c_del(dbc, flags) + DBC *dbc; + int flags; +{ + DBT key; + RCURSOR *cp; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cdelchk(dbc->dbp, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + return (ret); + + /* If already deleted, return failure. */ + if (CD_ISSET(dbc->dbp, cp)) + return (DB_KEYEMPTY); + + /* Build a normal delete request. */ + memset(&key, 0, sizeof(key)); + key.data = &cp->recno; + key.size = sizeof(db_recno_t); + if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0) + CD_SET(dbc->dbp, cp); + + return (ret); +} + +/* + * __ram_c_get -- + * Recno cursor->c_get function. + */ +static int +__ram_c_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + RCURSOR *cp, copy; + int ret; + + DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, + NULL, flags); + + cp = dbc->internal; + dbp = dbc->dbp; + + /* Check for invalid flags. */ + if ((ret = __db_cgetchk(dbc->dbp, + key, data, flags, cp->recno != RECNO_OOB)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + +retry: /* Update the record number. */ + switch (flags) { + case DB_CURRENT: + if (CD_ISSET(dbp, cp)) { + PUTHANDLE(dbp); + return (DB_KEYEMPTY); + } + break; + case DB_NEXT: + if (CD_ISSET(dbp, cp)) + break; + if (cp->recno != RECNO_OOB) { + ++cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + flags = DB_NEXT; + cp->recno = 1; + break; + case DB_PREV: + if (cp->recno != RECNO_OOB) { + if (cp->recno == 1) + return (DB_NOTFOUND); + --cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + flags = DB_PREV; + if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND) + goto err; + if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0) + goto err; + if (cp->recno == 0) + return (DB_NOTFOUND); + break; + case DB_SET: + case DB_SET_RANGE: + if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0) + goto err; + break; + } + + /* + * Return the key if the user didn't give us one, and then pass it + * into __ram_get(). + */ + if (flags != DB_SET && flags != DB_SET_RANGE && + (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno), + &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0) + return (ret); + + /* + * The cursor was reset, so the delete adjustment is no + * longer necessary. + */ + CD_CLR(dbp, cp); + + /* + * Retrieve the record. + * + * Skip any keys that don't really exist. + */ + if ((ret = __ram_get(dbp, dbc->txn, key, data, 0)) != 0) + if (ret == DB_KEYEMPTY && + (flags == DB_NEXT || flags == DB_PREV)) + goto retry; + +err: if (ret != 0) + *cp = copy; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_c_put -- + * Recno cursor->c_put function. + */ +static int +__ram_c_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + RCURSOR *cp, copy; + DB *dbp; + int exact, ret; + void *arg; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags); + + cp = dbc->internal; + + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + + /* + * To split, we need a valid key for the page. Since it's a cursor, + * we have to build one. + * + * The split code discards all short-term locks and stack pages. + */ + if (0) { +split: arg = &cp->recno; + if ((ret = __bam_split(dbp, arg)) != 0) + goto err; + } + + if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + if ((ret = __bam_iitem(dbp, &t->bt_csp->page, + &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { + if ((ret = __bam_stkrel(dbp)) != 0) + goto err; + goto split; + } + if ((ret = __bam_stkrel(dbp)) != 0) + goto err; + + if (flags != DB_CURRENT) { + /* Adjust the counts. */ + if ((ret = __bam_adjust(dbp, t, 1)) != 0) + goto err; + + switch (flags) { + case DB_AFTER: + /* Adjust the cursors. */ + __ram_ca(dbp, cp->recno, CA_IAFTER); + + /* Set this cursor to reference the new record. */ + cp->recno = copy.recno + 1; + break; + case DB_BEFORE: + /* Adjust the cursors. */ + __ram_ca(dbp, cp->recno, CA_IBEFORE); + + /* Set this cursor to reference the new record. */ + cp->recno = copy.recno; + break; + } + + } + + /* + * The cursor was reset, so the delete adjustment is no + * longer necessary. + */ + CD_CLR(dbp, cp); + +err: if (ret != 0) + *cp = copy; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_ca -- + * Adjust cursors. + * + * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg)); + */ +void +__ram_ca(dbp, recno, op) + DB *dbp; + db_recno_t recno; + ca_recno_arg op; +{ + DBC *dbc; + RCURSOR *cp; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (RCURSOR *)dbc->internal; + switch (op) { + case CA_DELETE: + if (recno > cp->recno) + --cp->recno; + break; + case CA_IAFTER: + if (recno > cp->recno) + ++cp->recno; + break; + case CA_IBEFORE: + if (recno >= cp->recno) + ++cp->recno; + break; + } + } + DB_THREAD_UNLOCK(dbp); +} + +#ifdef DEBUG +/* + * __ram_cprint -- + * Display the current recno cursor list. + */ +int +__ram_cprint(dbp) + DB *dbp; +{ + DBC *dbc; + RCURSOR *cp; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (RCURSOR *)dbc->internal; + fprintf(stderr, + "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno); + } + DB_THREAD_UNLOCK(dbp); + return (0); +} +#endif /* DEBUG */ + +/* + * __ram_getno -- + * Check the user's record number, and make sure we've seen it. + * + * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); + */ +int +__ram_getno(dbp, key, rep, can_create) + DB *dbp; + const DBT *key; + db_recno_t *rep; + int can_create; +{ + db_recno_t recno; + + /* Check the user's record number. */ + if ((recno = *(db_recno_t *)key->data) == 0) { + __db_err(dbp->dbenv, "illegal record number of 0"); + return (EINVAL); + } + if (rep != NULL) + *rep = recno; + + /* + * Btree can neither create records or read them in. Recno can + * do both, see if we can find the record. + */ + return (dbp->type == DB_RECNO ? + __ram_update(dbp, recno, can_create) : 0); +} + +/* + * __ram_snapshot -- + * Read in any remaining records from the backing input file. + * + * PUBLIC: int __ram_snapshot __P((DB *)); + */ +int +__ram_snapshot(dbp) + DB *dbp; +{ + return (__ram_update(dbp, DB_MAX_RECORDS, 0)); +} + +/* + * __ram_update -- + * Ensure the tree has records up to and including the specified one. + */ +static int +__ram_update(dbp, recno, can_create) + DB *dbp; + db_recno_t recno; + int can_create; +{ + BTREE *t; + RECNO *rp; + db_recno_t nrecs; + int ret; + + t = dbp->internal; + rp = t->bt_recno; + + /* + * If we can't create records and we've read the entire backing input + * file, we're done. + */ + if (!can_create && F_ISSET(rp, RECNO_EOF)) + return (0); + + /* + * If we haven't seen this record yet, try to get it from the original + * file. + */ + if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + return (ret); + if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) { + if ((ret = rp->re_irec(dbp, recno)) != 0) + return (ret); + if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + return (ret); + } + + /* + * If we can create records, create empty ones up to the requested + * record. + */ + if (!can_create || recno <= nrecs + 1) + return (0); + + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + t->bt_rdata.size = rp->re_len; + memset(t->bt_rdata.data, rp->re_pad, rp->re_len); + } else + t->bt_rdata.size = 0; + + while (recno > ++nrecs) + if ((ret = __ram_add(dbp, + &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0) + return (ret); + return (0); +} + +/* + * __ram_source -- + * Load information about the backing file. + */ +static int +__ram_source(dbp, rp, fname) + DB *dbp; + RECNO *rp; + const char *fname; +{ + off_t size; + int oflags, ret; + + if ((ret = __db_appname(dbp->dbenv, + DB_APP_DATA, NULL, fname, NULL, &rp->re_source)) != 0) + return (ret); + + oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0; + if ((ret = + __db_fdopen(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) { + __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); + goto err; + } + + /* + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ + if ((ret = + __db_stat(dbp->dbenv, rp->re_source, rp->re_fd, &size, NULL)) != 0) + goto err; + if (size == 0) { + F_SET(rp, RECNO_EOF); + return (0); + } + + if ((ret = __db_mmap(rp->re_fd, (size_t)size, 1, 1, &rp->re_smap)) != 0) + goto err; + rp->re_cmap = rp->re_smap; + rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size); + rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ? __ram_fmap : __ram_vmap; + return (0); + +err: FREES(rp->re_source) + return (ret); +} + +/* + * __ram_writeback -- + * Rewrite the backing file. + */ +static int +__ram_writeback(dbp) + DB *dbp; +{ + RECNO *rp; + DBT key, data; + db_recno_t keyno; + ssize_t nw; + int fd, ret, t_ret; + u_int8_t delim, *pad; + + rp = ((BTREE *)dbp->internal)->bt_recno; + + /* If the file wasn't modified, we're done. */ + if (!F_ISSET(rp, RECNO_MODIFIED)) + return (0); + + /* If there's no backing source file, we're done. */ + if (rp->re_source == NULL) { + F_CLR(rp, RECNO_MODIFIED); + return (0); + } + + /* + * Read any remaining records into the tree. + * + * XXX + * This is why we can't support transactions when applications specify + * backing (re_source) files. At this point we have to read in the + * rest of the records from the file so that we can write all of the + * records back out again, which could modify a page for which we'd + * have to log changes and which we don't have locked. This could be + * partially fixed by taking a snapshot of the entire file during the + * db_open(), or, since db_open() isn't transaction protected, as part + * of the first DB operation. But, if a checkpoint occurs then, the + * part of the log holding the copy of the file could be discarded, and + * that would make it impossible to recover in the face of disaster. + * This could all probably be fixed, but it would require transaction + * protecting the backing source file, i.e. mpool would have to know + * about it, and we don't want to go there. + */ + if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + return (ret); + + /* + * !!! + * Close any underlying mmap region. This is required for Windows NT + * (4.0, Service Pack 2) -- if the file is still mapped, the following + * open will fail. + */ + if (rp->re_smap != NULL) { + (void)__db_munmap(rp->re_smap, rp->re_msize); + rp->re_smap = NULL; + } + + /* Get rid of any backing file descriptor, just on GP's. */ + if (rp->re_fd != -1) { + (void)__db_close(rp->re_fd); + rp->re_fd = -1; + } + + /* Open the file, truncating it. */ + if ((ret = __db_fdopen(rp->re_source, + DB_SEQUENTIAL | DB_TRUNCATE, + DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) { + __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); + return (ret); + } + + /* + * We step through the records, writing each one out. Use the record + * number and the dbp->get() function, instead of a cursor, so we find + * and write out "deleted" or non-existent records. + */ + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + key.size = sizeof(db_recno_t); + key.data = &keyno; + + /* + * We'll need the delimiter if we're doing variable-length records, + * and the pad character if we're doing fixed-length records. + */ + delim = rp->re_delim; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((pad = malloc(rp->re_len)) == NULL) { + ret = ENOMEM; + goto err; + } + memset(pad, rp->re_pad, rp->re_len); + } else + pad = NULL; /* XXX: Shut the compiler up. */ + for (keyno = 1;; ++keyno) { + switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { + case 0: + if ((ret = + __db_write(fd, data.data, data.size, &nw)) != 0) + goto err; + if (nw != (ssize_t)data.size) { + ret = EIO; + goto err; + } + break; + case DB_KEYEMPTY: + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((ret = + __db_write(fd, pad, rp->re_len, &nw)) != 0) + goto err; + if (nw != (ssize_t) rp->re_len) { + ret = EIO; + goto err; + } + } + break; + case DB_NOTFOUND: + ret = 0; + goto done; + } + if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((ret = __db_write(fd, &delim, 1, &nw)) != 0) + goto err; + if (nw != 1) { + ret = EIO; + goto err; + } + } + } + +err: +done: /* Close the file descriptor. */ + if ((t_ret = __db_close(fd)) != 0 || ret == 0) + ret = t_ret; + + if (ret == 0) + F_CLR(rp, RECNO_MODIFIED); + return (ret); +} + +/* + * __ram_fmap -- + * Get fixed length records from a file. + */ +static int +__ram_fmap(dbp, top) + DB *dbp; + db_recno_t top; +{ + BTREE *t; + DBT data; + RECNO *rp; + db_recno_t recno; + u_int32_t len; + u_int8_t *sp, *ep, *p; + int ret; + + if ((ret = __bam_nrecs(dbp, &recno)) != 0) + return (ret); + + t = dbp->internal; + rp = t->bt_recno; + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + + memset(&data, 0, sizeof(data)); + data.data = t->bt_rdata.data; + data.size = rp->re_len; + + sp = (u_int8_t *)rp->re_cmap; + ep = (u_int8_t *)rp->re_emap; + while (recno <= top) { + if (sp >= ep) { + F_SET(rp, RECNO_EOF); + return (DB_NOTFOUND); + } + len = rp->re_len; + for (p = t->bt_rdata.data; + sp < ep && len > 0; *p++ = *sp++, --len); + + /* + * Another process may have read some portion of the input + * file already, in which case we just want to discard the + * new record. + * + * XXX + * We should just do a seek, since the records are fixed + * length. + */ + if (rp->re_last >= recno) { + if (len != 0) + memset(p, rp->re_pad, len); + + ++recno; + if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + return (ret); + } + ++rp->re_last; + } + rp->re_cmap = sp; + return (0); +} + +/* + * __ram_vmap -- + * Get variable length records from a file. + */ +static int +__ram_vmap(dbp, top) + DB *dbp; + db_recno_t top; +{ + BTREE *t; + DBT data; + RECNO *rp; + db_recno_t recno; + u_int8_t *sp, *ep; + int delim, ret; + + t = dbp->internal; + rp = t->bt_recno; + + if ((ret = __bam_nrecs(dbp, &recno)) != 0) + return (ret); + + memset(&data, 0, sizeof(data)); + + delim = rp->re_delim; + + sp = (u_int8_t *)rp->re_cmap; + ep = (u_int8_t *)rp->re_emap; + while (recno <= top) { + if (sp >= ep) { + F_SET(rp, RECNO_EOF); + return (DB_NOTFOUND); + } + for (data.data = sp; sp < ep && *sp != delim; ++sp); + + /* + * Another process may have read some portion of the input + * file already, in which case we just want to discard the + * new record. + */ + if (rp->re_last >= recno) { + data.size = sp - (u_int8_t *)data.data; + ++recno; + if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + return (ret); + } + ++rp->re_last; + ++sp; + } + rp->re_cmap = sp; + return (0); +} + +/* + * __ram_add -- + * Add records into the tree. + */ +static int +__ram_add(dbp, recnop, data, flags, bi_flags) + DB *dbp; + db_recno_t *recnop; + DBT *data; + int flags, bi_flags; +{ + BTREE *t; + PAGE *h; + db_indx_t indx; + int exact, ret, stack; + + t = dbp->internal; + +retry: /* Find the slot for insertion. */ + if ((ret = __bam_rsearch(dbp, recnop, + S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0) + return (ret); + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* + * The recno access method doesn't currently support duplicates, so + * if an identical key is already in the tree we're either overwriting + * it or an error is returned. + */ + if (exact && LF_ISSET(DB_NOOVERWRITE)) { + ret = DB_KEYEXIST; + goto err; + } + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, or we're replacing the data item with a + * new data item. If the key isn't an exact match, we're inserting + * a new key/data pair, before the search location. + */ + if ((ret = __bam_iitem(dbp, &h, &indx, NULL, + data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) == DB_NEEDSPLIT) { + (void)__bam_stkrel(dbp); + stack = 0; + if ((ret = __bam_split(dbp, recnop)) != 0) + goto err; + goto retry; + } + + if (!exact && ret == 0) + __bam_adjust(dbp, t, 1); + +err: if (stack) + __bam_stkrel(dbp); + return (ret); +} diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c new file mode 100644 index 0000000000..ee26221e25 --- /dev/null +++ b/db2/btree/bt_rsearch.c @@ -0,0 +1,347 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_rsearch.c 10.8 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_rsearch -- + * Search a btree for a record number. + * + * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int, int, int *)); + */ +int +__bam_rsearch(dbp, recnop, flags, stop, exactp) + DB *dbp; + db_recno_t *recnop; + u_int flags; + int stop, *exactp; +{ + BINTERNAL *bi; + BTREE *t; + DB_LOCK lock; + PAGE *h; + RINTERNAL *ri; + db_indx_t indx, top; + db_pgno_t pg; + db_recno_t recno, total; + int isappend, ret, stack; + + t = dbp->internal; + + /* + * We test for groups of flags, S_APPEND is the only one that can be + * OR'd into the set. Clear it now so that the tests for equality + * will work. + */ + if ((isappend = LF_ISSET(S_APPEND)) != 0) + LF_CLR(S_APPEND); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks and if we are + * locking pairs of pages. See btree.h for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, + flags == S_INSERT || flags == S_DELETE ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + total = RE_NREC(h); + + /* + * If appending to the tree, set the record number now -- we have the + * root page locked. + * + * Delete only deletes exact matches, read only returns exact matches. + * Note, this is different from __bam_search(), which returns non-exact + * matches for read. + * + * The record may not exist. We can only return the correct location + * for the record immediately after the last record in the tree, so do + * a fast check now. + */ + if (isappend) { + *exactp = 0; + *recnop = recno = total + 1; + } else { + recno = *recnop; + if (recno <= total) + *exactp = 1; + else { + *exactp = 0; + if (flags == S_DELETE || + flags == S_FIND || recno > total + 1) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + return (DB_NOTFOUND); + } + } + } + + /* Decide if we're building a stack based on the operation. */ + BT_STK_CLR(t); + stack = flags == S_DELETE || flags == S_INSERT; + + /* + * Decide if we need to save this page; if we do, write lock it, and + * start to build a stack. + */ + if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) { + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + stack = 1; + } + + /* Records in the tree are 0-based, and record numbers are 1-based. */ + --recno; + + for (total = 0;;) { + switch (TYPE(h)) { + case P_LBTREE: + BT_STK_ENTER(t, h, (recno - total) * P_INDX, lock, ret); + return (ret); + case P_IBTREE: + for (indx = 0, top = NUM_ENT(h);;) { + bi = GET_BINTERNAL(h, indx); + if (++indx == top || total + bi->nrecs > recno) + break; + total += bi->nrecs; + } + pg = bi->pgno; + break; + case P_LRECNO: + BT_STK_ENTER(t, h, recno - total, lock, ret); + return (ret); + case P_IRECNO: + for (indx = 0, top = NUM_ENT(h);;) { + ri = GET_RINTERNAL(h, indx); + if (++indx == top || total + ri->nrecs > recno) + break; + total += ri->nrecs; + } + pg = ri->pgno; + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + --indx; + + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + } + BT_STK_PUSH(t, h, indx, lock, ret); + if (ret) + goto err; + + if ((ret = __bam_lget(dbp, 0, pg, + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, + &lock)) != 0) + goto err; + } else { + (void)memp_fput(dbp->mpf, h, 0); + + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if (LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) + stack = 1; + + if ((ret = __bam_lget(dbp, 1, pg, + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, + &lock)) != 0) + goto err; + } + + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + goto err; + } + /* NOTREACHED */ + +err: BT_STK_POP(t); + __bam_stkrel(dbp); + return (ret); +} + +/* + * __bam_adjust -- + * Adjust the tree after adding or deleting a record. + * + * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int)); + */ +int +__bam_adjust(dbp, t, adjust) + DB *dbp; + BTREE *t; + int adjust; +{ + EPG *epg; + PAGE *h; + int ret; + + /* Update the record counts for the tree. */ + for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + h = epg->page; + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { + if (DB_LOGGING(dbp) && + (ret = __bam_cadjust_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(h), 0, dbp->log_fileid, + PGNO(h), &LSN(h), (u_int32_t)epg->indx, + (int32_t)adjust, 1)) != 0) + return (ret); + + if (TYPE(h) == P_IBTREE) + GET_BINTERNAL(h, epg->indx)->nrecs += adjust; + else + GET_RINTERNAL(h, epg->indx)->nrecs += adjust; + + if (PGNO(h) == PGNO_ROOT) + RE_NREC_ADJ(h, adjust); + + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + } + } + return (0); +} + +/* + * __bam_nrecs -- + * Return the number of records in the tree. + * + * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *)); + */ +int +__bam_nrecs(dbp, rep) + DB *dbp; + db_recno_t *rep; +{ + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int ret; + + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + return (ret); + + *rep = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + + return (0); +} + +/* + * __bam_total -- + * Return the number of records below a page. + * + * PUBLIC: db_recno_t __bam_total __P((PAGE *)); + */ +db_recno_t +__bam_total(h) + PAGE *h; +{ + db_recno_t recs; + db_indx_t nxt, top; + + switch (TYPE(h)) { + case P_LBTREE: + recs = NUM_ENT(h) / 2; + break; + case P_IBTREE: + for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt) + recs += GET_BINTERNAL(h, nxt)->nrecs; + break; + case P_LRECNO: + recs = NUM_ENT(h); + break; + case P_IRECNO: + for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt) + recs += GET_RINTERNAL(h, nxt)->nrecs; + break; + default: + abort(); + } + return (recs); +} diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c new file mode 100644 index 0000000000..d5f20d4c61 --- /dev/null +++ b/db2/btree/bt_search.c @@ -0,0 +1,335 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_search.c 10.6 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_search -- + * Search a btree for a key. + * + * PUBLIC: int __bam_search __P((DB *, + * PUBLIC: const DBT *, u_int, int, db_recno_t *, int *)); + */ +int +__bam_search(dbp, key, flags, stop, recnop, exactp) + DB *dbp; + const DBT *key; + u_int flags; + int stop, *exactp; + db_recno_t *recnop; +{ + BTREE *t; + DB_LOCK lock; + EPG cur; + PAGE *h; + db_indx_t base, i, indx, lim; + db_pgno_t pg; + db_recno_t recno; + int cmp, jump, ret, stack; + + t = dbp->internal; + recno = 0; + + BT_STK_CLR(t); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks, if we position + * to the first or last item in a set of duplicates, if we return + * deleted items, and if we are locking pairs of pages. See btree.h + * for more details. In addition, if we're doing record numbers, we + * have to lock the entire tree regardless. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = PGNO_ROOT; + stack = F_ISSET(dbp, DB_BT_RECNUM) && + (flags == S_INSERT || flags == S_DELETE); + if ((ret = __bam_lget(dbp, + 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + + /* Decide if we need to save this page; if we do, write lock it. */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + + stack = 1; + } + + for (;;) { + /* + * Do a binary search on the current page. If we're searching + * a leaf page, we have to manipulate the indices in groups of + * two. If we're searching an internal page, they're an index + * per page item. If we find an exact match on a leaf page, + * we're done. + */ + cur.page = h; + jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; + for (base = 0, + lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) { + cur.indx = indx = base + ((lim >> 1) * jump); + if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) { + if (TYPE(h) == P_LBTREE) + goto match; + goto next; + } + if (cmp > 0) { + base = indx + jump; + --lim; + } + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + O_INDX index. + * + * If it's a leaf page, return base as the "found" value. + * Delete only deletes exact matches. + */ + if (TYPE(h) == P_LBTREE) { + *exactp = 0; + + if (LF_ISSET(S_EXACT)) + goto notfound; + + BT_STK_ENTER(t, h, base, lock, ret); + return (ret); + } + + /* + * If it's not a leaf page, record the internal page (which is + * a parent page for the key). Decrement the base by 1 if it's + * non-zero so that if a split later occurs, the inserted page + * will be to the right of the saved page. + */ + indx = base > 0 ? base - O_INDX : base; + + /* + * If we're trying to calculate the record number, sum up + * all the record numbers on this page up to the indx point. + */ + if (recnop != NULL) + for (i = 0; i < indx; ++i) + recno += GET_BINTERNAL(h, i)->nrecs; + +next: pg = GET_BINTERNAL(h, indx)->pgno; + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + } + BT_STK_PUSH(t, h, indx, lock, ret); + if (ret != 0) + goto err; + + if ((ret = + __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + goto err; + } else { + (void)memp_fput(dbp->mpf, h, 0); + + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) + stack = 1; + + if ((ret = + __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + goto err; + } + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + goto err; + } + + /* NOTREACHED */ +match: *exactp = 1; + + /* + * If we're trying to calculate the record number, add in the + * offset on this page and correct for the fact that records + * in the tree are 0-based. + */ + if (recnop != NULL) + *recnop = recno + (indx / P_INDX) + 1; + + /* + * If we got here, we know that we have a btree leaf page. + * + * If there are duplicates, go to the first/last one. + */ + if (LF_ISSET(S_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + indx += P_INDX; + else + while (indx > 0 && + h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + + /* + * Now check if we are allowed to return deleted item; if not + * find/last the first non-deleted item. + */ + if (LF_ISSET(S_DELNO)) { + if (LF_ISSET(S_DUPLAST)) + while (GET_BKEYDATA(h, indx + O_INDX)->deleted && + indx > 0 && + h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + else + while (GET_BKEYDATA(h, indx + O_INDX)->deleted && + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + indx += P_INDX; + + if (GET_BKEYDATA(h, indx + O_INDX)->deleted) + goto notfound; + } + + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + +notfound: + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + ret = DB_NOTFOUND; + +err: if (t->bt_csp > t->bt_sp) { + BT_STK_POP(t); + __bam_stkrel(dbp); + } + return (ret); +} + +/* + * __bam_stkrel -- + * Release all pages currently held in the stack. + * + * PUBLIC: int __bam_stkrel __P((DB *)); + */ +int +__bam_stkrel(dbp) + DB *dbp; +{ + BTREE *t; + EPG *epg; + + t = dbp->internal; + for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (0); +} + +/* + * __bam_stkgrow -- + * Grow the stack. + * + * PUBLIC: int __bam_stkgrow __P((BTREE *)); + */ +int +__bam_stkgrow(t) + BTREE *t; +{ + EPG *p; + size_t entries; + + entries = t->bt_esp - t->bt_sp; + + if ((p = (EPG *)calloc(entries * 2, sizeof(EPG))) == NULL) + return (ENOMEM); + memcpy(p, t->bt_sp, entries * sizeof(EPG)); + if (t->bt_sp != t->bt_stack) + FREE(t->bt_sp, entries * sizeof(EPG)); + t->bt_sp = p; + t->bt_csp = p + entries; + t->bt_esp = p + entries * 2; + return (0); +} diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c new file mode 100644 index 0000000000..89cfcb5a2e --- /dev/null +++ b/db2/btree/bt_split.c @@ -0,0 +1,952 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_split.c 10.12 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_page __P((DB *, EPG *, EPG *)); +static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *)); +static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int)); +static int __bam_root __P((DB *, EPG *)); + +/* + * __bam_split -- + * Split a page. + * + * PUBLIC: int __bam_split __P((DB *, void *)); + */ +int +__bam_split(dbp, arg) + DB *dbp; + void *arg; +{ + BTREE *t; + enum { UP, DOWN } dir; + int exact, level, ret; + + t = dbp->internal; + + /* + * The locking protocol we use to avoid deadlock to acquire locks by + * walking down the tree, but we do it as lazily as possible, locking + * the root only as a last resort. We expect all stack pages to have + * been discarded before we're called; we discard all short-term locks. + * + * When __bam_split is first called, we know that a leaf page was too + * full for an insert. We don't know what leaf page it was, but we + * have the key/recno that caused the problem. We call XX_search to + * reacquire the leaf page, but this time get both the leaf page and + * its parent, locked. We then split the leaf page and see if the new + * internal key will fit into the parent page. If it will, we're done. + * + * If it won't, we discard our current locks and repeat the process, + * only this time acquiring the parent page and its parent, locked. + * This process repeats until we succeed in the split, splitting the + * root page as the final resort. The entire process then repeats, + * as necessary, until we split a leaf page. + * + * XXX + * A traditional method of speeding this up is to maintain a stack of + * the pages traversed in the original search. You can detect if the + * stack is correct by storing the page's LSN when it was searched and + * comparing that LSN with the current one when it's locked during the + * split. This would be an easy change for this code, but I have no + * numbers that indicate it's worthwhile. + */ + for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { + /* + * Acquire a page and its parent, locked. + */ + if ((ret = (dbp->type == DB_BTREE ? + __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbp, + (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) + return (ret); + + /* Split the page. */ + ret = t->bt_csp[0].page->pgno == PGNO_ROOT ? + __bam_root(dbp, &t->bt_csp[0]) : + __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]); + + switch (ret) { + case 0: + /* Once we've split the leaf page, we're done. */ + if (level == LEAFLEVEL) + return (0); + + /* Switch directions. */ + if (dir == UP) + dir = DOWN; + break; + case DB_NEEDSPLIT: + /* + * It's possible to fail to split repeatedly, as other + * threads may be modifying the tree, or the page usage + * is sufficiently bad that we don't get enough space + * the first time. + */ + if (dir == DOWN) + dir = UP; + break; + default: + return (ret); + } + } + /* NOTREACHED */ +} + +/* + * __bam_root -- + * Split the root page of a btree. + */ +static int +__bam_root(dbp, cp) + DB *dbp; + EPG *cp; +{ + BTREE *t; + PAGE *lp, *rp; + int ret; + + t = dbp->internal; + + /* Yeah, right. */ + if (cp->page->level >= MAXBTREELEVEL) + return (ENOSPC); + + /* Create new left and right pages for the split. */ + lp = rp = NULL; + if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 || + (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, lp->pgno, + PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, + cp->page->level, TYPE(cp->page)); + + /* Split the page. */ + if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + DBT __a; + DB_LSN __lsn; + memset(&__a, 0, sizeof(__a)); + __a.data = cp->page; + __a.size = dbp->pgsize; + ZERO_LSN(__lsn); + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), + PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, + &__a)) != 0) + goto err; + LSN(lp) = LSN(rp) = LSN(cp->page); + } + + /* Clean up the new root page. */ + if ((ret = (dbp->type == DB_RECNO ? + __ram_root(dbp, cp->page, lp, rp) : + __bam_broot(dbp, cp->page, lp, rp))) != 0) + goto err; + + /* Success -- write the real pages back to the store. */ + (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, cp->lock); + (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); + (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + + ++t->lstat.bt_split; + ++t->lstat.bt_rootsplit; + return (0); + +err: if (lp != NULL) + (void)__bam_free(dbp, lp); + if (rp != NULL) + (void)__bam_free(dbp, rp); + (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbp, cp->lock); + return (ret); +} + +/* + * __bam_page -- + * Split the non-root page of a btree. + */ +static int +__bam_page(dbp, pp, cp) + DB *dbp; + EPG *pp, *cp; +{ + BTREE *t; + DB_LOCK tplock; + PAGE *lp, *rp, *tp; + int ret; + + t = dbp->internal; + lp = rp = tp = NULL; + ret = -1; + + /* Create new right page for the split. */ + if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + return (ret); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno, + cp->page->level, TYPE(cp->page)); + + /* Create new left page for the split. */ + if ((lp = (PAGE *)malloc(dbp->pgsize)) == NULL) { + ret = ENOMEM; + goto err; + } +#ifdef DEBUG + memset(lp, 0xff, dbp->pgsize); +#endif + P_INIT(lp, dbp->pgsize, cp->page->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + ZERO_LSN(lp->lsn); + + /* + * Split right. + * + * Only the indices are sorted on the page, i.e., the key/data pairs + * aren't, so it's simpler to copy the data from the split page onto + * two new pages instead of copying half the data to the right page + * and compacting the left page in place. Since the left page can't + * change, we swap the original and the allocated left page after the + * split. + */ + if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0) + goto err; + + /* + * Fix up the previous pointer of any leaf page following the split + * page. + * + * !!! + * There are interesting deadlock situations here as we write-lock a + * page that's not in our direct ancestry. Consider a cursor walking + * through the leaf pages, that has the previous page read-locked and + * is waiting on a lock for the page we just split. It will deadlock + * here. If this is a problem, we can fail in the split; it's not a + * problem as the split will succeed after the cursor passes through + * the page we're splitting. + */ + if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { + if ((ret = __bam_lget(dbp, + 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0) + goto err; + } + + /* Insert the new pages into the parent page. */ + if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + DBT __a; + DB_LSN __lsn; + memset(&__a, 0, sizeof(__a)); + __a.data = cp->page; + __a.size = dbp->pgsize; + if (tp == NULL) + ZERO_LSN(__lsn); + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), + &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), + tp == NULL ? 0 : PGNO(tp), + tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0) + goto err; + + LSN(lp) = LSN(rp) = LSN(cp->page); + if (tp != NULL) + LSN(tp) = LSN(cp->page); + } + + /* Copy the allocated page into place. */ + memcpy(cp->page, lp, LOFFSET(lp)); + memcpy((u_int8_t *)cp->page + HOFFSET(lp), + (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); + FREE(lp, dbp->pgsize); + lp = NULL; + + /* Finish the next-page link. */ + if (tp != NULL) + tp->prev_pgno = rp->pgno; + + /* Success -- write the real pages back to the store. */ + (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, pp->lock); + (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, cp->lock); + (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + if (tp != NULL) { + (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, tplock); + } + return (0); + +err: if (lp != NULL) + FREE(lp, dbp->pgsize); + if (rp != NULL) + (void)__bam_free(dbp, rp); + if (tp != NULL) { + (void)memp_fput(dbp->mpf, tp, 0); + (void)__BT_TLPUT(dbp, tplock); + } + (void)memp_fput(dbp->mpf, pp->page, 0); + (void)__BT_TLPUT(dbp, pp->lock); + (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbp, cp->lock); + return (ret); +} + +/* + * __bam_broot -- + * Fix up the btree root page after it has been split. + * + * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); + */ +int +__bam_broot(dbp, rootp, lp, rp) + DB *dbp; + PAGE *rootp, *lp, *rp; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk; + DBT hdr, data; + int ret; + + /* + * If the root page was a leaf page, change it into an internal page. + * We copy the key we split on (but not the key's data, in the case of + * a leaf page) to the new root page. + */ + P_INIT(rootp, dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); + + /* + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + bi.len = 0; + bi.deleted = 0; + bi.type = B_KEYDATA; + bi.pgno = lp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(lp); + RE_NREC_SET(rootp, bi.nrecs); + } + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = (char *) ""; + data.size = 0; + if ((ret = + __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, &data)) != 0) + return (ret); + + switch (TYPE(rp)) { + case P_IBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bi = GET_BINTERNAL(rp, 0); + + bi.len = child_bi->len; + bi.deleted = 0; + bi.type = child_bi->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bi->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0) + return (ret); + break; + case P_LBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bk = GET_BKEYDATA(rp, 0); + switch (child_bk->type) { + case B_KEYDATA: + bi.len = child_bk->len; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk->data; + data.size = child_bk->len; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bi.len = BOVERFLOW_SIZE; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bk->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + return (0); +} + +/* + * __ram_root -- + * Fix up the recno root page after it has been split. + * + * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); + */ +int +__ram_root(dbp, rootp, lp, rp) + DB *dbp; + PAGE *rootp, *lp, *rp; +{ + DBT hdr; + RINTERNAL ri; + int ret; + + /* Initialize the page. */ + P_INIT(rootp, dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); + + /* Initialize the header. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + + /* Insert the left and right keys, set the header information. */ + ri.pgno = lp->pgno; + ri.nrecs = __bam_total(lp); + if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_SET(rootp, ri.nrecs); + ri.pgno = rp->pgno; + ri.nrecs = __bam_total(rp); + if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_ADJ(rootp, ri.nrecs); + return (0); +} + +/* + * __bam_pinsert -- + * Insert a new key into a parent page, completing the split. + */ +static int +__bam_pinsert(dbp, parent, lchild, rchild) + DB *dbp; + EPG *parent; + PAGE *lchild, *rchild; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk, *tmp_bk; + BTREE *t; + DBT a, b, hdr, data; + PAGE *ppage; + RINTERNAL ri; + db_indx_t off; + db_recno_t nrecs; + u_int32_t n, nbytes, nksize; + int ret; + + t = dbp->internal; + ppage = parent->page; + + /* If handling record numbers, count records split to the right page. */ + nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ? + __bam_total(rchild) : 0; + + /* + * Now we insert the new page's first key into the parent page, which + * completes the split. The parent points to a PAGE and a page index + * offset, where the new key goes ONE AFTER the index, because we split + * to the right. + * + * XXX + * Some btree algorithms replace the key for the old page as well as + * the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. + */ + off = parent->indx + O_INDX; + + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack used when inserting into BINTERNAL pages. + * Retain only what's needed to distinguish between the new entry and + * the LAST entry on the page to its left. If the keys compare equal, + * retain the entire key. We ignore overflow keys, and the entire key + * must be retained for the next-to-leftmost key on the leftmost page + * of each level, or the search will fail. Applicable ONLY to internal + * pages that have leaf pages as children. Further reduction of the + * key between pairs of internal pages loses too much information. + */ + switch (TYPE(rchild)) { + case P_IBTREE: + child_bi = GET_BINTERNAL(rchild, 0); + nbytes = BINTERNAL_PSIZE(child_bi->len); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + /* Add a new record for the right page. */ + bi.len = child_bi->len; + bi.deleted = 0; + bi.type = child_bi->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bi->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0) + return (ret); + break; + case P_LBTREE: + child_bk = GET_BKEYDATA(rchild, 0); + switch (child_bk->type) { + case B_KEYDATA: + nbytes = BINTERNAL_PSIZE(child_bk->len); + nksize = child_bk->len; + if (t->bt_prefix == NULL) + goto noprefix; + if (ppage->prev_pgno == PGNO_INVALID && off <= 1) + goto noprefix; + tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX); + if (tmp_bk->type != B_KEYDATA) + goto noprefix; + memset(&a, 0, sizeof(a)); + a.size = tmp_bk->len; + a.data = tmp_bk->data; + memset(&b, 0, sizeof(b)); + b.size = child_bk->len; + b.data = child_bk->data; + nksize = t->bt_prefix(&a, &b); + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) { + t->lstat.bt_pfxsaved += nbytes - n; + nbytes = n; + } else +noprefix: nksize = child_bk->len; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + bi.len = nksize; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk->data; + data.size = nksize; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + bi.len = BOVERFLOW_SIZE; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bk->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + break; + case P_IRECNO: + case P_LRECNO: + nbytes = RINTERNAL_PSIZE; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + /* Add a new record for the right page. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + ri.pgno = rchild->pgno; + ri.nrecs = nrecs; + if ((ret = __db_pitem(dbp, + ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + + /* Adjust the parent page's left page record count. */ + if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_cadjust_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(ppage), 0, dbp->log_fileid, + PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, + -(int32_t)nrecs, (int32_t)0)) != 0) + return (ret); + + /* Update the left page count. */ + if (dbp->type == DB_RECNO) + GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + else + GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + } + + return (0); +} + +/* + * __bam_psplit -- + * Do the real work of splitting the page. + */ +static int +__bam_psplit(dbp, cp, lp, rp, cleft) + DB *dbp; + EPG *cp; + PAGE *lp, *rp; + int cleft; +{ + BTREE *t; + PAGE *pp; + db_indx_t half, nbytes, off, splitp, top; + int adjust, cnt, isbigkey, ret; + + t = dbp->internal; + pp = cp->page; + adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; + + /* + * If we're splitting the first (last) page on a level because we're + * inserting (appending) a key to it, it's likely that the data is + * sorted. Moving a single item to the new page is less work and can + * push the fill factor higher than normal. If we're wrong it's not + * a big deal, we'll just do the split the right way next time. + */ + off = 0; + if (NEXT_PGNO(pp) == PGNO_INVALID && + ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || + (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) + off = NUM_ENT(cp->page) - adjust; + else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) + off = adjust; + + ++t->lstat.bt_split; + if (off != 0) { + ++t->lstat.bt_fastsplit; + goto sort; + } + + /* + * Split the data to the left and right pages. Try not to split on + * an overflow key. (Overflow keys on internal pages will slow down + * searches.) Refuse to split in the middle of a set of duplicates. + * + * First, find the optimum place to split. + * + * It's possible to try and split past the last record on the page if + * there's a very large record at the end of the page. Make sure this + * doesn't happen by bounding the check at the next-to-last entry on + * the page. + * + * Note, we try and split half the data present on the page. This is + * because another process may have already split the page and left + * it half empty. We don't try and skip the split -- we don't know + * how much space we're going to need on the page, and we may need up + * to half the page for a big item, so there's no easy test to decide + * if we need to split or not. Besides, if two threads are inserting + * data into the same place in the database, we're probably going to + * need more space soon anyway. + */ + top = NUM_ENT(pp) - adjust; + half = (dbp->pgsize - HOFFSET(pp)) / 2; + for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) + switch (TYPE(pp)) { + case P_IBTREE: + if (GET_BINTERNAL(pp, off)->type == B_KEYDATA) + nbytes += + BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); + else + nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + if (GET_BKEYDATA(pp, off)->type == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + + ++off; + if (GET_BKEYDATA(pp, off)->type == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes += RINTERNAL_SIZE; + break; + case P_LRECNO: + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } +sort: splitp = off; + + /* + * Splitp is either at or just past the optimum split point. If + * it's a big key, try and find something close by that's not. + */ + if (TYPE(pp) == P_IBTREE) + isbigkey = GET_BINTERNAL(pp, off)->type != B_KEYDATA; + else if (TYPE(pp) == P_LBTREE) + isbigkey = GET_BKEYDATA(pp, off)->type != B_KEYDATA; + else + isbigkey = 0; + if (isbigkey) + for (cnt = 1; cnt <= 3; ++cnt) { + off = splitp + cnt * adjust; + if (off < (db_indx_t)NUM_ENT(pp) && + ((TYPE(pp) == P_IBTREE && + GET_BINTERNAL(pp, off)->type == B_KEYDATA) || + GET_BKEYDATA(pp, off)->type == B_KEYDATA)) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (TYPE(pp) == P_IBTREE ? + GET_BINTERNAL(pp, off)->type == B_KEYDATA : + GET_BKEYDATA(pp, off)->type == B_KEYDATA) { + splitp = off; + break; + } + } + + /* + * We can't split in the middle a set of duplicates. We know that + * no duplicate set can take up more than about 25% of the page, + * because that's the point where we push it off onto a duplicate + * page set. So, this loop can't be unbounded. + */ + if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE && + pp->inp[splitp] == pp->inp[splitp - adjust]) + for (cnt = 1;; ++cnt) { + off = splitp + cnt * adjust; + if (off < NUM_ENT(pp) && + pp->inp[splitp] != pp->inp[off]) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (pp->inp[splitp] != pp->inp[off]) { + splitp = off + adjust; + break; + } + } + + + /* We're going to split at splitp. */ + if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) + return (ret); + if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) + return (ret); + + /* Adjust the cursors. */ + __bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft); + return (0); +} + +/* + * __bam_copy -- + * Copy a set of records from one page to another. + * + * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__bam_copy(dbp, pp, cp, nxt, stop) + DB *dbp; + PAGE *pp, *cp; + u_int32_t nxt, stop; +{ + db_indx_t dup, nbytes, off; + + /* + * Copy the rest of the data to the right page. Nxt is the next + * offset placed on the target page. + */ + for (dup = off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { + switch (TYPE(pp)) { + case P_IBTREE: + if (GET_BINTERNAL(pp, nxt)->type == B_KEYDATA) + nbytes = + BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); + else + nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + /* + * If we're on a key and it's a duplicate, just copy + * the offset. + */ + if (off != 0 && (nxt % P_INDX) == 0 && + pp->inp[nxt] == pp->inp[nxt - P_INDX]) { + cp->inp[off] = cp->inp[off - P_INDX]; + continue; + } + /* FALLTHROUGH */ + case P_LRECNO: + if (GET_BKEYDATA(pp, nxt)->type == B_KEYDATA) + nbytes = + BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); + else + nbytes = BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } + cp->inp[off] = HOFFSET(cp) -= nbytes; + memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); + } + return (0); +} diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c new file mode 100644 index 0000000000..ba71ea616d --- /dev/null +++ b/db2/btree/bt_stat.c @@ -0,0 +1,257 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_stat.c 10.11 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); + +/* + * __bam_stat -- + * Gather/print the btree statistics + * + * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), int)); + */ +int +__bam_stat(argdbp, spp, db_malloc, flags) + DB *argdbp; + void *spp; + void *(*db_malloc) __P((size_t)); + int flags; +{ + BTMETA *meta; + BTREE *t; + DB *dbp; + DB_BTREE_STAT *sp; + DB_LOCK lock; + PAGE *h; + db_pgno_t lastpgno, pgno; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_statchk(argdbp, flags)) != 0) + return (ret); + + if (spp == NULL) + return (0); + + GETHANDLE(argdbp, NULL, &dbp, ret); + t = dbp->internal; + + /* Allocate and clear the structure. */ + if ((sp = db_malloc == NULL ? + (DB_BTREE_STAT *)malloc(sizeof(*sp)) : + (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) { + ret = ENOMEM; + goto err; + } + memset(sp, 0, sizeof(*sp)); + + /* If the app just wants the record count, make it fast. */ + if (LF_ISSET(DB_RECORDCOUNT)) { + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0) + goto err; + + sp->bt_nrecs = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + goto done; + } + + /* Get the meta-data page. */ + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + goto err; + + /* Translate the metadata flags. */ + if (F_ISSET(meta, BTM_DUP)) + sp->bt_flags |= DB_DUP; + if (F_ISSET(meta, BTM_FIXEDLEN)) + sp->bt_flags |= DB_FIXEDLEN; + if (F_ISSET(meta, BTM_RECNUM)) + sp->bt_flags |= DB_RECNUM; + if (F_ISSET(meta, BTM_RENUMBER)) + sp->bt_flags |= DB_RENUMBER; + + /* + * Get the maxkey, minkey, re_len and re_pad fields from the + * metadata. + */ + sp->bt_minkey = meta->minkey; + sp->bt_maxkey = meta->maxkey; + sp->bt_re_len = meta->re_len; + sp->bt_re_pad = meta->re_pad; + + /* Get the page size from the DB. */ + sp->bt_pagesize = dbp->pgsize; + + /* Initialize counters with the meta-data page information. */ + __bam_add_rstat(&meta->stat, sp); + + /* + * Add in the local information from this handle. + * + * !!! + * This is a bit odd, but it gets us closer to the truth. + */ + __bam_add_rstat(&t->lstat, sp); + + /* Walk the free list, counting pages. */ + for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) { + ++sp->bt_free; + + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + (void)memp_fput(dbp->mpf, meta, 0); + (void)__BT_TLPUT(dbp, lock); + goto err; + } + pgno = h->next_pgno; + (void)memp_fput(dbp->mpf, h, 0); + } + + /* Discard the meta-data page. */ + (void)memp_fput(dbp->mpf, meta, 0); + (void)__BT_TLPUT(dbp, lock); + + /* Get the root page. */ + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + goto err; + } + + /* Get the levels from the root page. */ + sp->bt_levels = h->level; + + /* + * Determine the last page of the database, then walk it, counting + * things. + */ + if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0) + goto err; + (void)memp_fput(dbp->mpf, h, 0); + for (;;) { + switch (TYPE(h)) { + case P_INVALID: + break; + case P_IBTREE: + case P_IRECNO: + ++sp->bt_int_pg; + sp->bt_int_pgfree += HOFFSET(h) - LOFFSET(h); + break; + case P_LBTREE: + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h); + sp->bt_nrecs += NUM_ENT(h) / P_INDX; + break; + case P_LRECNO: + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h); + sp->bt_nrecs += NUM_ENT(h); + break; + case P_DUPLICATE: + ++sp->bt_dup_pg; + /* XXX MARGO: sp->bt_dup_pgfree; */ + break; + case P_OVERFLOW: + ++sp->bt_over_pg; + /* XXX MARGO: sp->bt_over_pgfree; */ + break; + default: + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + return (__db_pgfmt(dbp, pgno)); + } + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + + if (++pgno > lastpgno) + break; + if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) + break; + if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) { + (void)__BT_LPUT(dbp, lock); + break; + } + } + +done: *(DB_BTREE_STAT **)spp = sp; + ret = 0; + +err: PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_add_mstat -- + * Add the local statistics to the meta-data page statistics. + * + * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); + */ +void +__bam_add_mstat(from, to) + DB_BTREE_LSTAT *from; + DB_BTREE_LSTAT *to; +{ + to->bt_freed += from->bt_freed; + to->bt_pfxsaved += from->bt_pfxsaved; + to->bt_split += from->bt_split; + to->bt_rootsplit += from->bt_rootsplit; + to->bt_fastsplit += from->bt_fastsplit; + to->bt_added += from->bt_added; + to->bt_deleted += from->bt_deleted; + to->bt_get += from->bt_get; + to->bt_cache_hit += from->bt_cache_hit; + to->bt_cache_miss += from->bt_cache_miss; +} + +/* + * __bam_add_rstat -- + * Add the local statistics to the returned statistics. + */ +static void +__bam_add_rstat(from, to) + DB_BTREE_LSTAT *from; + DB_BTREE_STAT *to; +{ + to->bt_freed += from->bt_freed; + to->bt_pfxsaved += from->bt_pfxsaved; + to->bt_split += from->bt_split; + to->bt_rootsplit += from->bt_rootsplit; + to->bt_fastsplit += from->bt_fastsplit; + to->bt_added += from->bt_added; + to->bt_deleted += from->bt_deleted; + to->bt_get += from->bt_get; + to->bt_cache_hit += from->bt_cache_hit; + to->bt_cache_miss += from->bt_cache_miss; +} diff --git a/db2/btree/btree.src b/db2/btree/btree.src new file mode 100644 index 0000000000..50cc0dd0ff --- /dev/null +++ b/db2/btree/btree.src @@ -0,0 +1,137 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)btree.src 10.3 (Sleepycat) 8/17/97"; +#endif /* not lint */ + +PREFIX bam + +/* + * BTREE-pg_alloc: used to record allocating a new page. + * + * meta_lsn: the meta-data page's original lsn. + * page_lsn: the allocated page's original lsn. + * pgno: the page allocated. + * next: the next page on the free list. + */ +BEGIN pg_alloc +ARG fileid u_int32_t lu +POINTER meta_lsn DB_LSN * lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +/* + * BTREE-pg_free: used to record freeing a page. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + */ +BEGIN pg_free +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +DBT header DBT s +ARG next db_pgno_t lu +END + +/* + * BTREE-split: used to log a page split. + * + * left: the page number for the low-order contents. + * llsn: the left page's original LSN. + * right: the page number for the high-order contents. + * rlsn: the right page's original LSN. + * indx: the number of entries that went to the left page. + * npgno: the next page number + * nlsn: the next page's original LSN (or 0 if no next page). + * pg: the split page's contents before the split. + */ +BEGIN split +ARG fileid u_int32_t lu +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +DBT pg DBT s +END + +/* + * BTREE-rsplit: used to log a reverse-split + * + * pgno: the page number of the page copied over the root. + * pgdbt: the page being copied on the root page. + * rootent: last entry on the root page. + * rootlsn: the root page's original lsn. + */ +BEGIN rsplit +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +DBT pgdbt DBT s +DBT rootent DBT s +POINTER rootlsn DB_LSN * lu +END + +/* + * BTREE-adj: used to log the adjustment of an index. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index adjusted. + * indx_copy: the index to copy if inserting. + * is_insert: 0 if a delete, 1 if an insert. + */ +BEGIN adj +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG indx_copy u_int32_t lu +ARG is_insert u_int32_t lu +END + +/* + * BTREE-cadjust: used to adjust the count change in an internal page. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be adjusted. + * adjust: the signed adjustment. + * total: if the total tree entries count should be adjusted + */ +BEGIN cadjust +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG adjust int32_t ld +ARG total int32_t ld +END + +/* + * BTREE-cdel: used to log the intent-to-delete of a cursor record. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be deleted. + */ +BEGIN cdel +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +END diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c new file mode 100644 index 0000000000..e6b72252e5 --- /dev/null +++ b/db2/btree/btree_auto.c @@ -0,0 +1,1279 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "btree.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __bam_pg_alloc_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t, + * PUBLIC: u_int32_t, db_pgno_t)); + */ +int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, + fileid, meta_lsn, page_lsn, pgno, ptype, next) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + DB_LSN * meta_lsn; + DB_LSN * page_lsn; + db_pgno_t pgno; + u_int32_t ptype; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_alloc; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(*meta_lsn) + + sizeof(*page_lsn) + + sizeof(pgno) + + sizeof(ptype) + + sizeof(next); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (page_lsn != NULL) + memcpy(bp, page_lsn, sizeof(*page_lsn)); + else + memset(bp, 0, sizeof(*page_lsn)); + bp += sizeof(*page_lsn); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ptype, sizeof(ptype)); + bp += sizeof(ptype); + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_pg_alloc_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_pg_alloc_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_pg_alloc_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tptype: %lu\n", (u_long)argp->ptype); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_pg_alloc_read __P((void *, __bam_pg_alloc_args **)); + */ +int +__bam_pg_alloc_read(recbuf, argpp) + void *recbuf; + __bam_pg_alloc_args **argpp; +{ + __bam_pg_alloc_args *argp; + u_int8_t *bp; + + argp = (__bam_pg_alloc_args *)malloc(sizeof(__bam_pg_alloc_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); + bp += sizeof(argp->page_lsn); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ptype, bp, sizeof(argp->ptype)); + bp += sizeof(argp->ptype); + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_pg_free_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, DBT *, + * PUBLIC: db_pgno_t)); + */ +int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, meta_lsn, header, next) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * meta_lsn; + DBT *header; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_free; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(next); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (header == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &header->size, sizeof(header->size)); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + bp += header->size; + } + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_pg_free_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_pg_free_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_pg_free_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + c = ((char *)argp->header.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_pg_free_read __P((void *, __bam_pg_free_args **)); + */ +int +__bam_pg_free_read(recbuf, argpp) + void *recbuf; + __bam_pg_free_args **argpp; +{ + __bam_pg_free_args *argp; + u_int8_t *bp; + + argp = (__bam_pg_free_args *)malloc(sizeof(__bam_pg_free_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->header.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_split_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: DB_LSN *, u_int32_t, db_pgno_t, DB_LSN *, + * PUBLIC: DBT *)); + */ +int __bam_split_log(logp, txnid, ret_lsnp, flags, + fileid, left, llsn, right, rlsn, indx, + npgno, nlsn, pg) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t left; + DB_LSN * llsn; + db_pgno_t right; + DB_LSN * rlsn; + u_int32_t indx; + db_pgno_t npgno; + DB_LSN * nlsn; + DBT *pg; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_split; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(left) + + sizeof(*llsn) + + sizeof(right) + + sizeof(*rlsn) + + sizeof(indx) + + sizeof(npgno) + + sizeof(*nlsn) + + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &left, sizeof(left)); + bp += sizeof(left); + if (llsn != NULL) + memcpy(bp, llsn, sizeof(*llsn)); + else + memset(bp, 0, sizeof(*llsn)); + bp += sizeof(*llsn); + memcpy(bp, &right, sizeof(right)); + bp += sizeof(right); + if (rlsn != NULL) + memcpy(bp, rlsn, sizeof(*rlsn)); + else + memset(bp, 0, sizeof(*rlsn)); + bp += sizeof(*rlsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &npgno, sizeof(npgno)); + bp += sizeof(npgno); + if (nlsn != NULL) + memcpy(bp, nlsn, sizeof(*nlsn)); + else + memset(bp, 0, sizeof(*nlsn)); + bp += sizeof(*nlsn); + if (pg == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pg->size, sizeof(pg->size)); + bp += sizeof(pg->size); + memcpy(bp, pg->data, pg->size); + bp += pg->size; + } +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_split_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_split_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_split_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_split_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tleft: %lu\n", (u_long)argp->left); + printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + printf("\tright: %lu\n", (u_long)argp->right); + printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tnpgno: %lu\n", (u_long)argp->npgno); + printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + c = ((char *)argp->pg.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_split_read __P((void *, __bam_split_args **)); + */ +int +__bam_split_read(recbuf, argpp) + void *recbuf; + __bam_split_args **argpp; +{ + __bam_split_args *argp; + u_int8_t *bp; + + argp = (__bam_split_args *)malloc(sizeof(__bam_split_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->left, bp, sizeof(argp->left)); + bp += sizeof(argp->left); + memcpy(&argp->llsn, bp, sizeof(argp->llsn)); + bp += sizeof(argp->llsn); + memcpy(&argp->right, bp, sizeof(argp->right)); + bp += sizeof(argp->right); + memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); + bp += sizeof(argp->rlsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->npgno, bp, sizeof(argp->npgno)); + bp += sizeof(argp->npgno); + memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); + bp += sizeof(argp->nlsn); + memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_rsplit_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DBT *, DBT *, + * PUBLIC: DB_LSN *)); + */ +int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, pgdbt, rootent, rootlsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DBT *pgdbt; + DBT *rootent; + DB_LSN * rootlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_rsplit; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size) + + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + + sizeof(*rootlsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pgdbt == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pgdbt->size, sizeof(pgdbt->size)); + bp += sizeof(pgdbt->size); + memcpy(bp, pgdbt->data, pgdbt->size); + bp += pgdbt->size; + } + if (rootent == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &rootent->size, sizeof(rootent->size)); + bp += sizeof(rootent->size); + memcpy(bp, rootent->data, rootent->size); + bp += rootent->size; + } + if (rootlsn != NULL) + memcpy(bp, rootlsn, sizeof(*rootlsn)); + else + memset(bp, 0, sizeof(*rootlsn)); + bp += sizeof(*rootlsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_rsplit_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_rsplit_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_rsplit_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpgdbt: "); + for (i = 0; i < argp->pgdbt.size; i++) { + c = ((char *)argp->pgdbt.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\trootent: "); + for (i = 0; i < argp->rootent.size; i++) { + c = ((char *)argp->rootent.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\trootlsn: [%lu][%lu]\n", + (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_rsplit_read __P((void *, __bam_rsplit_args **)); + */ +int +__bam_rsplit_read(recbuf, argpp) + void *recbuf; + __bam_rsplit_args **argpp; +{ + __bam_rsplit_args *argp; + u_int8_t *bp; + + argp = (__bam_rsplit_args *)malloc(sizeof(__bam_rsplit_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pgdbt.data = bp; + bp += argp->pgdbt.size; + memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->rootent.data = bp; + bp += argp->rootent.size; + memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); + bp += sizeof(argp->rootlsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_adj_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t)); + */ +int __bam_adj_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, indx_copy, is_insert) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t indx_copy; + u_int32_t is_insert; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_adj; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(indx_copy) + + sizeof(is_insert); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &indx_copy, sizeof(indx_copy)); + bp += sizeof(indx_copy); + memcpy(bp, &is_insert, sizeof(is_insert)); + bp += sizeof(is_insert); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_adj_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_adj_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_adj_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_adj_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); + printf("\tis_insert: %lu\n", (u_long)argp->is_insert); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_adj_read __P((void *, __bam_adj_args **)); + */ +int +__bam_adj_read(recbuf, argpp) + void *recbuf; + __bam_adj_args **argpp; +{ + __bam_adj_args *argp; + u_int8_t *bp; + + argp = (__bam_adj_args *)malloc(sizeof(__bam_adj_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy)); + bp += sizeof(argp->indx_copy); + memcpy(&argp->is_insert, bp, sizeof(argp->is_insert)); + bp += sizeof(argp->is_insert); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_cadjust_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: int32_t, int32_t)); + */ +int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, adjust, total) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + int32_t adjust; + int32_t total; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cadjust; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(adjust) + + sizeof(total); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &adjust, sizeof(adjust)); + bp += sizeof(adjust); + memcpy(bp, &total, sizeof(total)); + bp += sizeof(total); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_cadjust_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_cadjust_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_cadjust_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tadjust: %ld\n", (long)argp->adjust); + printf("\ttotal: %ld\n", (long)argp->total); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_cadjust_read __P((void *, __bam_cadjust_args **)); + */ +int +__bam_cadjust_read(recbuf, argpp) + void *recbuf; + __bam_cadjust_args **argpp; +{ + __bam_cadjust_args *argp; + u_int8_t *bp; + + argp = (__bam_cadjust_args *)malloc(sizeof(__bam_cadjust_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->adjust, bp, sizeof(argp->adjust)); + bp += sizeof(argp->adjust); + memcpy(&argp->total, bp, sizeof(argp->total)); + bp += sizeof(argp->total); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_cdel_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t)); + */ +int __bam_cdel_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cdel; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_cdel_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_cdel_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_cdel_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_cdel_read __P((void *, __bam_cdel_args **)); + */ +int +__bam_cdel_read(recbuf, argpp) + void *recbuf; + __bam_cdel_args **argpp; +{ + __bam_cdel_args *argp; + u_int8_t *bp; + + argp = (__bam_cdel_args *)malloc(sizeof(__bam_cdel_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_init_print __P((DB_ENV *)); + */ +int +__bam_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_print, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_print, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_print, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_print, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_print, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_print, DB_bam_cdel)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __bam_init_recover __P((DB_ENV *)); + */ +int +__bam_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_recover, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_recover, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_recover, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_recover, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_recover, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_recover, DB_bam_cdel)) != 0) + return (ret); + return (0); +} + diff --git a/db2/clib/getlong.c b/db2/clib/getlong.c new file mode 100644 index 0000000000..d79c6846df --- /dev/null +++ b/db2/clib/getlong.c @@ -0,0 +1,48 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)getlong.c 10.2 (Sleepycat) 5/1/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <errno.h> +#include <limits.h> +#include <stdlib.h> +#endif + +#include "db.h" +#include "clib_ext.h" + +/* + * get_long -- + * Return a long value inside of basic parameters. + * + * PUBLIC: void get_long __P((char *, long, long, long *)); + */ +void +get_long(p, min, max, storep) + char *p; + long min, max, *storep; +{ + long val; + char *end; + + errno = 0; + val = strtol(p, &end, 10); + if ((val == LONG_MIN || val == LONG_MAX) && errno == ERANGE) + err(1, "%s", p); + if (p[0] == '\0' || end[0] != '\0') + errx(1, "%s: Invalid numeric argument", p); + if (val < min) + errx(1, "%s: Less than minimum value (%ld)", p, min); + if (val > max) + errx(1, "%s: Greater than maximum value (%ld)", p, max); + *storep = val; +} diff --git a/db2/common/db_appinit.c b/db2/common/db_appinit.c new file mode 100644 index 0000000000..01891c66a7 --- /dev/null +++ b/db2/common/db_appinit.c @@ -0,0 +1,787 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_appinit.c 10.27 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/param.h> +#include <sys/stat.h> + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "btree.h" +#include "hash.h" +#include "log.h" +#include "txn.h" +#include "clib_ext.h" +#include "common_ext.h" + +static int __db_home __P((DB_ENV *, const char *, int)); +static int __db_parse __P((DB_ENV *, char *)); +static int __db_tmp_dir __P((DB_ENV *, int)); +static int __db_tmp_open __P((DB_ENV *, char *, int *)); + +/* + * db_version -- + * Return verision information. + */ +const char * +db_version(majverp, minverp, patchp) + int *majverp, *minverp, *patchp; +{ + if (majverp != NULL) + *majverp = DB_VERSION_MAJOR; + if (minverp != NULL) + *minverp = DB_VERSION_MINOR; + if (patchp != NULL) + *patchp = DB_VERSION_PATCH; + return (DB_VERSION_STRING); +} + +/* + * db_appinit -- + * Initialize the application environment. + */ +int +db_appinit(db_home, db_config, dbenv, flags) + const char *db_home; + char * const *db_config; + DB_ENV *dbenv; + int flags; +{ + FILE *fp; + int i_lock, i_log, i_mpool, i_txn, ret; + char *lp, **p, buf[MAXPATHLEN * 2]; + + /* Validate arguments. */ + if (dbenv == NULL) + return (EINVAL); +#ifdef HAVE_SPINLOCKS +#define OKFLAGS \ + (DB_CREATE | DB_NOMMAP | DB_THREAD | DB_INIT_LOCK | DB_INIT_LOG | \ + DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER | \ + DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) +#else +#define OKFLAGS \ + (DB_CREATE | DB_NOMMAP | DB_INIT_LOCK | DB_INIT_LOG | \ + DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER | \ + DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) +#endif + if ((ret = __db_fchk(dbenv, "db_appinit", flags, OKFLAGS)) != 0) + return (ret); + +#define RECOVERY_FLAGS (DB_CREATE | DB_INIT_TXN | DB_INIT_LOG) + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && + LF_ISSET(RECOVERY_FLAGS) != RECOVERY_FLAGS) + return (__db_ferr(dbenv, "db_appinit", 1)); + + fp = NULL; + i_lock = i_log = i_mpool = i_txn = 0; + + /* Set the database home. */ + if ((ret = __db_home(dbenv, db_home, flags)) != 0) + goto err; + + /* Parse the config array. */ + for (p = (char **)db_config; p != NULL && *p != NULL; ++p) + if ((ret = __db_parse(dbenv, *p)) != 0) + goto err; + + /* Parse the config file. */ + if (dbenv->db_home != NULL) { + (void)snprintf(buf, + sizeof(buf), "%s/DB_CONFIG", dbenv->db_home); + if ((fp = fopen(buf, "r")) != NULL) { + while (fgets(buf, sizeof(buf), fp) != NULL) { + if ((lp = strchr(buf, '\n')) != NULL) + *lp = '\0'; + if ((ret = __db_parse(dbenv, buf)) != 0) + goto err; + } + (void)fclose(fp); + } + } + + /* Set up the tmp directory path. */ + if (dbenv->db_tmp_dir == NULL && + (ret = __db_tmp_dir(dbenv, flags)) != 0) + goto err; + + /* Indicate that the path names have been set. */ + F_SET(dbenv, DB_APP_INIT); + + /* + * If we are doing recovery, remove all the regions. + */ + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) { + /* Remove all the old shared memory regions. */ + if ((ret = log_unlink(NULL, 1 /* force */, dbenv)) != 0) + goto err; + if ((ret = memp_unlink(NULL, 1 /* force */, dbenv)) != 0) + goto err; + if ((ret = lock_unlink(NULL, 1 /* force */, dbenv)) != 0) + goto err; + if ((ret = txn_unlink(NULL, 1 /* force */, dbenv)) != 0) + goto err; + } + + /* Transactions imply logging. */ + if (LF_ISSET(DB_INIT_TXN)) + LF_SET(DB_INIT_LOG); + + /* Default permissions are 0660. */ +#undef DB_DEFPERM +#define DB_DEFPERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) + + /* Initialize the subsystems. */ + if (LF_ISSET(DB_INIT_LOCK)) { + if ((ret = lock_open(NULL, + LF_ISSET(DB_CREATE | DB_THREAD), + DB_DEFPERM, dbenv, &dbenv->lk_info)) != 0) + goto err; + i_lock = 1; + } + if (LF_ISSET(DB_INIT_LOG)) { + if ((ret = log_open(NULL, + LF_ISSET(DB_CREATE | DB_THREAD), + DB_DEFPERM, dbenv, &dbenv->lg_info)) != 0) + goto err; + i_log = 1; + } + if (LF_ISSET(DB_INIT_MPOOL)) { + if ((ret = memp_open(NULL, + LF_ISSET(DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD), + DB_DEFPERM, dbenv, &dbenv->mp_info)) != 0) + goto err; + i_mpool = 1; + } + if (LF_ISSET(DB_INIT_TXN)) { + if ((ret = txn_open(NULL, + LF_ISSET(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC), + DB_DEFPERM, dbenv, &dbenv->tx_info)) != 0) + goto err; + i_txn = 1; + } + + /* Initialize recovery. */ + if (LF_ISSET(DB_INIT_TXN)) { + if ((ret = __bam_init_recover(dbenv)) != 0) + goto err; + if ((ret = __db_init_recover(dbenv)) != 0) + goto err; + if ((ret = __ham_init_recover(dbenv)) != 0) + goto err; + if ((ret = __log_init_recover(dbenv)) != 0) + goto err; + if ((ret = __txn_init_recover(dbenv)) != 0) + goto err; + } + + /* Now run recovery if necessary. */ + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && (ret = + __db_apprec(dbenv, LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0) + goto err; + + return (ret); + +err: if (fp != NULL) + (void)fclose(fp); + if (i_lock) + (void)lock_close(dbenv->lk_info); + if (i_log) + (void)log_close(dbenv->lg_info); + if (i_mpool) + (void)memp_close(dbenv->mp_info); + if (i_txn) + (void)txn_close(dbenv->tx_info); + + (void)db_appexit(dbenv); + return (ret); +} + +/* + * db_appexit -- + * Close down the default application environment. + */ +int +db_appexit(dbenv) + DB_ENV *dbenv; +{ + int ret, t_ret; + char **p; + + ret = 0; + + /* Close subsystems. */ + if (dbenv->tx_info && (t_ret = txn_close(dbenv->tx_info)) != 0) + if (ret == 0) + ret = t_ret; + if (dbenv->mp_info && (t_ret = memp_close(dbenv->mp_info)) != 0) + if (ret == 0) + ret = t_ret; + if (dbenv->lg_info && (t_ret = log_close(dbenv->lg_info)) != 0) + if (ret == 0) + ret = t_ret; + if (dbenv->lk_info && (t_ret = lock_close(dbenv->lk_info)) != 0) + if (ret == 0) + ret = t_ret; + + /* Free allocated memory. */ + if (dbenv->db_home != NULL) + FREES(dbenv->db_home); + if ((p = dbenv->db_data_dir) != NULL) { + for (; *p != NULL; ++p) + FREES(*p); + FREE(dbenv->db_data_dir, dbenv->data_cnt * sizeof(char **)); + } + if (dbenv->db_log_dir != NULL) + FREES(dbenv->db_log_dir); + if (dbenv->db_tmp_dir != NULL) + FREES(dbenv->db_tmp_dir); + + return (ret); +} + +#define DB_ADDSTR(str) { \ + if ((str) != NULL) { \ + /* If leading slash, start over. */ \ + if (__db_abspath(str)) { \ + p = start; \ + slash = 0; \ + } \ + /* Append to the current string. */ \ + len = strlen(str); \ + if (slash) \ + *p++ = PATH_SEPARATOR[0]; \ + memcpy(p, str, len); \ + p += len; \ + slash = strchr(PATH_SEPARATOR, p[-1]) == NULL; \ + } \ +} + +/* + * __db_appname -- + * Given an optional DB environment, directory and file name and type + * of call, build a path based on the db_appinit(3) rules, and return + * it in allocated space. + * + * PUBLIC: int __db_appname __P((DB_ENV *, + * PUBLIC: APPNAME, const char *, const char *, int *, char **)); + */ +int +__db_appname(dbenv, appname, dir, file, fdp, namep) + DB_ENV *dbenv; + APPNAME appname; + const char *dir, *file; + int *fdp; + char **namep; +{ + DB_ENV etmp; + size_t len; + int ret, slash, tmp_create, tmp_free; + const char *a, *b, *c; + int data_entry; + char *p, *start; + + a = b = c = NULL; + data_entry = -1; + tmp_create = tmp_free = 0; + + /* + * We don't return a name when creating temporary files, just an fd. + * Default to error now. + */ + if (fdp != NULL) + *fdp = -1; + if (namep != NULL) + *namep = NULL; + + /* + * Absolute path names are never modified. If the file is an absolute + * path, we're done. If the directory is, simply append the file and + * return. + */ + if (file != NULL && __db_abspath(file)) + return ((*namep = (char *)strdup(file)) == NULL ? ENOMEM : 0); + if (dir != NULL && __db_abspath(dir)) { + a = dir; + goto done; + } + + /* + * DB_ENV DIR APPNAME RESULT + * ------------------------------------------- + * null null none <tmp>/file + * null set none DIR/file + * set null none DB_HOME/file + * set set none DB_HOME/DIR/file + * + * DB_ENV FILE APPNAME RESULT + * ------------------------------------------- + * null null DB_APP_DATA <tmp>/<create> + * null set DB_APP_DATA ./file + * set null DB_APP_DATA <tmp>/<create> + * set set DB_APP_DATA DB_HOME/DB_DATA_DIR/file + * + * DB_ENV DIR APPNAME RESULT + * ------------------------------------------- + * null null DB_APP_LOG <tmp>/file + * null set DB_APP_LOG DIR/file + * set null DB_APP_LOG DB_HOME/DB_LOG_DIR/file + * set set DB_APP_LOG DB_HOME/DB_LOG_DIR/DIR/file + * + * DB_ENV APPNAME RESULT + * ------------------------------------------- + * null DB_APP_TMP <tmp>/<create> + * set DB_APP_TMP DB_HOME/DB_TMP_DIR/<create> + */ +retry: switch (appname) { + case DB_APP_NONE: + if (dbenv == NULL || !F_ISSET(dbenv, DB_APP_INIT)) { + if (dir == NULL) + goto tmp; + a = dir; + } else { + a = dbenv->db_home; + b = dir; + } + break; + case DB_APP_DATA: + if (dir != NULL) { + __db_err(dbenv, + "DB_APP_DATA: illegal directory specification"); + return (EINVAL); + } + + if (file == NULL) { + tmp_create = 1; + goto tmp; + } + if (dbenv == NULL || !F_ISSET(dbenv, DB_APP_INIT)) + a = PATH_DOT; + else { + a = dbenv->db_home; + if (dbenv->db_data_dir != NULL && + (b = dbenv->db_data_dir[++data_entry]) == NULL) { + data_entry = -1; + b = dbenv->db_data_dir[0]; + } + } + break; + case DB_APP_LOG: + if (dbenv == NULL || !F_ISSET(dbenv, DB_APP_INIT)) { + if (dir == NULL) + goto tmp; + a = dir; + } else { + a = dbenv->db_home; + b = dbenv->db_log_dir; + c = dir; + } + break; + case DB_APP_TMP: + if (dir != NULL || file != NULL) { + __db_err(dbenv, + "DB_APP_TMP: illegal directory or file specification"); + return (EINVAL); + } + + tmp_create = 1; + if (dbenv == NULL || !F_ISSET(dbenv, DB_APP_INIT)) + goto tmp; + else { + a = dbenv->db_home; + b = dbenv->db_tmp_dir; + } + break; + } + + /* Reference a file from the appropriate temporary directory. */ + if (0) { +tmp: if (dbenv == NULL || !F_ISSET(dbenv, DB_APP_INIT)) { + memset(&etmp, 0, sizeof(etmp)); + if ((ret = __db_tmp_dir(&etmp, DB_USE_ENVIRON)) != 0) + return (ret); + tmp_free = 1; + a = etmp.db_tmp_dir; + } else + a = dbenv->db_tmp_dir; + } + +done: len = + (a == NULL ? 0 : strlen(a) + 1) + + (b == NULL ? 0 : strlen(b) + 1) + + (c == NULL ? 0 : strlen(c) + 1) + + (file == NULL ? 0 : strlen(file) + 1); + + if ((start = (char *)malloc(len)) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + if (tmp_free) + FREES(etmp.db_tmp_dir); + return (ENOMEM); + } + + slash = 0; + p = start; + DB_ADDSTR(a); + DB_ADDSTR(b); + DB_ADDSTR(file); + *p = '\0'; + + /* + * If we're opening a data file, see if it exists. If it does, + * return it, otherwise, try and find another one to open. + */ + if (data_entry != -1 && __db_exists(start, NULL) != 0) { + FREES(start); + a = b = c = NULL; + goto retry; + } + + /* Discard any space allocated to find the temp directory. */ + if (tmp_free) + FREES(etmp.db_tmp_dir); + + /* Create the file if so requested. */ + if (tmp_create) { + ret = __db_tmp_open(dbenv, start, fdp); + FREES(start); + } else { + *namep = start; + ret = 0; + } + return (ret); +} + +/* + * __db_home -- + * Find the database home. + */ +static int +__db_home(dbenv, db_home, flags) + DB_ENV *dbenv; + const char *db_home; + int flags; +{ + const char *p; + + p = db_home; + + /* Use the environment if it's permitted and initialized. */ +#ifdef HAVE_GETUID + if (LF_ISSET(DB_USE_ENVIRON) || + (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0)) { +#else + if (LF_ISSET(DB_USE_ENVIRON)) { +#endif + if ((p = getenv("DB_HOME")) == NULL) + p = db_home; + else if (p[0] == '\0') { + __db_err(dbenv, + "illegal DB_HOME environment variable"); + return (EINVAL); + } + } + + if (p == NULL) + return (0); + + if ((dbenv->db_home = (char *)strdup(p)) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); + } + return (0); +} + +/* + * __db_parse -- + * Parse a single NAME VALUE pair. + */ +static int +__db_parse(dbenv, s) + DB_ENV *dbenv; + char *s; +{ + int ret; + char *local_s, *name, *value, **p, *tp; + + ret = 0; + + /* + * We need to strdup the argument in case the caller passed us + * static data. + */ + if ((local_s = (char *)strdup(s)) == NULL) + return (ENOMEM); + + tp = local_s; + while ((name = strsep(&tp, " \t")) != NULL && *name == '\0'); + if (name == NULL) + goto illegal; + while ((value = strsep(&tp, " \t")) != NULL && *value == '\0'); + if (value == NULL) { +illegal: ret = EINVAL; + __db_err(dbenv, "illegal name-value pair: %s", s); + goto err; + } + +#define DATA_INIT_CNT 20 /* Start with 20 data slots. */ + if (!strcmp(name, "DB_DATA_DIR")) { + if (dbenv->db_data_dir == NULL) { + if ((dbenv->db_data_dir = (char **)calloc(DATA_INIT_CNT, + sizeof(char **))) == NULL) + goto nomem; + dbenv->data_cnt = DATA_INIT_CNT; + } else if (dbenv->data_next == dbenv->data_cnt - 1) { + dbenv->data_cnt *= 2; + if ((dbenv->db_data_dir = + (char **)realloc(dbenv->db_data_dir, + dbenv->data_cnt * sizeof(char **))) == NULL) + goto nomem; + } + p = &dbenv->db_data_dir[dbenv->data_next++]; + } else if (!strcmp(name, "DB_LOG_DIR")) { + if (dbenv->db_log_dir != NULL) + FREES(dbenv->db_log_dir); + p = &dbenv->db_log_dir; + } else if (!strcmp(name, "DB_TMP_DIR")) { + if (dbenv->db_tmp_dir != NULL) + FREES(dbenv->db_tmp_dir); + p = &dbenv->db_tmp_dir; + } else + goto err; + + if ((*p = (char *)strdup(value)) == NULL) { +nomem: ret = ENOMEM; + __db_err(dbenv, "%s", strerror(ENOMEM)); + } + +err: FREES(local_s); + return (ret); +} + +#ifdef macintosh +#include <TFileSpec.h> + +static char *sTempFolder; +#endif + +/* + * tmp -- + * Set the temporary directory path. + */ +static int +__db_tmp_dir(dbenv, flags) + DB_ENV *dbenv; + int flags; +{ + static const char * list[] = { /* Ordered: see db_appinit(3). */ + "/var/tmp", + "/usr/tmp", + "/temp", /* WIN32. */ + "/tmp", + "C:/temp", /* WIN32. */ + "C:/tmp", /* WIN32. */ + NULL + }; + const char **lp, *p; + + /* Use the environment if it's permitted and initialized. */ + p = NULL; +#ifdef HAVE_GETEUID + if (LF_ISSET(DB_USE_ENVIRON) || + (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0)) { +#else + if (LF_ISSET(DB_USE_ENVIRON)) { +#endif + if ((p = getenv("TMPDIR")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TMPDIR environment variable"); + return (EINVAL); + } + /* WIN32 */ + if (p == NULL && (p = getenv("TEMP")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TEMP environment variable"); + return (EINVAL); + } + /* WIN32 */ + if (p == NULL && (p = getenv("TMP")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TMP environment variable"); + return (EINVAL); + } + /* Macintosh */ + if (p == NULL && + (p = getenv("TempFolder")) != NULL && p[0] == '\0') { + __db_err(dbenv, + "illegal TempFolder environment variable"); + return (EINVAL); + } + } + +#ifdef macintosh + /* Get the path to the temporary folder. */ + if (p == NULL) { + FSSpec spec; + + if (!Special2FSSpec(kTemporaryFolderType, + kOnSystemDisk, 0, &spec)) { + p = FSp2FullPath(&spec); + sTempFolder = malloc(strlen(p) + 1); + strcpy(sTempFolder, p); + p = sTempFolder; + } + } +#endif + + /* Step through the list looking for a possibility. */ + if (p == NULL) + for (lp = list; *lp != NULL; ++lp) + if (__db_exists(p = *lp, NULL) == 0) + break; + + if (p == NULL) + return (0); + + if ((dbenv->db_tmp_dir = (char *)strdup(p)) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); + } + return (0); +} + +/* + * __db_tmp_open -- + * Create a temporary file. + */ +static int +__db_tmp_open(dbenv, dir, fdp) + DB_ENV *dbenv; + char *dir; + int *fdp; +{ +#ifdef HAVE_SIGFILLSET + sigset_t set, oset; +#endif + u_long pid; + size_t len; + int isdir, ret; + char *trv, buf[MAXPATHLEN]; + + /* + * Check the target directory; if you have six X's and it doesn't + * exist, this runs for a *very* long time. + */ + if ((ret = __db_exists(dir, &isdir)) != 0) { + __db_err(dbenv, "%s: %s", dir, strerror(ret)); + return (ret); + } + if (!isdir) { + __db_err(dbenv, "%s: %s", dir, strerror(EINVAL)); + return (EINVAL); + } + + /* Build the path. */ +#define DB_TRAIL "/XXXXXX" + if ((len = strlen(dir)) + sizeof(DB_TRAIL) > sizeof(buf)) { + __db_err(dbenv, + "tmp_open: %s: %s", buf, strerror(ENAMETOOLONG)); + return (ENAMETOOLONG); + } + (void)strcpy(buf, dir); + (void)strcpy(buf + len, DB_TRAIL); + buf[len] = PATH_SEPARATOR[0]; /* WIN32 */ + + /* + * Replace the X's with the process ID. Pid should be a pid_t, + * but we use unsigned long for portability. + */ + for (pid = getpid(), + trv = buf + len + sizeof(DB_TRAIL) - 1; *--trv == 'X'; pid /= 10) + switch (pid % 10) { + case 0: *trv = '0'; break; + case 1: *trv = '1'; break; + case 2: *trv = '2'; break; + case 3: *trv = '3'; break; + case 4: *trv = '4'; break; + case 5: *trv = '5'; break; + case 6: *trv = '6'; break; + case 7: *trv = '7'; break; + case 8: *trv = '8'; break; + case 9: *trv = '9'; break; + } + ++trv; + + /* + * Try and open a file. We block every signal we can get our hands + * on so that, if we're interrupted at the wrong time, the temporary + * file isn't left around -- of course, if we drop core in-between + * the calls we'll hang forever, but that's probably okay. ;-} + */ +#ifdef HAVE_SIGFILLSET + (void)sigfillset(&set); +#endif + for (;;) { +#ifdef HAVE_SIGFILLSET + (void)sigprocmask(SIG_BLOCK, &set, &oset); +#endif +#define DB_TEMPOPEN DB_CREATE | DB_EXCL | DB_TEMPORARY + if ((ret = __db_fdopen(buf, + DB_TEMPOPEN, DB_TEMPOPEN, S_IRUSR | S_IWUSR, fdp)) == 0) { +#ifdef HAVE_SIGFILLSET + (void)sigprocmask(SIG_SETMASK, &oset, NULL); +#endif + return (0); + } +#ifdef HAVE_SIGFILLSET + (void)sigprocmask(SIG_SETMASK, &oset, NULL); +#endif + /* + * XXX: + * If we don't get an EEXIST error, then there's something + * seriously wrong. Unfortunately, if the implementation + * doesn't return EEXIST for O_CREAT and O_EXCL regardless + * of other possible errors, we've lost. + */ + if (ret != EEXIST) { + __db_err(dbenv, + "tmp_open: %s: %s", buf, strerror(ret)); + return (ret); + } + + /* + * Tricky little algorithm for backward compatibility. + * Assumes the ASCII ordering of lower-case characters. + */ + for (;;) { + if (*trv == '\0') + return (EINVAL); + if (*trv == 'z') + *trv++ = 'a'; + else { + if (isdigit(*trv)) + *trv = 'a'; + else + ++*trv; + break; + } + } + } + /* NOTREACHED */ +} diff --git a/db2/common/db_apprec.c b/db2/common/db_apprec.c new file mode 100644 index 0000000000..b22b0c5e9a --- /dev/null +++ b/db2/common/db_apprec.c @@ -0,0 +1,143 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_apprec.c 10.15 (Sleepycat) 7/27/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <time.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "log.h" +#include "txn.h" +#include "common_ext.h" + +/* + * __db_apprec -- + * Perform recovery. + * + * PUBLIC: int __db_apprec __P((DB_ENV *, int)); + */ +int +__db_apprec(dbenv, flags) + DB_ENV *dbenv; + int flags; +{ + DBT data; + DB_LOG *lp; + DB_LSN ckp_lsn, first_lsn, lsn, tmp_lsn; + time_t now; + int first_flag, ret, tret; + void *txninfo; + + ZERO_LSN(ckp_lsn); + + /* Initialize the transaction list. */ + if ((ret = __db_txnlist_init(&txninfo)) != 0) + return (ret); + + /* + * Read forward through the log opening the appropriate files + * so that we can call recovery routines. In general, we start + * at the last checkpoint prior to the last checkpointed LSN. + * For catastrophic recovery, we begin at the first LSN that + * appears in any log file (log figures this out for us when + * we pass it the DB_FIRST flag). + */ + lp = dbenv->lg_info; + if (LF_ISSET(DB_RECOVER_FATAL)) + first_flag = DB_FIRST; + else + first_flag = __log_findckp(lp, &lsn) != 0 ? DB_FIRST : DB_SET; + + memset(&data, 0, sizeof(data)); + if ((ret = log_get(lp, &lsn, &data, first_flag)) != 0) { + __db_err(dbenv, "Failure: unable to get log record"); + if (first_flag == DB_SET) + __db_err(dbenv, "Retrieving LSN %lu %lu", + (u_long)lsn.file, (u_long)lsn.offset); + else + __db_err(dbenv, "Retrieving first LSN"); + goto err; + } + + first_lsn = lsn; + for (; ret == 0; + ret = log_get(dbenv->lg_info, &lsn, &data, DB_NEXT)) + if ((tret = __db_dispatch(lp, + &data, &lsn, TXN_OPENFILES, txninfo)) < 0) { + ret = tret; + goto msgerr; + } + + for (ret = log_get(lp, &lsn, &data, DB_LAST); + ret == 0 && log_compare(&lsn, &first_lsn) > 0; + ret = log_get(lp,&lsn, &data, DB_PREV)) { + tmp_lsn = lsn; + tret = + __db_dispatch(lp, &data, &lsn, TXN_BACKWARD_ROLL, txninfo); + if (IS_ZERO_LSN(ckp_lsn) && tret > 0) + ckp_lsn = tmp_lsn; + if (tret < 0) { + ret = tret; + goto msgerr; + } + } + + for (ret = log_get(lp, &lsn, &data, DB_NEXT); + ret == 0; ret = log_get(lp, &lsn, &data, DB_NEXT)) + if ((tret = __db_dispatch(lp, + &data, &lsn, TXN_FORWARD_ROLL, txninfo)) < 0) { + ret = tret; + goto msgerr; + } + + /* Now close all the db files that are open. */ + __log_close_files(lp); + + /* + * Now set the maximum transaction id, set the last checkpoint lsn, + * and the current time. Then take a checkpoint. + */ + (void)time(&now); + + dbenv->tx_info->region->last_txnid = ((__db_txnhead *)txninfo)->maxid; + dbenv->tx_info->region->last_ckp = ckp_lsn; + dbenv->tx_info->region->time_ckp = (u_int32_t) now; + txn_checkpoint(dbenv->tx_info, 0, 0); + + if (dbenv->db_verbose) { + __db_err(lp->dbenv, "Recovery complete at %s", ctime(&now)); + __db_err(lp->dbenv, "%s %lu %s [%lu][%lu]", + "Maximum transaction id", + (u_long)dbenv->tx_info->region->last_txnid, + "Recovery checkpoint", + (u_long)dbenv->tx_info->region->last_ckp.file, + (u_long)dbenv->tx_info->region->last_ckp.offset); + } + + return (0); + +msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed", + (u_long)lsn.file, (u_long)lsn.offset); + +err: return (ret); +} diff --git a/db2/common/db_byteorder.c b/db2/common/db_byteorder.c new file mode 100644 index 0000000000..d49883e093 --- /dev/null +++ b/db2/common/db_byteorder.c @@ -0,0 +1,56 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_byteorder.c 10.3 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "common_ext.h" + +/* + * __db_byteorder -- + * Return if we need to do byte swapping, checking for illegal + * values. + * + * PUBLIC: int __db_byteorder __P((DB_ENV *, int)); + */ +int +__db_byteorder(dbenv, lorder) + DB_ENV *dbenv; + int lorder; +{ + switch (lorder) { + case 0: + break; + case 1234: +#if defined(WORDS_BIGENDIAN) + return (DB_SWAPBYTES); +#else + break; +#endif + case 4321: +#if defined(WORDS_BIGENDIAN) + break; +#else + return (DB_SWAPBYTES); +#endif + default: + __db_err(dbenv, + "illegal byte order, only big and little-endian supported"); + return (EINVAL); + } + return (0); +} diff --git a/db2/common/db_err.c b/db2/common/db_err.c new file mode 100644 index 0000000000..3dc4ca007d --- /dev/null +++ b/db2/common/db_err.c @@ -0,0 +1,548 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_err.c 10.16 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> + +#ifdef __STDC__ +#include <stdarg.h> +#else +#include <varargs.h> +#endif +#endif + +#include "db_int.h" +#include "common_ext.h" + +static int __db_rdonly __P((const DB_ENV *, const char *)); + +/* + * __db_err -- + * Standard DB error routine. + * + * PUBLIC: #ifdef __STDC__ + * PUBLIC: void __db_err __P((const DB_ENV *dbenv, const char *fmt, ...)); + * PUBLIC: #else + * PUBLIC: void __db_err(); + * PUBLIC: #endif + */ +void +#ifdef __STDC__ +__db_err(const DB_ENV *dbenv, const char *fmt, ...) +#else +__db_err(dbenv, fmt, va_alist) + const DB_ENV *dbenv; + const char *fmt; + va_dcl +#endif +{ + va_list ap; + char errbuf[2048]; /* XXX: END OF THE STACK DON'T TRUST SPRINTF. */ + + if (dbenv == NULL) + return; + +#ifdef __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif + if (dbenv->db_errcall != NULL) { + (void)vsnprintf(errbuf, sizeof(errbuf), fmt, ap); + dbenv->db_errcall(dbenv->db_errpfx, errbuf); + } + if (dbenv->db_errfile != NULL) { + if (dbenv->db_errpfx != NULL) + (void)fprintf(dbenv->db_errfile, "%s: ", + dbenv->db_errpfx); + (void)vfprintf(dbenv->db_errfile, fmt, ap); + (void)fprintf(dbenv->db_errfile, "\n"); + (void)fflush(dbenv->db_errfile); + } + va_end(ap); +} + +/* + * XXX + * Provide ANSI C prototypes for the panic functions. Some compilers, (e.g., + * MS VC 4.2) get upset if they aren't here, even though the K&R declaration + * appears before the assignment in the __db__panic() call. + */ +static int __db_ecursor __P((DB *, DB_TXN *, DBC **)); +static int __db_edel __P((DB *, DB_TXN *, DBT *, int)); +static int __db_efd __P((DB *, int *)); +static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __db_estat __P((DB *, void *, void *(*)(size_t), int)); +static int __db_esync __P((DB *, int)); + +/* + * __db_ecursor -- + * After-panic cursor routine. + */ +static int +__db_ecursor(a, b, c) + DB *a; + DB_TXN *b; + DBC **c; +{ + a = a; b = b; c = c; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_edel -- + * After-panic delete routine. + */ +static int +__db_edel(a, b, c, d) + DB *a; + DB_TXN *b; + DBT *c; + int d; +{ + a = a; b = b; c = c; d = d; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_efd -- + * After-panic fd routine. + */ +static int +__db_efd(a, b) + DB *a; + int *b; +{ + a = a; b = b; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_egp -- + * After-panic get/put routine. + */ +static int +__db_egp(a, b, c, d, e) + DB *a; + DB_TXN *b; + DBT *c, *d; + int e; +{ + a = a; b = b; c = c; d = d; e = e; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_estat -- + * After-panic stat routine. + */ +static int +__db_estat(a, b, c, d) + DB *a; + void *b; + void *(*c) __P((size_t)); + int d; +{ + a = a; b = b; c = c; d = d; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_esync -- + * After-panic sync routine. + */ +static int +__db_esync(a, b) + DB *a; + int b; +{ + a = a; b = b; /* XXX: Shut the compiler up. */ + + return (EPERM); +} + +/* + * __db_panic -- + * Lock out the tree due to unrecoverable error. + * + * PUBLIC: int __db_panic __P((DB *)); + */ +int +__db_panic(dbp) + DB *dbp; +{ + /* + * XXX + * We should shut down all of the process's cursors, too. + * + * We should call mpool and have it shut down the file, so we get + * other processes sharing this file as well. + */ + dbp->cursor = __db_ecursor; + dbp->del = __db_edel; + dbp->fd = __db_efd; + dbp->get = __db_egp; + dbp->put = __db_egp; + dbp->stat = __db_estat; + dbp->sync = __db_esync; + + return (EPERM); +} + +/* Check for invalid flags. */ +#undef DB_CHECK_FLAGS +#define DB_CHECK_FLAGS(dbenv, name, flags, ok_flags) \ + if ((flags) & ~(ok_flags)) \ + return (__db_ferr(dbenv, name, 0)); +/* Check for invalid flag combinations. */ +#undef DB_CHECK_FCOMBO +#define DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2) \ + if ((flags) & (flag1) && (flags) & (flag2)) \ + return (__db_ferr(dbenv, name, 1)); + +/* + * __db_fchk -- + * General flags checking routine. + * + * PUBLIC: int __db_fchk __P((DB_ENV *, char *, int, int)); + */ +int +__db_fchk(dbenv, name, flags, ok_flags) + DB_ENV *dbenv; + const char *name; + int flags, ok_flags; +{ + DB_CHECK_FLAGS(dbenv, name, flags, ok_flags); + return (0); +} + +/* + * __db_fcchk -- + * General combination flags checking routine. + * + * PUBLIC: int __db_fcchk __P((DB_ENV *, char *, int, int, int)); + */ +int +__db_fcchk(dbenv, name, flags, flag1, flag2) + DB_ENV *dbenv; + const char *name; + int flags, flag1, flag2; +{ + DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2); + return (0); +} + +/* + * __db_cdelchk -- + * Common cursor delete argument checking routine. + * + * PUBLIC: int __db_cdelchk __P((const DB *, int, int, int)); + */ +int +__db_cdelchk(dbp, flags, isrdonly, isvalid) + const DB *dbp; + int flags, isrdonly, isvalid; +{ + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "c_del")); + + /* Check for invalid dbc->c_del() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "c_del", flags, 0); + + /* + * The cursor must be initialized, return -1 for an invalid cursor, + * otherwise 0. + */ + return (isvalid ? 0 : EINVAL); +} + +/* + * __db_cgetchk -- + * Common cursor get argument checking routine. + * + * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, int, int)); + */ +int +__db_cgetchk(dbp, key, data, flags, isvalid) + const DB *dbp; + DBT *key, *data; + int flags, isvalid; +{ + int check_key; + + check_key = 0; + + /* Check for invalid dbc->c_get() function flags. */ + switch (flags) { + case DB_CURRENT: + case DB_FIRST: + case DB_LAST: + case DB_NEXT: + case DB_PREV: + case DB_SET_RANGE: + check_key = 1; + break; + case DB_SET: + break; + case DB_SET_RECNO: + case DB_GET_RECNO: + if (!F_ISSET(dbp, DB_BT_RECNUM)) + goto err; + check_key = 1; + break; + default: +err: return (__db_ferr(dbp->dbenv, "c_get", 0)); + } + + /* Check for invalid key/data flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* Check dbt's for valid flags when multi-threaded. */ + if (F_ISSET(dbp, DB_AM_THREAD)) { + if (!F_ISSET(data, DB_DBT_USERMEM | DB_DBT_MALLOC)) + return (__db_ferr(dbp->dbenv, "threaded data", 1)); + if (check_key && + !F_ISSET(key, DB_DBT_USERMEM | DB_DBT_MALLOC)) + return (__db_ferr(dbp->dbenv, "threaded key", 1)); + } + + /* + * The cursor must be initialized for DB_CURRENT, return -1 for an + * invalid cursor, otherwise 0. + */ + return (isvalid || flags != DB_CURRENT ? 0 : EINVAL); +} + +/* + * __db_cputchk -- + * Common cursor put argument checking routine. + * + * PUBLIC: int __db_cputchk __P((const DB *, + * PUBLIC: const DBT *, DBT *, int, int, int)); + */ +int +__db_cputchk(dbp, key, data, flags, isrdonly, isvalid) + const DB *dbp; + const DBT *key; + DBT *data; + int flags, isrdonly, isvalid; +{ + int check_key; + + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "c_put")); + + /* Check for invalid dbc->c_put() function flags. */ + check_key = 0; + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER)) + goto err; + if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP)) + goto err; + break; + case DB_CURRENT: + break; + case DB_KEYFIRST: + case DB_KEYLAST: + if (dbp->type == DB_RECNO) + goto err; + check_key = 1; + break; + default: +err: return (__db_ferr(dbp->dbenv, "c_put", 0)); + } + + /* Check for invalid key/data flags. */ + if (check_key) + DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* + * The cursor must be initialized for anything other than DB_KEYFIRST + * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0. + */ + return (isvalid || + (flags != DB_KEYFIRST && flags != DB_KEYLAST) ? 0 : EINVAL); +} + +/* + * __db_delchk -- + * Common delete argument checking routine. + * + * PUBLIC: int __db_delchk __P((const DB *, int, int)); + */ +int +__db_delchk(dbp, flags, isrdonly) + const DB *dbp; + int flags, isrdonly; +{ + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "delete")); + + /* Check for invalid db->del() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "delete", flags, 0); + + return (0); +} + +/* + * __db_getchk -- + * Common get argument checking routine. + * + * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, int)); + */ +int +__db_getchk(dbp, key, data, flags) + const DB *dbp; + const DBT *key; + DBT *data; + int flags; +{ + /* Check for invalid db->get() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, + "get", flags, F_ISSET(dbp, DB_BT_RECNUM) ? DB_SET_RECNO : 0); + + /* Check for invalid key/data flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0); + DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + DB_CHECK_FCOMBO(dbp->dbenv, + "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM); + if (F_ISSET(dbp, DB_AM_THREAD) && + !F_ISSET(data, DB_DBT_MALLOC | DB_DBT_USERMEM)) + return (__db_ferr(dbp->dbenv, "threaded data", 1)); + + return (0); +} + +/* + * __db_putchk -- + * Common put argument checking routine. + * + * PUBLIC: int __db_putchk __P((const DB *, DBT *, const DBT *, int, int, int)); + */ +int +__db_putchk(dbp, key, data, flags, isrdonly, isdup) + const DB *dbp; + DBT *key; + const DBT *data; + int flags, isrdonly, isdup; +{ + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "put")); + + /* Check for invalid db->put() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "put", flags, + DB_NOOVERWRITE | (dbp->type == DB_RECNO ? DB_APPEND : 0)); + + /* Check for invalid key/data flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0); + DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); + DB_CHECK_FCOMBO(dbp->dbenv, + "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM); + + /* Check for partial puts in the presence of duplicates. */ + if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) { + __db_err(dbp->dbenv, +"a partial put in the presence of duplicates requires a cursor operation"); + return (EINVAL); + } + return (0); +} + +/* + * __db_statchk -- + * Common stat argument checking routine. + * + * PUBLIC: int __db_statchk __P((const DB *, int)); + */ +int +__db_statchk(dbp, flags) + const DB *dbp; + int flags; +{ + /* Check for invalid db->stat() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "stat", flags, DB_RECORDCOUNT); + + if (LF_ISSET(DB_RECORDCOUNT) && + dbp->type == DB_BTREE && !F_ISSET(dbp, DB_BT_RECNUM)) + return (__db_ferr(dbp->dbenv, "stat", 0)); + + return (0); +} + +/* + * __db_syncchk -- + * Common sync argument checking routine. + * + * PUBLIC: int __db_syncchk __P((const DB *, int)); + */ +int +__db_syncchk(dbp, flags) + const DB *dbp; + int flags; +{ + /* Check for invalid db->sync() function flags. */ + DB_CHECK_FLAGS(dbp->dbenv, "sync", flags, 0); + + return (0); +} + +/* + * __db_ferr -- + * Common flag errors. + * + * PUBLIC: int __db_ferr __P((const DB_ENV *, char *, int)); + */ +int +__db_ferr(dbenv, name, combo) + const DB_ENV *dbenv; + const char *name; + int combo; +{ + __db_err(dbenv, "illegal flag %sspecified to %s", + combo ? "combination " : "", name); + return (EINVAL); +} + +/* + * __db_rdonly -- + * Common readonly message. + */ +static int +__db_rdonly(dbenv, name) + const DB_ENV *dbenv; + const char *name; +{ + __db_err(dbenv, "%s: attempt to modify a read-only tree", name); + return (EACCES); +} diff --git a/db2/common/db_log2.c b/db2/common/db_log2.c new file mode 100644 index 0000000000..9af01116f6 --- /dev/null +++ b/db2/common/db_log2.c @@ -0,0 +1,68 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_log2.c 10.3 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "common_ext.h" + +/* + * PUBLIC: u_int32_t __db_log2 __P((u_int32_t)); + */ +u_int32_t +__db_log2(num) + u_int32_t num; +{ + u_int32_t i, limit; + + limit = 1; + for (i = 0; limit < num; limit = limit << 1, i++); + return (i); +} diff --git a/db2/common/db_region.c b/db2/common/db_region.c new file mode 100644 index 0000000000..51f8f4465c --- /dev/null +++ b/db2/common/db_region.c @@ -0,0 +1,565 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Harvard by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_region.c 10.12 (Sleepycat) 7/26/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "common_ext.h" + +static int __db_rmap __P((DB_ENV *, int, size_t, void *)); + +/* + * __db_rcreate -- + * + * Common interface for creating a shared region. Handles synchronization + * across multiple processes. + * + * The dbenv contains the environment for this process, including naming + * information. The path argument represents the parameters passed to + * the open routines and may be either a file or a directory. If it is + * a directory, it must exist. If it is a file, then the file parameter + * must be NULL, otherwise, file is the name to be created inside the + * directory path. + * + * The function returns a pointer to the shared region that has been mapped + * into memory, NULL on error. + * + * PUBLIC: int __db_rcreate __P((DB_ENV *, APPNAME, + * PUBLIC: const char *, const char *, int, size_t, int *, void *)); + */ +int +__db_rcreate(dbenv, appname, path, file, mode, size, fdp, retp) + DB_ENV *dbenv; + APPNAME appname; + const char *path, *file; + int mode, *fdp; + size_t size; + void *retp; +{ + RLAYOUT *rp; + int fd, ret; + char *name; + + fd = -1; + rp = NULL; + + /* + * Get the filename -- note, if it's a temporary file, it will + * be created by the underlying temporary file creation code, + * so we have to check the file descriptor to be sure it's an + * error. + */ + if ((ret = __db_appname(dbenv, appname, path, file, &fd, &name)) != 0) + return (ret); + + /* + * Now open the file. We need to make sure that multiple processes + * that attempt to create the region at the same time are properly + * ordered, so we open it O_EXCL and O_CREAT so two simultaneous + * attempts to create the region will return failure in one of the + * attempts. + */ + if (fd == -1 && (ret = __db_fdopen(name, + DB_CREATE | DB_EXCL, DB_CREATE | DB_EXCL, mode, &fd)) != 0) { + if (ret != EEXIST) + __db_err(dbenv, + "region create: %s: %s", name, strerror(ret)); + goto err; + } + *fdp = fd; + + /* Grow the region to the correct size. */ + if ((ret = __db_rgrow(dbenv, fd, size)) != 0) + goto err; + + /* Map the region in. */ + if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0) + goto err; + + /* + * Initialize the common information. + * + * !!! + * We have to order the region creates so that two processes don't try + * to simultaneously create the region and so that processes that are + * joining the region never see inconsistent data. We'd like to play + * file permissions games, but we can't because WNT filesystems won't + * open a file mode 0. + * + * So, the process that's creating the region always acquires the lock + * before the setting the version number. Any process joining always + * checks the version number before attempting to acquire the lock. + * + * We have to check the version number first, because if the version + * number has not been written, it's possible that the mutex has not + * been initialized in which case an attempt to get it could lead to + * random behavior. If the version number isn't there (the file size + * is too small) or it's 0, we know that the region is being created. + */ + (void)__db_mutex_init(&rp->lock, MUTEX_LOCK_OFFSET(rp, &rp->lock)); + (void)__db_mutex_lock(&rp->lock, + fd, dbenv == NULL ? NULL : dbenv->db_yield); + + rp->refcnt = 1; + rp->size = size; + rp->flags = 0; + db_version(&rp->majver, &rp->minver, &rp->patch); + + if (name != NULL) + FREES(name); + + *(void **)retp = rp; + return (0); + +err: if (fd != -1) { + if (rp != NULL) + (void)__db_munmap(rp, rp->size); + (void)__db_unlink(name); + (void)__db_close(fd); + } + if (name != NULL) + FREES(name); + return (ret); +} + +/* + * __db_ropen -- + * Construct the name of a file, open it and map it in. + * + * PUBLIC: int __db_ropen __P((DB_ENV *, + * PUBLIC: APPNAME, const char *, const char *, int, int *, void *)); + */ +int +__db_ropen(dbenv, appname, path, file, flags, fdp, retp) + DB_ENV *dbenv; + APPNAME appname; + const char *path, *file; + int flags, *fdp; + void *retp; +{ + RLAYOUT *rp; + off_t size1, size2; + int fd, ret; + char *name; + + fd = -1; + rp = NULL; + + /* Get the filename. */ + if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0) + return (ret); + + /* Open the file. */ + if ((ret = __db_fdopen(name, flags, DB_MUTEXDEBUG, 0, &fd)) != 0) { + __db_err(dbenv, "region open: %s: %s", name, strerror(ret)); + goto err2; + } + + *fdp = fd; + + /* + * Map the file in. We have to do things in a strange order so that + * we don't get into a situation where the file was just created and + * isn't yet initialized. See the comment in __db_rcreate() above. + * + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + * + */ + if ((ret = __db_stat(dbenv, name, fd, &size1, NULL)) != 0) + goto err2; + + /* Check to make sure the first block has been written. */ + if ((size_t) size1 < sizeof(RLAYOUT)) { + ret = EAGAIN; + goto err2; + } + + /* Map in whatever is there. */ + if ((ret = __db_rmap(dbenv, fd, size1, &rp)) != 0) + goto err2; + + /* + * Check to make sure the region has been initialized. We can't just + * grab the lock because the lock may not have been initialized yet. + */ + if (rp->majver == 0) { + ret = EAGAIN; + goto err2; + } + + /* Get the region lock. */ + if (!LF_ISSET(DB_MUTEXDEBUG)) + (void)__db_mutex_lock(&rp->lock, + fd, dbenv == NULL ? NULL : dbenv->db_yield); + + /* + * The file may have been half-written if we were descheduled between + * getting the size of the file and checking the major version. Check + * to make sure we got the entire file. + */ + if ((ret = __db_stat(dbenv, name, fd, &size2, NULL)) != 0) + goto err1; + if (size1 != size2) { + ret = EAGAIN; + goto err1; + } + + /* The file may have just been deleted. */ + if (F_ISSET(rp, DB_R_DELETED)) { + ret = EAGAIN; + goto err1; + } + + /* Increment the reference count. */ + ++rp->refcnt; + + /* Release the lock. */ + if (!LF_ISSET(DB_MUTEXDEBUG)) + (void)__db_mutex_unlock(&rp->lock, fd); + + FREES(name); + + *(void **)retp = rp; + return (0); + +err1: if (!LF_ISSET(DB_MUTEXDEBUG)) + (void)__db_mutex_unlock(&rp->lock, fd); +err2: if (rp != NULL) + (void)__db_munmap(rp, rp->size); + if (fd != -1) + (void)__db_close(fd); + FREES(name); + return (ret); +} + +/* + * __db_rclose -- + * Close a shared memory region. + * + * PUBLIC: int __db_rclose __P((DB_ENV *, int, void *)); + */ +int +__db_rclose(dbenv, fd, ptr) + DB_ENV *dbenv; + int fd; + void *ptr; +{ + RLAYOUT *rp; + int ret, t_ret; + const char *fail; + + rp = ptr; + fail = NULL; + + /* Get the lock. */ + if ((ret = __db_mutex_lock(&rp->lock, + fd, dbenv == NULL ? NULL : dbenv->db_yield)) != 0) { + fail = "lock get"; + goto err; + } + + /* Decrement the reference count. */ + --rp->refcnt; + + /* Release the lock. */ + if ((t_ret = __db_mutex_unlock(&rp->lock, fd)) != 0 && fail == NULL) { + ret = t_ret; + fail = "lock release"; + } + + /* Discard the region. */ + if ((t_ret = __db_munmap(ptr, rp->size)) != 0 && fail == NULL) { + ret = t_ret; + fail = "munmap"; + } + + if ((t_ret = __db_close(fd)) != 0 && fail == NULL) { + ret = t_ret; + fail = "close"; + } + + if (fail == NULL) + return (0); + +err: __db_err(dbenv, "region detach: %s: %s", fail, strerror(ret)); + return (ret); +} + +/* + * __db_runlink -- + * Remove a shared memory region. + * + * PUBLIC: int __db_runlink __P((DB_ENV *, + * PUBLIC: APPNAME, const char *, const char *, int)); + */ +int +__db_runlink(dbenv, appname, path, file, force) + DB_ENV *dbenv; + APPNAME appname; + const char *path, *file; + int force; +{ + RLAYOUT *rp; + int cnt, fd, ret, t_ret; + char *name; + + rp = NULL; + + /* Get the filename. */ + if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0) + return (ret); + + /* If the file doesn't exist, we're done. */ + if (__db_exists(name, NULL)) + return (0); /* XXX: ENOENT? */ + + /* + * If we're called with a force flag, try and unlink the file. This + * may not succeed if the file is currently open, but there's nothing + * we can do about that. There is a race condition between the check + * for existence above and the actual unlink. If someone else snuck + * in and removed it before we do the remove, then we might get an + * ENOENT error. If we get the ENOENT, we treat it as success, just + * as we do above. + */ + if (force) { + if ((ret = __db_unlink(name)) != 0 && ret != ENOENT) + goto err1; + FREES(name); + return (0); + } + + /* Open and lock the region. */ + if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0) + goto err1; + (void)__db_mutex_lock(&rp->lock, + fd, dbenv == NULL ? NULL : dbenv->db_yield); + + /* If the region is currently being deleted, fail. */ + if (F_ISSET(rp, DB_R_DELETED)) { + ret = ENOENT; /* XXX: ENOENT? */ + goto err2; + } + + /* If the region is currently in use by someone else, fail. */ + if (rp->refcnt > 1) { + ret = EBUSY; + goto err2; + } + + /* Set the delete flag. */ + F_SET(rp, DB_R_DELETED); + + /* Release the lock and close the region. */ + (void)__db_mutex_unlock(&rp->lock, fd); + if ((t_ret = __db_rclose(dbenv, fd, rp)) != 0 && ret == 0) + goto err1; + + /* + * Unlink the region. There's a race here -- other threads or + * processes might be opening the region while we're trying to + * remove it. They'll fail, because we've set the DELETED flag, + * but they could still stop us from succeeding in the unlink. + */ + for (cnt = 5; cnt > 0; --cnt) { + if ((ret = __db_unlink(name)) == 0) + break; + (void)__db_sleep(0, 250000); + } + if (ret == 0) { + FREES(name); + return (0); + } + + /* Not a clue. Try to clear the DB_R_DELETED flag. */ + if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0) + goto err1; + (void)__db_mutex_lock(&rp->lock, + fd, dbenv == NULL ? NULL : dbenv->db_yield); + F_CLR(rp, DB_R_DELETED); + /* FALLTHROUGH */ + +err2: (void)__db_mutex_unlock(&rp->lock, fd); + (void)__db_rclose(dbenv, fd, rp); +err1: __db_err(dbenv, "region unlink: %s: %s", name, strerror(ret)); + FREES(name); + return (ret); +} + +/* + * DB creates all regions on 4K boundaries so that we don't make the + * underlying VM unhappy. + */ +#define __DB_VMPAGESIZE (4 * 1024) + +/* + * __db_rgrow -- + * Extend a region by a specified amount. + * + * PUBLIC: int __db_rgrow __P((DB_ENV *, int, size_t)); + */ +int +__db_rgrow(dbenv, fd, incr) + DB_ENV *dbenv; + int fd; + size_t incr; +{ +#ifdef MMAP_INIT_NEEDED + size_t i; +#endif + ssize_t nw; + int ret; + char buf[__DB_VMPAGESIZE]; + + /* Seek to the end of the region. */ + if ((ret = __db_lseek(fd, 0, 0, 0, SEEK_END)) != 0) + goto err; + + /* Write nuls to the new bytes. */ + memset(buf, 0, sizeof(buf)); + + /* + * Historically, some systems required that all of the bytes of the + * region be written before you could mmap it and access it randomly. + */ +#ifdef MMAP_INIT_NEEDED + /* Extend the region by writing each new page. */ + for (i = 0; i < incr; i += __DB_VMPAGESIZE) { + if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0) + goto err; + if (nw != sizeof(buf)) + goto eio; + } +#else + /* + * Extend the region by writing the last page. + * + * Round off the increment to the next page boundary. + */ + incr += __DB_VMPAGESIZE - 1; + incr -= incr % __DB_VMPAGESIZE; + + /* Write the last page, not the page after the last. */ + if ((ret = __db_lseek(fd, 0, 0, incr - __DB_VMPAGESIZE, SEEK_CUR)) != 0) + goto err; + if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0) + goto err; + if (nw != sizeof(buf)) + goto eio; +#endif + return (0); + +eio: ret = EIO; +err: __db_err(dbenv, "region grow: %s", strerror(ret)); + return (ret); +} + +/* + * __db_rremap -- + * Unmap the old region and map in a new region of a new size. If + * either call fails, returns NULL, else returns the address of the + * new region. + * + * PUBLIC: int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *)); + */ +int +__db_rremap(dbenv, ptr, oldsize, newsize, fd, retp) + DB_ENV *dbenv; + void *ptr, *retp; + size_t oldsize, newsize; + int fd; +{ + int ret; + + if ((ret = __db_munmap(ptr, oldsize)) != 0) { + __db_err(dbenv, "region remap: munmap: %s", strerror(ret)); + return (ret); + } + + return (__db_rmap(dbenv, fd, newsize, retp)); +} + +/* + * __db_rmap -- + * Attach to a shared memory region. + */ +static int +__db_rmap(dbenv, fd, size, retp) + DB_ENV *dbenv; + int fd; + size_t size; + void *retp; +{ + RLAYOUT *rp; + int ret; + + if ((ret = __db_mmap(fd, size, 0, 0, &rp)) != 0) { + __db_err(dbenv, "region map: mmap %s", strerror(ret)); + return (ret); + } + if (rp->size < size) + rp->size = size; + + *(void **)retp = rp; + return (0); +} diff --git a/db2/common/db_salloc.c b/db2/common/db_salloc.c new file mode 100644 index 0000000000..f0202ddb90 --- /dev/null +++ b/db2/common/db_salloc.c @@ -0,0 +1,290 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_salloc.c 10.6 (Sleepycat) 7/5/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "common_ext.h" + +/* + * Implement shared memory region allocation, using simple first-fit algorithm. + * The model is that we take a "chunk" of shared memory store and begin carving + * it up into areas, similarly to how malloc works. We do coalescing on free. + * + * The "len" field in the __data struct contains the length of the free region + * (less the size_t bytes that holds the length). We use the address provided + * by the caller to find this length, which allows us to free a chunk without + * requiring that the caller pass in the length of the chunk they're freeing. + */ +SH_LIST_HEAD(__head); +struct __data { + size_t len; + SH_LIST_ENTRY links; +}; + +/* + * __db_shalloc_init -- + * Initialize the area as one large chunk. + * + * PUBLIC: void __db_shalloc_init __P((void *, size_t)); + */ +void +__db_shalloc_init(area, size) + void *area; + size_t size; +{ + struct __data *elp; + struct __head *hp; + + hp = area; + SH_LIST_INIT(hp); + + elp = (struct __data *)(hp + 1); + elp->len = size - sizeof(struct __head) - sizeof(elp->len); + SH_LIST_INSERT_HEAD(hp, elp, links, __data); +} + +/* + * __db_shalloc -- + * Allocate some space from the shared region. + * + * PUBLIC: int __db_shalloc __P((void *, size_t, size_t, void *)); + */ +int +__db_shalloc(p, len, align, retp) + void *p, *retp; + size_t len, align; +{ + struct __data *elp; + size_t *sp; + void *rp; + + /* + * We never allocate less than the size of a struct __data, align + * to less than a size_t boundary, or align to something that's not + * a multiple of a size_t. + */ + if (len < sizeof(struct __data)) + len = sizeof(struct __data); + align = align <= sizeof(size_t) ? + sizeof(size_t) : ALIGN(align, sizeof(size_t)); + + /* Walk the list, looking for a slot. */ + for (elp = SH_LIST_FIRST((struct __head *)p, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) { + /* + * Calculate the value of the returned pointer if we were to + * use this chunk. + * + Find the end of the chunk. + * + Subtract the memory the user wants. + * + Find the closest previous correctly-aligned address. + */ + rp = (u_int8_t *)elp + sizeof(size_t) + elp->len; + rp = (u_int8_t *)rp - len; + rp = (u_int8_t *)((ALIGNTYPE)rp & ~(align - 1)); + + /* + * Rp may now point before elp->links, in which case the chunk + * was too small, and we have to try again. + */ + if ((u_int8_t *)rp < (u_int8_t *)&elp->links) + continue; + + *(void **)retp = rp; + + /* + * If there are at least 32 bytes of additional memory, divide + * the chunk into two chunks. + */ + if ((u_int8_t *)rp >= (u_int8_t *)&elp->links + 32) { + sp = rp; + *--sp = elp->len - + ((u_int8_t *)rp - (u_int8_t *)&elp->links); + elp->len -= *sp + sizeof(size_t); + return (0); + } + + /* + * Otherwise, we return the entire chunk, wasting some amount + * of space to keep the list compact. However, because the + * address we're returning to the user may not be the address + * of the start of the region for alignment reasons, set the + * size_t length fields back to the "real" length field to a + * flag value, so that we can find the real length during free. + */ +#define ILLEGAL_SIZE 1 + SH_LIST_REMOVE(elp, links, __data); + for (sp = rp; (u_int8_t *)--sp >= (u_int8_t *)&elp->links;) + *sp = ILLEGAL_SIZE; + return (0); + } + + /* Nothing found large enough; need to figure out how to grow region. */ + return (ENOMEM); +} + +/* + * __db_shalloc_free -- + * Free a shared memory allocation. + * + * PUBLIC: void __db_shalloc_free __P((void *, void *)); + */ +void +__db_shalloc_free(regionp, ptr) + void *regionp, *ptr; +{ + struct __data *elp, *lastp, *newp; + struct __head *hp; + size_t free_size, *sp; + int merged; + + /* + * Step back over flagged length fields to find the beginning of + * the object and its real size. + */ + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp); + ptr = sp; + + newp = (struct __data *)((u_int8_t *)ptr - sizeof(size_t)); + free_size = newp->len; + + /* + * Walk the list, looking for where this entry goes. + * + * We keep the free list sorted by address so that coalescing is + * trivial. + * + * XXX + * Probably worth profiling this to see how expensive it is. + */ + hp = (struct __head *)regionp; + for (elp = SH_LIST_FIRST(hp, __data), lastp = NULL; + elp != NULL && (void *)elp < (void *)ptr; + lastp = elp, elp = SH_LIST_NEXT(elp, links, __data)); + + /* + * Elp is either NULL (we reached the end of the list), or the slot + * after the one that's being returned. Lastp is either NULL (we're + * returning the first element of the list) or the element before the + * one being returned. + * + * Check for coalescing with the next element. + */ + merged = 0; + if ((u_int8_t *)ptr + free_size == (u_int8_t *)elp) { + newp->len += elp->len + sizeof(size_t); + SH_LIST_REMOVE(elp, links, __data); + if (lastp != NULL) + SH_LIST_INSERT_AFTER(lastp, newp, links, __data); + else + SH_LIST_INSERT_HEAD(hp, newp, links, __data); + merged = 1; + } + + /* Check for coalescing with the previous element. */ + if (lastp != NULL && (u_int8_t *)lastp + + lastp->len + sizeof(size_t) == (u_int8_t *)newp) { + lastp->len += newp->len + sizeof(size_t); + + /* + * If we have already put the new element into the list take + * it back off again because it's just been merged with the + * previous element. + */ + if (merged) + SH_LIST_REMOVE(newp, links, __data); + merged = 1; + } + + if (!merged) + if (lastp == NULL) + SH_LIST_INSERT_HEAD(hp, newp, links, __data); + else + SH_LIST_INSERT_AFTER(lastp, newp, links, __data); +} + +/* + * __db_shalloc_count -- + * Return the amount of memory on the free list. + * + * PUBLIC: size_t __db_shalloc_count __P((void *)); + */ +size_t +__db_shalloc_count(addr) + void *addr; +{ + struct __data *elp; + size_t count; + + count = 0; + for (elp = SH_LIST_FIRST((struct __head *)addr, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) + count += elp->len; + + return (count); +} + +/* + * __db_shsizeof -- + * Return the size of a shalloc'd piece of memory. + * + * PUBLIC: size_t __db_shsizeof __P((void *)); + */ +size_t +__db_shsizeof(ptr) + void *ptr; +{ + struct __data *elp; + size_t *sp; + + /* + * Step back over flagged length fields to find the beginning of + * the object and its real size. + */ + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp); + + elp = (struct __data *)((u_int8_t *)sp - sizeof(size_t)); + return (elp->len); +} + +#ifdef DEBUG +/* + * __db_shalloc_dump -- + * + * PUBLIC: void __db_shalloc_dump __P((FILE *, void *)); + */ +void +__db_shalloc_dump(fp, addr) + FILE *fp; + void *addr; +{ + struct __data *elp; + + if (fp == NULL) + fp = stderr; + + for (elp = SH_LIST_FIRST((struct __head *)addr, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) + fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len); + fprintf(fp, "\n"); +} +#endif diff --git a/db2/common/db_shash.c b/db2/common/db_shash.c new file mode 100644 index 0000000000..988de8a994 --- /dev/null +++ b/db2/common/db_shash.c @@ -0,0 +1,90 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_shash.c 10.3 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "common_ext.h" + +/* Powers-of-2 and close-by prime number pairs. */ +static const struct { + int power; + int prime; +} list[] = { + { 64, 67}, + { 128, 131}, + { 256, 257}, + { 512, 521}, + {1024, 1031}, + {2048, 2053}, + {4096, 4099}, + {8192, 8191}, + {0, 0} +}; + +/* + * __db_tablesize -- + * Choose a size for the hash table. + * + * PUBLIC: int __db_tablesize __P((int)); + */ +int +__db_tablesize(n_buckets) + int n_buckets; +{ + int i; + + /* + * We try to be clever about how big we make the hash tables. Pick + * a prime number close to the "suggested" number of elements that + * will be in the hash table. We shoot for minimum collisions (i.e. + * one element in each bucket). We use 64 as the minimum table size. + * + * Ref: Sedgewick, Algorithms in C, "Hash Functions" + */ + if (n_buckets < 64) + n_buckets = 64; + + for (i = 0;; ++i) { + if (list[i].power == 0) { + --i; + break; + } + if (list[i].power >= n_buckets) + break; + } + return (list[i].prime); +} + +/* + * __db_hashinit -- + * Initialize a hash table that resides in shared memory. + * + * PUBLIC: void __db_hashinit __P((void *, int)); + */ +void +__db_hashinit(begin, nelements) + void *begin; + int nelements; +{ + int i; + SH_TAILQ_HEAD(hash_head) *headp; + + headp = (struct hash_head *)begin; + + for (i = 0; i < nelements; i++, headp++) + SH_TAILQ_INIT(headp); +} diff --git a/db2/compat.h b/db2/compat.h new file mode 100644 index 0000000000..5183befd60 --- /dev/null +++ b/db2/compat.h @@ -0,0 +1,10 @@ +/* Compatibility gunk for the db library. */ + +#include <sys/types.h> + +#define EFTYPE EINVAL + +/* Emulate Solaris llseek(). */ +typedef loff_t offset_t; + +extern int llseek (int fd, loff_t offset, int whence); diff --git a/db2/config.h b/db2/config.h new file mode 100644 index 0000000000..ed1246d0f4 --- /dev/null +++ b/db2/config.h @@ -0,0 +1,142 @@ +/* config.h. Generated automatically by configure. */ +/* config.h.in. Generated automatically from configure.in by autoheader. */ + +/* ...but edited by hand to be used in GNU libc. */ +#include <endian.h> +#include <sys/stat.h> /* To get _STATBUF_ST_BLKSIZE. */ + +/* Define to empty if the keyword does not work. */ +/* #undef const */ + +/* Define if your struct stat has st_blksize. */ +#ifdef _STATBUF_ST_BLKSIZE +# define HAVE_ST_BLKSIZE 1 +#endif + +/* Define to `int' if <sys/types.h> doesn't define. */ +/* #undef mode_t */ + +/* Define to `long' if <sys/types.h> doesn't define. */ +/* #undef off_t */ + +/* Define to `int' if <sys/types.h> doesn't define. */ +/* #undef pid_t */ + +/* Define to `unsigned' if <sys/types.h> doesn't define. */ +/* #undef size_t */ + +/* Define if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define if your processor stores words with the most significant + byte first (like Motorola and SPARC, unlike Intel and VAX). */ +#if __BYTE_ORDER == BIG_ENDIAN +# define WORDS_BIGENDIAN 1 +#endif + +/* Define to `int' if <sys/types.h> doesn't define. */ +/* #undef ssize_t */ + +/* Define if you want a debugging version. */ +/* #undef DEBUG */ + +/* Define if you have sigfillset (and sigprocmask). */ +#define HAVE_SIGFILLSET 1 + +/* Define if seeking to 64-bit file offsets requires the _llseek() call. */ +/* #undef HAVE_LLSEEK */ + +/* Define if seeking to 64-bit file offsets requires the _lseeki64() call. */ +/* #undef HAVE_LSEEKI */ + +/* Define if you have spinlocks. */ +/* #undef HAVE_SPINLOCKS */ + +/* Define if you want to use mc68020/gcc assembly spinlocks. */ +/* #undef HAVE_ASSEM_MC68020_GCC */ + +/* Define if you want to use sparc/gcc assembly spinlocks. */ +/* #undef HAVE_ASSEM_SPARC_GCC */ + +/* Define if you want to use uts4/cc assembly spinlocks. */ +/* #undef HAVE_ASSEM_UTS4_CC */ + +/* Define if you want to use x86/gcc assembly spinlocks. */ +/* #undef HAVE_ASSEM_X86_GCC */ + +/* Define if you have the AIX _check_lock spinlocks. */ +/* #undef HAVE_FUNC_AIX */ + +/* Define if you have the OSF1 or HPPA msemaphore spinlocks. */ +/* #undef HAVE_FUNC_MSEM */ + +/* Define if you have the SGI abilock_t spinlocks. */ +/* #undef HAVE_FUNC_SGI */ + +/* Define if you have the Solaris mutex_t spinlocks. */ +/* #undef HAVE_FUNC_SOLARIS */ + +/* Define if your sprintf returns a pointer, not a length. */ +/* #undef SPRINTF_RET_CHARPNT */ + +/* Define if you have the getcwd function. */ +#define HAVE_GETCWD 1 + +/* Define if you have the getopt function. */ +#define HAVE_GETOPT 1 + +/* Define if you have the getuid function. */ +#define HAVE_GETUID 1 + +/* Define if you have the memcmp function. */ +#define HAVE_MEMCMP 1 + +/* Define if you have the memcpy function. */ +#define HAVE_MEMCPY 1 + +/* Define if you have the memmove function. */ +#define HAVE_MEMMOVE 1 + +/* Define if you have the mmap function. */ +#define HAVE_MMAP 1 + +/* Define if you have the raise function. */ +#define HAVE_RAISE 1 + +/* Define if you have the select function. */ +#define HAVE_SELECT 1 + +/* Define if you have the snprintf function. */ +#define HAVE_SNPRINTF 1 + +/* Define if you have the strdup function. */ +#define HAVE_STRDUP 1 + +/* Define if you have the strerror function. */ +#define HAVE_STRERROR 1 + +/* Define if you have the strsep function. */ +#define HAVE_STRSEP 1 + +/* Define if you have the vsnprintf function. */ +#define HAVE_VSNPRINTF 1 + +/* Define if you have the <dirent.h> header file. */ +#define HAVE_DIRENT_H 1 + +/* Define if you have the <ndir.h> header file. */ +/* #undef HAVE_NDIR_H */ + +/* Define if you have the <sys/dir.h> header file. */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define if you have the <sys/ndir.h> header file. */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define if you have the <sys/select.h> header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define if you have the <sys/time.h> header file. */ +#define HAVE_SYS_TIME_H 1 + +#include_next <config.h> diff --git a/db2/db.h b/db2/db.h new file mode 100644 index 0000000000..3769579c58 --- /dev/null +++ b/db2/db.h @@ -0,0 +1,796 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db.h.src 10.67 (Sleepycat) 8/25/97 + */ + +#ifndef _DB_H_ +#define _DB_H_ + +#ifndef __NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#endif + +/* + * XXX + * MacOS: ensure that Metrowerks C makes enumeration types int sized. + */ +#ifdef __MWERKS__ +#pragma enumsalwaysint on +#endif + +/* + * XXX + * Handle function prototypes and the keyword "const". This steps on name + * space that DB doesn't control, but all of the other solutions are worse. + */ +#undef __P +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* ANSI C prototypes */ +#else +#define const +#define __P(protos) () /* K&R C preprocessor */ +#endif + +/* + * !!! + * DB needs basic information about specifically sized types. If they're + * not provided by the system, typedef them here. + * + * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__, + * as does BIND and Kerberos, since we don't know for sure what #include + * files the user is using. + * + * !!! + * We also provide the standard u_int, u_long etc., if they're not provided + * by the system. This isn't completely necessary, but the example programs + * need them. + */ +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ + + + + + +#endif + + + + + + +#define DB_VERSION_MAJOR 2 +#define DB_VERSION_MINOR 3 +#define DB_VERSION_PATCH 4 +#define DB_VERSION_STRING "Sleepycat Software: DB 2.3.4: (8/20/97)" + +typedef u_int32_t db_pgno_t; /* Page number type. */ +typedef u_int16_t db_indx_t; /* Page offset type. */ +#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ + +typedef u_int32_t db_recno_t; /* Record number type. */ +typedef size_t DB_LOCK; /* Object returned by lock manager. */ +#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ + +#define DB_FILE_ID_LEN 20 /* DB file ID length. */ + +/* Forward structure declarations, so applications get type checking. */ +struct __db; typedef struct __db DB; +#ifdef DB_DBM_HSEARCH + typedef struct __db DBM; +#endif +struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT; +struct __db_dbt; typedef struct __db_dbt DBT; +struct __db_env; typedef struct __db_env DB_ENV; +struct __db_info; typedef struct __db_info DB_INFO; +struct __db_lockregion; typedef struct __db_lockregion DB_LOCKREGION; +struct __db_lockreq; typedef struct __db_lockreq DB_LOCKREQ; +struct __db_locktab; typedef struct __db_locktab DB_LOCKTAB; +struct __db_log; typedef struct __db_log DB_LOG; +struct __db_lsn; typedef struct __db_lsn DB_LSN; +struct __db_mpool; typedef struct __db_mpool DB_MPOOL; +struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT; +struct __db_mpool_stat; typedef struct __db_mpool_stat DB_MPOOL_STAT; +struct __db_mpoolfile; typedef struct __db_mpoolfile DB_MPOOLFILE; +struct __db_txn; typedef struct __db_txn DB_TXN; +struct __db_txn_active; typedef struct __db_txn_active DB_TXN_ACTIVE; +struct __db_txn_stat; typedef struct __db_txn_stat DB_TXN_STAT; +struct __db_txnmgr; typedef struct __db_txnmgr DB_TXNMGR; +struct __db_txnregion; typedef struct __db_txnregion DB_TXNREGION; +struct __dbc; typedef struct __dbc DBC; + +/* Key/data structure -- a Data-Base Thang. */ +struct __db_dbt { + void *data; /* key/data */ + u_int32_t size; /* key/data length */ + u_int32_t ulen; /* RO: length of user buffer. */ + u_int32_t dlen; /* RO: get/put record length. */ + u_int32_t doff; /* RO: get/put record offset. */ + +#define DB_DBT_INTERNAL 0x01 /* Perform any mallocs using regular + malloc, not the user's malloc. */ +#define DB_DBT_MALLOC 0x02 /* Return in allocated memory. */ +#define DB_DBT_PARTIAL 0x04 /* Partial put/get. */ +#define DB_DBT_USERMEM 0x08 /* Return in user's memory. */ + u_int32_t flags; +}; + +/* + * Database configuration and initialization. + */ + /* + * Flags understood by both db_open(3) and db_appinit(3). + */ +#define DB_CREATE 0x00001 /* O_CREAT: create file as necessary. */ +#define DB_NOMMAP 0x00002 /* Don't mmap underlying file. */ +#define DB_THREAD 0x00004 /* Free-thread DB package handles. */ + +/* + * Flags understood by db_appinit(3). + * + * DB_APP_INIT and DB_MUTEXDEBUG are internal only, and not documented. + */ +/* 0x00007 COMMON MASK. */ +#define DB_APP_INIT 0x00008 /* Appinit called, paths initialized. */ +#define DB_INIT_LOCK 0x00010 /* Initialize locking. */ +#define DB_INIT_LOG 0x00020 /* Initialize logging. */ +#define DB_INIT_MPOOL 0x00040 /* Initialize mpool. */ +#define DB_INIT_TXN 0x00080 /* Initialize transactions. */ +#define DB_MPOOL_PRIVATE 0x00100 /* Mpool: private memory pool. */ +#define DB_MUTEXDEBUG 0x00200 /* Do not get/set mutexes in regions. */ +#define DB_RECOVER 0x00400 /* Run normal recovery. */ +#define DB_RECOVER_FATAL 0x00800 /* Run catastrophic recovery. */ +#define DB_TXN_NOSYNC 0x01000 /* Do not sync log on commit. */ +#define DB_USE_ENVIRON 0x02000 /* Use the environment. */ +#define DB_USE_ENVIRON_ROOT 0x04000 /* Use the environment if root. */ + +/* CURRENTLY UNUSED LOCK FLAGS. */ +#define DB_TXN_LOCK_2PL 0x00000 /* Two-phase locking. */ +#define DB_TXN_LOCK_OPTIMISTIC 0x00000 /* Optimistic locking. */ +#define DB_TXN_LOCK_MASK 0x00000 /* Lock flags mask. */ + +/* CURRENTLY UNUSED LOG FLAGS. */ +#define DB_TXN_LOG_REDO 0x00000 /* Redo-only logging. */ +#define DB_TXN_LOG_UNDO 0x00000 /* Undo-only logging. */ +#define DB_TXN_LOG_UNDOREDO 0x00000 /* Undo/redo write-ahead logging. */ +#define DB_TXN_LOG_MASK 0x00000 /* Log flags mask. */ + +/* + * Flags understood by db_open(3). + * + * DB_EXCL and DB_TEMPORARY are internal only, and not documented. + * DB_SEQUENTIAL is currently internal, but likely to be exported some day. + */ +/* 0x00007 COMMON MASK. */ +/* 0x07fff ALREADY USED. */ +#define DB_EXCL 0x08000 /* O_EXCL: exclusive open. */ +#define DB_RDONLY 0x10000 /* O_RDONLY: read-only. */ +#define DB_SEQUENTIAL 0x20000 /* Indicate sequential access. */ +#define DB_TEMPORARY 0x40000 /* Remove on last close. */ +#define DB_TRUNCATE 0x80000 /* O_TRUNCATE: replace existing DB. */ + +/* + * Deadlock detector modes; used in the DBENV structure to configure the + * locking subsystem. + */ +#define DB_LOCK_NORUN 0x0 +#define DB_LOCK_DEFAULT 0x1 +#define DB_LOCK_OLDEST 0x2 +#define DB_LOCK_RANDOM 0x3 +#define DB_LOCK_YOUNGEST 0x4 + +struct __db_env { + int db_lorder; /* Byte order. */ + + /* Error message callback. */ + void (*db_errcall) __P((const char *, char *)); + FILE *db_errfile; /* Error message file stream. */ + const char *db_errpfx; /* Error message prefix. */ + int db_verbose; /* Generate debugging messages. */ + + /* User paths. */ + char *db_home; /* Database home. */ + char *db_log_dir; /* Database log file directory. */ + char *db_tmp_dir; /* Database tmp file directory. */ + + char **db_data_dir; /* Database data file directories. */ + int data_cnt; /* Database data file slots. */ + int data_next; /* Next Database data file slot. */ + + /* Locking. */ + DB_LOCKTAB *lk_info; /* Return from lock_open(). */ + u_int8_t *lk_conflicts; /* Two dimensional conflict matrix. */ + int lk_modes; /* Number of lock modes in table. */ + unsigned int lk_max; /* Maximum number of locks. */ + u_int32_t lk_detect; /* Deadlock detect on every conflict. */ + int (*db_yield) __P((void)); /* Yield function for threads. */ + + /* Logging. */ + DB_LOG *lg_info; /* Return from log_open(). */ + u_int32_t lg_max; /* Maximum file size. */ + + /* Memory pool. */ + DB_MPOOL *mp_info; /* Return from memp_open(). */ + size_t mp_mmapsize; /* Maximum file size for mmap. */ + size_t mp_size; /* Bytes in the mpool cache. */ + + /* Transactions. */ + DB_TXNMGR *tx_info; /* Return from txn_open(). */ + unsigned int tx_max; /* Maximum number of transactions. */ + int (*tx_recover) /* Dispatch function for recovery. */ + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + + u_int32_t flags; /* Flags. */ +}; + +/******************************************************* + * Access methods. + *******************************************************/ +typedef enum { + DB_BTREE=1, /* B+tree. */ + DB_HASH, /* Extended Linear Hashing. */ + DB_RECNO, /* Fixed and variable-length records. */ + DB_UNKNOWN /* Figure it out on open. */ +} DBTYPE; + +#define DB_BTREEVERSION 6 /* Current btree version. */ +#define DB_BTREEOLDVER 6 /* Oldest btree version supported. */ +#define DB_BTREEMAGIC 0x053162 + +#define DB_HASHVERSION 5 /* Current hash version. */ +#define DB_HASHOLDVER 4 /* Oldest hash version supported. */ +#define DB_HASHMAGIC 0x061561 + +#define DB_LOGVERSION 2 /* Current log version. */ +#define DB_LOGOLDVER 2 /* Oldest log version supported. */ +#define DB_LOGMAGIC 0x040988 + +struct __db_info { + int db_lorder; /* Byte order. */ + size_t db_cachesize; /* Underlying cache size. */ + size_t db_pagesize; /* Underlying page size. */ + + /* Local heap allocation. */ + void *(*db_malloc) __P((size_t)); + + /* Btree access method. */ + int bt_maxkey; /* Maximum keys per page. */ + int bt_minkey; /* Minimum keys per page. */ + int (*bt_compare) /* Comparison function. */ + __P((const DBT *, const DBT *)); + size_t (*bt_prefix) /* Prefix function. */ + __P((const DBT *, const DBT *)); + + /* Hash access method. */ + unsigned int h_ffactor; /* Fill factor. */ + unsigned int h_nelem; /* Number of elements. */ + u_int32_t (*h_hash) /* Hash function. */ + __P((const void *, u_int32_t)); + + /* Recno access method. */ + int re_pad; /* Fixed-length padding byte. */ + int re_delim; /* Variable-length delimiting byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + char *re_source; /* Source file name. */ + +#define DB_DELIMITER 0x0001 /* Recno: re_delim set. */ +#define DB_DUP 0x0002 /* Btree, Hash: duplicate keys. */ +#define DB_FIXEDLEN 0x0004 /* Recno: fixed-length records. */ +#define DB_PAD 0x0008 /* Recno: re_pad set. */ +#define DB_RECNUM 0x0010 /* Btree: record numbers. */ +#define DB_RENUMBER 0x0020 /* Recno: renumber on insert/delete. */ +#define DB_SNAPSHOT 0x0040 /* Recno: snapshot the input. */ + u_int32_t flags; +}; + +/* + * DB access method and cursor operation codes. These are implemented as + * bit fields for future flexibility, but currently only a single one may + * be specified to any function. + */ +#define DB_AFTER 0x000001 /* c_put() */ +#define DB_APPEND 0x000002 /* put() */ +#define DB_BEFORE 0x000004 /* c_put() */ +#define DB_CHECKPOINT 0x000008 /* log_put(), log_get() */ +#define DB_CURRENT 0x000010 /* c_get(), c_put(), log_get() */ +#define DB_FIRST 0x000020 /* c_get(), log_get() */ +#define DB_FLUSH 0x000040 /* log_put() */ +#define DB_GET_RECNO 0x000080 /* c_get() */ +#define DB_KEYFIRST 0x000100 /* c_put() */ +#define DB_KEYLAST 0x000200 /* c_put() */ +#define DB_LAST 0x000400 /* c_get(), log_get() */ +#define DB_NEXT 0x000800 /* c_get(), log_get() */ +#define DB_NOOVERWRITE 0x001000 /* put() */ +#define DB_NOSYNC 0x002000 /* close() */ +#define DB_PREV 0x004000 /* c_get(), log_get() */ +#define DB_RECORDCOUNT 0x008000 /* stat() */ +#define DB_SET 0x010000 /* c_get(), log_get() */ +#define DB_SET_RANGE 0x020000 /* c_get() */ +#define DB_SET_RECNO 0x040000 /* get(), c_get() */ + +/* DB (user visible) error return codes. */ +#define DB_INCOMPLETE ( -1) /* Sync didn't finish. */ +#define DB_KEYEMPTY ( -2) /* The key/data pair was deleted or + was never created by the user. */ +#define DB_KEYEXIST ( -3) /* The key/data pair already exists. */ +#define DB_LOCK_DEADLOCK ( -4) /* Locker killed to resolve deadlock. */ +#define DB_LOCK_NOTGRANTED ( -5) /* Lock unavailable, no-wait set. */ +#define DB_LOCK_NOTHELD ( -6) /* Lock not held by locker. */ +#define DB_NOTFOUND ( -7) /* Key/data pair not found (EOF). */ + +/* DB (private) error return codes. */ +#define DB_DELETED ( -8) /* Recovery file marked deleted. */ +#define DB_NEEDSPLIT ( -9) /* Page needs to be split. */ +#define DB_REGISTERED (-10) /* Entry was previously registered. */ +#define DB_SWAPBYTES (-11) /* Database needs byte swapping. */ + +struct __db_ilock { /* Internal DB access method lock. */ + db_pgno_t pgno; /* Page being locked. */ + /* File id. */ + u_int8_t fileid[DB_FILE_ID_LEN]; +}; + +/* DB access method description structure. */ +struct __db { + void *mutex; /* Synchronization for free threading */ + DBTYPE type; /* DB access method. */ + DB_ENV *dbenv; /* DB_ENV structure. */ + DB_ENV *mp_dbenv; /* DB_ENV for local mpool creation. */ + + DB *master; /* Original DB created by db_open. */ + void *internal; /* Access method private. */ + + DB_MPOOL *mp; /* The access method's mpool. */ + DB_MPOOLFILE *mpf; /* The access method's mpool file. */ + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * TAILQ_HEAD(curs_queue, __dbc); + */ + struct { + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } curs_queue; + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * LIST_HEAD(handleq, __db); + * LIST_ENTRY(__db); + */ + struct { + struct __db *lh_first; + } handleq; /* List of handles for this DB. */ + struct { + struct __db *le_next; + struct __db **le_prev; + } links; /* Links for the handle list. */ + + u_int32_t log_fileid; /* Logging file id. */ + + DB_TXN *txn; /* Current transaction. */ + u_int32_t locker; /* Default process' locker id. */ + DBT lock_dbt; /* DBT referencing lock. */ + struct __db_ilock lock; /* Lock. */ + + size_t pgsize; /* Logical page size of file. */ + + /* Local heap allocation. */ + void *(*db_malloc) __P((size_t)); + + /* Functions. */ + int (*close) __P((DB *, int)); + int (*cursor) __P((DB *, DB_TXN *, DBC **)); + int (*del) __P((DB *, DB_TXN *, DBT *, int)); + int (*fd) __P((DB *, int *)); + int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, int)); + int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, int)); + int (*stat) __P((DB *, void *, void *(*)(size_t), int)); + int (*sync) __P((DB *, int)); + +#define DB_AM_DUP 0x000001 /* DB_DUP (internal). */ +#define DB_AM_INMEM 0x000002 /* In-memory; no sync on close. */ +#define DB_AM_LOCKING 0x000004 /* Perform locking. */ +#define DB_AM_LOGGING 0x000008 /* Perform logging. */ +#define DB_AM_MLOCAL 0x000010 /* Database memory pool is local. */ +#define DB_AM_PGDEF 0x000020 /* Page size was defaulted. */ +#define DB_AM_RDONLY 0x000040 /* Database is readonly. */ +#define DB_AM_RECOVER 0x000080 /* In recovery (do not log or lock). */ +#define DB_AM_SWAP 0x000100 /* Pages need to be byte-swapped. */ +#define DB_AM_THREAD 0x000200 /* DB is multi-threaded. */ +#define DB_BT_RECNUM 0x000400 /* DB_RECNUM (internal) */ +#define DB_HS_DIRTYMETA 0x000800 /* Hash: Metadata page modified. */ +#define DB_RE_DELIMITER 0x001000 /* DB_DELIMITER (internal). */ +#define DB_RE_FIXEDLEN 0x002000 /* DB_FIXEDLEN (internal). */ +#define DB_RE_PAD 0x004000 /* DB_PAD (internal). */ +#define DB_RE_RENUMBER 0x008000 /* DB_RENUMBER (internal). */ +#define DB_RE_SNAPSHOT 0x010000 /* DB_SNAPSHOT (internal). */ + + u_int32_t flags; +}; + +/* Cursor description structure. */ +struct __dbc { + DB *dbp; /* Related DB access method. */ + DB_TXN *txn; /* Associated transaction. */ + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * TAILQ_ENTRY(__dbc); + */ + struct { + struct __dbc *tqe_next; + struct __dbc **tqe_prev; + } links; + + void *internal; /* Access method private. */ + + int (*c_close) __P((DBC *)); + int (*c_del) __P((DBC *, int)); + int (*c_get) __P((DBC *, DBT *, DBT *, int)); + int (*c_put) __P((DBC *, DBT *, DBT *, int)); +}; + +/* Btree/recno statistics structure. */ +struct __db_bt_stat { + u_int32_t bt_flags; /* Open flags. */ + u_int32_t bt_maxkey; /* Maxkey value. */ + u_int32_t bt_minkey; /* Minkey value. */ + u_int32_t bt_re_len; /* Fixed-length record length. */ + u_int32_t bt_re_pad; /* Fixed-length record pad. */ + u_int32_t bt_pagesize; /* Page size. */ + u_int32_t bt_levels; /* Tree levels. */ + u_int32_t bt_nrecs; /* Number of records. */ + u_int32_t bt_int_pg; /* Internal pages. */ + u_int32_t bt_leaf_pg; /* Leaf pages. */ + u_int32_t bt_dup_pg; /* Duplicate pages. */ + u_int32_t bt_over_pg; /* Overflow pages. */ + u_int32_t bt_free; /* Pages on the free list. */ + u_int32_t bt_freed; /* Pages freed for reuse. */ + u_int32_t bt_int_pgfree; /* Bytes free in internal pages. */ + u_int32_t bt_leaf_pgfree; /* Bytes free in leaf pages. */ + u_int32_t bt_dup_pgfree; /* Bytes free in duplicate pages. */ + u_int32_t bt_over_pgfree; /* Bytes free in overflow pages. */ + u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ + u_int32_t bt_split; /* Total number of splits. */ + u_int32_t bt_rootsplit; /* Root page splits. */ + u_int32_t bt_fastsplit; /* Fast splits. */ + u_int32_t bt_added; /* Items added. */ + u_int32_t bt_deleted; /* Items deleted. */ + u_int32_t bt_get; /* Items retrieved. */ + u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ + u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int db_appinit __P((const char *, char * const *, DB_ENV *, int)); +int db_appexit __P((DB_ENV *)); +int db_open __P((const char *, DBTYPE, int, int, DB_ENV *, DB_INFO *, DB **)); +const char *db_version __P((int *, int *, int *)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Locking + *******************************************************/ +#define DB_LOCKVERSION 1 +#define DB_LOCKMAGIC 0x090193 + +/* Flag values for lock_vec(). */ +#define DB_LOCK_NOWAIT 0x01 /* Don't wait on unavailable lock. */ + +/* Flag values for lock_detect(). */ +#define DB_LOCK_CONFLICT 0x01 /* Run on any conflict. */ + +/* Request types. */ +typedef enum { + DB_LOCK_DUMP, /* Display held locks. */ + DB_LOCK_GET, /* Get the lock. */ + DB_LOCK_PUT, /* Release the lock. */ + DB_LOCK_PUT_ALL, /* Release locker's locks. */ + DB_LOCK_PUT_OBJ /* Release locker's locks on obj. */ +} db_lockop_t; + +/* Simple R/W lock modes and for multi-granularity intention locking. */ +typedef enum { + DB_LOCK_NG=0, /* Not granted. */ + DB_LOCK_READ, /* Shared/read. */ + DB_LOCK_WRITE, /* Exclusive/write. */ + DB_LOCK_IREAD, /* Intent to share/read. */ + DB_LOCK_IWRITE, /* Intent exclusive/write. */ + DB_LOCK_IWR /* Intent to read and write. */ +} db_lockmode_t; + +/* Lock request structure. */ +struct __db_lockreq { + db_lockop_t op; /* Operation. */ + db_lockmode_t mode; /* Requested mode. */ + u_int32_t locker; /* Locker identity. */ + DBT *obj; /* Object being locked. */ + DB_LOCK lock; /* Lock returned. */ +}; + +/* + * Commonly used conflict matrices. + * + * Standard Read/Write (or exclusive/shared) locks. + */ +#define DB_LOCK_RW_N 3 +extern const u_int8_t db_rw_conflicts[]; + +/* Multi-granularity locking. */ +#define DB_LOCK_RIW_N 6 +extern const u_int8_t db_riw_conflicts[]; + +#if defined(__cplusplus) +extern "C" { +#endif +int lock_close __P((DB_LOCKTAB *)); +int lock_detect __P((DB_LOCKTAB *, int, u_int32_t)); +int lock_get __P((DB_LOCKTAB *, + u_int32_t, int, const DBT *, db_lockmode_t, DB_LOCK *)); +int lock_id __P((DB_LOCKTAB *, u_int32_t *)); +int lock_open __P((const char *, int, int, DB_ENV *, DB_LOCKTAB **)); +int lock_put __P((DB_LOCKTAB *, DB_LOCK)); +int lock_unlink __P((const char *, int, DB_ENV *)); +int lock_vec __P((DB_LOCKTAB *, + u_int32_t, int, DB_LOCKREQ *, int, DB_LOCKREQ **)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Logging. + *******************************************************/ +/* Flag values for log_archive(). */ +#define DB_ARCH_ABS 0x001 /* Absolute pathnames. */ +#define DB_ARCH_DATA 0x002 /* Data files. */ +#define DB_ARCH_LOG 0x004 /* Log files. */ + +/* + * A DB_LSN has two parts, a fileid which identifies a specific file, and an + * offset within that file. The fileid is an unsigned 4-byte quantity that + * uniquely identifies a file within the log directory -- currently a simple + * counter inside the log. The offset is also an unsigned 4-byte value. The + * log manager guarantees the offset is never more than 4 bytes by switching + * to a new log file before the maximum length imposed by an unsigned 4-byte + * offset is reached. + */ +struct __db_lsn { + u_int32_t file; /* File ID. */ + u_int32_t offset; /* File offset. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int log_archive __P((DB_LOG *, char **[], int, void *(*)(size_t))); +int log_close __P((DB_LOG *)); +int log_compare __P((const DB_LSN *, const DB_LSN *)); +int log_file __P((DB_LOG *, const DB_LSN *, char *, size_t)); +int log_flush __P((DB_LOG *, const DB_LSN *)); +int log_get __P((DB_LOG *, DB_LSN *, DBT *, int)); +int log_open __P((const char *, int, int, DB_ENV *, DB_LOG **)); +int log_put __P((DB_LOG *, DB_LSN *, const DBT *, int)); +int log_register __P((DB_LOG *, DB *, const char *, DBTYPE, u_int32_t *)); +int log_unlink __P((const char *, int, DB_ENV *)); +int log_unregister __P((DB_LOG *, u_int32_t)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Mpool + *******************************************************/ +/* Flag values for memp_fget(). */ +#define DB_MPOOL_CREATE 0x001 /* Create a page. */ +#define DB_MPOOL_LAST 0x002 /* Return the last page. */ +#define DB_MPOOL_NEW 0x004 /* Create a new page. */ + +/* Flag values for memp_fput(), memp_fset(). */ +#define DB_MPOOL_CLEAN 0x001 /* Clear modified bit. */ +#define DB_MPOOL_DIRTY 0x002 /* Page is modified. */ +#define DB_MPOOL_DISCARD 0x004 /* Don't cache the page. */ + +/* Mpool statistics structure. */ +struct __db_mpool_stat { + size_t st_cachesize; /* Cache size. */ + unsigned long st_cache_hit; /* Pages found in the cache. */ + unsigned long st_cache_miss; /* Pages not found in the cache. */ + unsigned long st_map; /* Pages from mapped files. */ + unsigned long st_page_create; /* Pages created in the cache. */ + unsigned long st_page_in; /* Pages read in. */ + unsigned long st_page_out; /* Pages written out. */ + unsigned long st_ro_evict; /* Read-only pages evicted. */ + unsigned long st_rw_evict; /* Read-write pages evicted. */ + unsigned long st_hash_buckets; /* Number of hash buckets. */ + unsigned long st_hash_searches; /* Total hash chain searches. */ + unsigned long st_hash_longest; /* Longest hash chain searched. */ + unsigned long st_hash_examined; /* Total hash entries searched. */ +}; + +/* Mpool file statistics structure. */ +struct __db_mpool_fstat { + char *file_name; /* File name. */ + size_t st_pagesize; /* Page size. */ + unsigned long st_cache_hit; /* Pages found in the cache. */ + unsigned long st_cache_miss; /* Pages not found in the cache. */ + unsigned long st_map; /* Pages from mapped files. */ + unsigned long st_page_create; /* Pages created in the cache. */ + unsigned long st_page_in; /* Pages read in. */ + unsigned long st_page_out; /* Pages written out. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int memp_close __P((DB_MPOOL *)); +int memp_fclose __P((DB_MPOOLFILE *)); +int memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, unsigned long, void *)); +int memp_fopen __P((DB_MPOOL *, const char *, + int, int, int, size_t, int, DBT *, u_int8_t *, DB_MPOOLFILE **)); +int memp_fput __P((DB_MPOOLFILE *, void *, unsigned long)); +int memp_fset __P((DB_MPOOLFILE *, void *, unsigned long)); +int memp_fsync __P((DB_MPOOLFILE *)); +int memp_open __P((const char *, int, int, DB_ENV *, DB_MPOOL **)); +int memp_register __P((DB_MPOOL *, int, + int (*)(db_pgno_t, void *, DBT *), + int (*)(db_pgno_t, void *, DBT *))); +int memp_stat __P((DB_MPOOL *, + DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, void *(*)(size_t))); +int memp_sync __P((DB_MPOOL *, DB_LSN *)); +int memp_unlink __P((const char *, int, DB_ENV *)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Transactions. + *******************************************************/ +#define DB_TXNVERSION 1 +#define DB_TXNMAGIC 0x041593 + +/* Operations values to the tx_recover() function. */ +#define DB_TXN_BACKWARD_ROLL 1 /* Read the log backwards. */ +#define DB_TXN_FORWARD_ROLL 2 /* Read the log forwards. */ +#define DB_TXN_OPENFILES 3 /* Read for open files. */ +#define DB_TXN_REDO 4 /* Redo the operation. */ +#define DB_TXN_UNDO 5 /* Undo the operation. */ + +/* Internal transaction status values. */ + +/* Transaction statistics structure. */ +struct __db_txn_active { + u_int32_t txnid; /* Transaction ID */ + DB_LSN lsn; /* Lsn of the begin record */ +}; + +struct __db_txn_stat { + DB_LSN st_last_ckp; /* lsn of the last checkpoint */ + DB_LSN st_pending_ckp; /* last checkpoint did not finish */ + time_t st_time_ckp; /* time of last checkpoint */ + u_int32_t st_last_txnid; /* last transaction id given out */ + u_int32_t st_maxtxns; /* maximum number of active txns */ + u_int32_t st_naborts; /* number of aborted transactions */ + u_int32_t st_nbegins; /* number of begun transactions */ + u_int32_t st_ncommits; /* number of committed transactions */ + u_int32_t st_nactive; /* number of active transactions */ + DB_TXN_ACTIVE *st_txnarray; /* array of active transactions */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int txn_abort __P((DB_TXN *)); +int txn_begin __P((DB_TXNMGR *, DB_TXN *, DB_TXN **)); +int txn_checkpoint __P((const DB_TXNMGR *, long, long)); +int txn_commit __P((DB_TXN *)); +int txn_close __P((DB_TXNMGR *)); +u_int32_t txn_id __P((DB_TXN *)); +int txn_open __P((const char *, int, int, DB_ENV *, DB_TXNMGR **)); +int txn_prepare __P((DB_TXN *)); +int txn_stat __P((DB_TXNMGR *, DB_TXN_STAT **, void *(*)(size_t))); +int txn_unlink __P((const char *, int, DB_ENV *)); +#if defined(__cplusplus) +}; +#endif + +#ifdef DB_DBM_HSEARCH +/******************************************************* + * Dbm/Ndbm historic interfaces. + *******************************************************/ +#define DBM_INSERT 0 /* Flags to dbm_store(). */ +#define DBM_REPLACE 1 + +/* + * The db(3) support for ndbm(3) always appends this suffix to the + * file name to avoid overwriting the user's original database. + */ +#define DBM_SUFFIX ".db" + +typedef struct { + char *dptr; + int dsize; +} datum; + +#if defined(__cplusplus) +extern "C" { +#endif +int dbminit __P((char *)); +#if !defined(__cplusplus) +int delete __P((datum)); +#endif +datum fetch __P((datum)); +datum firstkey __P((void)); +datum nextkey __P((datum)); +int store __P((datum, datum)); + +/* + * !!! + * Don't prototype: + * + * dbm_clearerr(DBM *db); + * dbm_dirfno(DBM *db); + * dbm_error(DBM *db); + * dbm_pagfno(DBM *db); + * dbm_rdonly(DBM *db); + * + * they weren't documented and were historically implemented as #define's. + */ +void dbm_close __P((DBM *)); +int dbm_delete __P((DBM *, datum)); +datum dbm_fetch __P((DBM *, datum)); +datum dbm_firstkey __P((DBM *)); +long dbm_forder __P((DBM *, datum)); +datum dbm_nextkey __P((DBM *)); +DBM *dbm_open __P((const char *, int, int)); +int dbm_store __P((DBM *, datum, datum, int)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Hsearch historic interface. + *******************************************************/ +typedef enum { + FIND, ENTER +} ACTION; + +typedef struct entry { + char *key; + void *data; +} ENTRY; + +#if defined(__cplusplus) +extern "C" { +#endif +int hcreate __P((unsigned int)); +void hdestroy __P((void)); +ENTRY *hsearch __P((ENTRY, ACTION)); +#if defined(__cplusplus) +}; +#endif +#endif /* DB_DBM_HSEARCH */ + +/* + * XXX + * MacOS: Reset Metrowerks C enum sizes. + */ +#ifdef __MWERKS__ +#pragma enumsalwaysint reset +#endif +#endif /* !_DB_H_ */ diff --git a/db2/db/db.c b/db2/db/db.c new file mode 100644 index 0000000000..df3a9d2d21 --- /dev/null +++ b/db2/db/db.c @@ -0,0 +1,818 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db.c 10.37 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_shash.h" +#include "db_swap.h" +#include "btree.h" +#include "hash.h" +#include "mp.h" +#include "db_am.h" +#include "common_ext.h" + +static int db_close __P((DB *, int)); +static int db_fd __P((DB *, int *)); + +/* + * If the metadata page has the flag set, set the local flag. If the page + * does NOT have the flag set, return EINVAL if the user's dbinfo argument + * caused us to already set the local flag. + */ +#define DBINFO_FCHK(dbp, fn, meta_flags, m_name, dbp_name) { \ + if ((meta_flags) & (m_name)) \ + F_SET(dbp, dbp_name); \ + else \ + if (F_ISSET(dbp, dbp_name)) { \ + __db_err(dbenv, \ + "%s: %s specified in dbinfo argument but not set in file", \ + fname, fn); \ + goto einval; \ + } \ +} + +/* + * db_open -- + * Main library interface to the DB access methods. + */ +int +db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) + const char *fname; + DBTYPE type; + int flags, mode; + DB_ENV *dbenv; + DB_INFO *dbinfo; + DB **dbpp; +{ + BTMETA *btm; + DB *dbp; + DBT pgcookie; + DB_ENV *envp, t_dbenv; + DB_PGINFO pginfo; + HASHHDR *hashm; + off_t io; + size_t cachesize; + ssize_t nr; + int fd, ftype, need_fileid, restore, ret, retry_cnt, swapped; + char *real_name, mbuf[512]; + + /* Validate arguments. */ +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_NOMMAP | DB_RDONLY | DB_THREAD | DB_TRUNCATE) +#else +#define OKFLAGS (DB_CREATE | DB_NOMMAP | DB_RDONLY | DB_TRUNCATE) +#endif + if ((ret = __db_fchk(dbenv, "db_open", flags, OKFLAGS)) != 0) + return (ret); + + /* Initialize for error return. */ + fd = -1; + need_fileid = 1; + real_name = NULL; + + /* Allocate the DB structure, reference the DB_ENV structure. */ + if ((dbp = (DB *)calloc(1, sizeof(DB))) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); + } + dbp->dbenv = dbenv; + + /* Convert the dbinfo flags. */ + if (dbinfo != NULL) { + /* + * !!! + * We can't check for illegal flags until we know what type + * of open we're doing. + */ + if (F_ISSET(dbinfo, DB_DELIMITER)) + F_SET(dbp, DB_RE_DELIMITER); + if (F_ISSET(dbinfo, DB_DUP)) + F_SET(dbp, DB_AM_DUP); + if (F_ISSET(dbinfo, DB_FIXEDLEN)) + F_SET(dbp, DB_RE_FIXEDLEN); + if (F_ISSET(dbinfo, DB_PAD)) + F_SET(dbp, DB_RE_PAD); + if (F_ISSET(dbinfo, DB_RECNUM)) + F_SET(dbp, DB_BT_RECNUM); + if (F_ISSET(dbinfo, DB_RENUMBER)) + F_SET(dbp, DB_RE_RENUMBER); + if (F_ISSET(dbinfo, DB_SNAPSHOT)) + F_SET(dbp, DB_RE_SNAPSHOT); + } + + /* Set based on the open(2) flags. */ + if (LF_ISSET(DB_RDONLY)) + F_SET(dbp, DB_AM_RDONLY); + + /* Check threading fields. */ + if (LF_ISSET(DB_THREAD)) { + if ((dbp->mutex = + (db_mutex_t *)malloc(sizeof(db_mutex_t))) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + ret = ENOMEM; + goto err; + } + __db_mutex_init(dbp->mutex, 0); + + F_SET(dbp, DB_AM_THREAD); + } + + /* + * Always set the master and initialize the queues, so we can + * use these fields without checking the thread bit. + */ + dbp->master = dbp; + LIST_INIT(&dbp->handleq); + LIST_INSERT_HEAD(&dbp->handleq, dbp, links); + TAILQ_INIT(&dbp->curs_queue); + + /* + * Set based on the dbenv fields, although no logging or transactions + * are possible for temporary files. + */ + if (dbp->dbenv != NULL) { + if (dbenv->lk_info != NULL) + F_SET(dbp, DB_AM_LOCKING); + if (fname != NULL && dbenv->lg_info != NULL) + F_SET(dbp, DB_AM_LOGGING); + } + + /* Set the common fields. */ + if (dbinfo == NULL) { + dbp->pgsize = 0; + dbp->db_malloc = NULL; + } else { + dbp->pgsize = dbinfo->db_pagesize; + dbp->db_malloc = dbinfo->db_malloc; + } + + /* Fill in the default file mode. */ + if (mode == 0) + mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; + + /* Check if the user wants us to swap byte order. */ + if (dbinfo != NULL) + switch (ret = __db_byteorder(dbenv, dbinfo->db_lorder)) { + case 0: + break; + case DB_SWAPBYTES: + F_SET(dbp, DB_AM_SWAP); + break; + default: + goto err; + } + + /* + * If we have a file name, try and read the first page, figure out + * what type of file it is, and initialize everything we can based + * on that file's meta-data page. + * + * XXX + * We don't actually expect zero-length strings as arguments. We + * do the check, permitting them, because scripting languages, e.g., + * the Tcl test suite, doesn't know anything about passing NULL's. + */ + if (fname != NULL && fname[0] != '\0') { + /* Get the real file name. */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, NULL, fname, NULL, &real_name)) != 0) + goto err; + + /* + * Open the backing file. We need to make sure that multiple + * processes attempting to create the file at the same time + * are properly ordered so that only one of them creates the + * "unique" file id, so we open it O_EXCL and O_CREAT so two + * simultaneous attempts to create the region will return + * failure in one of the attempts. If we're one of the ones + * that fail, we simply retry without the O_CREAT flag, which + * will require that the meta-data page exist. + */ +#undef OKFLAGS +#define OKFLAGS \ + DB_CREATE | DB_NOMMAP | DB_RDONLY | DB_THREAD | DB_TRUNCATE + retry_cnt = 0; +open_retry: if (LF_ISSET(DB_CREATE)) { + if ((ret = __db_fdopen(real_name, flags | DB_EXCL, + OKFLAGS | DB_EXCL, mode, &fd)) != 0) + if (ret == EEXIST) { + LF_CLR(DB_CREATE); + goto open_retry; + } else { + __db_err(dbenv, + "%s: %s", fname, strerror(ret)); + goto err; + } + } else + if ((ret = __db_fdopen(real_name, + flags, OKFLAGS, mode, &fd)) != 0) { + __db_err(dbenv, "%s: %s", fname, strerror(ret)); + goto err; + } + + /* + * Use the optimum I/O size as the pagesize if a pagesize not + * specified. Some filesystems have 64K as their optimum I/O + * size, but as that results in impossibly large default cache + * sizes, we limit the default pagesize to 16K. + */ + if (dbp->pgsize == 0) { + if ((ret = __db_stat(dbp->dbenv, + real_name, fd, NULL, &io)) != 0) + goto err; + if (io < 512) + io = 512; + if (io > 16 * 1024) + io = 16 * 1024; + dbp->pgsize = io; + F_SET(dbp, DB_AM_PGDEF); + } + + /* + * Try and read the first disk sector -- this code assumes + * that the meta-data for all access methods fits in 512 + * bytes, and that no database will be smaller than that. + */ + if ((ret = __db_read(fd, mbuf, sizeof(mbuf), &nr)) != 0) + goto err; + + /* The fd is no longer needed. */ + (void)__db_close(fd); + fd = -1; + + if (nr != sizeof(mbuf)) { + if (nr != 0) { + __db_err(dbenv, + "%s: unexpected file format", fname); + goto einval; + } + /* + * The only way we can reach here with the DB_CREATE + * flag set is if we created the file. If we didn't + * create the file, there's a chance that someone else + * is busily doing so. Sleep and give them a chance, + * because we need the metadata page their going to + * write. + */ + if (!LF_ISSET(DB_CREATE) && retry_cnt++ < 3) { + __db_sleep(1, 0); + goto open_retry; + } + if (type == DB_UNKNOWN) { + __db_err(dbenv, + "%s: DBTYPE of unknown with empty file", + fname); + goto einval; + } + goto empty; + } + + /* + * A found file overrides some user information. We'll check + * for possible error conditions based on conflicts between + * the file and the user's arguments below. + */ + swapped = 0; + F_CLR(dbp, DB_AM_SWAP); + +retry: switch (((BTMETA *)mbuf)->magic) { + case DB_BTREEMAGIC: + if (type != DB_BTREE && + type != DB_RECNO && type != DB_UNKNOWN) + goto einval; + + btm = (BTMETA *)mbuf; + if (swapped && (ret = __bam_mswap((PAGE *)btm)) != 0) + goto err; + + if (btm->version < DB_BTREEOLDVER || + btm->version > DB_BTREEVERSION) { + __db_err(dbenv, + "%s: unsupported btree version number %lu", + fname, (u_long)btm->version); + goto einval; + } + dbp->pgsize = btm->pagesize; + F_CLR(dbp, DB_AM_PGDEF); + + if ((ret = __db_fchk(dbenv, + "db_open", btm->flags, BTM_MASK)) != 0) + goto err; + DBINFO_FCHK(dbp, "DB_DUP", + btm->flags, BTM_DUP, DB_AM_DUP); + if (F_ISSET(btm, BTM_RECNO)) { + DBINFO_FCHK(dbp, "DB_FIXEDLEN", + btm->flags, BTM_FIXEDLEN, DB_RE_FIXEDLEN); + DBINFO_FCHK(dbp, "DB_RENUMBER", + btm->flags, BTM_RENUMBER, DB_RE_RENUMBER); + type = DB_RECNO; + } else { + DBINFO_FCHK(dbp, "DB_RECNUM", + btm->flags, BTM_RECNUM, DB_BT_RECNUM); + type = DB_BTREE; + } + + /* Copy the file's unique id. */ + need_fileid = 0; + memcpy(dbp->lock.fileid, btm->uid, DB_FILE_ID_LEN); + break; + case DB_HASHMAGIC: + if (type != DB_HASH && type != DB_UNKNOWN) + goto einval; + + hashm = (HASHHDR *)mbuf; + if (swapped && (ret = __ham_mswap((PAGE *)hashm)) != 0) + goto err; + + if (hashm->version < DB_HASHOLDVER || + hashm->version > DB_HASHVERSION) { + __db_err(dbenv, + "%s: unsupported hash version number %lu", + fname, hashm->version); + goto einval; + } + dbp->pgsize = hashm->pagesize; + F_CLR(dbp, DB_AM_PGDEF); + + if ((ret = __db_fchk(dbenv, + "db_open", hashm->flags, DB_HASH_DUP)) != 0) + goto err; + DBINFO_FCHK(dbp, "DB_DUP", + hashm->flags, DB_HASH_DUP, DB_AM_DUP); + type = DB_HASH; + + /* Copy the file's unique id. */ + need_fileid = 0; + memcpy(dbp->lock.fileid, hashm->uid, DB_FILE_ID_LEN); + break; + default: + if (swapped) { + __db_err(dbenv, "unrecognized file type"); + goto einval; + } + M_32_SWAP(((BTMETA *)mbuf)->magic); + F_SET(dbp, DB_AM_SWAP); + + swapped = 1; + goto retry; + } + } else { + fname = real_name = NULL; + + if (type == DB_UNKNOWN) { + __db_err(dbenv, + "DBTYPE of unknown without existing file"); + goto einval; + } + F_SET(dbp, DB_AM_INMEM); + } + +empty: /* + * By the time we get here we've either set the type or we're taking + * it from the user. + */ + dbp->type = type; + + /* + * Set the page size to the best value for I/O to this file. Don't + * overflow the page offset type. The page size must be db_indx_t + * aligned and >= MIN_PAGE_SIZE. + * + * XXX + * Should we be checking for a page size that's not a multiple of 512? + */ + if (dbp->pgsize == 0) { + F_SET(dbp, DB_AM_PGDEF); + dbp->pgsize = 8 * 1024; + } + if (dbp->pgsize < DB_MIN_PGSIZE || + dbp->pgsize > DB_MAX_PGSIZE || + dbp->pgsize & (sizeof(db_indx_t) - 1)) { + __db_err(dbenv, "illegal page size"); + goto einval; + } + + /* + * Set and/or correct the cache size; must be a multiple of the + * page size. + */ + if (dbinfo == NULL || dbinfo->db_cachesize == 0) + cachesize = dbp->pgsize * DB_MINCACHE; + else { + cachesize = dbinfo->db_cachesize; + if (cachesize & (dbp->pgsize - 1)) + cachesize += (~cachesize & (dbp->pgsize - 1)) + 1; + if (cachesize < dbp->pgsize * DB_MINCACHE) + cachesize = dbp->pgsize * DB_MINCACHE; + if (cachesize < 20 * 1024) + cachesize = 20 * 1024; + } + + /* + * If no mpool supplied by the application, attach to a local, + * created buffer pool. + * + * XXX + * If the user has a DB_ENV structure, we have to use a temporary + * one so that we don't step on their values. If the user doesn't, + * we have to create one, and keep it around until the call to the + * memp_close() function. This is all so the mpool functions get + * the error stuff right. + */ + if (dbenv == NULL || dbenv->mp_info == NULL) { + F_SET(dbp, DB_AM_MLOCAL); + + if (dbenv == NULL) { + if ((dbp->mp_dbenv = + (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + ret = ENOMEM; + goto err; + } + + envp = dbp->mp_dbenv; + restore = 0; + } else { + t_dbenv = *dbenv; + + envp = dbenv; + restore = 1; + } + envp->mp_size = cachesize; + F_SET(envp, DB_MPOOL_PRIVATE); + if ((ret = memp_open(NULL, + DB_CREATE, S_IRUSR | S_IWUSR, envp, &dbp->mp)) != 0) + goto err; + if (restore) + *dbenv = t_dbenv; + } else + dbp->mp = dbenv->mp_info; + + /* Register DB's pgin/pgout functions. */ + if ((ret = memp_register(dbp->mp, + DB_FTYPE_BTREE, __bam_pgin, __bam_pgout)) != 0) + goto err; + if ((ret = memp_register(dbp->mp, + DB_FTYPE_HASH, __ham_pgin, __ham_pgout)) != 0) + goto err; + + /* + * If we don't already have one, get a unique file ID. If the file + * is a temporary file, then we have to create a unique file ID -- + * no backing file will be created until the mpool cache is filled + * forcing it to go to disk. The created ID must never match any + * potential real file ID -- we know it won't because real file IDs + * contain a time stamp after the dev/ino pair, and we're simply + * storing a 4-byte locker ID. + * + * XXX + * Store the file id in the locker structure -- we can get it from + * there as necessary, and it saves having two copies. + */ + if (need_fileid) + if (fname == NULL) { + memset(dbp->lock.fileid, 0, DB_FILE_ID_LEN); + if (F_ISSET(dbp, DB_AM_LOCKING) && + (ret = lock_id(dbenv->lk_info, + (u_int32_t *)dbp->lock.fileid)) != 0) + goto err; + } else + if ((ret = __db_fileid(dbenv, + real_name, 1, dbp->lock.fileid)) != 0) + goto err; + + /* No further use for the real name. */ + if (real_name != NULL) + FREES(real_name); + real_name = NULL; + + /* + * Open a backing file in the memory pool. + * + * If we need to process the file's pages on I/O, set the file type. + * If it's a hash file, always call pgin and pgout routines. This + * means that hash files can never be mapped into process memory. If + * it's a btree file and requires swapping, we need to page the file + * in and out. This has to be right -- we can't mmap files that are + * being paged in and out. + */ + if (type == DB_HASH) + ftype = DB_FTYPE_HASH; + else + ftype = F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_BTREE : 0; + pginfo.db_pagesize = dbp->pgsize; + pginfo.needswap = F_ISSET(dbp, DB_AM_SWAP); + pgcookie.data = &pginfo; + pgcookie.size = sizeof(DB_PGINFO); + + if ((ret = memp_fopen(dbp->mp, fname, ftype, + F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0, 0, dbp->pgsize, + 0, &pgcookie, dbp->lock.fileid, &dbp->mpf)) != 0) + goto err; + + /* Get a log file id. */ + if (F_ISSET(dbp, DB_AM_LOGGING) && + (ret = log_register(dbenv->lg_info, + dbp, fname, type, &dbp->log_fileid)) != 0) + goto err; + + /* + * Get a locker id for this DB, and build the lock cookie: the first + * db_pgno_t bytes are the page number, the next N bytes are the file + * id. + */ + if (F_ISSET(dbp, DB_AM_LOCKING)) { + if ((ret = lock_id(dbenv->lk_info, &dbp->locker)) != 0) + goto err; + dbp->lock_dbt.size = sizeof(dbp->lock); + dbp->lock_dbt.data = &dbp->lock; + } + + /* Call the real open function. */ + switch (type) { + case DB_BTREE: + if (dbinfo != NULL && (ret = __db_fchk(dbenv, + "db_open", dbinfo->flags, DB_RECNUM | DB_DUP)) != 0) + goto err; + if (dbinfo != NULL && (ret = __db_fcchk(dbenv, + "db_open", dbinfo->flags, DB_DUP, DB_RECNUM)) != 0) + goto err; + if ((ret = __bam_open(dbp, type, dbinfo)) != 0) + goto err; + break; + case DB_HASH: + if (dbinfo != NULL && (ret = __db_fchk(dbenv, + "db_open", dbinfo->flags, DB_DUP)) != 0) + goto err; + if ((ret = __ham_open(dbp, dbinfo)) != 0) + goto err; + break; + case DB_RECNO: +#define DB_INFO_FLAGS \ + (DB_DELIMITER | DB_FIXEDLEN | DB_PAD | DB_RENUMBER | DB_SNAPSHOT) + if (dbinfo != NULL && (ret = __db_fchk(dbenv, + "db_open", dbinfo->flags, DB_INFO_FLAGS)) != 0) + goto err; + if ((ret = __ram_open(dbp, type, dbinfo)) != 0) + goto err; + break; + default: + abort(); + } + + /* Call a local close routine. */ + dbp->close = db_close; + dbp->fd = db_fd; + + *dbpp = dbp; + return (0); + +einval: ret = EINVAL; +err: /* Close the file descriptor. */ + if (fd != -1) + (void)__db_close(fd); + + /* Discard the log file id. */ + if (dbp->log_fileid != 0) + (void)log_unregister(dbenv->lg_info, dbp->log_fileid); + + /* Close the memory pool file. */ + if (dbp->mpf != NULL) + (void)memp_fclose(dbp->mpf); + + /* If the memory pool was local, close it. */ + if (F_ISSET(dbp, DB_AM_MLOCAL) && dbp->mp != NULL) + (void)memp_close(dbp->mp); + + /* If we allocated a DB_ENV, discard it. */ + if (dbp->mp_dbenv != NULL) + FREE(dbp->mp_dbenv, sizeof(DB_ENV)); + + if (real_name != NULL) + FREES(real_name); + if (dbp != NULL) + FREE(dbp, sizeof(DB)); + + return (ret); +} + +/* + * db_close -- + * Close a DB tree. + */ +static int +db_close(dbp, flags) + DB *dbp; + int flags; +{ + DBC *dbc; + DB *tdbp; + int ret, t_ret; + + ret = 0; + + /* Sync the underlying file. */ + if (!LF_ISSET(DB_NOSYNC) && + (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0) + ret = t_ret; + + /* + * Call the underlying access method close routine for all the + * cursors and handles. + */ + for (tdbp = LIST_FIRST(&dbp->handleq); + tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) { + + while ((dbc = TAILQ_FIRST(&tdbp->curs_queue)) != NULL) + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + switch (tdbp->type) { + case DB_BTREE: + if ((t_ret = __bam_close(tdbp)) != 0 && ret == 0) + ret = t_ret; + break; + case DB_HASH: + if ((t_ret = __ham_close(tdbp)) != 0 && ret == 0) + ret = t_ret; + break; + case DB_RECNO: + if ((t_ret = __ram_close(tdbp)) != 0 && ret == 0) + ret = t_ret; + break; + default: + abort(); + } + + } + + /* Sync the memory pool. */ + if ((t_ret = memp_fsync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + + /* Close the memory pool file. */ + if ((t_ret = memp_fclose(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + + /* If the memory pool was local, close it. */ + if (F_ISSET(dbp, DB_AM_MLOCAL) && + (t_ret = memp_close(dbp->mp)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the mutex. */ + if (dbp->mutex != NULL) + FREE(dbp->mutex, sizeof(db_mutex_t)); + + /* Discard the log file id. */ + if (F_ISSET(dbp, DB_AM_LOGGING)) + (void)log_unregister(dbp->dbenv->lg_info, dbp->log_fileid); + + /* Discard the lock cookie for all handles. */ + for (tdbp = LIST_FIRST(&dbp->handleq); + tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) + if (F_ISSET(tdbp, DB_AM_LOCKING)) { +#ifdef DEBUG + DB_LOCKREQ request; + + /* + * If we're running tests, display any locks currently + * held. It's possible that some applications may hold + * locks for long periods, e.g., conference room locks, + * but the DB tests should never close holding locks. + */ + request.op = DB_LOCK_DUMP; + if ((t_ret = lock_vec(tdbp->dbenv->lk_info, + tdbp->locker, 0, &request, 1, NULL)) != 0 && + ret == 0) + ret = EAGAIN; +#endif + } + + /* If we allocated a DB_ENV, discard it. */ + if (dbp->mp_dbenv != NULL) + FREE(dbp->mp_dbenv, sizeof(DB_ENV)); + + /* Free all of the DB's. */ + LIST_REMOVE(dbp, links); + while ((tdbp = LIST_FIRST(&dbp->handleq)) != NULL) { + LIST_REMOVE(tdbp, links); + FREE(tdbp, sizeof(*tdbp)); + } + FREE(dbp, sizeof(*dbp)); + + return (ret); +} + +/* + * db_fd -- + * Return a file descriptor for flock'ing. + */ +static int +db_fd(dbp, fdp) + DB *dbp; + int *fdp; +{ + /* In-memory database can't have a file descriptor. */ + if (F_ISSET(dbp, DB_AM_INMEM)) + return (ENOENT); + + /* + * XXX + * Truly spectacular layering violation. As we don't open the + * underlying file until we need it, it may not be initialized. + */ + if ((*fdp = dbp->mpf->fd) == -1) + return (ENOENT); + return (0); +} + +/* + * __db_pgerr -- + * Error when unable to retrieve a specified page. + * + * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t)); + */ +int +__db_pgerr(dbp, pgno) + DB *dbp; + db_pgno_t pgno; +{ + __db_err(dbp->dbenv, + "unable to create/retrieve page %lu", (u_long)pgno); + return (__db_panic(dbp)); +} + +/* + * __db_pgfmt -- + * Error when a page has the wrong format. + * + * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t)); + */ +int +__db_pgfmt(dbp, pgno) + DB *dbp; + db_pgno_t pgno; +{ + __db_err(dbp->dbenv, + "page %lu: illegal page type or format", (u_long)pgno); + return (__db_panic(dbp)); +} diff --git a/db2/db/db.src b/db2/db/db.src new file mode 100644 index 0000000000..a3e2f7b75c --- /dev/null +++ b/db2/db/db.src @@ -0,0 +1,154 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * @(#)db.src 10.3 (Sleepycat) 8/18/97 + */ +#include "config.h" + +PREFIX db + +/* + * addrem -- Add or remove an entry from a duplicate page. + * + * opcode: identifies if this is an add or delete. + * fileid: file identifier of the file being modified. + * pgno: duplicate page number. + * indx: location at which to insert or delete. + * nbytes: number of bytes added/removed to/from the page. + * hdr: header for the data item. + * dbt: data that is deleted or is to be added. + * pagelsn: former lsn of the page. + * + * If the hdr was NULL then, the dbt is a regular B_KEYDATA. + * If the dbt was NULL then the hdr is a complete item to be + * pasted on the page. + */ +BEGIN addrem +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +ARG indx u_int32_t lu +ARG nbytes size_t lu +DBT hdr DBT s +DBT dbt DBT s +POINTER pagelsn DB_LSN * lu +END + +/* + * split -- Handles the split of a duplicate page. + * + * opcode: defines whether we are splitting from or splitting onto + * fileid: file identifier of the file being modified. + * pgno: page number being split. + * pageimage: entire page contents. + * pagelsn: former lsn of the page. + */ +BEGIN split +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +DBT pageimage DBT s +POINTER pagelsn DB_LSN * lu +END + +/* + * big -- Handles addition and deletion of big key/data items. + * + * opcode: identifies get/put. + * fileid: file identifier of the file being modified. + * pgno: page onto which data is being added/removed. + * prev_pgno: the page before the one we are logging. + * next_pgno: the page after the one we are logging. + * dbt: data being written onto the page. + * pagelsn: former lsn of the orig_page. + * prevlsn: former lsn of the prev_pgno. + * nextlsn: former lsn of the next_pgno. This is not currently used, but + * may be used later if we actually do overwrites of big key/ + * data items in place. + */ +BEGIN big +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +ARG prev_pgno db_pgno_t lu +ARG next_pgno db_pgno_t lu +DBT dbt DBT s +POINTER pagelsn DB_LSN * lu +POINTER prevlsn DB_LSN * lu +POINTER nextlsn DB_LSN * lu +END + +/* + * ovref -- Handles increment of overflow page reference count. + * + * fileid: identifies the file being modified. + * pgno: page number being incremented. + * lsn the page's original lsn. + */ +BEGIN ovref +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +END + +/* + * relink -- Handles relinking around a page. + * + * pgno: the page being changed. + * lsn the page's original lsn. + * prev: the previous page. + * lsn_prev: the previous page's original lsn. + * next: the next page. + * lsn_next: the previous page's original lsn. + */ +BEGIN relink +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG prev db_pgno_t lu +POINTER lsn_prev DB_LSN * lu +ARG next db_pgno_t lu +POINTER lsn_next DB_LSN * lu +END + +/* + * Addpage -- Handles adding a new duplicate page onto the end of + * an existing duplicate page. + * fileid: identifies the file being changed. + * pgno: page number to which a new page is being added. + * lsn: lsn of pgno + * nextpgno: new page number being added. + * nextlsn: lsn of nextpgno; + */ +BEGIN addpage +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG nextpgno db_pgno_t lu +POINTER nextlsn DB_LSN * lu +END + +/* + * Debug -- log an operation upon entering an access method. + * op: Operation (cursor, c_close, c_get, c_put, c_del, + * get, put, delete). + * fileid: identifies the file being acted upon. + * key: key paramater + * data: data parameter + * flags: flags parameter + */ +BEGIN debug +DBT op DBT s +ARG fileid u_int32_t lu +DBT key DBT s +DBT data DBT s +ARG arg_flags u_int32_t lu +END + +/* + * noop -- do nothing, but get an LSN. + */ +BEGIN noop +END diff --git a/db2/db/db_auto.c b/db2/db/db_auto.c new file mode 100644 index 0000000000..4684f1a39f --- /dev/null +++ b/db2/db/db_auto.c @@ -0,0 +1,1462 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __db_addrem_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, u_int32_t, + * PUBLIC: size_t, DBT *, DBT *, DB_LSN *)); + */ +int __db_addrem_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, pgno, indx, nbytes, hdr, + dbt, pagelsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t indx; + size_t nbytes; + DBT *hdr; + DBT *dbt; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_addrem; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(indx) + + sizeof(nbytes) + + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size) + + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size) + + sizeof(*pagelsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &nbytes, sizeof(nbytes)); + bp += sizeof(nbytes); + if (hdr == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &hdr->size, sizeof(hdr->size)); + bp += sizeof(hdr->size); + memcpy(bp, hdr->data, hdr->size); + bp += hdr->size; + } + if (dbt == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &dbt->size, sizeof(dbt->size)); + bp += sizeof(dbt->size); + memcpy(bp, dbt->data, dbt->size); + bp += dbt->size; + } + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_addrem_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_addrem_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_addrem_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_addrem_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_addrem: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tnbytes: %lu\n", (u_long)argp->nbytes); + printf("\thdr: "); + for (i = 0; i < argp->hdr.size; i++) { + c = ((char *)argp->hdr.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tdbt: "); + for (i = 0; i < argp->dbt.size; i++) { + c = ((char *)argp->dbt.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_addrem_read __P((void *, __db_addrem_args **)); + */ +int +__db_addrem_read(recbuf, argpp) + void *recbuf; + __db_addrem_args **argpp; +{ + __db_addrem_args *argp; + u_int8_t *bp; + + argp = (__db_addrem_args *)malloc(sizeof(__db_addrem_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->nbytes, bp, sizeof(argp->nbytes)); + bp += sizeof(argp->nbytes); + memcpy(&argp->hdr.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->hdr.data = bp; + bp += argp->hdr.size; + memcpy(&argp->dbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->dbt.data = bp; + bp += argp->dbt.size; + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_split_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, DBT *, + * PUBLIC: DB_LSN *)); + */ +int __db_split_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, pgno, pageimage, pagelsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + DBT *pageimage; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_split; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size) + + sizeof(*pagelsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pageimage == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pageimage->size, sizeof(pageimage->size)); + bp += sizeof(pageimage->size); + memcpy(bp, pageimage->data, pageimage->size); + bp += pageimage->size; + } + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_split_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_split_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_split_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_split_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpageimage: "); + for (i = 0; i < argp->pageimage.size; i++) { + c = ((char *)argp->pageimage.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_split_read __P((void *, __db_split_args **)); + */ +int +__db_split_read(recbuf, argpp) + void *recbuf; + __db_split_args **argpp; +{ + __db_split_args *argp; + u_int8_t *bp; + + argp = (__db_split_args *)malloc(sizeof(__db_split_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->pageimage.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pageimage.data = bp; + bp += argp->pageimage.size; + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_big_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, + * PUBLIC: db_pgno_t, DBT *, DB_LSN *, DB_LSN *, + * PUBLIC: DB_LSN *)); + */ +int __db_big_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, pgno, prev_pgno, next_pgno, dbt, + pagelsn, prevlsn, nextlsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + db_pgno_t prev_pgno; + db_pgno_t next_pgno; + DBT *dbt; + DB_LSN * pagelsn; + DB_LSN * prevlsn; + DB_LSN * nextlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_big; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(prev_pgno) + + sizeof(next_pgno) + + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size) + + sizeof(*pagelsn) + + sizeof(*prevlsn) + + sizeof(*nextlsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &prev_pgno, sizeof(prev_pgno)); + bp += sizeof(prev_pgno); + memcpy(bp, &next_pgno, sizeof(next_pgno)); + bp += sizeof(next_pgno); + if (dbt == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &dbt->size, sizeof(dbt->size)); + bp += sizeof(dbt->size); + memcpy(bp, dbt->data, dbt->size); + bp += dbt->size; + } + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + if (prevlsn != NULL) + memcpy(bp, prevlsn, sizeof(*prevlsn)); + else + memset(bp, 0, sizeof(*prevlsn)); + bp += sizeof(*prevlsn); + if (nextlsn != NULL) + memcpy(bp, nextlsn, sizeof(*nextlsn)); + else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_big_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_big_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_big_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_big_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_big: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno); + printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno); + printf("\tdbt: "); + for (i = 0; i < argp->dbt.size; i++) { + c = ((char *)argp->dbt.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tprevlsn: [%lu][%lu]\n", + (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); + printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_big_read __P((void *, __db_big_args **)); + */ +int +__db_big_read(recbuf, argpp) + void *recbuf; + __db_big_args **argpp; +{ + __db_big_args *argp; + u_int8_t *bp; + + argp = (__db_big_args *)malloc(sizeof(__db_big_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->prev_pgno, bp, sizeof(argp->prev_pgno)); + bp += sizeof(argp->prev_pgno); + memcpy(&argp->next_pgno, bp, sizeof(argp->next_pgno)); + bp += sizeof(argp->next_pgno); + memcpy(&argp->dbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->dbt.data = bp; + bp += argp->dbt.size; + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->prevlsn, bp, sizeof(argp->prevlsn)); + bp += sizeof(argp->prevlsn); + memcpy(&argp->nextlsn, bp, sizeof(argp->nextlsn)); + bp += sizeof(argp->nextlsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_ovref_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *)); + */ +int __db_ovref_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_ovref; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_ovref_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_ovref_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_ovref_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_ovref_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_ovref: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_ovref_read __P((void *, __db_ovref_args **)); + */ +int +__db_ovref_read(recbuf, argpp) + void *recbuf; + __db_ovref_args **argpp; +{ + __db_ovref_args *argp; + u_int8_t *bp; + + argp = (__db_ovref_args *)malloc(sizeof(__db_ovref_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_relink_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: DB_LSN *, db_pgno_t, DB_LSN *)); + */ +int __db_relink_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, prev, lsn_prev, next, + lsn_next) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + db_pgno_t prev; + DB_LSN * lsn_prev; + db_pgno_t next; + DB_LSN * lsn_next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_relink; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(prev) + + sizeof(*lsn_prev) + + sizeof(next) + + sizeof(*lsn_next); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &prev, sizeof(prev)); + bp += sizeof(prev); + if (lsn_prev != NULL) + memcpy(bp, lsn_prev, sizeof(*lsn_prev)); + else + memset(bp, 0, sizeof(*lsn_prev)); + bp += sizeof(*lsn_prev); + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); + if (lsn_next != NULL) + memcpy(bp, lsn_next, sizeof(*lsn_next)); + else + memset(bp, 0, sizeof(*lsn_next)); + bp += sizeof(*lsn_next); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_relink_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_relink_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_relink_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_relink_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_relink: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tprev: %lu\n", (u_long)argp->prev); + printf("\tlsn_prev: [%lu][%lu]\n", + (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\tlsn_next: [%lu][%lu]\n", + (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_relink_read __P((void *, __db_relink_args **)); + */ +int +__db_relink_read(recbuf, argpp) + void *recbuf; + __db_relink_args **argpp; +{ + __db_relink_args *argp; + u_int8_t *bp; + + argp = (__db_relink_args *)malloc(sizeof(__db_relink_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->prev, bp, sizeof(argp->prev)); + bp += sizeof(argp->prev); + memcpy(&argp->lsn_prev, bp, sizeof(argp->lsn_prev)); + bp += sizeof(argp->lsn_prev); + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + memcpy(&argp->lsn_next, bp, sizeof(argp->lsn_next)); + bp += sizeof(argp->lsn_next); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_addpage_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: DB_LSN *)); + */ +int __db_addpage_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, nextpgno, nextlsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + db_pgno_t nextpgno; + DB_LSN * nextlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_addpage; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(nextpgno) + + sizeof(*nextlsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &nextpgno, sizeof(nextpgno)); + bp += sizeof(nextpgno); + if (nextlsn != NULL) + memcpy(bp, nextlsn, sizeof(*nextlsn)); + else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_addpage_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_addpage_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_addpage_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_addpage_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_addpage: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tnextpgno: %lu\n", (u_long)argp->nextpgno); + printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_addpage_read __P((void *, __db_addpage_args **)); + */ +int +__db_addpage_read(recbuf, argpp) + void *recbuf; + __db_addpage_args **argpp; +{ + __db_addpage_args *argp; + u_int8_t *bp; + + argp = (__db_addpage_args *)malloc(sizeof(__db_addpage_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->nextpgno, bp, sizeof(argp->nextpgno)); + bp += sizeof(argp->nextpgno); + memcpy(&argp->nextlsn, bp, sizeof(argp->nextlsn)); + bp += sizeof(argp->nextlsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_debug_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: DBT *, u_int32_t, DBT *, DBT *, + * PUBLIC: u_int32_t)); + */ +int __db_debug_log(logp, txnid, ret_lsnp, flags, + op, fileid, key, data, arg_flags) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + DBT *op; + u_int32_t fileid; + DBT *key; + DBT *data; + u_int32_t arg_flags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_debug; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + (op == NULL ? 0 : op->size) + + sizeof(fileid) + + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size) + + sizeof(arg_flags); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + if (op == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &op->size, sizeof(op->size)); + bp += sizeof(op->size); + memcpy(bp, op->data, op->size); + bp += op->size; + } + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + if (key == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &key->size, sizeof(key->size)); + bp += sizeof(key->size); + memcpy(bp, key->data, key->size); + bp += key->size; + } + if (data == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &data->size, sizeof(data->size)); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + bp += data->size; + } + memcpy(bp, &arg_flags, sizeof(arg_flags)); + bp += sizeof(arg_flags); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_debug_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_debug_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_debug_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_debug_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_debug: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\top: "); + for (i = 0; i < argp->op.size; i++) { + c = ((char *)argp->op.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tkey: "); + for (i = 0; i < argp->key.size; i++) { + c = ((char *)argp->key.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + c = ((char *)argp->data.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\targ_flags: %lu\n", (u_long)argp->arg_flags); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_debug_read __P((void *, __db_debug_args **)); + */ +int +__db_debug_read(recbuf, argpp) + void *recbuf; + __db_debug_args **argpp; +{ + __db_debug_args *argp; + u_int8_t *bp; + + argp = (__db_debug_args *)malloc(sizeof(__db_debug_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->op.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->op.data = bp; + bp += argp->op.size; + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->key.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->key.data = bp; + bp += argp->key.size; + memcpy(&argp->data.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + memcpy(&argp->arg_flags, bp, sizeof(argp->arg_flags)); + bp += sizeof(argp->arg_flags); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_noop_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t)); + */ +int __db_noop_log(logp, txnid, ret_lsnp, flags) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_db_noop; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __db_noop_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__db_noop_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __db_noop_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __db_noop_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]db_noop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __db_noop_read __P((void *, __db_noop_args **)); + */ +int +__db_noop_read(recbuf, argpp) + void *recbuf; + __db_noop_args **argpp; +{ + __db_noop_args *argp; + u_int8_t *bp; + + argp = (__db_noop_args *)malloc(sizeof(__db_noop_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __db_init_print __P((DB_ENV *)); + */ +int +__db_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __db_addrem_print, DB_db_addrem)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_split_print, DB_db_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_big_print, DB_db_big)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_ovref_print, DB_db_ovref)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_relink_print, DB_db_relink)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_addpage_print, DB_db_addpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_debug_print, DB_db_debug)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_noop_print, DB_db_noop)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __db_init_recover __P((DB_ENV *)); + */ +int +__db_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __db_addrem_recover, DB_db_addrem)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_split_recover, DB_db_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_big_recover, DB_db_big)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_ovref_recover, DB_db_ovref)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_relink_recover, DB_db_relink)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_addpage_recover, DB_db_addpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_debug_recover, DB_db_debug)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __db_noop_recover, DB_db_noop)) != 0) + return (ret); + return (0); +} + diff --git a/db2/db/db_conv.c b/db2/db/db_conv.c new file mode 100644 index 0000000000..39527c6804 --- /dev/null +++ b/db2/db/db_conv.c @@ -0,0 +1,219 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_conv.c 10.4 (Sleepycat) 8/15/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "db_am.h" + +static int __db_convert __P((db_pgno_t, void *, int)); + +/* + * __db_pgin, __db_pgout -- + * + * PUBLIC: int __db_pgin __P((db_pgno_t, void *)); + * PUBLIC: int __db_pgout __P((db_pgno_t, void *)); + */ +int +__db_pgin(pg, pp) + db_pgno_t pg; + void *pp; +{ + return (__db_convert(pg, pp, 1)); +} + +int +__db_pgout(pg, pp) + db_pgno_t pg; + void *pp; +{ + return (__db_convert(pg, pp, 0)); +} + +/* + * __db_convert -- + * Actually convert a page. + */ +static int +__db_convert(pg, pp, pgin) + db_pgno_t pg; /* Unused, but left for the future. */ + void *pp; + int pgin; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + HKEYDATA *hk; + PAGE *h; + RINTERNAL *ri; + db_indx_t i; + u_int8_t *p; + + h = pp; + if (pgin) { + M_32_SWAP(h->lsn.file); + M_32_SWAP(h->lsn.offset); + M_32_SWAP(h->pgno); + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + M_16_SWAP(h->entries); + M_16_SWAP(h->hf_offset); + } + + switch (h->type) { + case P_HASH: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(h->inp[i]); + + hk = GET_HKEYDATA(h, i); + switch (hk->type) { + case H_KEYDATA: + break; + case H_DUPLICATE: + case H_OFFPAGE: + p = (u_int8_t *)hk + sizeof(u_int8_t); + ++p; + SWAP32(p); /* tlen */ + SWAP32(p); /* pgno */ + SWAP16(p); /* offset */ + SWAP16(p); /* len */ + break; + } + + if (!pgin) + M_16_SWAP(h->inp[i]); + } + break; + case P_LBTREE: + case P_LRECNO: + case P_DUPLICATE: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(h->inp[i]); + + bk = GET_BKEYDATA(h, i); + switch (bk->type) { + case B_KEYDATA: + M_16_SWAP(bk->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bo = (BOVERFLOW *)bk; + M_32_SWAP(bo->tlen); + M_32_SWAP(bo->pgno); + break; + } + + if (!pgin) + M_16_SWAP(h->inp[i]); + } + break; + case P_IBTREE: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(h->inp[i]); + + bi = GET_BINTERNAL(h, i); + switch (bi->type) { + case B_KEYDATA: + M_16_SWAP(bi->len); + M_32_SWAP(bi->pgno); + M_32_SWAP(bi->nrecs); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bo = (BOVERFLOW *)bi; + M_32_SWAP(bo->tlen); + M_32_SWAP(bo->pgno); + break; + } + + if (!pgin) + M_16_SWAP(h->inp[i]); + } + break; + case P_IRECNO: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(h->inp[i]); + + ri = GET_RINTERNAL(h, i); + M_32_SWAP(ri->pgno); + M_32_SWAP(ri->nrecs); + + if (!pgin) + M_16_SWAP(h->inp[i]); + } + case P_OVERFLOW: + case P_INVALID: + /* Nothing to do. */ + break; + default: + return (EINVAL); + } + + if (!pgin) { + /* Swap the header information. */ + M_32_SWAP(h->lsn.file); + M_32_SWAP(h->lsn.offset); + M_32_SWAP(h->pgno); + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + M_16_SWAP(h->entries); + M_16_SWAP(h->hf_offset); + } + return (0); +} diff --git a/db2/db/db_dispatch.c b/db2/db/db_dispatch.c new file mode 100644 index 0000000000..3d7b162d75 --- /dev/null +++ b/db2/db/db_dispatch.c @@ -0,0 +1,270 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_dispatch.c 10.5 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * Data structures to manage the DB dispatch table. The dispatch table + * is a dynamically allocated array of pointers to dispatch functions. + * The dispatch_size is the number of entries possible in the current + * dispatch table and the dispatch_valid is the number of valid entries + * in the dispatch table. + */ +static int (**dispatch_table) __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +static u_int32_t dispatch_size = 0; + +/* + * __db_dispatch -- + * + * This is the transaction dispatch function used by the db access methods. + * It is designed to handle the record format used by all the access + * methods (the one automatically generated by the db_{h,log,read}.sh + * scripts in the tools directory). An application using a different + * recovery paradigm will supply a different dispatch function to txn_open. + * + * PUBLIC: int __db_dispatch __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_dispatch(logp, db, lsnp, redo, info) + DB_LOG *logp; /* The log file. */ + DBT *db; /* The log record upon which to dispatch. */ + DB_LSN *lsnp; /* The lsn of the record being dispatched. */ + int redo; /* Redo this op (or undo it). */ + void *info; +{ + u_int32_t rectype, txnid; + + memcpy(&rectype, db->data, sizeof(rectype)); + memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid)); + + switch (redo) { + case TXN_REDO: + case TXN_UNDO: + return ((dispatch_table[rectype])(logp, db, lsnp, redo, info)); + case TXN_OPENFILES: + if (rectype < DB_txn_BEGIN ) + return ((dispatch_table[rectype])(logp, + db, lsnp, redo, info)); + break; + case TXN_BACKWARD_ROLL: + /* + * Running full recovery in the backward pass. If we've + * seen this txnid before and added to it our commit list, + * then we do nothing during this pass. If we've never + * seen it, then we call the appropriate recovery routine + * in "abort mode". + */ + if (__db_txnlist_find(info, txnid) == DB_NOTFOUND) + return ((dispatch_table[rectype])(logp, + db, lsnp, TXN_UNDO, info)); + break; + case TXN_FORWARD_ROLL: + /* + * In the forward pass, if we haven't seen the transaction, + * do nothing, else recovery it. + */ + if (__db_txnlist_find(info, txnid) != DB_NOTFOUND) + return ((dispatch_table[rectype])(logp, + db, lsnp, TXN_REDO, info)); + break; + default: + abort(); + } + return (0); +} + +/* + * __db_add_recovery -- + * + * PUBLIC: int __db_add_recovery __P((DB_ENV *, + * PUBLIC: int (*)(DB_LOG *, DBT *, DB_LSN *, int, void *), u_int32_t)); + */ +int +__db_add_recovery(dbenv, func, ndx) + DB_ENV *dbenv; + int (*func) __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + u_int32_t ndx; +{ + u_int32_t i; + + /* Check if function is already registered. */ + if (dispatch_table && ndx < dispatch_size && + dispatch_table[ndx] != 0 && dispatch_table[ndx] != func) + return (DB_REGISTERED); + + /* Check if we have to grow the table. */ + if (ndx >= dispatch_size) { + if (dispatch_table == NULL) + dispatch_table = (int (**) + __P((DB_LOG *, DBT *, DB_LSN *, int, void *))) + malloc(DB_user_BEGIN * sizeof(dispatch_table[0])); + else + dispatch_table = (int (**) + __P((DB_LOG *, DBT *, DB_LSN *, int, void *))) + realloc(dispatch_table, (DB_user_BEGIN + + dispatch_size) * sizeof(dispatch_table[0])); + if (dispatch_table == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); + } + for (i = dispatch_size, + dispatch_size += DB_user_BEGIN; i < dispatch_size; ++i) + dispatch_table[i] = NULL; + } + + dispatch_table[ndx] = func; + return (0); +} + +/* + * __db_txnlist_init -- + * Initialize transaction linked list. + * + * PUBLIC: int __db_txnlist_init __P((void *)); + */ +int +__db_txnlist_init(retp) + void *retp; +{ + __db_txnhead *headp; + + if ((headp = + (struct __db_txnhead *)malloc(sizeof(struct __db_txnhead))) == NULL) + return (ENOMEM); + + LIST_INIT(&headp->head); + headp->maxid = 0; + + *(void **)retp = headp; + return (0); +} + +/* + * __db_txnlist_add -- + * Add an element to our transaction linked list. + * + * PUBLIC: int __db_txnlist_add __P((void *, u_int32_t)); + */ +int +__db_txnlist_add(listp, txnid) + void *listp; + u_int32_t txnid; +{ + __db_txnhead *hp; + __db_txnlist *elp; + + if ((elp = (__db_txnlist *)malloc(sizeof(__db_txnlist))) == NULL) + return (ENOMEM); + + elp->txnid = txnid; + hp = (struct __db_txnhead *)listp; + LIST_INSERT_HEAD(&hp->head, elp, links); + if (txnid > hp->maxid) + hp->maxid = txnid; + + return (0); +} + +/* + * __db_txnlist_find -- + * Checks to see if txnid is in the txnid list, returns 1 if found, + * 0 if not found. + * + * PUBLIC: int __db_txnlist_find __P((void *, u_int32_t)); + */ +int +__db_txnlist_find(listp, txnid) + void *listp; + u_int32_t txnid; +{ + __db_txnlist *p; + __db_txnhead *hp; + + if ((hp = (struct __db_txnhead *)listp) == NULL) + return (DB_NOTFOUND); + + if (hp->maxid < txnid) { + hp->maxid = txnid; + return (DB_NOTFOUND); + } + + for (p = hp->head.lh_first; p != NULL; p = p->links.le_next) + if (p->txnid == txnid) + return (0); + + return (DB_NOTFOUND); +} + +#ifdef DEBUG +void +__db_txnlist_print(listp) + void *listp; +{ + __db_txnlist *p; + __db_txnhead *hp; + + hp = (struct __db_txnhead *)listp; + printf("Maxid: %lu\n", (u_long)hp->maxid); + for (p = hp->head.lh_first; p != NULL; p = p->links.le_next) + printf("TXNID: %lu\n", (u_long)p->txnid); +} +#endif diff --git a/db2/db/db_dup.c b/db2/db/db_dup.c new file mode 100644 index 0000000000..8d364d518e --- /dev/null +++ b/db2/db/db_dup.c @@ -0,0 +1,680 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_dup.c 10.8 (Sleepycat) 7/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" +#include "db_am.h" +#include "common_ext.h" + +static int __db_addpage __P((DB *, + PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); +static int __db_dsplit __P((DB *, + PAGE **, db_indx_t *, u_int32_t, int (*)(DB *, u_int32_t, PAGE **))); + +/* + * __db_dput -- + * Put a duplicate item onto a duplicate page at the given index. + * + * PUBLIC: int __db_dput __P((DB *, + * PUBLIC: DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); + */ +int +__db_dput(dbp, dbt, pp, indxp, newfunc) + DB *dbp; + DBT *dbt; + PAGE **pp; + db_indx_t *indxp; + int (*newfunc) __P((DB *, u_int32_t, PAGE **)); +{ + BOVERFLOW bo; + DBT *data_dbtp, hdr_dbt, *hdr_dbtp; + PAGE *pagep; + db_indx_t size, isize; + db_pgno_t pgno; + int ret; + + /* + * We need some access method independent threshold for when we put + * a duplicate item onto an overflow page. + */ + if (dbt->size > 0.25 * dbp->pgsize) { + if ((ret = __db_poff(dbp, dbt, &pgno, newfunc)) != 0) + return (ret); + bo.deleted = 0; + bo.type = B_OVERFLOW; + bo.tlen = dbt->size; + bo.pgno = pgno; + hdr_dbt.data = &bo; + hdr_dbt.size = isize = BOVERFLOW_SIZE; + hdr_dbtp = &hdr_dbt; + size = BOVERFLOW_PSIZE; + data_dbtp = NULL; + } else { + size = BKEYDATA_PSIZE(dbt->size); + isize = BKEYDATA_SIZE(dbt->size); + hdr_dbtp = NULL; + data_dbtp = dbt; + } + + pagep = *pp; + if (size > P_FREESPACE(pagep)) { + if (*indxp == NUM_ENT(*pp) && NEXT_PGNO(*pp) == PGNO_INVALID) + ret = __db_addpage(dbp, pp, indxp, newfunc); + else + ret = __db_dsplit(dbp, pp, indxp, isize, newfunc); + if (ret != 0) + /* XXX: Pages not returned to free list. */ + return (ret); + pagep = *pp; + } + + /* + * Now, pagep references the page on which to insert and indx is the + * the location to insert. + */ + if ((ret = __db_pitem(dbp, + pagep, (u_int32_t)*indxp, isize, hdr_dbtp, data_dbtp)) != 0) + return (ret); + + (void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY); + return (0); +} + +/* + * __db_drem -- + * Remove a duplicate at the given index on the given page. + * + * PUBLIC: int __db_drem __P((DB *, + * PUBLIC: PAGE **, u_int32_t, int (*)(DB *, PAGE *))); + */ +int +__db_drem(dbp, pp, indx, freefunc) + DB *dbp; + PAGE **pp; + u_int32_t indx; + int (*freefunc) __P((DB *, PAGE *)); +{ + PAGE *pagep; + int ret; + + pagep = *pp; + + /* Check if we are freeing a big item. */ + if (GET_BKEYDATA(pagep, indx)->type == B_OVERFLOW) { + if ((ret = __db_doff(dbp, + GET_BOVERFLOW(pagep, indx)->pgno, freefunc)) != 0) + return (ret); + ret = __db_ditem(dbp, pagep, indx, BOVERFLOW_SIZE); + } else + ret = __db_ditem(dbp, pagep, indx, + BKEYDATA_SIZE(GET_BKEYDATA(pagep, indx)->len)); + if (ret != 0) + return (ret); + + if (NUM_ENT(pagep) == 0) { + /* + * If the page is emptied, then the page is freed and the pp + * parameter is set to reference the next, locked page in the + * duplicate chain, if one exists. If there was no such page, + * then it is set to NULL. + * + * !!! + * __db_relink will set the dirty bit for us. + */ + if ((ret = __db_relink(dbp, pagep, pp, 0)) != 0) + return (ret); + if ((ret = freefunc(dbp, pagep)) != 0) + return (ret); + } else + (void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY); + + return (0); +} + +/* + * __db_dend -- + * Find the last page in a set of offpage duplicates. + * + * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **)); + */ +int +__db_dend(dbp, pgno, pagep) + DB *dbp; + db_pgno_t pgno; + PAGE **pagep; +{ + PAGE *h; + int ret; + + /* + * This implements DB_KEYLAST. The last page is returned in pp; pgno + * should be the page number of the first page of the duplicate chain. + */ + for (;;) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID) + break; + (void)memp_fput(dbp->mpf, h, 0); + } + + *pagep = h; + return (0); +} + +/* + * __db_dsplit -- + * Split a page of duplicates, calculating the split point based + * on an element of size "size" being added at "*indxp". + * On entry hp contains a pointer to the page-pointer of the original + * page. On exit, it returns a pointer to the page containing "*indxp" + * and "indxp" has been modified to reflect the index on the new page + * where the element should be added. The function returns with + * the page on which the insert should happen, not yet put. + */ +static int +__db_dsplit(dbp, hp, indxp, size, newfunc) + DB *dbp; + PAGE **hp; + db_indx_t *indxp; + u_int32_t size; + int (*newfunc) __P((DB *, u_int32_t, PAGE **)); +{ + PAGE *h, *np, *tp; + BKEYDATA *bk; + DBT page_dbt; + db_indx_t indx, nindex, oindex, sum; + db_indx_t halfbytes, i, lastsum; + int did_indx, ret, s; + + h = *hp; + indx = *indxp; + + /* Create a temporary page to do compaction onto. */ + if ((tp = (PAGE *)malloc(dbp->pgsize)) == NULL) + return (ENOMEM); +#ifdef DEBUG + memset(tp, 0xff, dbp->pgsize); +#endif + /* Create new page for the split. */ + if ((ret = newfunc(dbp, P_DUPLICATE, &np)) != 0) { + FREE(tp, dbp->pgsize); + return (ret); + } + + P_INIT(np, dbp->pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0, + P_DUPLICATE); + P_INIT(tp, dbp->pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0, + P_DUPLICATE); + + /* Figure out the split point */ + halfbytes = (dbp->pgsize - HOFFSET(h)) / 2; + did_indx = 0; + for (sum = 0, lastsum = 0, i = 0; i < NUM_ENT(h); i++) { + if (i == indx) { + sum += size; + if (lastsum < halfbytes && sum >= halfbytes) { + /* We've crossed the halfway point. */ + if ((db_indx_t)(halfbytes - lastsum) < + (db_indx_t)(sum - halfbytes)) { + *hp = np; + *indxp = 0; + i--; + } else + *indxp = i; + break; + } + *indxp = i; + lastsum = sum; + did_indx = 1; + } + if (GET_BKEYDATA(h, i)->type == B_KEYDATA) + sum += BKEYDATA_SIZE(GET_BKEYDATA(h, i)->len); + else + sum += BOVERFLOW_SIZE; + + if (lastsum < halfbytes && sum >= halfbytes) { + /* We've crossed the halfway point. */ + if ((db_indx_t)(halfbytes - lastsum) < + (db_indx_t)(sum - halfbytes)) + i--; + break; + } + } + + /* + * Check if we have set the return values of the index pointer and + * page pointer. + */ + if (!did_indx) { + *hp = np; + *indxp = indx - i - 1; + } + + if (DB_LOGGING(dbp)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = h; + if ((ret = __db_split_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid, + PGNO(h), &page_dbt, &LSN(h))) != 0) { + FREE(tp, dbp->pgsize); + return (ret); + } + LSN(tp) = LSN(h); + } + + /* + * If it's a btree, adjust the cursors. + * + * i is the index of the last element to stay on the page. + */ + if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) + __bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i + 1, 0); + + for (nindex = 0, oindex = i + 1; oindex < NUM_ENT(h); oindex++) { + bk = GET_BKEYDATA(h, oindex); + if (bk->type == B_KEYDATA) + s = BKEYDATA_SIZE(bk->len); + else + s = BOVERFLOW_SIZE; + + np->inp[nindex++] = HOFFSET(np) -= s; + memcpy((u_int8_t *)np + HOFFSET(np), bk, s); + NUM_ENT(np)++; + } + + /* + * Now do data compaction by copying the remaining stuff onto the + * temporary page and then copying it back to the real page. + */ + for (nindex = 0, oindex = 0; oindex <= i; oindex++) { + bk = GET_BKEYDATA(h, oindex); + if (bk->type == B_KEYDATA) + s = BKEYDATA_SIZE(bk->len); + else + s = BOVERFLOW_SIZE; + + tp->inp[nindex++] = HOFFSET(tp) -= s; + memcpy((u_int8_t *)tp + HOFFSET(tp), bk, s); + NUM_ENT(tp)++; + } + + /* + * This page (the temporary) should be only half full, so we do two + * memcpy's, one for the top of the page and one for the bottom of + * the page. This way we avoid copying the middle which should be + * about half a page. + */ + memcpy(h, tp, LOFFSET(tp)); + memcpy((u_int8_t *)h + HOFFSET(tp), + (u_int8_t *)tp + HOFFSET(tp), dbp->pgsize - HOFFSET(tp)); + FREE(tp, dbp->pgsize); + + if (DB_LOGGING(dbp)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = h; + if ((ret = __db_split_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid, + PGNO(h), &page_dbt, &LSN(h))) != 0) + return (ret); + + page_dbt.size = dbp->pgsize; + page_dbt.data = np; + if ((ret = __db_split_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid, + PGNO(np), &page_dbt, &LSN(np))) != 0) + return (ret); + } + + /* + * Figure out if the location we're interested in is on the new + * page, and if so, reset the callers' pointer. Push the other + * page back to the store. + */ + if (*hp == h) + ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); + else + ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + + return (ret); +} + +/* + * __db_ditem -- + * Remove an item from a page. + * + * PUBLIC: int __db_ditem __P((DB *, PAGE *, int, u_int32_t)); + */ +int +__db_ditem(dbp, pagep, indx, nbytes) + DB *dbp; + PAGE *pagep; + int indx; + u_int32_t nbytes; +{ + DBT ldbt; + db_indx_t cnt, offset; + int ret; + u_int8_t *from; + + if (DB_LOGGING(dbp)) { + ldbt.data = P_ENTRY(pagep, indx); + ldbt.size = nbytes; + if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep), + (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0) + return (ret); + } + + /* + * If there's only a single item on the page, we don't have to + * work hard. + */ + if (NUM_ENT(pagep) == 1) { + NUM_ENT(pagep) = 0; + HOFFSET(pagep) = dbp->pgsize; + return (0); + } + + /* + * Pack the remaining key/data items at the end of the page. Use + * memmove(3), the regions may overlap. + */ + from = (u_int8_t *)pagep + HOFFSET(pagep); + memmove(from + nbytes, from, pagep->inp[indx] - HOFFSET(pagep)); + HOFFSET(pagep) += nbytes; + + /* Adjust the indices' offsets. */ + offset = pagep->inp[indx]; + for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt) + if (pagep->inp[cnt] < offset) + pagep->inp[cnt] += nbytes; + + /* Shift the indices down. */ + --NUM_ENT(pagep); + if (indx != NUM_ENT(pagep)) + memmove(&pagep->inp[indx], &pagep->inp[indx + 1], + sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); + + /* If it's a btree, adjust the cursors. */ + if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) + __bam_ca_di(dbp, PGNO(pagep), indx, -1); + + return (0); +} + +/* + * __db_pitem -- + * Put an item on a page. + * + * PUBLIC: int __db_pitem + * PUBLIC: __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); + */ +int +__db_pitem(dbp, pagep, indx, nbytes, hdr, data) + DB *dbp; + PAGE *pagep; + u_int32_t indx; + u_int32_t nbytes; + DBT *hdr, *data; +{ + BKEYDATA bk; + DBT thdr; + int ret; + u_int8_t *p; + + /* + * Put a single item onto a page. The logic figuring out where to + * insert and whether it fits is handled in the caller. All we do + * here is manage the page shuffling. We cheat a little bit in that + * we don't want to copy the dbt on a normal put twice. If hdr is + * NULL, we create a BKEYDATA structure on the page, otherwise, just + * copy the caller's information onto the page. + * + * This routine is also used to put entries onto the page where the + * entry is pre-built, e.g., during recovery. In this case, the hdr + * will point to the entry, and the data argument will be NULL. + * + * !!! + * There's a tremendous potential for off-by-one errors here, since + * the passed in header sizes must be adjusted for the structure's + * placeholder for the trailing variable-length data field. + */ + if (DB_LOGGING(dbp)) + if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep), + (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0) + return (ret); + + if (hdr == NULL) { + bk.deleted = 0; + bk.type = B_KEYDATA; + bk.len = data == NULL ? 0 : data->size; + + thdr.data = &bk; + thdr.size = SSZA(BKEYDATA, data); + hdr = &thdr; + } + + /* Adjust the index table, then put the item on the page. */ + if (indx != NUM_ENT(pagep)) + memmove(&pagep->inp[indx + 1], &pagep->inp[indx], + sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); + HOFFSET(pagep) -= nbytes; + pagep->inp[indx] = HOFFSET(pagep); + ++NUM_ENT(pagep); + + p = P_ENTRY(pagep, indx); + memcpy(p, hdr->data, hdr->size); + if (data != NULL) + memcpy(p + hdr->size, data->data, data->size); + + /* If it's a btree, adjust the cursors. */ + if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) + __bam_ca_di(dbp, PGNO(pagep), indx, 1); + + return (0); +} + +/* + * __db_relink -- + * Relink around a deleted page. + * + * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int)); + */ +int +__db_relink(dbp, pagep, new_next, needlock) + DB *dbp; + PAGE *pagep, **new_next; + int needlock; +{ + PAGE *np, *pp; + DB_LOCK npl, ppl; + DB_LSN *nlsnp, *plsnp; + int ret; + + ret = 0; + np = pp = NULL; + npl = ppl = LOCK_INVALID; + nlsnp = plsnp = NULL; + + /* Retrieve and lock the two pages. */ + if (pagep->next_pgno != PGNO_INVALID) { + if (needlock && (ret = __bam_lget(dbp, + 0, pagep->next_pgno, DB_LOCK_WRITE, &npl)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, + &pagep->next_pgno, 0, &np)) != 0) { + (void)__db_pgerr(dbp, pagep->next_pgno); + goto err; + } + nlsnp = &np->lsn; + } + if (pagep->prev_pgno != PGNO_INVALID) { + if (needlock && (ret = __bam_lget(dbp, + 0, pagep->prev_pgno, DB_LOCK_WRITE, &ppl)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, + &pagep->prev_pgno, 0, &pp)) != 0) { + (void)__db_pgerr(dbp, pagep->next_pgno); + goto err; + } + plsnp = &pp->lsn; + } + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbp->txn, + &pagep->lsn, 0, dbp->log_fileid, pagep->pgno, &pagep->lsn, + pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0) + goto err; + if (np != NULL) + np->lsn = pagep->lsn; + if (pp != NULL) + pp->lsn = pagep->lsn; + } + + /* + * Modify and release the two pages. + * + * !!! + * The parameter new_next gets set to the page following the page we + * are removing. If there is no following page, then new_next gets + * set to NULL. + */ + if (np != NULL) { + np->prev_pgno = pagep->prev_pgno; + if (new_next == NULL) + ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); + else { + *new_next = np; + ret = memp_fset(dbp->mpf, np, DB_MPOOL_DIRTY); + } + if (ret != 0) + goto err; + if (needlock) + (void)__bam_lput(dbp, npl); + } else if (new_next != NULL) + *new_next = NULL; + + if (pp != NULL) { + pp->next_pgno = pagep->next_pgno; + if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto err; + if (needlock) + (void)__bam_lput(dbp, ppl); + } + return (0); + +err: if (np != NULL) + (void)memp_fput(dbp->mpf, np, 0); + if (needlock && npl != LOCK_INVALID) + (void)__bam_lput(dbp, npl); + if (pp != NULL) + (void)memp_fput(dbp->mpf, pp, 0); + if (needlock && ppl != LOCK_INVALID) + (void)__bam_lput(dbp, ppl); + return (ret); +} + +/* + * __db_ddup -- + * Delete an offpage chain of duplicates. + * + * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); + */ +int +__db_ddup(dbp, pgno, freefunc) + DB *dbp; + db_pgno_t pgno; + int (*freefunc) __P((DB *, PAGE *)); +{ + PAGE *pagep; + DBT tmp_dbt; + int ret; + + do { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + + if (DB_LOGGING(dbp)) { + tmp_dbt.data = pagep; + tmp_dbt.size = dbp->pgsize; + if ((ret = __db_split_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(pagep), 0, DB_SPLITOLD, dbp->log_fileid, + PGNO(pagep), &tmp_dbt, &LSN(pagep))) != 0) + return (ret); + } + pgno = pagep->next_pgno; + if ((ret = freefunc(dbp, pagep)) != 0) + return (ret); + } while (pgno != PGNO_INVALID); + + return (0); +} + +/* + * __db_addpage -- + * Create a new page and link it onto the next_pgno field of the + * current page. + */ +static int +__db_addpage(dbp, hp, indxp, newfunc) + DB *dbp; + PAGE **hp; + db_indx_t *indxp; + int (*newfunc) __P((DB *, u_int32_t, PAGE **)); +{ + PAGE *newpage; + int ret; + + if ((ret = newfunc(dbp, P_DUPLICATE, &newpage)) != 0) + return (ret); + + if (DB_LOGGING(dbp)) { + if ((ret = __db_addpage_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(*hp), 0, dbp->log_fileid, + PGNO(*hp), &LSN(*hp), PGNO(newpage), &LSN(newpage))) != 0) { + return (ret); + } + LSN(newpage) = LSN(*hp); + } + + PREV_PGNO(newpage) = PGNO(*hp); + NEXT_PGNO(*hp) = PGNO(newpage); + + if ((ret = memp_fput(dbp->mpf, *hp, DB_MPOOL_DIRTY)) != 0) + return (ret); + *hp = newpage; + *indxp = 0; + return (0); +} diff --git a/db2/db/db_overflow.c b/db2/db/db_overflow.c new file mode 100644 index 0000000000..2340e9e358 --- /dev/null +++ b/db2/db/db_overflow.c @@ -0,0 +1,383 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_overflow.c 10.4 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * Big key/data code. + * + * Big key and data entries are stored on linked lists of pages. The initial + * reference is a structure with the total length of the item and the page + * number where it begins. Each entry in the linked list contains a pointer + * to the next page of data, and so on. + */ + +/* + * __db_goff -- + * Get an offpage item. + * + * PUBLIC: int __db_goff __P((DB *, DBT *, + * PUBLIC: u_int32_t, db_pgno_t, void **, u_int32_t *)); + */ +int +__db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) + DB *dbp; + DBT *dbt; + u_int32_t tlen; + db_pgno_t pgno; + void **bpp; + u_int32_t *bpsz; +{ + PAGE *h; + db_indx_t bytes; + int ret; + u_int32_t curoff, needed, start; + u_int8_t *p, *src; + + /* + * Check if the buffer is big enough; if it is not and we are + * allowed to malloc space, then we'll malloc it. If we are + * not (DB_DBT_USERMEM), then we'll set the dbt and return + * appropriately. + */ + if (F_ISSET(dbt, DB_DBT_PARTIAL)) { + start = dbt->doff; + needed = dbt->dlen; + } else { + start = 0; + needed = tlen; + } + + /* + * Allocate any necessary memory. + * + * XXX: Never allocate 0 bytes; + */ + if (F_ISSET(dbt, DB_DBT_USERMEM)) { + if (needed > dbt->ulen) { + dbt->size = needed; + return (ENOMEM); + } + } else if (F_ISSET(dbt, DB_DBT_MALLOC)) { + dbt->data = dbp->db_malloc == NULL ? + (void *)malloc(needed + 1) : + (void *)dbp->db_malloc(needed + 1); + if (dbt->data == NULL) + return (ENOMEM); + } else if (*bpsz == 0 || *bpsz < needed) { + *bpp = (*bpp == NULL ? + (void *)malloc(needed + 1) : + (void *)realloc(*bpp, needed + 1)); + if (*bpp == NULL) + return (ENOMEM); + *bpsz = needed + 1; + dbt->data = *bpp; + } else + dbt->data = *bpp; + + /* + * Step through the linked list of pages, copying the data on each + * one into the buffer. Never copy more than the total data length. + */ + dbt->size = needed; + for (curoff = 0, p = dbt->data; pgno != P_INVALID && needed > 0;) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + /* Check if we need any bytes from this page. */ + if (curoff + OV_LEN(h) >= start) { + src = (u_int8_t *)h + P_OVERHEAD; + bytes = OV_LEN(h); + if (start > curoff) { + src += start - curoff; + bytes -= start - curoff; + } + if (bytes > needed) + bytes = needed; + memcpy(p, src, bytes); + p += bytes; + needed -= bytes; + } + curoff += OV_LEN(h); + pgno = h->next_pgno; + memp_fput(dbp->mpf, h, 0); + } + return (0); +} + +/* + * __db_poff -- + * Put an offpage item. + * + * PUBLIC: int __db_poff __P((DB *, const DBT *, db_pgno_t *, + * PUBLIC: int (*)(DB *, u_int32_t, PAGE **))); + */ +int +__db_poff(dbp, dbt, pgnop, newfunc) + DB *dbp; + const DBT *dbt; + db_pgno_t *pgnop; + int (*newfunc) __P((DB *, u_int32_t, PAGE **)); +{ + PAGE *pagep, *lastp; + DB_LSN new_lsn, null_lsn; + DBT tmp_dbt; + db_indx_t pagespace; + u_int32_t sz; + u_int8_t *p; + int ret; + + /* + * Allocate pages and copy the key/data item into them. Calculate the + * number of bytes we get for pages we fill completely with a single + * item. + */ + pagespace = P_MAXSPACE(dbp->pgsize); + + lastp = NULL; + for (p = dbt->data, + sz = dbt->size; sz > 0; p += pagespace, sz -= pagespace) { + /* + * Reduce pagespace so we terminate the loop correctly and + * don't copy too much data. + */ + if (sz < pagespace) + pagespace = sz; + + /* + * Allocate and initialize a new page and copy all or part of + * the item onto the page. If sz is less than pagespace, we + * have a partial record. + */ + if ((ret = newfunc(dbp, P_OVERFLOW, &pagep)) != 0) + return (ret); + if (DB_LOGGING(dbp)) { + tmp_dbt.data = p; + tmp_dbt.size = pagespace; + ZERO_LSN(null_lsn); + if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn, + &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid, + PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID, + PGNO_INVALID, &tmp_dbt, &LSN(pagep), + lastp == NULL ? &null_lsn : &LSN(lastp), + &null_lsn)) != 0) + return (ret); + + /* Move lsn onto page. */ + if (lastp) + LSN(lastp) = new_lsn; + LSN(pagep) = new_lsn; + } + + P_INIT(pagep, dbp->pgsize, + PGNO(pagep), PGNO_INVALID, PGNO_INVALID, 0, P_OVERFLOW); + OV_LEN(pagep) = pagespace; + OV_REF(pagep) = 1; + memcpy((u_int8_t *)pagep + P_OVERHEAD, p, pagespace); + + /* + * If this is the first entry, update the user's info. + * Otherwise, update the entry on the last page filled + * in and release that page. + */ + if (lastp == NULL) + *pgnop = PGNO(pagep); + else { + lastp->next_pgno = PGNO(pagep); + pagep->prev_pgno = PGNO(lastp); + (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY); + } + lastp = pagep; + } + (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY); + return (0); +} + +/* + * __db_ioff -- + * Increment the reference count on an overflow page. + * + * PUBLIC: int __db_ioff __P((DB *, db_pgno_t)); + */ +int +__db_ioff(dbp, pgno) + DB *dbp; + db_pgno_t pgno; +{ + PAGE *h; + int ret; + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + + ++OV_REF(h); + if (DB_LOGGING(dbp) && (ret = __db_ovref_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(h), 0, dbp->log_fileid, h->pgno, &LSN(h))) != 0) + return (ret); + + (void)memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + return (0); +} + +/* + * __db_doff -- + * Delete an offpage chain of overflow pages. + * + * PUBLIC: int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); + */ +int +__db_doff(dbp, pgno, freefunc) + DB *dbp; + db_pgno_t pgno; + int (*freefunc) __P((DB *, PAGE *)); +{ + PAGE *pagep; + DB_LSN null_lsn; + DBT tmp_dbt; + int ret; + + do { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + + /* + * If it's an overflow page and it's referenced by more than + * one key/data item, decrement the reference count and return. + */ + if (TYPE(pagep) == P_OVERFLOW && OV_REF(pagep) > 1) { + --OV_REF(pagep); + (void)memp_fput(dbp->mpf, pagep, DB_MPOOL_DIRTY); + return (0); + } + + if (DB_LOGGING(dbp)) { + tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD; + tmp_dbt.size = OV_LEN(pagep); + ZERO_LSN(null_lsn); + if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid, + PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep), + &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0) + return (ret); + } + pgno = pagep->next_pgno; + if ((ret = freefunc(dbp, pagep)) != 0) + return (ret); + } while (pgno != PGNO_INVALID); + + return (0); +} + +/* + * __db_moff -- + * Match on overflow pages. + * + * Given a starting page number and a key, return <0, 0, >0 to indicate if the + * key on the page is less than, equal to or greater than the key specified. + * + * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t)); + */ +int +__db_moff(dbp, dbt, pgno) + DB *dbp; + const DBT *dbt; + db_pgno_t pgno; +{ + PAGE *pagep; + u_int32_t cmp_bytes, key_left; + int ret; + u_int8_t *p1, *p2; + + /* While there are both keys to compare. */ + for (ret = 0, p1 = dbt->data, + key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { + if (memp_fget(dbp->mpf, &pgno, 0, &pagep) != 0) { + (void)__db_pgerr(dbp, pgno); + return (0); /* No system error return. */ + } + + cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; + key_left -= cmp_bytes; + for (p2 = + (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2) + if (*p1 != *p2) { + ret = (long)*p1 - (long)*p2; + break; + } + pgno = NEXT_PGNO(pagep); + (void)memp_fput(dbp->mpf, pagep, 0); + if (ret != 0) + return (ret); + } + if (key_left > 0) /* DBT is longer than page key. */ + return (-1); + if (pgno != PGNO_INVALID) /* DBT is shorter than page key. */ + return (1); + return (0); +} diff --git a/db2/db/db_pr.c b/db2/db/db_pr.c new file mode 100644 index 0000000000..c103b10e4f --- /dev/null +++ b/db2/db/db_pr.c @@ -0,0 +1,785 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_pr.c 10.14 (Sleepycat) 8/17/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "hash.h" +#include "db_am.h" + +static void __db_proff __P((void *)); +static void __db_psize __P((DB_MPOOLFILE *)); + +/* + * __db_loadme -- + * Force loading of this file. + * + * PUBLIC: void __db_loadme __P((void)); + */ +void +__db_loadme() +{ + getpid(); +} + +static FILE *set_fp; + +/* + * 64K is the maximum page size, so by default we check for offsets + * larger than that, and, where possible, we refine the test. + */ +#define PSIZE_BOUNDARY (64 * 1024 + 1) +static size_t set_psize = PSIZE_BOUNDARY; + +/* + * __db_prinit -- + * Initialize tree printing routines. + * + * PUBLIC: FILE *__db_prinit __P((FILE *)); + */ +FILE * +__db_prinit(fp) + FILE *fp; +{ + if (set_fp == NULL) + set_fp = fp == NULL ? stdout : fp; + return (set_fp); +} + +/* + * __db_dump -- + * Dump the tree to a file. + * + * PUBLIC: int __db_dump __P((DB *, char *, int)); + */ +int +__db_dump(dbp, name, all) + DB *dbp; + char *name; + int all; +{ + FILE *fp, *save_fp; + + save_fp = NULL; /* XXX: Shut the compiler up. */ + + if (set_psize == PSIZE_BOUNDARY) + __db_psize(dbp->mpf); + + if (name != NULL) { + if ((fp = fopen(name, "w")) == NULL) + return (errno); + save_fp = set_fp; + set_fp = fp; + } else + fp = __db_prinit(NULL); + + (void)__db_prdb(dbp); + if (dbp->type == DB_HASH) + (void)__db_prhash(dbp); + else + (void)__db_prbtree(dbp); + fprintf(fp, "%s\n", DB_LINE); + __db_prtree(dbp->mpf, all); + + if (name != NULL) { + (void)fclose(fp); + set_fp = save_fp; + } + return (0); +} + +/* + * __db_prdb -- + * Print out the DB structure information. + * + * PUBLIC: int __db_prdb __P((DB *)); + */ +int +__db_prdb(dbp) + DB *dbp; +{ + static const FN fn[] = { + { DB_AM_DUP, "duplicates" }, + { DB_AM_INMEM, "in-memory" }, + { DB_AM_LOCKING, "locking" }, + { DB_AM_LOGGING, "logging" }, + { DB_AM_MLOCAL, "local mpool" }, + { DB_AM_PGDEF, "default page size" }, + { DB_AM_RDONLY, "read-only" }, + { DB_AM_RECOVER, "recover" }, + { DB_AM_SWAP, "needswap" }, + { DB_AM_THREAD, "thread" }, + { DB_BT_RECNUM, "btree:records" }, + { DB_HS_DIRTYMETA, "hash:dirty-meta" }, + { DB_RE_DELIMITER, "recno:delimiter" }, + { DB_RE_FIXEDLEN, "recno:fixed-length" }, + { DB_RE_PAD, "recno:pad" }, + { DB_RE_RENUMBER, "recno:renumber" }, + { DB_RE_SNAPSHOT, "recno:snapshot" }, + { 0 }, + }; + FILE *fp; + const char *t; + + fp = __db_prinit(NULL); + + switch (dbp->type) { + case DB_BTREE: + t = "btree"; + break; + case DB_HASH: + t = "hash"; + break; + case DB_RECNO: + t = "recno"; + break; + default: + t = "UNKNOWN"; + break; + } + + fprintf(fp, "%s ", t); + __db_prflags(dbp->flags, fn); + fprintf(fp, "\n"); + + return (0); +} + +/* + * __db_prbtree -- + * Print out the btree internal information. + * + * PUBLIC: int __db_prbtree __P((DB *)); + */ +int +__db_prbtree(dbp) + DB *dbp; +{ + static const FN mfn[] = { + { BTM_DUP, "duplicates" }, + { BTM_RECNO, "recno" }, + { 0 }, + }; + BTMETA *mp; + BTREE *t; + DB_LOCK lock; + EPG *sp; + FILE *fp; + RECNO *rp; + db_pgno_t i; + int ret; + + t = dbp->internal; + fp = __db_prinit(NULL); + + (void)fprintf(fp, "%s\nOn-page metadata:\n", DB_LINE); + i = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_READ, &lock)) != 0) + return (ret); + + if ((ret = __bam_pget(dbp, (PAGE **)&mp, &i, 0)) != 0) + return (ret); + + (void)fprintf(fp, "magic %#lx\n", (u_long)mp->magic); + (void)fprintf(fp, "version %lu\n", (u_long)mp->version); + (void)fprintf(fp, "pagesize %lu\n", (u_long)mp->pagesize); + (void)fprintf(fp, "maxkey: %lu minkey: %lu\n", + (u_long)mp->maxkey, (u_long)mp->minkey); + (void)fprintf(fp, "free %lu\n", (u_long)mp->free); + (void)fprintf(fp, "flags %lu", (u_long)mp->flags); + __db_prflags(mp->flags, mfn); + (void)fprintf(fp, "\n"); + (void)memp_fput(dbp->mpf, mp, 0); + (void)__bam_lput(dbp, lock); + + (void)fprintf(fp, "%s\nDB_INFO:\n", DB_LINE); + (void)fprintf(fp, "bt_maxkey: %lu bt_minkey: %lu\n", + (u_long)t->bt_maxkey, (u_long)t->bt_minkey); + (void)fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n", + (u_long)t->bt_compare, (u_long)t->bt_prefix); + if ((rp = t->bt_recno) != NULL) { + (void)fprintf(fp, + "re_delim: %#lx re_pad: %#lx re_len: %lu re_source: %s\n", + (u_long)rp->re_delim, (u_long)rp->re_pad, + (u_long)rp->re_len, + rp->re_source == NULL ? "" : rp->re_source); + (void)fprintf(fp, + "cmap: %#lx smap: %#lx emap: %#lx msize: %lu\n", + (u_long)rp->re_cmap, (u_long)rp->re_smap, + (u_long)rp->re_emap, (u_long)rp->re_msize); + } + (void)fprintf(fp, "stack:"); + for (sp = t->bt_stack; sp < t->bt_sp; ++sp) + (void)fprintf(fp, " %lu", (u_long)sp->page->pgno); + (void)fprintf(fp, "\n"); + (void)fprintf(fp, "ovflsize: %lu\n", (u_long)t->bt_ovflsize); + (void)fflush(fp); + return (0); +} + +/* + * __db_prhash -- + * Print out the hash internal information. + * + * PUBLIC: int __db_prhash __P((DB *)); + */ +int +__db_prhash(dbp) + DB *dbp; +{ + FILE *fp; + HTAB *t; + int i, put_page, ret; + db_pgno_t pgno; + + t = dbp->internal; + + fp = __db_prinit(NULL); + + fprintf(fp, "\thash_accesses %lu\n", (u_long)t->hash_accesses); + fprintf(fp, "\thash_collisions %lu\n", (u_long)t->hash_collisions); + fprintf(fp, "\thash_expansions %lu\n", (u_long)t->hash_expansions); + fprintf(fp, "\thash_overflows %lu\n", (u_long)t->hash_overflows); + fprintf(fp, "\thash_bigpages %lu\n", (u_long)t->hash_bigpages); + fprintf(fp, "\n"); + + if (t->hdr == NULL) { + pgno = PGNO_METADATA; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &t->hdr)) != 0) + return (ret); + put_page = 1; + } else + put_page = 0; + + fprintf(fp, "\tmagic %#lx\n", (u_long)t->hdr->magic); + fprintf(fp, "\tversion %lu\n", (u_long)t->hdr->version); + fprintf(fp, "\tpagesize %lu\n", (u_long)t->hdr->pagesize); + fprintf(fp, "\tovfl_point %lu\n", (u_long)t->hdr->ovfl_point); + fprintf(fp, "\tlast_freed %lu\n", (u_long)t->hdr->last_freed); + fprintf(fp, "\tmax_bucket %lu\n", (u_long)t->hdr->max_bucket); + fprintf(fp, "\thigh_mask %#lx\n", (u_long)t->hdr->high_mask); + fprintf(fp, "\tlow_mask %#lx\n", (u_long)t->hdr->low_mask); + fprintf(fp, "\tffactor %lu\n", (u_long)t->hdr->ffactor); + fprintf(fp, "\tnelem %lu\n", (u_long)t->hdr->nelem); + fprintf(fp, "\th_charkey %#lx\n", (u_long)t->hdr->h_charkey); + + for (i = 0; i < NCACHED; i++) + fprintf(fp, "%lu ", (u_long)t->hdr->spares[i]); + fprintf(fp, "\n"); + + (void)fflush(fp); + if (put_page) { + (void)memp_fput(dbp->mpf, (PAGE *)t->hdr, 0); + t->hdr = NULL; + } + return (0); +} + +/* + * __db_prtree -- + * Print out the entire tree. + * + * PUBLIC: int __db_prtree __P((DB_MPOOLFILE *, int)); + */ +int +__db_prtree(mpf, all) + DB_MPOOLFILE *mpf; + int all; +{ + PAGE *h; + db_pgno_t i; + int ret, t_ret; + + if (set_psize == PSIZE_BOUNDARY) + __db_psize(mpf); + + ret = 0; + for (i = PGNO_ROOT;; ++i) { + if ((ret = memp_fget(mpf, &i, 0, &h)) != 0) + break; + if (TYPE(h) != P_INVALID) + if ((t_ret = __db_prpage(h, all)) != 0 && ret == 0) + ret = t_ret; + (void)memp_fput(mpf, h, 0); + } + (void)fflush(__db_prinit(NULL)); + return (ret); +} + +/* + * __db_prnpage + * -- Print out a specific page. + * + * PUBLIC: int __db_prnpage __P((DB_MPOOLFILE *, db_pgno_t)); + */ +int +__db_prnpage(mpf, pgno) + DB_MPOOLFILE *mpf; + db_pgno_t pgno; +{ + PAGE *h; + int ret; + + if (set_psize == PSIZE_BOUNDARY) + __db_psize(mpf); + + if ((ret = memp_fget(mpf, &pgno, 0, &h)) != 0) + return (ret); + + ret = __db_prpage(h, 1); + (void)fflush(__db_prinit(NULL)); + + (void)memp_fput(mpf, h, 0); + return (ret); +} + +/* + * __db_prpage + * -- Print out a page. + * + * PUBLIC: int __db_prpage __P((PAGE *, int)); + */ +int +__db_prpage(h, all) + PAGE *h; + int all; +{ + BINTERNAL *bi; + BKEYDATA *bk; + HKEYDATA *hkd; + HOFFPAGE a_hkd; + FILE *fp; + RINTERNAL *ri; + db_indx_t dlen, len, i; + db_pgno_t pgno; + u_int8_t *p; + int deleted, ret; + const char *s; + + bi = NULL; /* XXX: Shut the compiler up. */ + bk = NULL; + hkd = NULL; + ri = NULL; + + fp = __db_prinit(NULL); + + switch (TYPE(h)) { + case P_DUPLICATE: + s = "duplicate"; + break; + case P_HASH: + s = "hash"; + break; + case P_IBTREE: + s = "btree internal"; + break; + case P_INVALID: + s = "invalid"; + break; + case P_IRECNO: + s = "recno internal"; + break; + case P_LBTREE: + s = "btree leaf"; + break; + case P_LRECNO: + s = "recno leaf"; + break; + case P_OVERFLOW: + s = "overflow"; + break; + default: + fprintf(fp, "ILLEGAL PAGE TYPE: page: %lu type: %lu\n", + (u_long)h->pgno, (u_long)TYPE(h)); + return (1); + } + fprintf(fp, "page %4lu: (%s)\n", (u_long)h->pgno, s); + fprintf(fp, " lsn.file: %lu lsn.offset: %lu", + (u_long)LSN(h).file, (u_long)LSN(h).offset); + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO || + (TYPE(h) == P_LRECNO && h->pgno == PGNO_ROOT)) + fprintf(fp, " total records: %4lu", (u_long)RE_NREC(h)); + fprintf(fp, "\n"); + if (TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO) + fprintf(fp, " prev: %4lu next: %4lu", + (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h)); + if (TYPE(h) == P_IBTREE || TYPE(h) == P_LBTREE) + fprintf(fp, " level: %2lu", (u_long)h->level); + if (TYPE(h) == P_OVERFLOW) { + fprintf(fp, " ref cnt: %4lu ", (u_long)OV_REF(h)); + __db_pr((u_int8_t *)h + P_OVERHEAD, OV_LEN(h)); + return (0); + } + fprintf(fp, " entries: %4lu", (u_long)NUM_ENT(h)); + fprintf(fp, " offset: %4lu\n", (u_long)HOFFSET(h)); + + if (!all || TYPE(h) == P_INVALID) + return (0); + + ret = 0; + for (i = 0; i < NUM_ENT(h); i++) { + if (P_ENTRY(h, i) - (u_int8_t *)h < P_OVERHEAD || + (size_t)(P_ENTRY(h, i) - (u_int8_t *)h) >= set_psize) { + fprintf(fp, + "ILLEGAL PAGE OFFSET: indx: %lu of %lu\n", + (u_long)i, (u_long)h->inp[i]); + ret = EINVAL; + continue; + } + deleted = 0; + switch (TYPE(h)) { + case P_HASH: + hkd = GET_HKEYDATA(h, i); + break; + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + break; + case P_IRECNO: + ri = GET_RINTERNAL(h, i); + break; + case P_LBTREE: + bk = GET_BKEYDATA(h, i); + deleted = i % 2 == 0 && + GET_BKEYDATA(h, i + O_INDX)->deleted; + break; + case P_LRECNO: + case P_DUPLICATE: + bk = GET_BKEYDATA(h, i); + deleted = GET_BKEYDATA(h, i)->deleted; + break; + default: + fprintf(fp, + "ILLEGAL PAGE ITEM: %lu\n", (u_long)TYPE(h)); + ret = EINVAL; + continue; + } + fprintf(fp, " %s[%03lu] %4lu ", + deleted ? "D" : " ", (u_long)i, (u_long)h->inp[i]); + switch (TYPE(h)) { + case P_HASH: + switch (hkd->type) { + case H_OFFDUP: + memcpy(&pgno, + (u_int8_t *)hkd + SSZ(HOFFDUP, pgno), + sizeof(db_pgno_t)); + fprintf(fp, + "%4lu [offpage dups]\n", (u_long)pgno); + break; + case H_DUPLICATE: + /* + * If this is the first item on a page, then + * we cannot figure out how long it is, so + * we only print the first one in the duplicate + * set. + */ + if (i != 0) + len = LEN_HKEYDATA(h, 0, i); + else + len = 1; + + fprintf(fp, "Duplicates:\n"); + for (p = hkd->data; p < hkd->data + len;) { + memcpy(&dlen, p, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + fprintf(fp, "\t\t"); + __db_pr(p, dlen); + p += sizeof(db_indx_t) + dlen; + } + break; + case H_KEYDATA: + if (i != 0) + __db_pr(hkd->data, + LEN_HKEYDATA(h, 0, i)); + else + fprintf(fp, "%s\n", hkd->data); + break; + case H_OFFPAGE: + memcpy(&a_hkd, hkd, HOFFPAGE_SIZE); + fprintf(fp, + "overflow: total len: %4lu page: %4lu\n", + (u_long)a_hkd.tlen, (u_long)a_hkd.pgno); + break; + } + break; + case P_IBTREE: + fprintf(fp, "count: %4lu pgno: %4lu ", + (u_long)bi->nrecs, (u_long)bi->pgno); + switch (bi->type) { + case B_KEYDATA: + __db_pr(bi->data, bi->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + __db_proff(bi->data); + break; + default: + fprintf(fp, "ILLEGAL BINTERNAL TYPE: %lu\n", + (u_long)bi->type); + ret = EINVAL; + break; + } + break; + case P_IRECNO: + fprintf(fp, "entries %4lu pgno %4lu\n", + (u_long)ri->nrecs, (u_long)ri->pgno); + break; + case P_LBTREE: + case P_LRECNO: + case P_DUPLICATE: + switch (bk->type) { + case B_KEYDATA: + __db_pr(bk->data, bk->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + __db_proff(bk); + break; + default: + fprintf(fp, + "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu\n", + (u_long)bk->type); + ret = EINVAL; + break; + } + break; + } + } + (void)fflush(fp); + return (ret); +} + +/* + * __db_isbad + * -- Decide if a page is corrupted. + * + * PUBLIC: int __db_isbad __P((PAGE *, int)); + */ +int +__db_isbad(h, die) + PAGE *h; + int die; +{ + BINTERNAL *bi; + BKEYDATA *bk; + HKEYDATA *hkd; + FILE *fp; + db_indx_t i; + + bi = NULL; /* XXX: Shut the compiler up. */ + bk = NULL; + hkd = NULL; + + fp = __db_prinit(NULL); + + switch (TYPE(h)) { + case P_DUPLICATE: + case P_HASH: + case P_IBTREE: + case P_INVALID: + case P_IRECNO: + case P_LBTREE: + case P_LRECNO: + case P_OVERFLOW: + break; + default: + fprintf(fp, "ILLEGAL PAGE TYPE: page: %lu type: %lu\n", + (u_long)h->pgno, (u_long)TYPE(h)); + goto bad; + } + + for (i = 0; i < NUM_ENT(h); i++) { + if (P_ENTRY(h, i) - (u_int8_t *)h < P_OVERHEAD || + (size_t)(P_ENTRY(h, i) - (u_int8_t *)h) >= set_psize) { + fprintf(fp, + "ILLEGAL PAGE OFFSET: indx: %lu of %lu\n", + (u_long)i, (u_long)h->inp[i]); + goto bad; + } + switch (TYPE(h)) { + case P_HASH: + hkd = GET_HKEYDATA(h, i); + if (hkd->type != H_OFFDUP && + hkd->type != H_DUPLICATE && + hkd->type != H_KEYDATA && + hkd->type != H_OFFPAGE) { + fprintf(fp, "ILLEGAL HASH TYPE: %lu\n", + (u_long)hkd->type); + goto bad; + } + break; + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + if (bi->type != B_KEYDATA && + bi->type != B_DUPLICATE && + bi->type != B_OVERFLOW) { + fprintf(fp, "ILLEGAL BINTERNAL TYPE: %lu\n", + (u_long)bi->type); + goto bad; + } + break; + case P_IRECNO: + case P_LBTREE: + case P_LRECNO: + break; + case P_DUPLICATE: + bk = GET_BKEYDATA(h, i); + if (bk->type != B_KEYDATA && + bk->type != B_DUPLICATE && + bk->type != B_OVERFLOW) { + fprintf(fp, + "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu\n", + (u_long)bk->type); + goto bad; + } + break; + default: + fprintf(fp, + "ILLEGAL PAGE ITEM: %lu\n", (u_long)TYPE(h)); + goto bad; + } + } + return (0); + +bad: if (die) { + abort(); + /* NOTREACHED */ + } + return (1); +} + +/* + * __db_pr -- + * Print out a data element. + * + * PUBLIC: void __db_pr __P((u_int8_t *, u_int32_t)); + */ +void +__db_pr(p, len) + u_int8_t *p; + u_int32_t len; +{ + FILE *fp; + int i, lastch; + + fp = __db_prinit(NULL); + + fprintf(fp, "len: %3lu", (u_long)len); + lastch = '.'; + if (len != 0) { + fprintf(fp, " data: "); + for (i = len <= 20 ? len : 20; i > 0; --i, ++p) { + lastch = *p; + if (isprint(*p) || *p == '\n') + fprintf(fp, "%c", *p); + else + fprintf(fp, "%#x", (u_int)*p); + } + if (len > 20) { + fprintf(fp, "..."); + lastch = '.'; + } + } + if (lastch != '\n') + fprintf(fp, "\n"); +} + +/* + * __db_proff -- + * Print out an off-page element. + */ +static void +__db_proff(vp) + void *vp; +{ + FILE *fp; + BOVERFLOW *p; + + fp = __db_prinit(NULL); + + p = vp; + switch (p->type) { + case B_OVERFLOW: + fprintf(fp, "overflow: total len: %4lu page: %4lu\n", + (u_long)p->tlen, (u_long)p->pgno); + break; + case B_DUPLICATE: + fprintf(fp, "duplicate: page: %4lu\n", (u_long)p->pgno); + break; + } +} + +/* + * __db_prflags -- + * Print out flags values. + * + * PUBLIC: void __db_prflags __P((u_int32_t, const FN *)); + */ +void +__db_prflags(flags, fn) + u_int32_t flags; + FN const *fn; +{ + FILE *fp; + const FN *fnp; + int found; + const char *sep; + + fp = __db_prinit(NULL); + + sep = " ("; + for (found = 0, fnp = fn; fnp->mask != 0; ++fnp) + if (fnp->mask & flags) { + fprintf(fp, "%s%s", sep, fnp->name); + sep = ", "; + found = 1; + } + if (found) + fprintf(fp, ")"); +} + +/* + * __db_psize -- + * Get the page size. + */ +static void +__db_psize(mpf) + DB_MPOOLFILE *mpf; +{ + BTMETA *mp; + db_pgno_t pgno; + + set_psize = PSIZE_BOUNDARY - 1; + + pgno = PGNO_METADATA; + if (memp_fget(mpf, &pgno, 0, &mp) != 0) + return; + + switch (mp->magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + set_psize = mp->pagesize; + break; + } + (void)memp_fput(mpf, mp, 0); +} diff --git a/db2/db/db_rec.c b/db2/db/db_rec.c new file mode 100644 index 0000000000..900b0ed579 --- /dev/null +++ b/db2/db/db_rec.c @@ -0,0 +1,623 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_rec.c 10.8 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#endif +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "log.h" +#include "hash.h" +#include "btree.h" + +/* + * PUBLIC: int __db_addrem_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + * + * This log message is generated whenever we add or remove a duplicate + * to/from a duplicate page. On recover, we just do the opposite. + */ +int +__db_addrem_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_addrem_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int change, cmp_n, cmp_p, ret; + + REC_PRINT(__db_addrem_print); + REC_INTRO(__db_addrem_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else + if ((ret = memp_fget(mpf, + &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + change = 0; + if ((cmp_p == 0 && redo && argp->opcode == DB_ADD_DUP) || + (cmp_n == 0 && !redo && argp->opcode == DB_REM_DUP)) { + + /* Need to redo an add, or undo a delete. */ + if ((ret = __db_pitem(file_dbp, pagep, argp->indx, argp->nbytes, + argp->hdr.size == 0 ? NULL : &argp->hdr, + argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0) + goto out; + + change = DB_MPOOL_DIRTY; + + } else if ((cmp_n == 0 && !redo && argp->opcode == DB_ADD_DUP) || + (cmp_p == 0 && redo && argp->opcode == DB_REM_DUP)) { + /* Need to undo an add, or redo a delete. */ + if ((ret = __db_ditem(file_dbp, pagep, argp->indx, + argp->nbytes)) != 0) + goto out; + change = DB_MPOOL_DIRTY; + } + + if (change) + if (redo) + LSN(pagep) = *lsnp; + else + LSN(pagep) = argp->pagelsn; + + if ((ret = memp_fput(mpf, pagep, change)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * PUBLIC: int __db_split_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_split_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_split_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int change, cmp_n, cmp_p, ret; + + REC_PRINT(__db_split_print); + REC_INTRO(__db_split_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else + if ((ret = memp_fget(mpf, + &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + /* + * There are two types of log messages here, one for the old page + * and one for the new pages created. The original image in the + * SPLITOLD record is used for undo. The image in the SPLITNEW + * is used for redo. We should never have a case where there is + * a redo operation and the SPLITOLD record is on disk, but not + * the SPLITNEW record. Therefore, we only redo NEW messages + * and only undo OLD messages. + */ + + change = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + if (cmp_p == 0 && redo) { + if (argp->opcode == DB_SPLITNEW) { + /* Need to redo the split described. */ + memcpy(pagep, + argp->pageimage.data, argp->pageimage.size); + } + LSN(pagep) = *lsnp; + change = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && !redo) { + if (argp->opcode == DB_SPLITOLD) { + /* Put back the old image. */ + memcpy(pagep, + argp->pageimage.data, argp->pageimage.size); + } + LSN(pagep) = argp->pagelsn; + change = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, change)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * PUBLIC: int __db_big_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_big_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_big_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int change, cmp_n, cmp_p, ret; + + REC_PRINT(__db_big_print); + REC_INTRO(__db_big_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto ppage; + } else + if ((ret = memp_fget(mpf, + &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + /* + * There are three pages we need to check. The one on which we are + * adding data, the previous one whose next_pointer may have + * been updated, and the next one whose prev_pointer may have + * been updated. + */ + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + change = 0; + if ((cmp_p == 0 && redo && argp->opcode == DB_ADD_BIG) || + (cmp_n == 0 && !redo && argp->opcode == DB_REM_BIG)) { + /* We are either redo-ing an add, or undoing a delete. */ + P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno, + argp->next_pgno, 0, P_OVERFLOW); + OV_LEN(pagep) = argp->dbt.size; + OV_REF(pagep) = 1; + memcpy((u_int8_t *)pagep + P_OVERHEAD, argp->dbt.data, + argp->dbt.size); + PREV_PGNO(pagep) = argp->prev_pgno; + change = DB_MPOOL_DIRTY; + } else if ((cmp_n == 0 && !redo && argp->opcode == DB_ADD_BIG) || + (cmp_p == 0 && redo && argp->opcode == DB_REM_BIG)) { + /* + * We are either undo-ing an add or redo-ing a delete. + * The page is about to be reclaimed in either case, so + * there really isn't anything to do here. + */ + change = DB_MPOOL_DIRTY; + } + if (change) + LSN(pagep) = redo ? *lsnp : argp->pagelsn; + + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + + /* Now check the previous page. */ +ppage: if (argp->prev_pgno != PGNO_INVALID) { + change = 0; + if ((ret = memp_fget(mpf, &argp->prev_pgno, 0, &pagep)) != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto npage; + } else + if ((ret = memp_fget(mpf, &argp->prev_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->prevlsn); + + if ((cmp_p == 0 && redo && argp->opcode == DB_ADD_BIG) || + (cmp_n == 0 && !redo && argp->opcode == DB_REM_BIG)) { + /* Redo add, undo delete. */ + NEXT_PGNO(pagep) = argp->pgno; + change = DB_MPOOL_DIRTY; + } else if ((cmp_n == 0 && + !redo && argp->opcode == DB_ADD_BIG) || + (cmp_p == 0 && redo && argp->opcode == DB_REM_BIG)) { + /* Redo delete, undo add. */ + NEXT_PGNO(pagep) = argp->next_pgno; + change = DB_MPOOL_DIRTY; + } + if (change) + LSN(pagep) = redo ? *lsnp : argp->prevlsn; + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + } + + /* Now check the next page. Can only be set on a delete. */ +npage: if (argp->next_pgno != PGNO_INVALID) { + change = 0; + if ((ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep)) != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else + if ((ret = memp_fget(mpf, &argp->next_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nextlsn); + if (cmp_p == 0 && redo) { + PREV_PGNO(pagep) = PGNO_INVALID; + change = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && !redo) { + PREV_PGNO(pagep) = argp->pgno; + change = DB_MPOOL_DIRTY; + } + if (change) + LSN(pagep) = redo ? *lsnp : argp->nextlsn; + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + } + + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __db_ovref_recover -- + * Recovery function for __db_ioff(). + * + * PUBLIC: int __db_ovref_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_ovref_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_ovref_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int modified, ret; + + REC_PRINT(__db_ovref_print); + REC_INTRO(__db_ovref_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + + modified = 0; + if (log_compare(lsnp, &argp->lsn) == 0 && redo) { + /* Need to redo update described. */ + ++OV_REF(pagep); + + pagep->lsn = *lsnp; + modified = 1; + } else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) { + /* Need to undo update described. */ + --OV_REF(pagep); + + pagep->lsn = argp->lsn; + modified = 1; + } + ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0); + + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __db_relink_recover -- + * Recovery function for relink. + * + * PUBLIC: int __db_relink_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_relink_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_relink_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int modified, ret; + + REC_PRINT(__db_relink_print); + REC_INTRO(__db_relink_read); + + /* + * There are three pages we need to check -- the page, and the + * previous and next pages, if they existed. + */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (redo) { + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + goto next; + } + modified = 0; + if (log_compare(lsnp, &argp->lsn) == 0 && redo) { + /* Redo the relink. */ + pagep->lsn = *lsnp; + modified = 1; + } else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) { + /* Undo the relink. */ + pagep->next_pgno = argp->next; + pagep->prev_pgno = argp->prev; + + pagep->lsn = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + +next: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) { + if (redo) { + (void)__db_pgerr(file_dbp, argp->next); + goto out; + } + goto prev; + } + modified = 0; + if (log_compare(lsnp, &argp->lsn_next) == 0 && redo) { + /* Redo the relink. */ + pagep->prev_pgno = argp->prev; + + pagep->lsn = *lsnp; + modified = 1; + } else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) { + /* Undo the relink. */ + pagep->prev_pgno = argp->pgno; + + pagep->lsn = argp->lsn_next; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + +prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) { + if (redo) { + (void)__db_pgerr(file_dbp, argp->prev); + goto out; + } + goto done; + } + modified = 0; + if (log_compare(lsnp, &argp->lsn_prev) == 0 && redo) { + /* Redo the relink. */ + pagep->next_pgno = argp->next; + + pagep->lsn = *lsnp; + modified = 1; + } else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) { + /* Undo the relink. */ + pagep->next_pgno = argp->pgno; + + pagep->lsn = argp->lsn_prev; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void) __db_panic(file_dbp); + goto out; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * PUBLIC: int __db_addpage_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_addpage_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_addpage_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int change, cmp_n, cmp_p, ret; + + REC_PRINT(__db_addpage_print); + REC_INTRO(__db_addpage_read); + + /* + * We need to check two pages: the old one and the new one onto + * which we're going to add duplicates. Do the old one first. + */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) + goto out; + + change = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + NEXT_PGNO(pagep) = argp->nextpgno; + + LSN(pagep) = *lsnp; + change = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && !redo) { + NEXT_PGNO(pagep) = PGNO_INVALID; + + LSN(pagep) = argp->lsn; + change = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + + if ((ret = memp_fget(mpf, &argp->nextpgno, 0, &pagep)) != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto out; + } else + if ((ret = memp_fget(mpf, + &argp->nextpgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + change = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nextlsn); + if (cmp_p == 0 && redo) { + PREV_PGNO(pagep) = argp->pgno; + + LSN(pagep) = *lsnp; + change = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && !redo) { + PREV_PGNO(pagep) = PGNO_INVALID; + + LSN(pagep) = argp->nextlsn; + change = DB_MPOOL_DIRTY; + } + ret = memp_fput(mpf, pagep, change); + +out: if (ret == 0) + *lsnp = argp->prev_lsn; + REC_CLOSE; +} + +/* + * __db_debug_recover -- + * Recovery function for debug. + * + * PUBLIC: int __db_debug_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_debug_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_debug_args *argp; + int ret; + + REC_PRINT(__db_debug_print); + REC_NOOP_INTRO(__db_debug_read); + + *lsnp = argp->prev_lsn; + ret = 0; + + REC_NOOP_CLOSE; +} + +/* + * __db_noop_recover -- + * Recovery function for noop. + * + * PUBLIC: int __db_noop_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__db_noop_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __db_noop_args *argp; + int ret; + + REC_PRINT(__db_noop_print); + REC_NOOP_INTRO(__db_noop_read); + + *lsnp = argp->prev_lsn; + ret = 0; + + REC_NOOP_CLOSE; +} diff --git a/db2/db/db_ret.c b/db2/db/db_ret.c new file mode 100644 index 0000000000..ddeb26eb94 --- /dev/null +++ b/db2/db/db_ret.c @@ -0,0 +1,149 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_ret.c 10.5 (Sleepycat) 7/12/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "hash.h" +#include "db_am.h" + +/* + * __db_ret -- + * Build return DBT. + * + * PUBLIC: int __db_ret __P((DB *, + * PUBLIC: PAGE *, u_int32_t, DBT *, void **, u_int32_t *)); + */ +int +__db_ret(dbp, h, indx, dbt, memp, memsize) + DB *dbp; + PAGE *h; + u_int32_t indx; + DBT *dbt; + void **memp; + u_int32_t *memsize; +{ + BKEYDATA *bk; + HOFFPAGE ho; + BOVERFLOW *bo; + u_int32_t len; + void *data, *hk; + + switch (TYPE(h)) { + case P_HASH: + hk = P_ENTRY(h, indx); + if (((HKEYDATA *)hk)->type == H_OFFPAGE) { + memcpy(&ho, hk, sizeof(HOFFPAGE)); + return (__db_goff(dbp, dbt, + ho.tlen, ho.pgno, memp, memsize)); + } + len = LEN_HKEYDATA(h, dbp->pgsize, indx); + data = ((HKEYDATA *)hk)->data; + break; + case P_DUPLICATE: + case P_LBTREE: + case P_LRECNO: + bk = GET_BKEYDATA(h, indx); + if (bk->type == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + return (__db_goff(dbp, dbt, + bo->tlen, bo->pgno, memp, memsize)); + } + len = bk->len; + data = bk->data; + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + + return (__db_retcopy(dbt, data, len, memp, memsize, + F_ISSET(dbt, DB_DBT_INTERNAL) ? NULL : dbp->db_malloc)); +} + +/* + * __db_retcopy -- + * Copy the returned data into the user's DBT, handling special flags. + * + * PUBLIC: int __db_retcopy __P((DBT *, + * PUBLIC: void *, u_int32_t, void **, u_int32_t *, void *(*)(size_t))); + */ +int +__db_retcopy(dbt, data, len, memp, memsize, db_malloc) + DBT *dbt; + void *data; + u_int32_t len; + void **memp; + u_int32_t *memsize; + void *(*db_malloc) __P((size_t)); +{ + /* If returning a partial record, reset the length. */ + if (F_ISSET(dbt, DB_DBT_PARTIAL)) { + data = (u_int8_t *)data + dbt->doff; + if (len > dbt->doff) { + len -= dbt->doff; + if (len > dbt->dlen) + len = dbt->dlen; + } else + len = 0; + } + + /* + * Return the length of the returned record in the DBT size field. + * This satisfies the requirement that if we're using user memory + * and insufficient memory was provided, return the amount necessary + * in the size field. + */ + dbt->size = len; + + /* + * Allocate any necessary memory. + * + * XXX: Never allocate 0 bytes. + */ + if (F_ISSET(dbt, DB_DBT_MALLOC)) { + dbt->data = db_malloc == NULL ? + (void *)malloc(len + 1) : + (void *)db_malloc(len + 1); + if (dbt->data == NULL) + return (ENOMEM); + } else if (F_ISSET(dbt, DB_DBT_USERMEM)) { + if (dbt->ulen < len) + return (ENOMEM); + } else if (memp == NULL || memsize == NULL) { + return (EINVAL); + } else { + if (*memsize == 0 || *memsize < len) { + *memp = *memp == NULL ? + (void *)malloc(len + 1) : + (void *)realloc(*memp, len + 1); + if (*memp == NULL) { + *memsize = 0; + return (ENOMEM); + } + *memsize = len + 1; + } + dbt->data = *memp; + } + + memcpy(dbt->data, data, len); + return (0); +} diff --git a/db2/db/db_thread.c b/db2/db/db_thread.c new file mode 100644 index 0000000000..e956e809d9 --- /dev/null +++ b/db2/db/db_thread.c @@ -0,0 +1,125 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_thread.c 8.11 (Sleepycat) 8/18/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "shqueue.h" +#include "db_am.h" + +static int __db_getlockid __P((DB *, DB *)); + +/* + * __db_gethandle -- + * Called by db access method routines when the DB_THREAD flag is set. + * This routine returns a handle, either an existing handle from the + * chain of handles, or creating one if necessary. + * + * PUBLIC: int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **)); + */ +int +__db_gethandle(dbp, am_func, dbpp) + DB *dbp, **dbpp; + int (*am_func) __P((DB *, DB *)); +{ + DB *ret_dbp; + int ret, t_ret; + + if ((ret = __db_mutex_lock((db_mutex_t *)dbp->mutex, -1, + dbp->dbenv == NULL ? NULL : dbp->dbenv->db_yield)) != 0) + return (ret); + + if ((ret_dbp = LIST_FIRST(&dbp->handleq)) != NULL) + /* Simply take one off the list. */ + LIST_REMOVE(ret_dbp, links); + else { + /* Allocate a new handle. */ + if ((ret_dbp = (DB *)malloc(sizeof(*dbp))) == NULL) { + ret = ENOMEM; + goto err; + } + memcpy(ret_dbp, dbp, sizeof(*dbp)); + ret_dbp->internal = NULL; + TAILQ_INIT(&ret_dbp->curs_queue); + + /* Set the locker, the lock structure and the lock DBT. */ + if ((ret = __db_getlockid(dbp, ret_dbp)) != 0) + goto err; + + /* Finally, call the access method specific dup function. */ + if ((ret = am_func(dbp, ret_dbp)) != 0) + goto err; + } + + *dbpp = ret_dbp; + + if (0) { +err: if (ret_dbp != NULL) + FREE(ret_dbp, sizeof(*ret_dbp)); + } + if ((t_ret = + __db_mutex_unlock((db_mutex_t *)dbp->mutex, -1)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_puthandle -- + * Return a DB handle to the pool for later use. + * + * PUBLIC: int __db_puthandle __P((DB *)); + */ +int +__db_puthandle(dbp) + DB *dbp; +{ + DB *master; + int ret; + + master = dbp->master; + if ((ret = __db_mutex_lock((db_mutex_t *)master->mutex, -1, + dbp->dbenv == NULL ? NULL : dbp->dbenv->db_yield)) != 0) + return (ret); + + LIST_INSERT_HEAD(&master->handleq, dbp, links); + + return (__db_mutex_unlock((db_mutex_t *)master->mutex, -1)); +} + +/* + * __db_getlockid -- + * Create a new locker ID and copy the file lock information from + * the old DB into the new one. + */ +static int +__db_getlockid(dbp, new_dbp) + DB *dbp, *new_dbp; +{ + int ret; + + if (F_ISSET(dbp, DB_AM_LOCKING)) { + if ((ret = lock_id(dbp->dbenv->lk_info, &new_dbp->locker)) != 0) + return (ret); + memcpy(new_dbp->lock.fileid, dbp->lock.fileid, DB_FILE_ID_LEN); + new_dbp->lock_dbt.size = sizeof(new_dbp->lock); + new_dbp->lock_dbt.data = &new_dbp->lock; + } + return (0); +} diff --git a/db2/db185/db185.c b/db2/db185/db185.c new file mode 100644 index 0000000000..933f55c813 --- /dev/null +++ b/db2/db185/db185.c @@ -0,0 +1,472 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db185.c 8.13 (Sleepycat) 8/24/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db185_int.h" +#include "common_ext.h" + +static int db185_close __P((DB185 *)); +static int db185_del __P((const DB185 *, const DBT185 *, u_int)); +static int db185_fd __P((const DB185 *)); +static int db185_get __P((const DB185 *, const DBT185 *, DBT185 *, u_int)); +static int db185_put __P((const DB185 *, DBT185 *, const DBT185 *, u_int)); +static int db185_seq __P((const DB185 *, DBT185 *, DBT185 *, u_int)); +static int db185_sync __P((const DB185 *, u_int)); + +DB185 * +__dbopen(file, oflags, mode, type, openinfo) + const char *file; + int oflags, mode; + DBTYPE type; + const void *openinfo; +{ + const BTREEINFO *bi; + const HASHINFO *hi; + const RECNOINFO *ri; + DB *dbp; + DB185 *db185p; + DB_INFO dbinfo, *dbinfop; + int s_errno; + + if ((db185p = (DB185 *)calloc(1, sizeof(DB185))) == NULL) + return (NULL); + dbinfop = NULL; + memset(&dbinfo, 0, sizeof(dbinfo)); + + /* + * !!! + * The DBTYPE enum wasn't initialized in DB 185, so it's off-by-one + * from DB 2.0. + */ + switch (type) { + case 0: /* DB_BTREE */ + type = DB_BTREE; + if ((bi = openinfo) != NULL) { + dbinfop = &dbinfo; + if (bi->flags & ~R_DUP) + goto einval; + if (bi->flags & R_DUP) + dbinfop->flags |= DB_DUP; + dbinfop->db_cachesize = bi->cachesize; + dbinfop->bt_maxkey = bi->maxkeypage; + dbinfop->bt_minkey = bi->minkeypage; + dbinfop->db_pagesize = bi->psize; + /* + * !!! + * Comparisons and prefix calls work because the DBT + * structures in 1.85 and 2.0 have the same initial + * fields. + */ + dbinfop->bt_compare = bi->compare; + dbinfop->bt_prefix = bi->prefix; + dbinfop->db_lorder = bi->lorder; + } + break; + case 1: /* DB_HASH */ + type = DB_HASH; + if ((hi = openinfo) != NULL) { + dbinfop = &dbinfo; + dbinfop->db_pagesize = hi->bsize; + dbinfop->h_ffactor = hi->ffactor; + dbinfop->h_nelem = hi->nelem; + dbinfop->db_cachesize = hi->cachesize; + dbinfop->h_hash = hi->hash; + dbinfop->db_lorder = hi->lorder; + } + + break; + case 2: /* DB_RECNO */ + type = DB_RECNO; + dbinfop = &dbinfo; + + /* DB 1.85 did renumbering by default. */ + dbinfop->flags |= DB_RENUMBER; + + /* + * !!! + * The file name given to DB 1.85 recno is the name of the DB + * 2.0 backing file. If the file doesn't exist, create it if + * the user has the O_CREAT flag set, DB 1.85 did it for you, + * and DB 2.0 doesn't. + * + * !!! + * Note, the file name in DB 1.85 was a const -- we don't do + * that in DB 2.0, so do that cast. + */ + if (file != NULL) { + if (oflags & O_CREAT && __db_exists(file, NULL) != 0) + (void)close(open(file, oflags, mode)); + dbinfop->re_source = (char *)file; + file = NULL; + } + + if ((ri = openinfo) != NULL) { + /* + * !!! + * We can't support the bfname field. + */ +#define BFMSG "DB: DB 1.85's recno bfname field is not supported.\n" + if (ri->bfname != NULL) { + (void)write(2, BFMSG, sizeof(BFMSG) - 1); + goto einval; + } + + if (ri->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT)) + goto einval; + if (ri->flags & R_FIXEDLEN) { + dbinfop->flags |= DB_FIXEDLEN; + if (ri->bval != 0) { + dbinfop->flags |= DB_PAD; + dbinfop->re_pad = ri->bval; + } + } else + if (ri->bval != 0) { + dbinfop->flags |= DB_DELIMITER; + dbinfop->re_delim = ri->bval; + } + + /* + * !!! + * We ignore the R_NOKEY flag, but that's okay, it was + * only an optimization that was never implemented. + */ + + if (ri->flags & R_SNAPSHOT) + dbinfop->flags |= DB_SNAPSHOT; + + dbinfop->db_cachesize = ri->cachesize; + dbinfop->db_pagesize = ri->psize; + dbinfop->db_lorder = ri->lorder; + dbinfop->re_len = ri->reclen; + } + break; + default: + goto einval; + } + + db185p->close = db185_close; + db185p->del = db185_del; + db185p->fd = db185_fd; + db185p->get = db185_get; + db185p->put = db185_put; + db185p->seq = db185_seq; + db185p->sync = db185_sync; + + /* + * !!! + * Store the returned pointer to the real DB 2.0 structure in the + * internal pointer. Ugly, but we're not going for pretty, here. + */ + if ((errno = db_open(file, + type, __db_oflags(oflags), mode, NULL, dbinfop, &dbp)) != 0) { + free(db185p); + return (NULL); + } + + /* Create the cursor used for sequential ops. */ + if ((errno = dbp->cursor(dbp, NULL, &((DB185 *)db185p)->dbc)) != 0) { + s_errno = errno; + (void)dbp->close(dbp, 0); + free(db185p); + errno = s_errno; + return (NULL); + } + + db185p->internal = dbp; + return (db185p); + +einval: free(db185p); + errno = EINVAL; + return (NULL); +} +weak_alias (__dbopen, dbopen) + +static int +db185_close(db185p) + DB185 *db185p; +{ + DB *dbp; + + dbp = (DB *)db185p->internal; + + errno = dbp->close(dbp, 0); + + free(db185p); + + return (errno == 0 ? 0 : -1); +} + +static int +db185_del(db185p, key185, flags) + const DB185 *db185p; + const DBT185 *key185; + u_int flags; +{ + DB *dbp; + DBT key; + + dbp = (DB *)db185p->internal; + + memset(&key, 0, sizeof(key)); + key.data = key185->data; + key.size = key185->size; + + if (flags & ~R_CURSOR) + goto einval; + if (flags & R_CURSOR) + errno = db185p->dbc->c_del(db185p->dbc, 0); + else + errno = dbp->del(dbp, NULL, &key, 0); + + switch (errno) { + case 0: + return (0); + case DB_NOTFOUND: + return (1); + } + return (-1); + +einval: errno = EINVAL; + return (-1); +} + +static int +db185_fd(db185p) + const DB185 *db185p; +{ + DB *dbp; + int fd; + + dbp = (DB *)db185p->internal; + + return ((errno = dbp->fd(dbp, &fd)) == 0 ? fd : -1); +} + +static int +db185_get(db185p, key185, data185, flags) + const DB185 *db185p; + const DBT185 *key185; + DBT185 *data185; + u_int flags; +{ + DB *dbp; + DBT key, data; + + dbp = (DB *)db185p->internal; + + memset(&key, 0, sizeof(key)); + key.data = key185->data; + key.size = key185->size; + memset(&data, 0, sizeof(data)); + data.data = data185->data; + data.size = data185->size; + + if (flags) + goto einval; + + switch (errno = dbp->get(dbp, NULL, &key, &data, 0)) { + case 0: + data185->data = data.data; + data185->size = data.size; + return (0); + case DB_NOTFOUND: + return (1); + } + return (-1); + +einval: errno = EINVAL; + return (-1); +} + +static int +db185_put(db185p, key185, data185, flags) + const DB185 *db185p; + DBT185 *key185; + const DBT185 *data185; + u_int flags; +{ + DB *dbp; + DBC *dbcp_put; + DBT key, data; + int s_errno; + + dbp = (DB *)db185p->internal; + + memset(&key, 0, sizeof(key)); + key.data = key185->data; + key.size = key185->size; + memset(&data, 0, sizeof(data)); + data.data = data185->data; + data.size = data185->size; + + switch (flags) { + case 0: + errno = dbp->put(dbp, NULL, &key, &data, 0); + break; + case R_CURSOR: + errno = + db185p->dbc->c_put(db185p->dbc, &key, &data, DB_CURRENT); + break; + case R_IAFTER: + case R_IBEFORE: + if (dbp->type != DB_RECNO) + goto einval; + + if ((errno = dbp->cursor(dbp, NULL, &dbcp_put)) != 0) + return (-1); + if ((errno = + dbcp_put->c_get(dbcp_put, &key, &data, DB_SET)) != 0) { + s_errno = errno; + (void)dbcp_put->c_close(dbcp_put); + errno = s_errno; + return (-1); + } + memset(&data, 0, sizeof(data)); + data.data = data185->data; + data.size = data185->size; + errno = dbcp_put->c_put(dbcp_put, + &key, &data, flags == R_IAFTER ? DB_AFTER : DB_BEFORE); + s_errno = errno; + (void)dbcp_put->c_close(dbcp_put); + errno = s_errno; + break; + case R_NOOVERWRITE: + errno = dbp->put(dbp, NULL, &key, &data, DB_NOOVERWRITE); + break; + case R_SETCURSOR: + if (dbp->type != DB_BTREE && dbp->type != DB_RECNO) + goto einval; + + if ((errno = dbp->put(dbp, NULL, &key, &data, 0)) != 0) + break; + errno = + db185p->dbc->c_get(db185p->dbc, &key, &data, DB_SET_RANGE); + break; + default: + goto einval; + } + + switch (errno) { + case 0: + key185->data = key.data; + key185->size = key.size; + return (0); + case DB_KEYEXIST: + return (1); + } + return (-1); + +einval: errno = EINVAL; + return (-1); +} + +static int +db185_seq(db185p, key185, data185, flags) + const DB185 *db185p; + DBT185 *key185, *data185; + u_int flags; +{ + DB *dbp; + DBT key, data; + + dbp = (DB *)db185p->internal; + + memset(&key, 0, sizeof(key)); + key.data = key185->data; + key.size = key185->size; + memset(&data, 0, sizeof(data)); + data.data = data185->data; + data.size = data185->size; + + switch (flags) { + case R_CURSOR: + flags = DB_SET_RANGE; + break; + case R_FIRST: + flags = DB_FIRST; + break; + case R_LAST: + if (dbp->type != DB_BTREE && dbp->type != DB_RECNO) + goto einval; + flags = DB_LAST; + break; + case R_NEXT: + flags = DB_NEXT; + break; + case R_PREV: + if (dbp->type != DB_BTREE && dbp->type != DB_RECNO) + goto einval; + flags = DB_PREV; + break; + default: + goto einval; + } + switch (errno = db185p->dbc->c_get(db185p->dbc, &key, &data, flags)) { + case 0: + key185->data = key.data; + key185->size = key.size; + data185->data = data.data; + data185->size = data.size; + return (0); + case DB_NOTFOUND: + return (1); + } + return (-1); + +einval: errno = EINVAL; + return (-1); +} + +static int +db185_sync(db185p, flags) + const DB185 *db185p; + u_int flags; +{ + DB *dbp; + + dbp = (DB *)db185p->internal; + + switch (flags) { + case 0: + break; + case R_RECNOSYNC: + /* + * !!! + * We can't support the R_RECNOSYNC flag. + */ +#define RSMSG "DB: DB 1.85's R_RECNOSYNC sync flag is not supported.\n" + (void)write(2, RSMSG, sizeof(RSMSG) - 1); + goto einval; + default: + goto einval; + } + + return ((errno = dbp->sync(dbp, 0)) == 0 ? 0 : -1); + +einval: errno = EINVAL; + return (-1); +} diff --git a/db2/db185/db185_int.h b/db2/db185/db185_int.h new file mode 100644 index 0000000000..656dfddf78 --- /dev/null +++ b/db2/db185/db185_int.h @@ -0,0 +1,137 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)db185_int.h 8.4 (Sleepycat) 7/27/97 + */ + +#ifndef _DB185_H_ +#define _DB185_H_ + +/* Routine flags. */ +#define R_CURSOR 1 /* del, put, seq */ +#define __R_UNUSED 2 /* UNUSED */ +#define R_FIRST 3 /* seq */ +#define R_IAFTER 4 /* put (RECNO) */ +#define R_IBEFORE 5 /* put (RECNO) */ +#define R_LAST 6 /* seq (BTREE, RECNO) */ +#define R_NEXT 7 /* seq */ +#define R_NOOVERWRITE 8 /* put */ +#define R_PREV 9 /* seq (BTREE, RECNO) */ +#define R_SETCURSOR 10 /* put (RECNO) */ +#define R_RECNOSYNC 11 /* sync (RECNO) */ + +typedef struct { + void *data; /* data */ + size_t size; /* data length */ +} DBT185; + +/* Access method description structure. */ +typedef struct __db185 { + DBTYPE type; /* Underlying db type. */ + int (*close) __P((struct __db185 *)); + int (*del) __P((const struct __db185 *, const DBT185 *, u_int)); + int (*get) + __P((const struct __db185 *, const DBT185 *, DBT185 *, u_int)); + int (*put) + __P((const struct __db185 *, DBT185 *, const DBT185 *, u_int)); + int (*seq) + __P((const struct __db185 *, DBT185 *, DBT185 *, u_int)); + int (*sync) __P((const struct __db185 *, u_int)); + void *internal; /* Access method private. */ + int (*fd) __P((const struct __db185 *)); + + /* + * !!! + * Added to the end of the DB 1.85 DB structure, it's needed to + * hold the DB 2.0 cursor used for DB 1.85 sequential operations. + */ + DBC *dbc; /* DB 1.85 sequential cursor. */ +} DB185; + +/* Structure used to pass parameters to the btree routines. */ +typedef struct { +#define R_DUP 0x01 /* duplicate keys */ + u_long flags; + u_int cachesize; /* bytes to cache */ + int maxkeypage; /* maximum keys per page */ + int minkeypage; /* minimum keys per page */ + u_int psize; /* page size */ + int (*compare) /* comparison function */ + __P((const DBT *, const DBT *)); + size_t (*prefix) /* prefix function */ + __P((const DBT *, const DBT *)); + int lorder; /* byte order */ +} BTREEINFO; + +/* Structure used to pass parameters to the hashing routines. */ +typedef struct { + u_int bsize; /* bucket size */ + u_int ffactor; /* fill factor */ + u_int nelem; /* number of elements */ + u_int cachesize; /* bytes to cache */ + u_int32_t /* hash function */ + (*hash) __P((const void *, size_t)); + int lorder; /* byte order */ +} HASHINFO; + +/* Structure used to pass parameters to the record routines. */ +typedef struct { +#define R_FIXEDLEN 0x01 /* fixed-length records */ +#define R_NOKEY 0x02 /* key not required */ +#define R_SNAPSHOT 0x04 /* snapshot the input */ + u_long flags; + u_int cachesize; /* bytes to cache */ + u_int psize; /* page size */ + int lorder; /* byte order */ + size_t reclen; /* record length (fixed-length records) */ + u_char bval; /* delimiting byte (variable-length records */ + char *bfname; /* btree file name */ +} RECNOINFO; + +#if defined(__cplusplus) +extern "C" { +#endif +DB185 *dbopen __P((const char *, int, int, DBTYPE, const void *)); +#if defined(__cplusplus) +}; +#endif +#endif /* !_DB185_H_ */ diff --git a/db2/db_185.h b/db2/db_185.h new file mode 100644 index 0000000000..650d365a60 --- /dev/null +++ b/db2/db_185.h @@ -0,0 +1,171 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)db_185.h.src 8.3 (Sleepycat) 7/27/97 + */ + +#ifndef _DB_185_H_ +#define _DB_185_H_ + +#include <sys/types.h> + +#include <limits.h> + +/* + * XXX + * Handle function prototypes and the keyword "const". This steps on name + * space that DB doesn't control, but all of the other solutions are worse. + */ +#undef __P +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* ANSI C prototypes */ +#else +#define const +#define __P(protos) () /* K&R C preprocessor */ +#endif + +#define RET_ERROR -1 /* Return values. */ +#define RET_SUCCESS 0 +#define RET_SPECIAL 1 + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ + + + + + +#endif + +#define MAX_PAGE_NUMBER 0xffffffff /* >= # of pages in a file */ +typedef u_int32_t pgno_t; +#define MAX_PAGE_OFFSET 65535 /* >= # of bytes in a page */ +typedef u_int16_t indx_t; +#define MAX_REC_NUMBER 0xffffffff /* >= # of records in a tree */ +typedef u_int32_t recno_t; + +/* Key/data structure -- a Data-Base Thang. */ +typedef struct { + void *data; /* data */ + size_t size; /* data length */ +} DBT; + +/* Routine flags. */ +#define R_CURSOR 1 /* del, put, seq */ +#define __R_UNUSED 2 /* UNUSED */ +#define R_FIRST 3 /* seq */ +#define R_IAFTER 4 /* put (RECNO) */ +#define R_IBEFORE 5 /* put (RECNO) */ +#define R_LAST 6 /* seq (BTREE, RECNO) */ +#define R_NEXT 7 /* seq */ +#define R_NOOVERWRITE 8 /* put */ +#define R_PREV 9 /* seq (BTREE, RECNO) */ +#define R_SETCURSOR 10 /* put (RECNO) */ +#define R_RECNOSYNC 11 /* sync (RECNO) */ + +typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE; + +/* Access method description structure. */ +typedef struct __db { + DBTYPE type; /* Underlying db type. */ + int (*close) __P((struct __db *)); + int (*del) __P((const struct __db *, const DBT *, u_int)); + int (*get) __P((const struct __db *, const DBT *, DBT *, u_int)); + int (*put) __P((const struct __db *, DBT *, const DBT *, u_int)); + int (*seq) __P((const struct __db *, DBT *, DBT *, u_int)); + int (*sync) __P((const struct __db *, u_int)); + void *internal; /* Access method private. */ + int (*fd) __P((const struct __db *)); +} DB; + +#define BTREEMAGIC 0x053162 +#define BTREEVERSION 3 + +/* Structure used to pass parameters to the btree routines. */ +typedef struct { +#define R_DUP 0x01 /* duplicate keys */ + u_long flags; + u_int cachesize; /* bytes to cache */ + int maxkeypage; /* maximum keys per page */ + int minkeypage; /* minimum keys per page */ + u_int psize; /* page size */ + int (*compare) /* comparison function */ + __P((const DBT *, const DBT *)); + size_t (*prefix) /* prefix function */ + __P((const DBT *, const DBT *)); + int lorder; /* byte order */ +} BTREEINFO; + +#define HASHMAGIC 0x061561 +#define HASHVERSION 2 + +/* Structure used to pass parameters to the hashing routines. */ +typedef struct { + u_int bsize; /* bucket size */ + u_int ffactor; /* fill factor */ + u_int nelem; /* number of elements */ + u_int cachesize; /* bytes to cache */ + u_int32_t /* hash function */ + (*hash) __P((const void *, size_t)); + int lorder; /* byte order */ +} HASHINFO; + +/* Structure used to pass parameters to the record routines. */ +typedef struct { +#define R_FIXEDLEN 0x01 /* fixed-length records */ +#define R_NOKEY 0x02 /* key not required */ +#define R_SNAPSHOT 0x04 /* snapshot the input */ + u_long flags; + u_int cachesize; /* bytes to cache */ + u_int psize; /* page size */ + int lorder; /* byte order */ + size_t reclen; /* record length (fixed-length records) */ + u_char bval; /* delimiting byte (variable-length records */ + char *bfname; /* btree file name */ +} RECNOINFO; + +#if defined(__cplusplus) +extern "C" { +#endif +DB *__dbopen __P((const char *, int, int, DBTYPE, const void *)); +DB *dbopen __P((const char *, int, int, DBTYPE, const void *)); + +#if defined(__cplusplus) +}; +#endif +#endif /* !_DB_185_H_ */ diff --git a/db2/db_int.h b/db2/db_int.h new file mode 100644 index 0000000000..23fb106755 --- /dev/null +++ b/db2/db_int.h @@ -0,0 +1,332 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_int.h.src 10.28 (Sleepycat) 8/20/97 + */ + +#ifndef _DB_INTERNAL_H_ +#define _DB_INTERNAL_H_ + +#include "db.h" /* Standard DB include file. */ +#include "queue.h" +#include "os_ext.h" + +/******************************************************* + * General purpose constants and macros. + *******************************************************/ +#define UINT32_T_MAX 0xffffffff /* Maximum 32 bit unsigned. */ +#define UINT16_T_MAX 0xffff /* Maximum 16 bit unsigned. */ + +#define DB_MIN_PGSIZE 0x000200 /* Minimum page size. */ +#define DB_MAX_PGSIZE 0x010000 /* Maximum page size. */ + +#define DB_MINCACHE 10 /* Minimum cached pages */ + +/* + * Aligning items to particular sizes or in pages or memory. ALIGNP is a + * separate macro, as we've had to cast the pointer to different integral + * types on different architectures. + * + * We cast pointers into unsigned longs when manipulating them because C89 + * guarantees that u_long is the largest available integral type and further, + * to never generate overflows. However, neither C89 or C9X requires that + * any integer type be large enough to hold a pointer, although C9X created + * the intptr_t type, which is guaranteed to hold a pointer but may or may + * not exist. At some point in the future, we should test for intptr_t and + * use it where available. + */ +#undef ALIGNTYPE +#define ALIGNTYPE u_long +#undef ALIGNP +#define ALIGNP(value, bound) ALIGN((ALIGNTYPE)value, bound) +#undef ALIGN +#define ALIGN(value, bound) (((value) + (bound) - 1) & ~((bound) - 1)) + +/* + * There are several on-page structures that are declared to have a number of + * fields followed by a variable length array of items. The structure size + * without including the variable length array or the address of the first of + * those elements can be found using SSZ. + * + * This macro can also be used to find the offset of a structure element in a + * structure. This is used in various places to copy structure elements from + * unaligned memory references, e.g., pointers into a packed page. + * + * There are two versions because compilers object if you take the address of + * an array. + */ +#undef SSZ +#define SSZ(name, field) ((int)&(((name *)0)->field)) + +#undef SSZA +#define SSZA(name, field) ((int)&(((name *)0)->field[0])) + +/* Free and free-string macros that overwrite memory during debugging. */ +#ifdef DEBUG +#undef FREE +#define FREE(p, len) { \ + memset(p, 0xff, len); \ + free(p); \ +} +#undef FREES +#define FREES(p) { \ + FREE(p, strlen(p)); \ +} +#else +#undef FREE +#define FREE(p, len) { \ + free(p); \ +} +#undef FREES +#define FREES(p) { \ + free(p); \ +} +#endif + +/* Structure used to print flag values. */ +typedef struct __fn { + u_int32_t mask; /* Flag value. */ + const char *name; /* Flag name. */ +} FN; + +/* Set, clear and test flags. */ +#define F_SET(p, f) (p)->flags |= (f) +#define F_CLR(p, f) (p)->flags &= ~(f) +#define F_ISSET(p, f) ((p)->flags & (f)) +#define LF_SET(f) (flags |= (f)) +#define LF_CLR(f) (flags &= ~(f)) +#define LF_ISSET(f) (flags & (f)) + +/* Display separator string. */ +#undef DB_LINE +#define DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + +/******************************************************* + * Files. + *******************************************************/ +#ifndef MAXPATHLEN /* Maximum path length. */ +#ifdef PATH_MAX +#define MAXPATHLEN PATH_MAX +#else +#define MAXPATHLEN 1024 +#endif +#endif + +#define PATH_DOT "." /* Current working directory. */ +#define PATH_SEPARATOR "/" /* Path separator character. */ + +#ifndef S_IRUSR /* UNIX specific file permissions. */ +#define S_IRUSR 0000400 /* R for owner */ +#define S_IWUSR 0000200 /* W for owner */ +#define S_IRGRP 0000040 /* R for group */ +#define S_IWGRP 0000020 /* W for group */ +#define S_IROTH 0000004 /* R for other */ +#define S_IWOTH 0000002 /* W for other */ +#endif + +#ifndef S_ISDIR /* UNIX specific: directory test. */ +#define S_ISDIR(m) ((m & 0170000) == 0040000) +#endif + +/******************************************************* + * Mutex support. + *******************************************************/ +typedef unsigned char tsl_t; + + + +/* + * !!! + * Various systems require different alignments for mutexes (the worst we've + * seen so far is 16-bytes on some HP architectures). The mutex (tsl_t) must + * be first in the db_mutex_t structure, which must itself be first in the + * region. This ensures the alignment is as returned by mmap(2), which should + * be sufficient. All other mutex users must ensure proper alignment locally. + */ +#define MUTEX_ALIGNMENT 1 + +/* + * The offset of a mutex in memory. + */ +#define MUTEX_LOCK_OFFSET(a, b) ((off_t)((u_int8_t *)b - (u_int8_t *)a)) + +typedef struct _db_mutex_t { +#ifdef HAVE_SPINLOCKS + tsl_t tsl_resource; /* Resource test and set. */ +#ifdef DEBUG + u_long pid; /* Lock holder: 0 or process pid. */ +#endif +#else + off_t off; /* Backing file offset. */ + u_long pid; /* Lock holder: 0 or process pid. */ +#endif +#ifdef MUTEX_STATISTICS + u_long mutex_set_wait; /* Blocking mutex: required waiting. */ + u_long mutex_set_nowait; /* Blocking mutex: without waiting. */ +#endif +} db_mutex_t; + +#include "mutex_ext.h" + +/******************************************************* + * Access methods. + *******************************************************/ +/* Lock/unlock a DB thread. */ +#define DB_THREAD_LOCK(dbp) \ + (F_ISSET(dbp, DB_AM_THREAD) ? \ + __db_mutex_lock((db_mutex_t *)(dbp)->mutex, -1, \ + (dbp)->dbenv == NULL ? NULL : (dbp)->dbenv->db_yield) : 0) +#define DB_THREAD_UNLOCK(dbp) \ + (F_ISSET(dbp, DB_AM_THREAD) ? \ + __db_mutex_unlock((db_mutex_t *)(dbp)->mutex, -1) : 0) + +/* Btree/recno local statistics structure. */ +struct __db_bt_lstat; typedef struct __db_bt_lstat DB_BTREE_LSTAT; +struct __db_bt_lstat { + u_int32_t bt_freed; /* Pages freed for reuse. */ + u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ + u_int32_t bt_split; /* Total number of splits. */ + u_int32_t bt_rootsplit; /* Root page splits. */ + u_int32_t bt_fastsplit; /* Fast splits. */ + u_int32_t bt_added; /* Items added. */ + u_int32_t bt_deleted; /* Items deleted. */ + u_int32_t bt_get; /* Items retrieved. */ + u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ + u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ +}; + +/******************************************************* + * Environment. + *******************************************************/ +/* Type passed to __db_appname(). */ +typedef enum { + DB_APP_NONE=0, /* No type (region). */ + DB_APP_DATA, /* Data file. */ + DB_APP_LOG, /* Log file. */ + DB_APP_TMP /* Temporary file. */ +} APPNAME; + +/******************************************************* + * Regions. + *******************************************************/ +/* + * The shared memory regions share an initial structure so that the general + * region code can handle races between the region being deleted and other + * processes waiting on the region mutex. + * + * !!! + * Note, the mutex must be the first entry in the region; see comment above. + */ +typedef struct _rlayout { + db_mutex_t lock; /* Region mutex. */ + u_int32_t refcnt; /* Region reference count. */ + size_t size; /* Region length. */ + int majver; /* Major version number. */ + int minver; /* Minor version number. */ + int patch; /* Patch version number. */ + +#define DB_R_DELETED 0x01 /* Region was deleted. */ + u_int32_t flags; +} RLAYOUT; + +/******************************************************* + * Mpool. + *******************************************************/ +/* + * File types for DB access methods. Negative numbers are reserved to DB. + */ +#define DB_FTYPE_BTREE -1 /* Btree. */ +#define DB_FTYPE_HASH -2 /* Hash. */ + +/* Structure used as the DB pgin/pgout pgcookie. */ +typedef struct __dbpginfo { + size_t db_pagesize; /* Underlying page size. */ + int needswap; /* If swapping required. */ +} DB_PGINFO; + +/******************************************************* + * Log. + *******************************************************/ +/* Initialize an LSN to 'zero'. */ +#define ZERO_LSN(LSN) { \ + (LSN).file = 0; \ + (LSN).offset = 0; \ +} + +/* Return 1 if LSN is a 'zero' lsn, otherwise return 0. */ +#define IS_ZERO_LSN(LSN) ((LSN).file == 0) + +/* Test if we need to log a change. */ +#define DB_LOGGING(dbp) \ + (F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER)) + +#ifdef DEBUG +/* + * Debugging macro to log operations. + * If DEBUG_WOP is defined, log operations that modify the database. + * If DEBUG_ROP is defined, log operations that read the database. + * + * D dbp + * T txn + * O operation (string) + * K key + * A data + * F flags + */ +#define LOG_OP(D, T, O, K, A, F) { \ + DB_LSN _lsn; \ + DBT _op; \ + if (DB_LOGGING((D))) { \ + memset(&_op, 0, sizeof(_op)); \ + _op.data = O; \ + _op.size = strlen(O) + 1; \ + (void)__db_debug_log((D)->dbenv->lg_info, \ + T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F); \ + } \ +} +#ifdef DEBUG_ROP +#define DEBUG_LREAD(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#else +#define DEBUG_LREAD(D, T, O, K, A, F) +#endif +#ifdef DEBUG_WOP +#define DEBUG_LWRITE(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#else +#define DEBUG_LWRITE(D, T, O, K, A, F) +#endif +#else +#define DEBUG_LREAD(D, T, O, K, A, F) +#define DEBUG_LWRITE(D, T, O, K, A, F) +#endif /* DEBUG */ + +/******************************************************* + * Transactions and recovery. + *******************************************************/ +/* + * The locker id space is divided between the transaction manager and the lock + * manager. Lockid's start at 0 and go to MAX_LOCKER_ID. Txn Id's start at + * MAX_LOCKER_ID + 1 and go up to MAX_TXNID. + */ +#define MAX_LOCKER_ID 0x0fffffff +#define MAX_TXNID 0xffffffff + +/* + * Out of band value for a lock. The locks are returned to callers as offsets + * into the lock regions. Since the RLAYOUT structure begins all regions, an + * offset of 0 is guaranteed not to be a valid lock. + */ +#define LOCK_INVALID 0 + +/* The structure allocated for every transaction. */ +struct __db_txn { + DB_TXNMGR *mgrp; /* Pointer to transaction manager. */ + DB_TXN *parent; /* Pointer to transaction's parent. */ + DB_LSN last_lsn; /* Lsn of last log write. */ + u_int32_t txnid; /* Unique transaction id. */ + size_t off; /* Detail structure within region. */ + TAILQ_ENTRY(__db_txn) links; +}; +#endif /* !_DB_INTERNAL_H_ */ diff --git a/db2/dbm/dbm.c b/db2/dbm/dbm.c new file mode 100644 index 0000000000..8daa980f5a --- /dev/null +++ b/db2/dbm/dbm.c @@ -0,0 +1,410 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)dbm.c 10.5 (Sleepycat) 7/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/param.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#endif + +#define DB_DBM_HSEARCH +#include "db_int.h" + +#include "db_page.h" +#include "hash.h" + +/* + * + * This package provides dbm and ndbm compatible interfaces to DB. + * + * The DBM routines, which call the NDBM routines. + */ +static DBM *__cur_db; + +static void __db_no_open __P((void)); + +/* Provide prototypes here since there are none in db.h. */ +int dbm_error __P((DBM *)); +int dbm_clearerr __P((DBM *)); +int dbm_dirfno __P((DBM *)); +int dbm_pagfno __P((DBM *)); + +int +dbminit(file) + char *file; +{ + if (__cur_db != NULL) + (void)dbm_close(__cur_db); + if ((__cur_db = + dbm_open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR)) != NULL) + return (0); + if ((__cur_db = dbm_open(file, O_RDONLY, 0)) != NULL) + return (0); + return (-1); +} + +datum +fetch(key) + datum key; +{ + datum item; + + if (__cur_db == NULL) { + __db_no_open(); + item.dptr = 0; + return (item); + } + return (dbm_fetch(__cur_db, key)); +} + +datum +firstkey() +{ + datum item; + + if (__cur_db == NULL) { + __db_no_open(); + item.dptr = 0; + return (item); + } + return (dbm_firstkey(__cur_db)); +} + +datum +nextkey(key) + datum key; +{ + datum item; + + if (__cur_db == NULL) { + __db_no_open(); + item.dptr = 0; + return (item); + } + return (dbm_nextkey(__cur_db)); +} + +int +delete(key) + datum key; +{ + int ret; + + if (__cur_db == NULL) { + __db_no_open(); + return (-1); + } + ret = dbm_delete(__cur_db, key); + if (ret == 0) + ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0); + return (ret); +} + +int +store(key, dat) + datum key, dat; +{ + int ret; + + if (__cur_db == NULL) { + __db_no_open(); + return (-1); + } + ret = dbm_store(__cur_db, key, dat, DBM_REPLACE); + if (ret == 0) + ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0); + return (ret); +} + +static void +__db_no_open() +{ + (void)fprintf(stderr, "dbm: no open database.\n"); +} + +/* + * This package provides dbm and ndbm compatible interfaces to DB. + * + * The NDBM routines, which call the DB routines. + */ +/* + * Returns: + * *DBM on success + * NULL on failure + */ +DBM * +dbm_open(file, oflags, mode) + const char *file; + int oflags, mode; +{ + DB *dbp; + DB_INFO dbinfo; + char path[MAXPATHLEN]; + + memset(&dbinfo, 0, sizeof(dbinfo)); + dbinfo.db_pagesize = 4096; + dbinfo.h_ffactor = 40; + dbinfo.h_nelem = 1; + + (void)snprintf(path, sizeof(path), "%s%s", file, DBM_SUFFIX); + if ((errno = db_open(path, + DB_HASH, __db_oflags(oflags), mode, NULL, &dbinfo, &dbp)) != 0) + return (NULL); + return ((DBM *)dbp); +} + +/* + * Returns: + * Nothing. + */ +void +dbm_close(db) + DBM *db; +{ + (void)db->close(db, 0); +} + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_fetch(db, key) + DBM *db; + datum key; +{ + DBT _key, _data; + datum data; + int status; + + memset(&_key, 0, sizeof(DBT)); + memset(&_data, 0, sizeof(DBT)); + _key.size = key.dsize; + _key.data = key.dptr; + status = db->get((DB *)db, NULL, &_key, &_data, 0); + if (status) { + data.dptr = NULL; + data.dsize = 0; + } else { + data.dptr = _data.data; + data.dsize = _data.size; + } + return (data); +} + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_firstkey(db) + DBM *db; +{ + DBT _key, _data; + datum key; + int status; + + DBC *cp; + + if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL) + if ((errno = db->cursor(db, NULL, &cp)) != 0) { + memset(&key, 0, sizeof(key)); + return (key); + } + + memset(&_key, 0, sizeof(DBT)); + memset(&_data, 0, sizeof(DBT)); + status = (cp->c_get)(cp, &_key, &_data, DB_FIRST); + if (status) { + key.dptr = NULL; + key.dsize = 0; + } else { + key.dptr = _key.data; + key.dsize = _key.size; + } + return (key); +} + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_nextkey(db) + DBM *db; +{ + DBC *cp; + DBT _key, _data; + datum key; + int status; + + if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL) + if ((errno = db->cursor(db, NULL, &cp)) != 0) { + memset(&key, 0, sizeof(key)); + return (key); + } + + memset(&_key, 0, sizeof(DBT)); + memset(&_data, 0, sizeof(DBT)); + status = (cp->c_get)(cp, &_key, &_data, DB_NEXT); + if (status) { + key.dptr = NULL; + key.dsize = 0; + } else { + key.dptr = _key.data; + key.dsize = _key.size; + } + return (key); +} + +/* + * Returns: + * 0 on success + * <0 failure + */ +int +dbm_delete(db, key) + DBM *db; + datum key; +{ + DBT _key; + int ret; + + memset(&_key, 0, sizeof(DBT)); + _key.data = key.dptr; + _key.size = key.dsize; + ret = (((DB *)db)->del)((DB *)db, NULL, &_key, 0); + if (ret < 0) + errno = ENOENT; + else if (ret > 0) { + errno = ret; + ret = -1; + } + return (ret); +} + +/* + * Returns: + * 0 on success + * <0 failure + * 1 if DBM_INSERT and entry exists + */ +int +dbm_store(db, key, data, flags) + DBM *db; + datum key, data; + int flags; +{ + DBT _key, _data; + + memset(&_key, 0, sizeof(DBT)); + memset(&_data, 0, sizeof(DBT)); + _key.data = key.dptr; + _key.size = key.dsize; + _data.data = data.dptr; + _data.size = data.dsize; + return (db->put((DB *)db, + NULL, &_key, &_data, (flags == DBM_INSERT) ? DB_NOOVERWRITE : 0)); +} + +int +dbm_error(db) + DBM *db; +{ + HTAB *hp; + + hp = (HTAB *)db->internal; + return (hp->local_errno); +} + +int +dbm_clearerr(db) + DBM *db; +{ + HTAB *hp; + + hp = (HTAB *)db->internal; + hp->local_errno = 0; + return (0); +} + +/* + * XXX + * We only have a single file descriptor that we can return, not two. Return + * the same one for both files. Hopefully, the user is using it for locking + * and picked one to use at random. + */ +int +dbm_dirfno(db) + DBM *db; +{ + int fd; + + (void)db->fd(db, &fd); + return (fd); +} + +int +dbm_pagfno(db) + DBM *db; +{ + int fd; + + (void)db->fd(db, &fd); + return (fd); +} diff --git a/db2/hash/hash.c b/db2/hash/hash.c new file mode 100644 index 0000000000..6d8c40057d --- /dev/null +++ b/db2/hash/hash.c @@ -0,0 +1,1440 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash.c 10.25 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "shqueue.h" +#include "db_int.h" +#include "db_page.h" +#include "db_am.h" +#include "db_ext.h" +#include "hash.h" +#include "log.h" + +static int __ham_c_close __P((DBC *)); +static int __ham_c_del __P((DBC *, int)); +static int __ham_c_get __P((DBC *, DBT *, DBT *, int)); +static int __ham_c_put __P((DBC *, DBT *, DBT *, int)); +static int __ham_c_init __P((DB *, DB_TXN *, DBC **)); +static int __ham_cursor __P((DB *, DB_TXN *, DBC **)); +static int __ham_delete __P((DB *, DB_TXN *, DBT *, int)); +static int __ham_dup_return __P((HTAB *, HASH_CURSOR *, DBT *, int)); +static int __ham_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static void __ham_init_htab __P((HTAB *)); +static int __ham_lookup __P((HTAB *, + HASH_CURSOR *, const DBT *, u_int32_t, db_lockmode_t)); +static int __ham_overwrite __P((HTAB *, HASH_CURSOR *, DBT *)); +static int __ham_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __ham_sync __P((DB *, int)); + +/************************** INTERFACE ROUTINES ***************************/ +/* OPEN/CLOSE */ + +/* + * __ham_open -- + * + * PUBLIC: int __ham_open __P((DB *, DB_INFO *)); + */ +int +__ham_open(dbp, dbinfo) + DB *dbp; + DB_INFO *dbinfo; +{ + DB_ENV *dbenv; + DBC *curs; + HTAB *hashp; + int file_existed, ret; + + dbenv = dbp->dbenv; + + if ((hashp = (HTAB *)calloc(1, sizeof(HTAB))) == NULL) + return (ENOMEM); + hashp->dbp = dbp; + + /* Set the hash function if specified by the user. */ + if (dbinfo != NULL && dbinfo->h_hash != NULL) + hashp->hash = dbinfo->h_hash; + + /* + * Initialize the remaining fields of the dbp. The type, close and + * fd functions are all set in db_open. + */ + dbp->internal = hashp; + dbp->cursor = __ham_cursor; + dbp->del = __ham_delete; + dbp->get = __ham_get; + dbp->put = __ham_put; + dbp->sync = __ham_sync; + + /* If locking is turned on, lock the meta data page. */ + if (F_ISSET(dbp, DB_AM_LOCKING)) { + dbp->lock.pgno = BUCKET_INVALID; + if ((ret = lock_get(dbenv->lk_info, dbp->locker, + 0, &dbp->lock_dbt, DB_LOCK_READ, &hashp->hlock)) != 0) { + if (ret < 0) + ret = EAGAIN; + goto out; + } + } + + /* + * Now, we can try to read the meta-data page and figure out + * if we set up locking and get the meta-data page properly. + * If this is a new file, initialize it, and put it back dirty. + */ + if ((ret = __ham_get_page(hashp->dbp, 0, (PAGE **)&hashp->hdr)) != 0) + goto out; + + /* Initialize the hashp structure */ + if (hashp->hdr->magic == DB_HASHMAGIC) { + file_existed = 1; + /* File exists, verify the data in the header. */ + if (hashp->hash == NULL) + hashp->hash = + hashp->hdr->version < 5 ? __ham_func4 : __ham_func5; + if (hashp->hash(CHARKEY, sizeof(CHARKEY)) != + hashp->hdr->h_charkey) { + __db_err(hashp->dbp->dbenv, + "hash: incompatible hash function"); + ret = EINVAL; + goto out; + } + if (F_ISSET(hashp->hdr, DB_HASH_DUP)) + F_SET(dbp, DB_AM_DUP); + } else { + /* + * File does not exist, we must initialize the header. If + * locking is enabled that means getting a write lock first. + */ + file_existed = 0; + if (F_ISSET(dbp, DB_AM_LOCKING) && + ((ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0 || + (ret = lock_get(dbenv->lk_info, dbp->locker, 0, + &dbp->lock_dbt, DB_LOCK_WRITE, &hashp->hlock)) != 0)) { + if (ret < 0) + ret = EAGAIN; + goto out; + } + + hashp->hdr->nelem = dbinfo != NULL ? dbinfo->h_nelem : 0; + hashp->hdr->ffactor = + dbinfo != NULL && dbinfo->h_ffactor ? dbinfo->h_ffactor : 0; + __ham_init_htab(hashp); + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(hashp->hdr, DB_HASH_DUP); + if ((ret = __ham_dirty_page(hashp, (PAGE *)hashp->hdr)) != 0) + goto out; + } + + /* Initialize the default cursor. */ + __ham_c_init(dbp, NULL, &curs); + TAILQ_INSERT_TAIL(&dbp->curs_queue, curs, links); + + /* Allocate memory for our split buffer. */ + if ((hashp->split_buf = (PAGE *)malloc(dbp->pgsize)) == NULL) { + ret = ENOMEM; + goto out; + } + +#ifdef NO_STATISTICS_FOR_DB_ERR + __db_err(dbp->dbenv, + "%s%lx\n%s%ld\n%s%ld\n%s%ld\n%s%ld\n%s0x%lx\n%s0x%lx\n%s%ld\n%s%ld\n%s0x%lx", + "TABLE POINTER ", (long)hashp, + "BUCKET SIZE ", (long)hashp->hdr->pagesize, + "FILL FACTOR ", (long)hashp->hdr->ffactor, + "MAX BUCKET ", (long)hashp->hdr->max_bucket, + "OVFL POINT ", (long)hashp->hdr->ovfl_point, + "LAST FREED ", (long)hashp->hdr->last_freed, + "HIGH MASK ", (long)hashp->hdr->high_mask, + "LOW MASK ", (long)hashp->hdr->low_mask, + "NELEM ", (long)hashp->hdr->nelem, + "FLAGS ", (long)hashp->hdr->flags); +#endif + + /* Release the meta data page */ + (void)__ham_put_page(hashp->dbp, (PAGE *)hashp->hdr, 0); + if (F_ISSET(dbp, DB_AM_LOCKING) && + (ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0) { + if (ret < 0) + ret = EAGAIN; + goto out; + } + + hashp->hlock = 0; + hashp->hdr = NULL; + /* Sync the file so that we know that the meta data goes to disk. */ + if (!file_existed && (ret = dbp->sync(dbp, 0)) != 0) + goto out; + return (0); + +out: (void)__ham_close(dbp); + return (ret); +} + +/* + * PUBLIC: int __ham_close __P((DB *)); + */ +int +__ham_close(dbp) + DB *dbp; +{ + HTAB *hashp; + int ret, t_ret; + + DEBUG_LWRITE(dbp, NULL, "ham_close", NULL, NULL, 0); + hashp = (HTAB *)dbp->internal; + ret = 0; + + /* Free the split page. */ + if (hashp->split_buf) + FREE(hashp->split_buf, dbp->pgsize); + + if (hashp->hdr && (t_ret = __ham_put_page(hashp->dbp, + (PAGE *)hashp->hdr, 0)) != 0 && ret == 0) + ret = t_ret; + if (hashp->hlock && (t_ret = lock_put(hashp->dbp->dbenv->lk_info, + hashp->hlock)) != 0 && ret == 0) + ret = t_ret; + + FREE(hashp, sizeof(HTAB)); + dbp->internal = NULL; + return (ret); +} + +/************************** LOCAL CREATION ROUTINES **********************/ +/* + * Returns 0 on No Error + */ +static void +__ham_init_htab(hashp) + HTAB *hashp; +{ + u_int32_t nelem; + int32_t l2, nbuckets; + + nelem = hashp->hdr->nelem; + hashp->hdr->pagesize = hashp->dbp->pgsize; + ZERO_LSN(hashp->hdr->lsn); + hashp->hdr->magic = DB_HASHMAGIC; + hashp->hdr->version = DB_HASHVERSION; + if (hashp->hash == NULL) + hashp->hash = + hashp->hdr->version < 5 ? __ham_func4 : __ham_func5; + hashp->hdr->h_charkey = hashp->hash(CHARKEY, sizeof(CHARKEY)); + if (nelem != 0 && hashp->hdr->ffactor != 0) { + nelem = (nelem - 1) / hashp->hdr->ffactor + 1; + l2 = __db_log2(nelem > 2 ? nelem : 2); + } else + l2 = 2; + + nbuckets = 1 << l2; + + hashp->hdr->spares[l2] = 0; + hashp->hdr->spares[l2 + 1] = 0; + hashp->hdr->ovfl_point = l2; + hashp->hdr->last_freed = PGNO_INVALID; + + hashp->hdr->max_bucket = hashp->hdr->high_mask = nbuckets - 1; + hashp->hdr->low_mask = (nbuckets >> 1) - 1; + memcpy(hashp->hdr->uid, hashp->dbp->lock.fileid, DB_FILE_ID_LEN); +} + +/********************** DESTROY/CLOSE ROUTINES ************************/ + + +/* + * Write modified pages to disk + * + * Returns: + * 0 == OK + * -1 ERROR + */ +static int +__ham_sync(dbp, flags) + DB *dbp; + int flags; +{ + int ret; + + DEBUG_LWRITE(dbp, NULL, "ham_sync", NULL, NULL, flags); + if ((ret = __db_syncchk(dbp, flags)) != 0) + return (ret); + if (F_ISSET(dbp, DB_AM_RDONLY)) + return (0); + + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = 0; + + return (ret); +} + +/*******************************SEARCH ROUTINES *****************************/ +/* + * All the access routines return + * + * Returns: + * 0 on SUCCESS + * 1 to indicate an external ERROR (i.e. key not found, etc) + * -1 to indicate an internal ERROR (i.e. out of memory, etc) + */ + +static int +__ham_get(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + DBT *data; + int flags; +{ + DB *ldbp; + DBC *cp; + HTAB *hashp; + HASH_CURSOR *hcp; + int ret, t_ret; + + DEBUG_LREAD(dbp, txn, "ham_get", key, NULL, flags); + if ((ret = __db_getchk(dbp, key, data, flags)) != 0) + return (ret); + + ldbp = dbp; + if (F_ISSET(dbp, DB_AM_THREAD) && + (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + + hashp = (HTAB *)ldbp->internal; + SET_LOCKER(ldbp, txn); + GET_META(ldbp, hashp); + cp = TAILQ_FIRST(&ldbp->curs_queue); + + hashp->hash_accesses++; + hcp = (HASH_CURSOR *)TAILQ_FIRST(&ldbp->curs_queue)->internal; + if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ)) == 0) + if (F_ISSET(hcp, H_OK)) + ret = __ham_dup_return(hashp, hcp, data, DB_FIRST); + else /* Key was not found */ + ret = DB_NOTFOUND; + + if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) + ret = t_ret; + RELEASE_META(ldbp, hashp); + if (F_ISSET(dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +static int +__ham_put(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + DBT *data; + int flags; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp; + DBT tmp_val, *myval; + int ret, t_ret; + u_int32_t nbytes; + + DEBUG_LWRITE(dbp, txn, "ham_put", key, data, flags); + if ((ret = __db_putchk(dbp, key, data, + flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0) + return (ret); + + ldbp = dbp; + if (F_ISSET(dbp, DB_AM_THREAD) && + (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + + hashp = (HTAB *)ldbp->internal; + SET_LOCKER(ldbp, txn); + GET_META(ldbp, hashp); + hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal; + + nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(key->size)) + + (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(data->size)); + + hashp->hash_accesses++; + ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE); + + if (ret == DB_NOTFOUND) { + ret = 0; + if (hcp->seek_found_page != PGNO_INVALID && + hcp->seek_found_page != hcp->pgno) { + if ((ret = __ham_item_done(hashp, hcp, 0)) != 0) + goto out; + hcp->pgno = hcp->seek_found_page; + hcp->bndx = NDX_INVALID; + } + + if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { + /* + * Doing a partial put, but the key does not exist + * and we are not beginning the write at 0. We + * must create a data item padded up to doff and + * then write the new bytes represented by val. + */ + ret = __ham_init_dbt(&tmp_val, data->size + data->doff, + &hcp->big_data, &hcp->big_datalen); + if (ret == 0) { + memset(tmp_val.data, 0, data->doff); + memcpy((u_int8_t *)tmp_val.data + data->doff, + data->data, data->size); + myval = &tmp_val; + } + } else + myval = (DBT *)data; + + if (ret == 0) + ret = __ham_add_el(hashp, hcp, key, myval, H_KEYDATA); + } else if (ret == 0 && F_ISSET(hcp, H_OK)) { + if (flags == DB_NOOVERWRITE) + ret = DB_KEYEXIST; + else if (F_ISSET(ldbp, DB_AM_DUP)) + ret = __ham_add_dup(hashp, hcp, data, DB_KEYLAST); + else + ret = __ham_overwrite(hashp, hcp, data); + } + + /* Free up all the cursor pages. */ + if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + ret = t_ret; + /* Now check if we have to grow. */ +out: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { + ret = __ham_expand_table(hashp); + F_CLR(hcp, H_EXPAND); + } + + if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + ret = t_ret; + RELEASE_META(ldbp, hashp); + if (F_ISSET(dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +static int +__ham_cursor(dbp, txnid, dbcp) + DB *dbp; + DB_TXN *txnid; + DBC **dbcp; +{ + int ret; + + DEBUG_LWRITE(dbp, txnid, "ham_cursor", NULL, NULL, 0); + if ((ret = __ham_c_init(dbp, txnid, dbcp)) != 0) + return (ret); + + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_TAIL(&dbp->curs_queue, *dbcp, links); + DB_THREAD_UNLOCK(dbp); + return (ret); +} + +static int +__ham_c_init(dbp, txnid, dbcp) + DB *dbp; + DB_TXN *txnid; + DBC **dbcp; +{ + DBC *db_curs; + HASH_CURSOR *new_curs; + + if ((db_curs = (DBC *)calloc(sizeof(DBC), 1)) == NULL) + return (ENOMEM); + + if ((new_curs = + (HASH_CURSOR *)calloc(sizeof(struct cursor_t), 1)) == NULL) { + FREE(db_curs, sizeof(DBC)); + return (ENOMEM); + } + + db_curs->internal = new_curs; + db_curs->c_close = __ham_c_close; + db_curs->c_del = __ham_c_del; + db_curs->c_get = __ham_c_get; + db_curs->c_put = __ham_c_put; + db_curs->txn = txnid; + db_curs->dbp = dbp; + + new_curs->db_cursor = db_curs; + __ham_item_init(new_curs); + + if (dbcp != NULL) + *dbcp = db_curs; + return (0); +} + +static int +__ham_delete(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + int flags; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp; + int ret, t_ret; + + DEBUG_LWRITE(dbp, txn, "ham_delete", key, NULL, flags); + if ((ret = __db_delchk(dbp, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); + + ldbp = dbp; + if (F_ISSET(dbp, DB_AM_THREAD) && + (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + hashp = (HTAB *)ldbp->internal; + SET_LOCKER(ldbp, txn); + GET_META(ldbp, hashp); + hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal; + + hashp->hash_accesses++; + if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_WRITE)) == 0) + if (F_ISSET(hcp, H_OK)) + ret = __ham_del_pair(hashp, hcp); + else + ret = DB_NOTFOUND; + + if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + ret = t_ret; + RELEASE_META(ldbp, hashp); + if (F_ISSET(dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +/* ****************** CURSORS ********************************** */ +static int +__ham_c_close(cursor) + DBC *cursor; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp; + int ret; + + DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_close", NULL, NULL, 0); + /* + * If the pagep, dpagep, and lock fields of the cursor are all NULL, + * then there really isn't a need to get a handle here. However, + * the normal case is that at least one of those fields is non-NULL, + * and putting those checks in here would couple the ham_item_done + * functionality with cursor close which would be pretty disgusting. + * Instead, we pay the overhead here of always getting the handle. + */ + ldbp = cursor->dbp; + if (F_ISSET(cursor->dbp, DB_AM_THREAD) && + (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + hashp = (HTAB *)ldbp->internal; + hcp = (HASH_CURSOR *)cursor->internal; + ret = __ham_item_done(hashp, hcp, 0); + + if (hcp->big_key) + FREE(hcp->big_key, hcp->big_keylen); + if (hcp->big_data) + FREE(hcp->big_data, hcp->big_datalen); + + /* + * All cursors (except the default ones) are linked off the master. + * Therefore, when we close the cursor, we have to remove it from + * the master, not the local one. When we are closing the file in + * its entirety, then we clear the THREAD bit and the master and + * local are identical, so we remove the correct one. + */ + DB_THREAD_LOCK(cursor->dbp); + TAILQ_REMOVE(&cursor->dbp->curs_queue, cursor, links); + DB_THREAD_UNLOCK(cursor->dbp); + + if (F_ISSET(cursor->dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + + FREE(hcp, sizeof(HASH_CURSOR)); + FREE(cursor, sizeof(DBC)); + return (ret); +} + +static int +__ham_c_del(cursor, flags) + DBC *cursor; + int flags; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp; + HASH_CURSOR save_curs; + db_pgno_t ppgno, chg_pgno; + int ret, t_ret; + + DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_del", NULL, NULL, flags); + ldbp = cursor->dbp; + if (F_ISSET(cursor->dbp, DB_AM_THREAD) && + (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + hashp = (HTAB *)ldbp->internal; + hcp = (HASH_CURSOR *)cursor->internal; + save_curs = *hcp; + if ((ret = __db_cdelchk(ldbp, flags, + F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) + return (ret); + if (F_ISSET(hcp, H_DELETED)) + return (DB_NOTFOUND); + + SET_LOCKER(hashp->dbp, cursor->txn); + GET_META(hashp->dbp, hashp); + hashp->hash_accesses++; + if ((ret = __ham_get_cpage(hashp, hcp, DB_LOCK_WRITE)) != 0) + goto out; + if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID) { + ppgno = PREV_PGNO(hcp->dpagep); + + /* Remove item from duplicate page. */ + chg_pgno = hcp->dpgno; + if ((ret = __db_drem(hashp->dbp, + &hcp->dpagep, hcp->dndx, __ham_del_page)) != 0) + goto out; + + /* + * There are 4 cases. + * 1. We removed an item on a page, but nothing else changed. + * 2. We removed the last item on a page, but there is a + * following page of duplicates. + * 3. We removed the last item on a page, this page was the + * last page in a duplicate set, but there were dups before + * it. + * 4. We removed the last item on a page, removing the last + * duplicate. + * In case 1 hcp->dpagep is unchanged. + * In case 2 hcp->dpagep comes back pointing to the next dup + * page. + * In case 3 hcp->dpagep comes back NULL. + * In case 4 hcp->dpagep comes back NULL. + */ + if (hcp->dpagep == NULL) { + if (ppgno != PGNO_INVALID) { /* Case 3 */ + hcp->dpgno = ppgno; + if ((ret = __ham_get_cpage(hashp, hcp, + DB_LOCK_READ)) != 0) + goto out; + hcp->dndx = NUM_ENT(hcp->dpagep); + F_SET(hcp, H_DELETED); + } else { /* Case 4 */ + ret = __ham_del_pair(hashp, hcp); + hcp->dpgno = PGNO_INVALID; + /* + * Delpair updated the cursor queue, so we + * don't have to do that here. + */ + chg_pgno = PGNO_INVALID; + } + } else if (PGNO(hcp->dpagep) != hcp->dpgno) { + hcp->dndx = 0; /* Case 2 */ + hcp->dpgno = PGNO(hcp->dpagep); + if (ppgno == PGNO_INVALID) + memcpy(P_ENTRY(hcp->pagep, + H_DATAINDEX(hcp->bndx)) + + SSZ(HOFFDUP, pgno), &hcp->dpgno, + sizeof(db_pgno_t)); + F_SET(hcp, H_DELETED); + } else /* Case 1 */ + F_SET(hcp, H_DELETED); + if (chg_pgno != PGNO_INVALID) + __ham_c_update(hashp, hcp, chg_pgno, 0, 0, 1); + } else if (F_ISSET(hcp, H_ISDUP)) { /* on page */ + if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) == + LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx)) + ret = __ham_del_pair(hashp, hcp); + else { + DBT repldbt; + + repldbt.flags = 0; + F_SET(&repldbt, DB_DBT_PARTIAL); + repldbt.doff = hcp->dup_off; + repldbt.dlen = DUP_SIZE(hcp->dup_len); + repldbt.size = 0; + ret = __ham_replpair(hashp, hcp, &repldbt, 0); + hcp->dup_tlen -= DUP_SIZE(hcp->dup_len); + __ham_c_update(hashp, hcp, hcp->pgno, + DUP_SIZE(hcp->dup_len), 0, 1); + F_SET(hcp, H_DELETED); + } + + } else + /* Not a duplicate */ + ret = __ham_del_pair(hashp, hcp); + +out: if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + t_ret = ret; + if (ret != 0) + *hcp = save_curs; + RELEASE_META(hashp->dbp, hashp); + if (F_ISSET(cursor->dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +static int +__ham_c_get(cursor, key, data, flags) + DBC *cursor; + DBT *key; + DBT *data; + int flags; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp, save_curs; + int get_key, ret, t_ret; + + DEBUG_LREAD(cursor->dbp, cursor->txn, "ham_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, + NULL, flags); + ldbp = cursor->dbp; + if (F_ISSET(cursor->dbp, DB_AM_THREAD) && + (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + hashp = (HTAB *)(ldbp->internal); + hcp = (HASH_CURSOR *)cursor->internal; + save_curs = *hcp; + if ((ret = + __db_cgetchk(hashp->dbp, key, data, flags, IS_VALID(hcp))) != 0) + return (ret); + + SET_LOCKER(hashp->dbp, cursor->txn); + GET_META(hashp->dbp, hashp); + hashp->hash_accesses++; + + hcp->seek_size = 0; + + ret = 0; + get_key = 1; + switch (flags) { + case DB_PREV: + if (hcp->bucket != BUCKET_INVALID) { + ret = __ham_item_prev(hashp, hcp, DB_LOCK_READ); + break; + } + /* FALL THROUGH */ + case DB_LAST: + ret = __ham_item_last(hashp, hcp, DB_LOCK_READ); + break; + case DB_FIRST: + ret = __ham_item_first(hashp, hcp, DB_LOCK_READ); + break; + case DB_NEXT: + if (hcp->bucket == BUCKET_INVALID) + hcp->bucket = 0; + ret = __ham_item_next(hashp, hcp, DB_LOCK_READ); + break; + case DB_SET: + case DB_SET_RANGE: + ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ); + get_key = 0; + break; + case DB_CURRENT: + if (F_ISSET(hcp, H_DELETED)) { + ret = DB_KEYEMPTY; + goto out; + } + + ret = __ham_item(hashp, hcp, DB_LOCK_READ); + break; + } + + /* + * Must always enter this loop to do error handling and + * check for big key/data pair. + */ + while (1) { + if (ret != 0 && ret != DB_NOTFOUND) + goto out1; + else if (F_ISSET(hcp, H_OK)) { + /* Get the key. */ + if (get_key && (ret = __db_ret(hashp->dbp, hcp->pagep, + H_KEYINDEX(hcp->bndx), key, &hcp->big_key, + &hcp->big_keylen)) != 0) + goto out1; + + ret = __ham_dup_return(hashp, hcp, data, flags); + break; + } else if (!F_ISSET(hcp, H_NOMORE)) { + abort(); + break; + } + + /* + * Ran out of entries in a bucket; change buckets. + */ + switch (flags) { + case DB_LAST: + case DB_PREV: + ret = __ham_item_done(hashp, hcp, 0); + if (hcp->bucket == 0) { + ret = DB_NOTFOUND; + goto out1; + } + hcp->bucket--; + hcp->bndx = NDX_INVALID; + if (ret == 0) + ret = __ham_item_prev(hashp, + hcp, DB_LOCK_READ); + break; + case DB_FIRST: + case DB_NEXT: + ret = __ham_item_done(hashp, hcp, 0); + hcp->bndx = NDX_INVALID; + hcp->bucket++; + hcp->pgno = PGNO_INVALID; + hcp->pagep = NULL; + if (hcp->bucket > hashp->hdr->max_bucket) { + ret = DB_NOTFOUND; + goto out1; + } + if (ret == 0) + ret = __ham_item_next(hashp, + hcp, DB_LOCK_READ); + break; + case DB_SET: + case DB_SET_RANGE: + /* Key not found. */ + ret = DB_NOTFOUND; + goto out1; + } + } +out1: if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) + t_ret = ret; +out: if (ret) + *hcp = save_curs; + RELEASE_META(hashp->dbp, hashp); + if (F_ISSET(cursor->dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +static int +__ham_c_put(cursor, key, data, flags) + DBC *cursor; + DBT *key; + DBT *data; + int flags; +{ + DB *ldbp; + HTAB *hashp; + HASH_CURSOR *hcp, save_curs; + int ret, t_ret; + u_int32_t nbytes; + + DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, + NULL, flags); + ldbp = cursor->dbp; + if (F_ISSET(cursor->dbp, DB_AM_THREAD) && + (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) + return (ret); + hashp = (HTAB *)(ldbp->internal); + hcp = (HASH_CURSOR *)cursor->internal; + save_curs = *hcp; + + if ((ret = __db_cputchk(hashp->dbp, key, data, flags, + F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) + return (ret); + if (F_ISSET(hcp, H_DELETED)) + return (DB_NOTFOUND); + + SET_LOCKER(hashp->dbp, cursor->txn); + GET_META(hashp->dbp, hashp); + ret = 0; + + switch (flags) { + case DB_KEYLAST: + case DB_KEYFIRST: + nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(key->size)) + + (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(data->size)); + ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE); + break; + case DB_BEFORE: + case DB_AFTER: + case DB_CURRENT: + ret = __ham_item(hashp, hcp, DB_LOCK_WRITE); + break; + } + + if (ret == 0) { + if (flags == DB_CURRENT && !F_ISSET(ldbp, DB_AM_DUP)) + ret = __ham_overwrite(hashp, hcp, data); + else + ret = __ham_add_dup(hashp, hcp, data, flags); + } + + if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { + ret = __ham_expand_table(hashp); + F_CLR(hcp, H_EXPAND); + } + + if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + *hcp = save_curs; + RELEASE_META(hashp->dbp, hashp); + if (F_ISSET(cursor->dbp, DB_AM_THREAD)) + __db_puthandle(ldbp); + return (ret); +} + +/********************************* UTILITIES ************************/ + +/* + * __ham_expand_table -- + * + * PUBLIC: int __ham_expand_table __P((HTAB *)); + */ +int +__ham_expand_table(hashp) + HTAB *hashp; +{ + u_int32_t old_bucket, new_bucket; + u_int32_t spare_ndx; + int ret; + + ret = 0; + DIRTY_META(hashp, ret); + if (ret) + return (ret); + + if (DB_LOGGING(hashp->dbp)) { + DB_LSN new_lsn; + + if ((ret = __ham_splitmeta_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, + hashp->hdr->max_bucket, hashp->hdr->ovfl_point, + hashp->hdr->spares[hashp->hdr->ovfl_point], + &hashp->hdr->lsn)) != 0) + return (ret); + + hashp->hdr->lsn = new_lsn; + } + + hashp->hash_expansions++; + new_bucket = ++hashp->hdr->max_bucket; + old_bucket = (hashp->hdr->max_bucket & hashp->hdr->low_mask); + + /* + * If the split point is increasing (hdr.max_bucket's log base 2 + * increases), max sure that we have enough extra pages, then + * copy the current contents of the spare split bucket to the + * next bucket. + */ + spare_ndx = __db_log2(hashp->hdr->max_bucket + 1); + if (spare_ndx > hashp->hdr->ovfl_point) { + /* + * We are about to shift the split point. Make sure that + * if the next doubling is going to be big (more than 8 + * pages), we have some extra pages around. + */ + if (hashp->hdr->spares[hashp->hdr->ovfl_point] == 0 && + new_bucket >= 8) + __ham_init_ovflpages(hashp); + + hashp->hdr->spares[spare_ndx] = + hashp->hdr->spares[hashp->hdr->ovfl_point]; + hashp->hdr->ovfl_point = spare_ndx; + } + + if (new_bucket > hashp->hdr->high_mask) { + /* Starting a new doubling */ + hashp->hdr->low_mask = hashp->hdr->high_mask; + hashp->hdr->high_mask = new_bucket | hashp->hdr->low_mask; + } + + if (BUCKET_TO_PAGE(hashp, new_bucket) > MAX_PAGES(hashp)) { + __db_err(hashp->dbp->dbenv, + "hash: Cannot allocate new bucket. Pages exhausted."); + return (ENOSPC); + } + + /* Relocate records to the new bucket */ + return (__ham_split_page(hashp, old_bucket, new_bucket)); +} + +/* + * PUBLIC: u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t)); + */ +u_int32_t +__ham_call_hash(hashp, k, len) + HTAB *hashp; + u_int8_t *k; + int32_t len; +{ + u_int32_t n, bucket; + + n = (u_int32_t)hashp->hash(k, len); + bucket = n & hashp->hdr->high_mask; + if (bucket > hashp->hdr->max_bucket) + bucket = bucket & hashp->hdr->low_mask; + return (bucket); +} + +/* + * Check for duplicates, and call __db_ret appropriately. Release + * everything held by the cursor. + */ +static int +__ham_dup_return(hashp, hcp, val, flags) + HTAB *hashp; + HASH_CURSOR *hcp; + DBT *val; + int flags; +{ + HKEYDATA *hk; + PAGE *pp; + DBT *myval, tmp_val; + db_indx_t ndx; + db_pgno_t pgno; + u_int8_t type; + int indx, ret; + db_indx_t len; + + /* Check for duplicate and return the first one. */ + ndx = H_DATAINDEX(hcp->bndx); + type = GET_HKEYDATA(hcp->pagep, ndx)->type; + pp = hcp->pagep; + myval = val; + + /* + * There are 3 cases: + * 1. We are not in duplicate, simply call db_ret. + * 2. We are looking at keys and stumbled onto a duplicate. + * 3. We are in the middle of a duplicate set. (ISDUP set) + */ + + /* + * Here we check for the case where we just stumbled onto a + * duplicate. In this case, we do initialization and then + * let the normal duplicate code handle it. + */ + if (!F_ISSET(hcp, H_ISDUP)) + if (type == H_DUPLICATE) { + F_SET(hcp, H_ISDUP); + hcp->dup_tlen = LEN_HDATA(hcp->pagep, + hashp->hdr->pagesize, hcp->bndx); + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + if (flags == DB_LAST || flags == DB_PREV) { + hcp->dndx = 0; + hcp->dup_off = 0; + do { + memcpy(&len, hk->data + hcp->dup_off, + sizeof(db_indx_t)); + hcp->dup_off += DUP_SIZE(len); + hcp->dndx++; + } while (hcp->dup_off < hcp->dup_tlen); + hcp->dup_off -= DUP_SIZE(len); + hcp->dndx--; + } else { + memcpy(&len, hk->data, sizeof(db_indx_t)); + hcp->dup_off = 0; + hcp->dndx = 0; + } + hcp->dup_len = len; + } else if (type == H_OFFDUP) { + F_SET(hcp, H_ISDUP); + memcpy(&pgno, + P_ENTRY(hcp->pagep, ndx) + SSZ(HOFFDUP, pgno), + sizeof(db_pgno_t)); + if (flags == DB_LAST || flags == DB_PREV) { + indx = (int)hcp->dndx; + if ((ret = __db_dend(hashp->dbp, + pgno, &hcp->dpagep)) != 0) + return (ret); + hcp->dpgno = PGNO(hcp->dpagep); + hcp->dndx = NUM_ENT(hcp->dpagep) - 1; + } else if ((ret = __ham_next_cpage(hashp, + hcp, pgno, 0, H_ISDUP)) != 0) + return (ret); + } + + + /* + * Now, everything is initialized, grab a duplicate if + * necessary. + */ + if (F_ISSET(hcp, H_ISDUP)) + if (hcp->dpgno != PGNO_INVALID) { + pp = hcp->dpagep; + ndx = hcp->dndx; + } else { + /* + * Copy the DBT in case we are retrieving into + * user memory and we need the parameters for + * it. + */ + memcpy(&tmp_val, val, sizeof(*val)); + F_SET(&tmp_val, DB_DBT_PARTIAL); + tmp_val.dlen = hcp->dup_len; + tmp_val.doff = hcp->dup_off + sizeof(db_indx_t); + myval = &tmp_val; + } + + + /* + * Finally, if we had a duplicate, pp, ndx, and myval should be + * set appropriately. + */ + if ((ret = __db_ret(hashp->dbp, pp, ndx, myval, &hcp->big_data, + &hcp->big_datalen)) != 0) + return (ret); + + /* + * In case we sent a temporary off to db_ret, set the real + * return values. + */ + val->data = myval->data; + val->size = myval->size; + + return (0); +} + +static int +__ham_overwrite(hashp, hcp, nval) + HTAB *hashp; + HASH_CURSOR *hcp; + DBT *nval; +{ + DBT *myval, tmp_val; + HKEYDATA *hk; + + if (F_ISSET(hashp->dbp, DB_AM_DUP)) + return (__ham_add_dup(hashp, hcp, nval, DB_KEYLAST)); + else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { + /* Put/overwrite */ + memcpy(&tmp_val, nval, sizeof(*nval)); + F_SET(&tmp_val, DB_DBT_PARTIAL); + tmp_val.doff = 0; + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + if (hk->type == H_OFFPAGE) + memcpy(&tmp_val.dlen, + (u_int8_t *)hk + SSZ(HOFFPAGE, tlen), + sizeof(u_int32_t)); + else + tmp_val.dlen = LEN_HDATA(hcp->pagep, + hashp->hdr->pagesize,hcp->bndx); + myval = &tmp_val; + } else /* Regular partial put */ + myval = nval; + + return (__ham_replpair(hashp, hcp, myval, 0)); +} + +/* + * Given a key and a cursor, sets the cursor to the page/ndx on which + * the key resides. If the key is found, the cursor H_OK flag is set + * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set. + * If the key is not found, the H_OK flag is not set. If the sought + * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields + * are set indicating where an add might take place. If it is 0, + * non of the cursor pointer field are valid. + */ +static int +__ham_lookup(hashp, hcp, key, sought, mode) + HTAB *hashp; + HASH_CURSOR *hcp; + const DBT *key; + u_int32_t sought; + db_lockmode_t mode; +{ + HKEYDATA *hk; + db_pgno_t pgno; + u_int32_t tlen; + int match, ret, t_ret; + + /* + * Set up cursor so that we're looking for space to add an item + * as we cycle through the pages looking for the key. + */ + if ((ret = __ham_item_reset(hashp, hcp)) != 0) + return (ret); + hcp->seek_size = sought; + + hcp->bucket = __ham_call_hash(hashp, (u_int8_t *)key->data, key->size); + while (1) { + if ((ret = __ham_item_next(hashp, hcp, mode)) != 0) + return (ret); + + if (F_ISSET(hcp, H_NOMORE)) + break; + + hk = H_PAIRKEY(hcp->pagep, hcp->bndx); + switch (hk->type) { + case H_OFFPAGE: + memcpy(&tlen, (u_int8_t *)hk + SSZ(HOFFPAGE, tlen), + sizeof(u_int32_t)); + if (tlen == key->size) { + memcpy(&pgno, + (u_int8_t *)hk + SSZ(HOFFPAGE, pgno), + sizeof(db_pgno_t)); + match = __db_moff(hashp->dbp, key, pgno); + if (match == 0) { + F_SET(hcp, H_OK); + return (0); + } + } + break; + case H_KEYDATA: + if (key->size == LEN_HKEY(hcp->pagep, + hashp->hdr->pagesize, hcp->bndx) && + memcmp(key->data, hk->data, key->size) == 0) { + F_SET(hcp, H_OK); + return (0); + } + break; + case H_DUPLICATE: + case H_OFFDUP: + /* + * These are errors because keys are never + * duplicated, only data items are. + */ + return (__db_pgfmt(hashp->dbp, PGNO(hcp->pagep))); + } + hashp->hash_collisions++; + } + + /* + * Item was not found, adjust cursor properly. + */ + + if (sought != 0) + return (ret); + + if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * Initialize a dbt using some possibly already allocated storage + * for items. + * PUBLIC: int __ham_init_dbt __P((DBT *, u_int32_t, void **, u_int32_t *)); + */ +int +__ham_init_dbt(dbt, size, bufp, sizep) + DBT *dbt; + u_int32_t size; + void **bufp; + u_int32_t *sizep; +{ + memset(dbt, 0, sizeof(*dbt)); + if (*sizep < size) { + if ((*bufp = (void *)(*bufp == NULL ? + malloc(size) : realloc(*bufp, size))) == NULL) { + *sizep = 0; + return (ENOMEM); + } + *sizep = size; + } + dbt->data = *bufp; + dbt->size = size; + return (0); +} + +/* + * Adjust the cursor after an insert or delete. The cursor passed is + * the one that was operated upon; we just need to check any of the + * others. + * + * len indicates the length of the item added/deleted + * add indicates if the item indicated by the cursor has just been + * added (add == 1) or deleted (add == 0). + * dup indicates if the addition occurred into a duplicate set. + * + * PUBLIC: void __ham_c_update __P((HTAB *, + * PUBLIC: HASH_CURSOR *, db_pgno_t, u_int32_t, int, int)); + */ +void +__ham_c_update(hashp, hcp, chg_pgno, len, add, dup) + HTAB *hashp; + HASH_CURSOR *hcp; + db_pgno_t chg_pgno; + u_int32_t len; + int add; + int dup; +{ + DBC *cp; + HTAB *hp; + HASH_CURSOR *lcp; + int page_deleted; + + /* + * Regular adds are always at the end of a given page, + * so we never have to adjust anyone's cursor after + * a regular add. + */ + if (!dup && add) + return; + + page_deleted = chg_pgno != PGNO_INVALID && + ((!dup && chg_pgno != hcp->pgno) || + (dup && chg_pgno != hcp->dpgno)); + + hp = hcp->db_cursor->dbp->master->internal; + DB_THREAD_LOCK(hp->dbp); + + for (cp = TAILQ_FIRST(&hp->dbp->curs_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) { + if (cp->internal == hcp) + continue; + + lcp = (HASH_CURSOR *)cp->internal; + + if (!dup && lcp->pgno != chg_pgno) + continue; + + if (dup && F_ISSET(hcp, H_DELETED) && lcp->pgno != chg_pgno) + continue; + + if (dup && !F_ISSET(hcp, H_DELETED) && lcp->dpgno != chg_pgno) + continue; + + if (page_deleted) { + if (dup) { + lcp->dpgno = hcp->dpgno; + lcp->dndx = hcp->dndx; + } else { + lcp->pgno = hcp->pgno; + lcp->bndx = hcp->bndx; + lcp->bucket = hcp->bucket; + } + F_CLR(lcp, H_ISDUP); + continue; + } + + if (!dup && lcp->bndx > hcp->bndx) + lcp->bndx--; + else if (!dup && lcp->bndx == hcp->bndx) + F_SET(lcp, H_DELETED); + else if (dup && lcp->bndx == hcp->bndx) { + /* Assign dpgno in case there was page conversion. */ + lcp->dpgno = hcp->dpgno; + if (add && lcp->dndx >= hcp->dndx ) + lcp->dndx++; + else if (!add && lcp->dndx > hcp->dndx) + lcp->dndx--; + else if (!add && lcp->dndx == hcp->dndx) + F_SET(lcp, H_DELETED); + + /* Now adjust on-page information. */ + if (lcp->dpgno == PGNO_INVALID) + if (add) { + lcp->dup_tlen += len; + if (lcp->dndx > hcp->dndx) + lcp->dup_off += len; + } else { + lcp->dup_tlen -= len; + if (lcp->dndx > hcp->dndx) + lcp->dup_off -= len; + } + } + } + DB_THREAD_UNLOCK(hp->dbp); +} + +/* + * __ham_hdup -- + * This function gets called when we create a duplicate handle for a + * threaded DB. It should create the private part of the DB structure. + * PUBLIC: int __ham_hdup __P((DB *, DB *)); + */ +int +__ham_hdup(orig, new) + DB *orig, *new; +{ + HTAB *hashp; + DBC *curs; + int ret; + + if ((hashp = (HTAB *)malloc(sizeof(HTAB))) == NULL) + return (ENOMEM); + + new->internal = hashp; + + hashp->dbp = new; + hashp->hlock = 0; + hashp->hdr = NULL; + hashp->hash = ((HTAB *)orig->internal)->hash; + if ((hashp->split_buf = (PAGE *)malloc(orig->pgsize)) == NULL) + return (ENOMEM); + hashp->local_errno = 0; + hashp->hash_accesses = 0; + hashp->hash_collisions = 0; + hashp->hash_expansions = 0; + hashp->hash_overflows = 0; + hashp->hash_bigpages = 0; + /* Initialize the cursor queue. */ + ret = __ham_c_init(new, NULL, &curs); + TAILQ_INSERT_TAIL(&new->curs_queue, curs, links); + return (ret); +} diff --git a/db2/hash/hash.src b/db2/hash/hash.src new file mode 100644 index 0000000000..04a98d3cb3 --- /dev/null +++ b/db2/hash/hash.src @@ -0,0 +1,211 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hash.src 10.1 (Sleepycat) 4/12/97 + */ + +#include "config.h" + +/* + * This is the source file used to create the logging functions for the + * hash package. Each access method (or set of routines wishing to register + * record types with the transaction system) should have a file like this. + * Each type of log record and its parameters is defined. The basic + * format of a record definition is: + * + * BEGIN <RECORD_TYPE> + * ARG|STRING|POINTER <variable name> <variable type> <printf format> + * ... + * END + * ARG the argument is a simple parameter of the type * specified. + * DBT the argument is a DBT (db.h) containing a length and pointer. + * PTR the argument is a pointer to the data type specified; the entire + * type should be logged. + * + * There are a set of shell scripts of the form xxx.sh that generate c + * code and or h files to process these. (This is probably better done + * in a single PERL script, but for now, this works.) + * + * The DB recovery system requires the following three fields appear in + * every record, and will assign them to the per-record-type structures + * as well as making them the first parameters to the appropriate logging + * call. + * rectype: record-type, identifies the structure and log/read call + * txnid: transaction id, a DBT in this implementation + * prev: the last LSN for this transaction + */ + +/* + * Use the argument of PREFIX as the prefix for all record types, + * routines, id numbers, etc. + */ +PREFIX ham + +/* + * HASH-insdel: used for hash to insert/delete a pair of entries onto a master + * page. The pair might be regular key/data pairs or they might be the + * structures that refer to off page items, duplicates or offpage duplicates. + * opcode - PUTPAIR/DELPAIR + big masks + * fileid - identifies the file referenced + * pgno - page within file + * ndx - index on the page of the item being added (item index) + * pagelsn - lsn on the page before the update + * key - the key being inserted + * data - the data being inserted + */ +BEGIN insdel +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +ARG ndx u_int32_t lu +POINTER pagelsn DB_LSN * lu +DBT key DBT s +DBT data DBT s +END + +/* + * Used to add and remove overflow pages. + * prev_pgno is the previous page that is going to get modified to + * point to this one. If this is the first page in a chain + * then prev_pgno should be PGNO_INVALID. + * new_pgno is the page being allocated. + * next_pgno is the page that follows this one. On allocation, + * this should be PGNO_INVALID. For deletes, it may exist. + * pagelsn is the old lsn on the page. + */ +BEGIN newpage +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG prev_pgno db_pgno_t lu +POINTER prevlsn DB_LSN * lu +ARG new_pgno db_pgno_t lu +POINTER pagelsn DB_LSN * lu +ARG next_pgno db_pgno_t lu +POINTER nextlsn DB_LSN * lu +END + +/* + * Splitting requires two types of log messages. The first + * logs the meta-data of the split. The second logs the + * data on the original page. To redo the split, we have + * to visit the new page (pages) and add the items back + * on the page if they are not yet there. + * For the meta-data split + * bucket: max_bucket in table before split + * ovflpoint: overflow point before split. + * spares: spares[ovflpoint] before split. + */ +BEGIN splitmeta +ARG fileid u_int32_t lu +ARG bucket u_int32_t lu +ARG ovflpoint u_int32_t lu +ARG spares u_int32_t lu +POINTER metalsn DB_LSN * lu +END + +BEGIN splitdata +ARG fileid u_int32_t lu +ARG opcode u_int32_t lu +ARG pgno db_pgno_t lu +DBT pageimage DBT s +POINTER pagelsn DB_LSN * lu +END + +/* + * HASH-replace: is used for hash to handle partial puts that only + * affect a single master page. + * fileid - identifies the file referenced + * pgno - page within file + * ndx - index on the page of the item being modified (item index) + * pagelsn - lsn on the page before the update + * off - offset in the old item where the new item is going. + * olditem - DBT that describes the part of the item being replaced. + * newitem - DBT of the new item. + * makedup - this was a replacement that made an item a duplicate. + */ +BEGIN replace +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +ARG ndx u_int32_t lu +POINTER pagelsn DB_LSN * lu +ARG off int32_t ld +DBT olditem DBT s +DBT newitem DBT s +ARG makedup u_int32_t lu +END + +/* + * HASH-newpgno: is used to record getting/deleting a new page number. + * This doesn't require much data modification, just modifying the + * meta-data. + * pgno is the page being allocated/freed. + * free_pgno is the next_pgno on the free list. + * old_type was the type of a page being deallocated. + * old_pgno was the next page number before the deallocation. We use it + * to indicate whether we incremented the spares count or not + * during this allocation. + */ +BEGIN newpgno +ARG opcode u_int32_t lu +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +ARG free_pgno db_pgno_t lu +ARG old_type u_int32_t lu +ARG old_pgno db_pgno_t lu +ARG new_type u_int32_t lu +POINTER pagelsn DB_LSN * lu +POINTER metalsn DB_LSN * lu +END + +/* + * ovfl: initialize a set of overflow pages. + */ +BEGIN ovfl +ARG fileid u_int32_t lu +ARG start_pgno db_pgno_t lu +ARG npages u_int32_t lu +ARG free_pgno db_pgno_t lu +POINTER metalsn DB_LSN * lu +END diff --git a/db2/hash/hash_auto.c b/db2/hash/hash_auto.c new file mode 100644 index 0000000000..f8ab80c8ee --- /dev/null +++ b/db2/hash/hash_auto.c @@ -0,0 +1,1343 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "hash.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __ham_insdel_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, u_int32_t, + * PUBLIC: DB_LSN *, DBT *, DBT *)); + */ +int __ham_insdel_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, pgno, ndx, pagelsn, key, + data) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN * pagelsn; + DBT *key; + DBT *data; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_insdel; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(ndx) + + sizeof(*pagelsn) + + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ndx, sizeof(ndx)); + bp += sizeof(ndx); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + if (key == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &key->size, sizeof(key->size)); + bp += sizeof(key->size); + memcpy(bp, key->data, key->size); + bp += key->size; + } + if (data == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &data->size, sizeof(data->size)); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + bp += data->size; + } +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_insdel_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_insdel_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_insdel_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_insdel_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_insdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tndx: %lu\n", (u_long)argp->ndx); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tkey: "); + for (i = 0; i < argp->key.size; i++) { + c = ((char *)argp->key.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + c = ((char *)argp->data.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_insdel_read __P((void *, __ham_insdel_args **)); + */ +int +__ham_insdel_read(recbuf, argpp) + void *recbuf; + __ham_insdel_args **argpp; +{ + __ham_insdel_args *argp; + u_int8_t *bp; + + argp = (__ham_insdel_args *)malloc(sizeof(__ham_insdel_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ndx, bp, sizeof(argp->ndx)); + bp += sizeof(argp->ndx); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->key.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->key.data = bp; + bp += argp->key.size; + memcpy(&argp->data.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_newpage_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, DB_LSN *, + * PUBLIC: db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *)); + */ +int __ham_newpage_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, prev_pgno, prevlsn, new_pgno, pagelsn, + next_pgno, nextlsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t prev_pgno; + DB_LSN * prevlsn; + db_pgno_t new_pgno; + DB_LSN * pagelsn; + db_pgno_t next_pgno; + DB_LSN * nextlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_newpage; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(prev_pgno) + + sizeof(*prevlsn) + + sizeof(new_pgno) + + sizeof(*pagelsn) + + sizeof(next_pgno) + + sizeof(*nextlsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &prev_pgno, sizeof(prev_pgno)); + bp += sizeof(prev_pgno); + if (prevlsn != NULL) + memcpy(bp, prevlsn, sizeof(*prevlsn)); + else + memset(bp, 0, sizeof(*prevlsn)); + bp += sizeof(*prevlsn); + memcpy(bp, &new_pgno, sizeof(new_pgno)); + bp += sizeof(new_pgno); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + memcpy(bp, &next_pgno, sizeof(next_pgno)); + bp += sizeof(next_pgno); + if (nextlsn != NULL) + memcpy(bp, nextlsn, sizeof(*nextlsn)); + else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_newpage_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_newpage_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_newpage_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_newpage_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_newpage: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno); + printf("\tprevlsn: [%lu][%lu]\n", + (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); + printf("\tnew_pgno: %lu\n", (u_long)argp->new_pgno); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno); + printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_newpage_read __P((void *, __ham_newpage_args **)); + */ +int +__ham_newpage_read(recbuf, argpp) + void *recbuf; + __ham_newpage_args **argpp; +{ + __ham_newpage_args *argp; + u_int8_t *bp; + + argp = (__ham_newpage_args *)malloc(sizeof(__ham_newpage_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->prev_pgno, bp, sizeof(argp->prev_pgno)); + bp += sizeof(argp->prev_pgno); + memcpy(&argp->prevlsn, bp, sizeof(argp->prevlsn)); + bp += sizeof(argp->prevlsn); + memcpy(&argp->new_pgno, bp, sizeof(argp->new_pgno)); + bp += sizeof(argp->new_pgno); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->next_pgno, bp, sizeof(argp->next_pgno)); + bp += sizeof(argp->next_pgno); + memcpy(&argp->nextlsn, bp, sizeof(argp->nextlsn)); + bp += sizeof(argp->nextlsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_splitmeta_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int32_t, + * PUBLIC: DB_LSN *)); + */ +int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags, + fileid, bucket, ovflpoint, spares, metalsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + u_int32_t bucket; + u_int32_t ovflpoint; + u_int32_t spares; + DB_LSN * metalsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_splitmeta; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(bucket) + + sizeof(ovflpoint) + + sizeof(spares) + + sizeof(*metalsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &bucket, sizeof(bucket)); + bp += sizeof(bucket); + memcpy(bp, &ovflpoint, sizeof(ovflpoint)); + bp += sizeof(ovflpoint); + memcpy(bp, &spares, sizeof(spares)); + bp += sizeof(spares); + if (metalsn != NULL) + memcpy(bp, metalsn, sizeof(*metalsn)); + else + memset(bp, 0, sizeof(*metalsn)); + bp += sizeof(*metalsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_splitmeta_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_splitmeta_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_splitmeta_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_splitmeta_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_splitmeta: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tbucket: %lu\n", (u_long)argp->bucket); + printf("\tovflpoint: %lu\n", (u_long)argp->ovflpoint); + printf("\tspares: %lu\n", (u_long)argp->spares); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_splitmeta_read __P((void *, __ham_splitmeta_args **)); + */ +int +__ham_splitmeta_read(recbuf, argpp) + void *recbuf; + __ham_splitmeta_args **argpp; +{ + __ham_splitmeta_args *argp; + u_int8_t *bp; + + argp = (__ham_splitmeta_args *)malloc(sizeof(__ham_splitmeta_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->bucket, bp, sizeof(argp->bucket)); + bp += sizeof(argp->bucket); + memcpy(&argp->ovflpoint, bp, sizeof(argp->ovflpoint)); + bp += sizeof(argp->ovflpoint); + memcpy(&argp->spares, bp, sizeof(argp->spares)); + bp += sizeof(argp->spares); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_splitdata_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, DBT *, + * PUBLIC: DB_LSN *)); + */ +int __ham_splitdata_log(logp, txnid, ret_lsnp, flags, + fileid, opcode, pgno, pageimage, pagelsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + u_int32_t opcode; + db_pgno_t pgno; + DBT *pageimage; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_splitdata; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(opcode) + + sizeof(pgno) + + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size) + + sizeof(*pagelsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pageimage == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pageimage->size, sizeof(pageimage->size)); + bp += sizeof(pageimage->size); + memcpy(bp, pageimage->data, pageimage->size); + bp += pageimage->size; + } + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_splitdata_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_splitdata_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_splitdata_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_splitdata_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_splitdata: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpageimage: "); + for (i = 0; i < argp->pageimage.size; i++) { + c = ((char *)argp->pageimage.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_splitdata_read __P((void *, __ham_splitdata_args **)); + */ +int +__ham_splitdata_read(recbuf, argpp) + void *recbuf; + __ham_splitdata_args **argpp; +{ + __ham_splitdata_args *argp; + u_int8_t *bp; + + argp = (__ham_splitdata_args *)malloc(sizeof(__ham_splitdata_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->pageimage.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pageimage.data = bp; + bp += argp->pageimage.size; + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_replace_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, DB_LSN *, + * PUBLIC: int32_t, DBT *, DBT *, u_int32_t)); + */ +int __ham_replace_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, ndx, pagelsn, off, olditem, + newitem, makedup) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN * pagelsn; + int32_t off; + DBT *olditem; + DBT *newitem; + u_int32_t makedup; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_replace; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(ndx) + + sizeof(*pagelsn) + + sizeof(off) + + sizeof(u_int32_t) + (olditem == NULL ? 0 : olditem->size) + + sizeof(u_int32_t) + (newitem == NULL ? 0 : newitem->size) + + sizeof(makedup); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ndx, sizeof(ndx)); + bp += sizeof(ndx); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + memcpy(bp, &off, sizeof(off)); + bp += sizeof(off); + if (olditem == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &olditem->size, sizeof(olditem->size)); + bp += sizeof(olditem->size); + memcpy(bp, olditem->data, olditem->size); + bp += olditem->size; + } + if (newitem == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &newitem->size, sizeof(newitem->size)); + bp += sizeof(newitem->size); + memcpy(bp, newitem->data, newitem->size); + bp += newitem->size; + } + memcpy(bp, &makedup, sizeof(makedup)); + bp += sizeof(makedup); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_replace_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_replace_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_replace_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_replace_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_replace: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tndx: %lu\n", (u_long)argp->ndx); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\toff: %ld\n", (long)argp->off); + printf("\tolditem: "); + for (i = 0; i < argp->olditem.size; i++) { + c = ((char *)argp->olditem.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tnewitem: "); + for (i = 0; i < argp->newitem.size; i++) { + c = ((char *)argp->newitem.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tmakedup: %lu\n", (u_long)argp->makedup); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_replace_read __P((void *, __ham_replace_args **)); + */ +int +__ham_replace_read(recbuf, argpp) + void *recbuf; + __ham_replace_args **argpp; +{ + __ham_replace_args *argp; + u_int8_t *bp; + + argp = (__ham_replace_args *)malloc(sizeof(__ham_replace_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ndx, bp, sizeof(argp->ndx)); + bp += sizeof(argp->ndx); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->off, bp, sizeof(argp->off)); + bp += sizeof(argp->off); + memcpy(&argp->olditem.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->olditem.data = bp; + bp += argp->olditem.size; + memcpy(&argp->newitem.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->newitem.data = bp; + bp += argp->newitem.size; + memcpy(&argp->makedup, bp, sizeof(argp->makedup)); + bp += sizeof(argp->makedup); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_newpgno_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, DB_LSN *, + * PUBLIC: DB_LSN *)); + */ +int __ham_newpgno_log(logp, txnid, ret_lsnp, flags, + opcode, fileid, pgno, free_pgno, old_type, old_pgno, + new_type, pagelsn, metalsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + db_pgno_t free_pgno; + u_int32_t old_type; + db_pgno_t old_pgno; + u_int32_t new_type; + DB_LSN * pagelsn; + DB_LSN * metalsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_newpgno; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(free_pgno) + + sizeof(old_type) + + sizeof(old_pgno) + + sizeof(new_type) + + sizeof(*pagelsn) + + sizeof(*metalsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &free_pgno, sizeof(free_pgno)); + bp += sizeof(free_pgno); + memcpy(bp, &old_type, sizeof(old_type)); + bp += sizeof(old_type); + memcpy(bp, &old_pgno, sizeof(old_pgno)); + bp += sizeof(old_pgno); + memcpy(bp, &new_type, sizeof(new_type)); + bp += sizeof(new_type); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + if (metalsn != NULL) + memcpy(bp, metalsn, sizeof(*metalsn)); + else + memset(bp, 0, sizeof(*metalsn)); + bp += sizeof(*metalsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_newpgno_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_newpgno_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_newpgno_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_newpgno_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_newpgno: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tfree_pgno: %lu\n", (u_long)argp->free_pgno); + printf("\told_type: %lu\n", (u_long)argp->old_type); + printf("\told_pgno: %lu\n", (u_long)argp->old_pgno); + printf("\tnew_type: %lu\n", (u_long)argp->new_type); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_newpgno_read __P((void *, __ham_newpgno_args **)); + */ +int +__ham_newpgno_read(recbuf, argpp) + void *recbuf; + __ham_newpgno_args **argpp; +{ + __ham_newpgno_args *argp; + u_int8_t *bp; + + argp = (__ham_newpgno_args *)malloc(sizeof(__ham_newpgno_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->free_pgno, bp, sizeof(argp->free_pgno)); + bp += sizeof(argp->free_pgno); + memcpy(&argp->old_type, bp, sizeof(argp->old_type)); + bp += sizeof(argp->old_type); + memcpy(&argp->old_pgno, bp, sizeof(argp->old_pgno)); + bp += sizeof(argp->old_pgno); + memcpy(&argp->new_type, bp, sizeof(argp->new_type)); + bp += sizeof(argp->new_type); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_ovfl_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, + * PUBLIC: DB_LSN *)); + */ +int __ham_ovfl_log(logp, txnid, ret_lsnp, flags, + fileid, start_pgno, npages, free_pgno, metalsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t start_pgno; + u_int32_t npages; + db_pgno_t free_pgno; + DB_LSN * metalsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_ovfl; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(start_pgno) + + sizeof(npages) + + sizeof(free_pgno) + + sizeof(*metalsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &start_pgno, sizeof(start_pgno)); + bp += sizeof(start_pgno); + memcpy(bp, &npages, sizeof(npages)); + bp += sizeof(npages); + memcpy(bp, &free_pgno, sizeof(free_pgno)); + bp += sizeof(free_pgno); + if (metalsn != NULL) + memcpy(bp, metalsn, sizeof(*metalsn)); + else + memset(bp, 0, sizeof(*metalsn)); + bp += sizeof(*metalsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __ham_ovfl_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__ham_ovfl_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __ham_ovfl_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __ham_ovfl_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_ovfl: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tstart_pgno: %lu\n", (u_long)argp->start_pgno); + printf("\tnpages: %lu\n", (u_long)argp->npages); + printf("\tfree_pgno: %lu\n", (u_long)argp->free_pgno); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __ham_ovfl_read __P((void *, __ham_ovfl_args **)); + */ +int +__ham_ovfl_read(recbuf, argpp) + void *recbuf; + __ham_ovfl_args **argpp; +{ + __ham_ovfl_args *argp; + u_int8_t *bp; + + argp = (__ham_ovfl_args *)malloc(sizeof(__ham_ovfl_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->start_pgno, bp, sizeof(argp->start_pgno)); + bp += sizeof(argp->start_pgno); + memcpy(&argp->npages, bp, sizeof(argp->npages)); + bp += sizeof(argp->npages); + memcpy(&argp->free_pgno, bp, sizeof(argp->free_pgno)); + bp += sizeof(argp->free_pgno); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __ham_init_print __P((DB_ENV *)); + */ +int +__ham_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __ham_insdel_print, DB_ham_insdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpage_print, DB_ham_newpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitmeta_print, DB_ham_splitmeta)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitdata_print, DB_ham_splitdata)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_replace_print, DB_ham_replace)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpgno_print, DB_ham_newpgno)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_ovfl_print, DB_ham_ovfl)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __ham_init_recover __P((DB_ENV *)); + */ +int +__ham_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __ham_insdel_recover, DB_ham_insdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpage_recover, DB_ham_newpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitmeta_recover, DB_ham_splitmeta)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitdata_recover, DB_ham_splitdata)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_replace_recover, DB_ham_replace)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpgno_recover, DB_ham_newpgno)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_ovfl_recover, DB_ham_ovfl)) != 0) + return (ret); + return (0); +} + diff --git a/db2/hash/hash_conv.c b/db2/hash/hash_conv.c new file mode 100644 index 0000000000..22901af950 --- /dev/null +++ b/db2/hash/hash_conv.c @@ -0,0 +1,101 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_conv.c 10.3 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" + +/* + * __h_pgin, __ham_pgout -- + * Convert host-specific page layout to/from the host-independent + * format stored on disk. + * + * PUBLIC: int __ham_pgin __P((db_pgno_t, void *, DBT *)); + * PUBLIC: int __ham_pgout __P((db_pgno_t, void *, DBT *)); + */ +int +__ham_pgin(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + u_int32_t tpgno; + + pginfo = (DB_PGINFO *)cookie->data; + tpgno = PGNO((PAGE *)pp); + if (pginfo->needswap) + M_32_SWAP(tpgno); + + if (pg != PGNO_METADATA && pg != tpgno) { + P_INIT(pp, pginfo->db_pagesize, + pg, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + return (0); + } + + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __ham_mswap(pp) : __db_pgin(pg, pp)); +} + +int +__ham_pgout(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __ham_mswap(pp) : __db_pgout(pg, pp)); +} + +/* + * __ham_mswap -- + * Swap the bytes on the hash metadata page. + * + * PUBLIC: int __ham_mswap __P((void *)); + */ +int +__ham_mswap(pg) + void *pg; +{ + u_int8_t *p; + int i; + + p = (u_int8_t *)pg; + SWAP32(p); /* lsn part 1 */ + SWAP32(p); /* lsn part 2 */ + SWAP32(p); /* pgno */ + SWAP32(p); /* magic */ + SWAP32(p); /* version */ + SWAP32(p); /* pagesize */ + SWAP32(p); /* ovfl_point */ + SWAP32(p); /* last_freed */ + SWAP32(p); /* max_bucket */ + SWAP32(p); /* high_mask */ + SWAP32(p); /* low_mask */ + SWAP32(p); /* ffactor */ + SWAP32(p); /* nelem */ + SWAP32(p); /* h_charkey */ + SWAP32(p); /* flags */ + for (i = 0; i < NCACHED; ++i) + SWAP32(p); /* spares */ + return (0); +} diff --git a/db2/hash/hash_debug.c b/db2/hash/hash_debug.c new file mode 100644 index 0000000000..979ddd7b87 --- /dev/null +++ b/db2/hash/hash_debug.c @@ -0,0 +1,96 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jeremy Rassen. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_debug.c 10.2 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifdef DEBUG +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Debug routines. + * + * ROUTINES: + * + * External + * __dump_bucket + */ +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" + +/* + * __ham_dump_bucket -- + * + * PUBLIC: #ifdef DEBUG + * PUBLIC: void __ham_dump_bucket __P((HTAB *, u_int32_t)); + * PUBLIC: #endif + */ +void +__ham_dump_bucket(hashp, bucket) + HTAB *hashp; + u_int32_t bucket; +{ + PAGE *p; + db_pgno_t pgno; + int ret; + + for (pgno = BUCKET_TO_PAGE(hashp, bucket); pgno != PGNO_INVALID;) { + if ((ret = memp_fget(hashp->dbp->mpf, &pgno, 0, &p)) != 0) + break; + (void)__db_prpage(p, 1); + pgno = p->next_pgno; + (void)memp_fput(hashp->dbp->mpf, p, 0); + } +} +#endif /* DEBUG */ diff --git a/db2/hash/hash_dup.c b/db2/hash/hash_dup.c new file mode 100644 index 0000000000..059eec6f92 --- /dev/null +++ b/db2/hash/hash_dup.c @@ -0,0 +1,544 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_dup.c 10.5 (Sleepycat) 7/27/97"; +#endif /* not lint */ + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Manipulation of duplicates for the hash package. + * + * ROUTINES: + * + * External + * __add_dup + * Internal + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" + +static int __ham_check_move __P((HTAB *, HASH_CURSOR *, int32_t)); +static int __ham_dup_convert __P((HTAB *, HASH_CURSOR *)); +static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *)); + +/* + * Called from hash_access to add a duplicate key. nval is the new + * value that we want to add. The flags correspond to the flag values + * to cursor_put indicating where to add the new element. + * There are 4 cases. + * Case 1: The existing duplicate set already resides on a separate page. + * We can use common code for this. + * Case 2: The element is small enough to just be added to the existing set. + * Case 3: The element is large enough to be a big item, so we're going to + * have to push the set onto a new page. + * Case 4: The element is large enough to push the duplicate set onto a + * separate page. + * + * PUBLIC: int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, int)); + */ +int +__ham_add_dup(hashp, hcp, nval, flags) + HTAB *hashp; + HASH_CURSOR *hcp; + DBT *nval; + int flags; +{ + DBT pval, tmp_val; + HKEYDATA *hk; + u_int32_t del_len, new_size; + int ret; + + if (flags == DB_CURRENT && hcp->dpgno == PGNO_INVALID) + del_len = hcp->dup_len; + else + del_len = 0; + + if ((ret = __ham_check_move(hashp, hcp, + (int32_t)DUP_SIZE(nval->size) - (int32_t)del_len)) != 0) + return (ret); + + /* + * Check if resulting duplicate set is going to need to go + * onto a separate duplicate page. If so, convert the + * duplicate set and add the new one. After conversion, + * hcp->dndx is the first free ndx or the index of the + * current pointer into the duplicate set. + */ + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + new_size = DUP_SIZE(nval->size) - del_len + LEN_HKEYDATA(hcp->pagep, + hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); + + /* + * We convert to off-page duplicates if the item is a big item, + * the addition of the new item will make the set large, or + * if there isn't enough room on this page to add the next item. + */ + if (hk->type != H_OFFDUP && + (hk->type == H_OFFPAGE || ISBIG(hashp, new_size) || + DUP_SIZE(nval->size) - del_len > P_FREESPACE(hcp->pagep))) { + + if ((ret = __ham_dup_convert(hashp, hcp)) != 0) + return (ret); + else + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + } + + /* There are two separate cases here: on page and off page. */ + if (hk->type != H_OFFDUP) { + if (hk->type != H_DUPLICATE) { + hk->type = H_DUPLICATE; + pval.flags = 0; + pval.data = hk->data; + pval.size = LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, + hcp->bndx); + if ((ret = __ham_make_dup(&pval, &tmp_val, &hcp->big_data, + &hcp->big_datalen)) != 0 || + (ret = __ham_replpair(hashp, hcp, &tmp_val, 1)) != 0) + return (ret); + } + + /* Now make the new entry a duplicate. */ + if ((ret = __ham_make_dup(nval, + &tmp_val, &hcp->big_data, &hcp->big_datalen)) != 0) + return (ret); + + tmp_val.dlen = 0; + switch (flags) { /* On page. */ + case DB_KEYFIRST: + tmp_val.doff = 0; + break; + case DB_KEYLAST: + tmp_val.doff = LEN_HDATA(hcp->pagep, + hashp->hdr->pagesize, hcp->bndx); + break; + case DB_CURRENT: + tmp_val.doff = hcp->dup_off; + tmp_val.dlen = DUP_SIZE(hcp->dup_len); + break; + case DB_BEFORE: + tmp_val.doff = hcp->dup_off; + break; + case DB_AFTER: + tmp_val.doff = hcp->dup_off + DUP_SIZE(hcp->dup_len); + break; + } + /* Add the duplicate. */ + ret = __ham_replpair(hashp, hcp, &tmp_val, 0); + if (ret == 0) + ret = __ham_dirty_page(hashp, hcp->pagep); + __ham_c_update(hashp, hcp, hcp->pgno, tmp_val.size, 1, 1); + return (ret); + } + + /* If we get here, then we're on duplicate pages. */ + if (hcp->dpgno == PGNO_INVALID) { + memcpy(&hcp->dpgno, + (u_int8_t *)hk + SSZ(HOFFDUP, pgno), sizeof(db_pgno_t)); + hcp->dndx = 0; + } + + switch (flags) { + case DB_KEYFIRST: + /* + * The only way that we are already on a dup page is + * if we just converted the on-page representation. + * In that case, we've only got one page of duplicates. + */ + if (hcp->dpagep == NULL && (ret = + __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + return (ret); + hcp->dndx = 0; + break; + case DB_KEYLAST: + if (hcp->dpagep == NULL && (ret = + __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + return (ret); + hcp->dpgno = PGNO(hcp->dpagep); + hcp->dndx = NUM_ENT(hcp->dpagep); + break; + case DB_CURRENT: + if ((ret = __db_ditem(hashp->dbp, hcp->dpagep, hcp->dndx, + BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep, hcp->dndx)->len))) + != 0) + return (ret); + break; + case DB_BEFORE: /* The default behavior is correct. */ + break; + case DB_AFTER: + hcp->dndx++; + break; + } + + ret = __db_dput(hashp->dbp, + nval, &hcp->dpagep, &hcp->dndx, __ham_overflow_page); + hcp->pgno = PGNO(hcp->pagep); + __ham_c_update(hashp, hcp, hcp->pgno, nval->size, 1, 1); + return (ret); +} + +/* + * Convert an on-page set of duplicates to an offpage set of duplicates. + */ +static int +__ham_dup_convert(hashp, hcp) + HTAB *hashp; + HASH_CURSOR *hcp; +{ + BOVERFLOW bo; + DBT dbt; + HOFFPAGE ho; + db_indx_t dndx, len; + int ret; + u_int8_t *p, *pend; + + /* + * Create a new page for the duplicates. + */ + if ((ret = + __ham_overflow_page(hashp->dbp, P_DUPLICATE, &hcp->dpagep)) != 0) + return (ret); + hcp->dpagep->type = P_DUPLICATE; + hcp->dpgno = PGNO(hcp->dpagep); + + /* + * Now put the duplicates onto the new page. + */ + dbt.flags = 0; + switch (((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->type) { + case H_KEYDATA: + /* Simple case, one key on page; move it to dup page. */ + dndx = 0; + dbt.size = + LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx); + dbt.data = + ((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->data; + ret = __db_pitem(hashp->dbp, hcp->dpagep, + (u_int32_t)dndx, BKEYDATA_SIZE(dbt.size), NULL, &dbt); + if (ret == 0) + __ham_dirty_page(hashp, hcp->dpagep); + break; + case H_OFFPAGE: + /* Simple case, one key on page; move it to dup page. */ + dndx = 0; + memcpy(&ho, + P_ENTRY(hcp->pagep, H_DATAINDEX(hcp->bndx)), HOFFPAGE_SIZE); + bo.deleted = 0; + bo.type = ho.type; + bo.pgno = ho.pgno; + bo.tlen = ho.tlen; + dbt.size = BOVERFLOW_SIZE; + dbt.data = &bo; + + ret = __db_pitem(hashp->dbp, hcp->dpagep, + (u_int32_t)dndx, dbt.size, &dbt, NULL); + if (ret == 0) + __ham_dirty_page(hashp, hcp->dpagep); + break; + case H_DUPLICATE: + p = ((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->data; + pend = p + + LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx); + + for (dndx = 0; p < pend; dndx++) { + memcpy(&len, p, sizeof(db_indx_t)); + dbt.size = len; + p += sizeof(db_indx_t); + dbt.data = p; + p += len + sizeof(db_indx_t); + ret = __db_dput(hashp->dbp, &dbt, + &hcp->dpagep, &dndx, __ham_overflow_page); + if (ret != 0) + break; + } + break; + default: + ret = __db_pgfmt(hashp->dbp, (u_long)hcp->pgno); + } + if (ret == 0) { + /* + * Now attach this to the source page in place of + * the old duplicate item. + */ + __ham_move_offpage(hashp, hcp->pagep, + (u_int32_t)H_DATAINDEX(hcp->bndx), hcp->dpgno); + + /* Can probably just do a "put" here. */ + ret = __ham_dirty_page(hashp, hcp->pagep); + } else { + (void)__ham_del_page(hashp->dbp, hcp->dpagep); + hcp->dpagep = NULL; + } + return (ret); +} + +static int +__ham_make_dup(notdup, dup, bufp, sizep) + const DBT *notdup; + DBT *dup; + void **bufp; + u_int32_t *sizep; +{ + db_indx_t tsize, item_size; + int ret; + u_int8_t *p; + + item_size = (db_indx_t)notdup->size; + tsize = DUP_SIZE(item_size); + if ((ret = __ham_init_dbt(dup, tsize, bufp, sizep)) != 0) + return (ret); + + dup->dlen = 0; + dup->flags = notdup->flags; + F_SET(dup, DB_DBT_PARTIAL); + + p = dup->data; + memcpy(p, &item_size, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + memcpy(p, notdup->data, notdup->size); + p += notdup->size; + memcpy(p, &item_size, sizeof(db_indx_t)); + + dup->doff = 0; + dup->dlen = notdup->size; + + return (0); +} + +static int +__ham_check_move(hashp, hcp, add_len) + HTAB *hashp; + HASH_CURSOR *hcp; + int32_t add_len; +{ + DBT k, d; + DB_LSN new_lsn; + HKEYDATA *hk; + PAGE *next_pagep; + db_pgno_t next_pgno; + int rectype, ret; + u_int32_t new_datalen, old_len; + + /* + * Check if we can do whatever we need to on this page. If not, + * then we'll have to move the current element to a new page. + */ + + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + + /* + * If the item is already off page duplicates or an offpage item, + * then we know we can do whatever we need to do in-place + */ + if (hk->type == H_OFFDUP || hk->type == H_OFFPAGE) + return (0); + + old_len = + LEN_HITEM(hcp->pagep, hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); + new_datalen = old_len - HKEYDATA_SIZE(0) + add_len; + + /* + * We need to add a new page under two conditions: + * 1. The addition makes the total data length cross the BIG + * threshold and the OFFDUP structure won't fit on this page. + * 2. The addition does not make the total data cross the + * threshold, but the new data won't fit on the page. + * If neither of these is true, then we can return. + */ + if (ISBIG(hashp, new_datalen) && (old_len > HOFFDUP_SIZE || + HOFFDUP_SIZE - old_len <= P_FREESPACE(hcp->pagep))) + return (0); + + if (!ISBIG(hashp, new_datalen) && + add_len <= (int32_t)P_FREESPACE(hcp->pagep)) + return (0); + + /* + * If we get here, then we need to move the item to a new page. + * Check if there are more pages in the chain. + */ + + new_datalen = ISBIG(hashp, new_datalen) ? + HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen); + + next_pagep = NULL; + for (next_pgno = NEXT_PGNO(hcp->pagep); next_pgno != PGNO_INVALID; + next_pgno = NEXT_PGNO(next_pagep)) { + if (next_pagep != NULL && + (ret = __ham_put_page(hashp->dbp, next_pagep, 0)) != 0) + return (ret); + + if ((ret = __ham_get_page(hashp->dbp, next_pgno, &next_pagep)) != 0) + return (ret); + + if (P_FREESPACE(next_pagep) >= new_datalen) + break; + } + + /* No more pages, add one. */ + if (next_pagep == NULL && + (ret = __ham_add_ovflpage(hashp, hcp->pagep, 0, &next_pagep)) != 0) + return (ret); + + /* Add new page at the end of the chain. */ + if (P_FREESPACE(next_pagep) < new_datalen && + (ret = __ham_add_ovflpage(hashp, next_pagep, 1, &next_pagep)) != 0) + return (ret); + + /* Copy the item to the new page. */ + if (DB_LOGGING(hashp->dbp)) { + rectype = PUTPAIR; + k.flags = 0; + d.flags = 0; + if (H_PAIRKEY(hcp->pagep, hcp->bndx)->type == H_OFFPAGE) { + rectype |= PAIR_KEYMASK; + k.data = H_PAIRKEY(hcp->pagep, hcp->bndx); + k.size = HOFFPAGE_SIZE; + } else { + k.data = H_PAIRKEY(hcp->pagep, hcp->bndx)->data; + k.size = LEN_HKEY(hcp->pagep, + hashp->hdr->pagesize, hcp->bndx); + } + + if (hk->type == H_OFFPAGE) { + rectype |= PAIR_DATAMASK; + d.data = H_PAIRDATA(hcp->pagep, hcp->bndx); + d.size = HOFFPAGE_SIZE; + } else { + d.data = H_PAIRDATA(hcp->pagep, hcp->bndx)->data; + d.size = LEN_HDATA(hcp->pagep, + hashp->hdr->pagesize, hcp->bndx); + } + + + if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype, + hashp->dbp->log_fileid, PGNO(next_pagep), + (u_int32_t)H_NUMPAIRS(next_pagep), &LSN(next_pagep), + &k, &d)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(next_pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_copy_item(hashp, hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep); + __ham_copy_item(hashp, hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep); + + /* Now delete the pair from the current page. */ + ret = __ham_del_pair(hashp, hcp); + + (void)__ham_put_page(hashp->dbp, hcp->pagep, 1); + hcp->pagep = next_pagep; + hcp->pgno = PGNO(hcp->pagep); + hcp->bndx = H_NUMPAIRS(hcp->pagep) - 1; + F_SET(hcp, H_EXPAND); + return (ret); +} + +/* + * Replace an onpage set of duplicates with the OFFDUP structure that + * references the duplicate page. + * XXX This is really just a special case of __onpage_replace; we should + * probably combine them. + * PUBLIC: void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t)); + */ +void +__ham_move_offpage(hashp, pagep, ndx, pgno) + HTAB *hashp; + PAGE *pagep; + u_int32_t ndx; + db_pgno_t pgno; +{ + DBT new_dbt; + DBT old_dbt; + HOFFDUP od; + db_indx_t i; + int32_t shrink; + u_int8_t *src; + + od.type = H_OFFDUP; + od.pgno = pgno; + + if (DB_LOGGING(hashp->dbp)) { + new_dbt.data = &od; + new_dbt.size = HOFFDUP_SIZE; + old_dbt.data = P_ENTRY(pagep, ndx); + old_dbt.size = LEN_HITEM(pagep, hashp->hdr->pagesize, ndx); + (void)__ham_replace_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &LSN(pagep), 0, + hashp->dbp->log_fileid, PGNO(pagep), (u_int32_t)ndx, + &LSN(pagep), -1, &old_dbt, &new_dbt, 0); + } + + shrink = + LEN_HITEM(pagep, hashp->hdr->pagesize, ndx) - HOFFDUP_SIZE; + + if (shrink != 0) { + /* Copy data. */ + src = (u_int8_t *)(pagep) + HOFFSET(pagep); + memmove(src + shrink, src, pagep->inp[ndx] - HOFFSET(pagep)); + HOFFSET(pagep) += shrink; + + /* Update index table. */ + for (i = ndx; i < NUM_ENT(pagep); i++) + pagep->inp[i] += shrink; + } + + /* Now copy the offdup entry onto the page. */ + memcpy(P_ENTRY(pagep, ndx), &od, HOFFDUP_SIZE); +} diff --git a/db2/hash/hash_func.c b/db2/hash/hash_func.c new file mode 100644 index 0000000000..2ef47afb57 --- /dev/null +++ b/db2/hash/hash_func.c @@ -0,0 +1,219 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_func.c 10.6 (Sleepycat) 7/26/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" + +/* + * __ham_func2 -- + * Phong Vo's linear congruential hash. + * + * PUBLIC: u_int32_t __ham_func2 __P((const void *, u_int32_t)); + */ +#define dcharhash(h, c) ((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c)) + +u_int32_t +__ham_func2(key, len) + const void *key; + u_int32_t len; +{ + const u_int8_t *e, *k; + u_int32_t h; + u_int8_t c; + + k = key; + e = k + len; + for (h = 0; k != e;) { + c = *k++; + if (!c && k > e) + break; + dcharhash(h, c); + } + return (h); +} + +/* + * __ham_func3 -- + * Ozan Yigit's original sdbm hash. + * + * Ugly, but fast. Break the string up into 8 byte units. On the first time + * through the loop get the "leftover bytes" (strlen % 8). On every other + * iteration, perform 8 HASHC's so we handle all 8 bytes. Essentially, this + * saves us 7 cmp & branch instructions. + * + * PUBLIC: u_int32_t __ham_func3 __P((const void *, u_int32_t)); + */ +u_int32_t +__ham_func3(key, len) + const void *key; + u_int32_t len; +{ + const u_int8_t *k; + u_int32_t n, loop; + + if (len == 0) + return (0); + +#define HASHC n = *k++ + 65599 * n + n = 0; + k = key; + + loop = (len + 8 - 1) >> 3; + switch (len & (8 - 1)) { + case 0: + do { + HASHC; + case 7: + HASHC; + case 6: + HASHC; + case 5: + HASHC; + case 4: + HASHC; + case 3: + HASHC; + case 2: + HASHC; + case 1: + HASHC; + } while (--loop); + } + return (n); +} + +/* + * __ham_func4 -- + * Chris Torek's hash function. Although this function performs only + * slightly worse than __ham_func5 on strings, it performs horribly on + * numbers. + * + * PUBLIC: u_int32_t __ham_func4 __P((const void *, u_int32_t)); + */ +u_int32_t +__ham_func4(key, len) + const void *key; + u_int32_t len; +{ + const u_int8_t *k; + u_int32_t h, loop; + + if (len == 0) + return (0); + +#define HASH4a h = (h << 5) - h + *k++; +#define HASH4b h = (h << 5) + h + *k++; +#define HASH4 HASH4b + h = 0; + k = key; + + loop = (len + 8 - 1) >> 3; + switch (len & (8 - 1)) { + case 0: + do { + HASH4; + case 7: + HASH4; + case 6: + HASH4; + case 5: + HASH4; + case 4: + HASH4; + case 3: + HASH4; + case 2: + HASH4; + case 1: + HASH4; + } while (--loop); + } + return (h); +} + +/* + * Fowler/Noll/Vo hash + * + * The basis of the hash algorithm was taken from an idea sent by email to the + * IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and + * Glenn Fowler (gsf@research.att.com). Landon Curt Noll (chongo@toad.com) + * later improved on their algorithm. + * + * The magic is in the interesting relationship between the special prime + * 16777619 (2^24 + 403) and 2^32 and 2^8. + * + * This hash produces the fewest collisions of any function that we've seen so + * far, and works well on both numbers and strings. + * + * PUBLIC: u_int32_t __ham_func5 __P((const void *, u_int32_t)); + */ +u_int32_t +__ham_func5(key, len) + const void *key; + u_int32_t len; +{ + const u_int8_t *k, *e; + u_int32_t h; + + k = key; + e = k + len; + for (h = 0; k < e; ++k) { + h *= 16777619; + h ^= *k; + } + return (h); +} diff --git a/db2/hash/hash_page.c b/db2/hash/hash_page.c new file mode 100644 index 0000000000..68c31b14f9 --- /dev/null +++ b/db2/hash/hash_page.c @@ -0,0 +1,1775 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_page.c 10.18 (Sleepycat) 8/21/97"; +#endif /* not lint */ + + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Page manipulation for hashing package. + * + * ROUTINES: + * + * External + * __get_page + * __add_ovflpage + * __overflow_page + * Internal + * open_temp + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" + +static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t)); + +#ifdef DEBUG_SLOW +static void account_page(HTAB *, db_pgno_t, int); +#endif + +/* + * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + db_pgno_t next_pgno; + int ret; + + if (F_ISSET(cursorp, H_DELETED)) + return (EINVAL); + F_CLR(cursorp, H_OK | H_NOMORE); + + /* Check if we need to get a page for this cursor. */ + if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + + /* Check if we are looking for space in which to insert an item. */ + if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID + && cursorp->seek_size < P_FREESPACE(cursorp->pagep)) + cursorp->seek_found_page = cursorp->pgno; + + /* Check if we need to go on to the next page. */ + if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID) + /* + * ISDUP is set, and offset is at the beginning of the datum. + * We need to grab the length of the datum, then set the datum + * pointer to be the beginning of the datum. + */ + memcpy(&cursorp->dup_len, + H_PAIRDATA(cursorp->pagep, cursorp->bndx)->data + + cursorp->dup_off, sizeof(db_indx_t)); + else if (F_ISSET(cursorp, H_ISDUP)) { + /* Make sure we're not about to run off the page. */ + if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp, + cursorp->dpgno, &cursorp->dpagep)) != 0) + return (ret); + + if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) { + if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) { + if ((ret = __ham_put_page(hashp->dbp, + cursorp->dpagep, 0)) != 0) + return (ret); + F_CLR(cursorp, H_ISDUP); + cursorp->dpagep = NULL; + cursorp->dpgno = PGNO_INVALID; + cursorp->dndx = NDX_INVALID; + cursorp->bndx++; + } else if ((ret = __ham_next_cpage(hashp, cursorp, + NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + return (ret); + } + } + + if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) { + /* Fetch next page. */ + if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) { + F_SET(cursorp, H_NOMORE); + if (cursorp->dpagep != NULL && + (ret = __ham_put_page(hashp->dbp, + cursorp->dpagep, 0)) != 0) + return (ret); + cursorp->dpgno = PGNO_INVALID; + return (DB_NOTFOUND); + } + next_pgno = NEXT_PGNO(cursorp->pagep); + cursorp->bndx = 0; + if ((ret = __ham_next_cpage(hashp, + cursorp, next_pgno, 0, 0)) != 0) + return (ret); + } + + F_SET(cursorp, H_OK); + return (0); +} + +/* + * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *)); + */ +int +__ham_item_reset(hashp, cursorp) + HTAB *hashp; + HASH_CURSOR *cursorp; +{ + int ret; + + if (cursorp->pagep) + ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0); + else + ret = 0; + + __ham_item_init(cursorp); + return (ret); +} + +/* + * PUBLIC: void __ham_item_init __P((HASH_CURSOR *)); + */ +void +__ham_item_init(cursorp) + HASH_CURSOR *cursorp; +{ + cursorp->pagep = NULL; + cursorp->bucket = BUCKET_INVALID; + cursorp->lock = 0; + cursorp->bndx = NDX_INVALID; + cursorp->pgno = PGNO_INVALID; + cursorp->dpgno = PGNO_INVALID; + cursorp->dndx = NDX_INVALID; + cursorp->dpagep = NULL; + cursorp->flags = 0; + cursorp->seek_size = 0; + cursorp->seek_found_page = PGNO_INVALID; +} + +/* + * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int)); + */ +int +__ham_item_done(hashp, cursorp, dirty) + HTAB *hashp; + HASH_CURSOR *cursorp; + int dirty; +{ + int ret, t_ret; + + t_ret = ret = 0; + + if (cursorp->pagep) + ret = __ham_put_page(hashp->dbp, cursorp->pagep, + dirty && cursorp->dpagep == NULL); + cursorp->pagep = NULL; + + if (cursorp->dpagep) + t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty); + cursorp->dpagep = NULL; + + if (ret == 0 && t_ret != 0) + ret = t_ret; + + /* + * If we are running with transactions, then we must + * not relinquish locks explicitly. + */ + if (cursorp->lock && hashp->dbp->txn == NULL) + t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock); + cursorp->lock = 0; + + + /* + * We don't throw out the page number since we might want to + * continue getting on this page. + */ + return (ret != 0 ? ret : t_ret); +} + +/* + * Returns the last item in a bucket. + * + * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_last(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + int ret; + + if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + return (ret); + + cursorp->bucket = hashp->hdr->max_bucket; + F_SET(cursorp, H_OK); + return (__ham_item_prev(hashp, cursorp, mode)); +} +/* + * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_first(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + int ret; + + if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + return (ret); + F_SET(cursorp, H_OK); + cursorp->bucket = 0; + return (__ham_item_next(hashp, cursorp, mode)); +} + +/* + * Returns a pointer to key/data pair on a page. In the case of bigkeys, + * just returns the page number and index of the bigkey pointer pair. + * + * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_prev(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + db_pgno_t next_pgno; + int ret; + + /* + * There are N cases for backing up in a hash file. + * Case 1: In the middle of a page, no duplicates, just dec the index. + * Case 2: In the middle of a duplicate set, back up one. + * Case 3: At the beginning of a duplicate set, get out of set and + * back up to next key. + * Case 4: At the beginning of a page; go to previous page. + * Case 5: At the beginning of a bucket; go to prev bucket. + */ + F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED); + + /* + * First handle the duplicates. Either you'll get the key here + * or you'll exit the duplicate set and drop into the code below + * to handle backing up through keys. + */ + if (F_ISSET(cursorp, H_ISDUP)) { + if (cursorp->dpgno == PGNO_INVALID) { + /* Duplicates are on-page. */ + if (cursorp->dup_off != 0) + if ((ret = __ham_get_cpage(hashp, + cursorp, mode)) != 0) + return (ret); + else { + HASH_CURSOR *h; + h = cursorp; + memcpy(&h->dup_len, + H_PAIRDATA(h->pagep, h->bndx)->data + + h->dup_off - sizeof(db_indx_t), + sizeof(db_indx_t)); + cursorp->dup_off -= + DUP_SIZE(cursorp->dup_len); + cursorp->dndx--; + return (__ham_item(hashp, + cursorp, mode)); + } + } else if (cursorp->dndx > 0) { /* Duplicates are off-page. */ + cursorp->dndx--; + return (__ham_item(hashp, cursorp, mode)); + } else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) { + F_CLR(cursorp, H_ISDUP); /* End of dups */ + cursorp->dpgno = PGNO_INVALID; + if (cursorp->dpagep != NULL) + (void)__ham_put_page(hashp->dbp, + cursorp->dpagep, 0); + cursorp->dpagep = NULL; + } else if ((ret = __ham_next_cpage(hashp, cursorp, + PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + return (ret); + else { + cursorp->dndx = NUM_ENT(cursorp->pagep) - 1; + return (__ham_item(hashp, cursorp, mode)); + } + } + + /* + * If we get here, we are not in a duplicate set, and just need + * to back up the cursor. There are still three cases: + * midpage, beginning of page, beginning of bucket. + */ + + if (cursorp->bndx == 0) { /* Beginning of page. */ + if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + return (ret); + cursorp->pgno = PREV_PGNO(cursorp->pagep); + if (cursorp->pgno == PGNO_INVALID) { + /* Beginning of bucket. */ + F_SET(cursorp, H_NOMORE); + return (DB_NOTFOUND); + } else if ((ret = __ham_next_cpage(hashp, + cursorp, cursorp->pgno, 0, 0)) != 0) + return (ret); + else + cursorp->bndx = H_NUMPAIRS(cursorp->pagep); + } + + /* + * Either we've got the cursor set up to be decremented, or we + * have to find the end of a bucket. + */ + if (cursorp->bndx == NDX_INVALID) { + if (cursorp->pagep == NULL) + next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket); + else + goto got_page; + + do { + if ((ret = __ham_next_cpage(hashp, + cursorp, next_pgno, 0, 0)) != 0) + return (ret); +got_page: next_pgno = NEXT_PGNO(cursorp->pagep); + cursorp->bndx = H_NUMPAIRS(cursorp->pagep); + } while (next_pgno != PGNO_INVALID); + + if (cursorp->bndx == 0) { + /* Bucket was empty. */ + F_SET(cursorp, H_NOMORE); + return (DB_NOTFOUND); + } + } + + cursorp->bndx--; + + return (__ham_item(hashp, cursorp, mode)); +} + +/* + * Sets the cursor to the next key/data pair on a page. + * + * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_item_next(hashp, cursorp, mode) + HTAB *hashp; + HASH_CURSOR *cursorp; + db_lockmode_t mode; +{ + /* + * Deleted on-page duplicates are a weird case. If we delete the last + * one, then our cursor is at the very end of a duplicate set and + * we actually need to go on to the next key. + */ + if (F_ISSET(cursorp, H_DELETED)) { + if (cursorp->bndx != NDX_INVALID && + F_ISSET(cursorp, H_ISDUP) && + cursorp->dpgno == PGNO_INVALID && + cursorp->dup_tlen == cursorp->dup_off) { + F_CLR(cursorp, H_ISDUP); + cursorp->dpgno = PGNO_INVALID; + cursorp->bndx++; + } + F_CLR(cursorp, H_DELETED); + } else if (cursorp->bndx == NDX_INVALID) { + cursorp->bndx = 0; + cursorp->dpgno = PGNO_INVALID; + F_CLR(cursorp, H_ISDUP); + } else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID) + cursorp->dndx++; + else if (F_ISSET(cursorp, H_ISDUP)) { + cursorp->dndx++; + cursorp->dup_off += DUP_SIZE(cursorp->dup_len); + if (cursorp->dup_off >= cursorp->dup_tlen) { + F_CLR(cursorp, H_ISDUP); + cursorp->dpgno = PGNO_INVALID; + cursorp->bndx++; + } + } else + cursorp->bndx++; + + return (__ham_item(hashp, cursorp, mode)); +} + +/* + * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int)); + * + * This is a little bit sleazy in that we're overloading the meaning + * of the H_OFFPAGE type here. When we recover deletes, we have the + * entire entry instead of having only the DBT, so we'll pass type + * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing + * an H_KEYDATA around it. + */ +void +__ham_putitem(p, dbt, type) + PAGE *p; + const DBT *dbt; + int type; +{ + u_int16_t n, off; + + n = NUM_ENT(p); + + /* Put the item element on the page. */ + if (type == H_OFFPAGE) { + off = HOFFSET(p) - dbt->size; + HOFFSET(p) = p->inp[n] = off; + memcpy(P_ENTRY(p, n), dbt->data, dbt->size); + } else { + off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size); + HOFFSET(p) = p->inp[n] = off; + PUT_HKEYDATA(GET_HKEYDATA(p, n), dbt->data, dbt->size, type); + } + + /* Adjust page info. */ + NUM_ENT(p) += 1; +} + + +/* + * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *)); + * XXX TODO: if the item is an offdup, delete the other pages and + * then remove the pair. If the offpage page is 0, then you can + * just remove the pair. + */ +int +__ham_del_pair(hashp, cursorp) + HTAB *hashp; + HASH_CURSOR *cursorp; +{ + DBT data_dbt, key_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn, *n_lsn; + PAGE *p; + db_indx_t ndx; + db_pgno_t chg_pgno, pgno; + int ret, tret; + + dbenv = hashp->dbp->dbenv; + ndx = cursorp->bndx; + if (cursorp->pagep == NULL && (ret = + __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0) + return (ret); + + p = cursorp->pagep; + + /* + * We optimize for the normal case which is when neither the key nor + * the data are large. In this case, we write a single log record + * and do the delete. If either is large, we'll call __big_delete + * to remove the big item and then update the page to remove the + * entry referring to the big item. + */ + ret = 0; + if (H_PAIRKEY(p, ndx)->type == H_OFFPAGE) { + memcpy(&pgno, (u_int8_t *)GET_HOFFPAGE(p, H_KEYINDEX(ndx)) + + SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t)); + ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + } + + if (ret == 0) + switch (H_PAIRDATA(p, ndx)->type) { + case H_OFFPAGE: + memcpy(&pgno, + (u_int8_t *)GET_HOFFPAGE(p, H_DATAINDEX(ndx)) + + SSZ(HOFFPAGE, pgno), sizeof(db_pgno_t)); + ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + break; + case H_OFFDUP: + memcpy(&pgno, + (u_int8_t *)GET_HOFFDUP(p, H_DATAINDEX(ndx)) + + SSZ(HOFFDUP, pgno), sizeof(db_pgno_t)); + ret = __db_ddup(hashp->dbp, pgno, __ham_del_page); + break; + } + + if (ret) + return (ret); + + /* Now log the delete off this page. */ + if (DB_LOGGING(hashp->dbp)) { + key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx)); + key_dbt.size = + LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx)); + data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx)); + data_dbt.size = + LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx)); + + if ((ret = __ham_insdel_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR, + hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx, + &LSN(p), &key_dbt, &data_dbt)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(p) = new_lsn; + } + + __ham_dpair(hashp->dbp, p, ndx); + + /* + * If we are locking, we will not maintain this. + * XXXX perhaps we can retain incremental numbers and apply them + * later. + */ + if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) + --hashp->hdr->nelem; + + /* + * Check if the page is empty. There are two cases. If it's + * empty and it's not the first chain in the bucket (i.e., the + * bucket page) then we can simply remove it. If it is the first + * chain in the bucket, then we need to copy the second page into + * it and remove the second page. + */ + if (NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID && + NEXT_PGNO(p) != PGNO_INVALID) { + PAGE *n_pagep, *nn_pagep; + db_pgno_t tmp_pgno; + + /* + * First page in chain is empty and we know that there + * are more pages in the chain. + * XXX Need to log this. + */ + if ((ret = + __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0) + return (ret); + + if (NEXT_PGNO(n_pagep) != PGNO_INVALID) { + if ((ret = + __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep), + &nn_pagep)) != 0) { + (void) __ham_put_page(hashp->dbp, n_pagep, 0); + return (ret); + } + PREV_PGNO(nn_pagep) = PGNO(p); + (void)__ham_put_page(hashp->dbp, nn_pagep, 1); + } + + tmp_pgno = PGNO(p); + memcpy(p, n_pagep, hashp->hdr->pagesize); + PGNO(p) = tmp_pgno; + PREV_PGNO(p) = PGNO_INVALID; + + /* + * Cursor is advanced to the beginning of the next page. + */ + cursorp->bndx = NDX_INVALID; + cursorp->pgno = PGNO(p); + chg_pgno = PGNO(p); + if ((ret = __ham_dirty_page(hashp, p)) != 0 || + (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0) + return (ret); + } else if (NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) { + PAGE *n_pagep, *p_pagep; + + if ((ret = + __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0) + return (ret); + + if (NEXT_PGNO(p) != PGNO_INVALID) { + if ((ret = __ham_get_page(hashp->dbp, + NEXT_PGNO(p), &n_pagep)) != 0) { + (void)__ham_put_page(hashp->dbp, p_pagep, 0); + return (ret); + } + n_lsn = &LSN(n_pagep); + } else { + n_pagep = NULL; + n_lsn = NULL; + } + + NEXT_PGNO(p_pagep) = NEXT_PGNO(p); + if (n_pagep != NULL) + PREV_PGNO(n_pagep) = PGNO(p_pagep); + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpage_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL, + hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep), + PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(p_pagep) = new_lsn; /* Structure assignment. */ + if (n_pagep) + LSN(n_pagep) = new_lsn; + LSN(p) = new_lsn; + } + cursorp->pgno = NEXT_PGNO(p); + cursorp->bndx = 0; + /* + * Since we are about to delete the cursor page and we have + * just moved the cursor, we need to make sure that the + * old page pointer isn't left hanging around in the cursor. + */ + cursorp->pagep = NULL; + chg_pgno = PGNO(p); + ret = __ham_del_page(hashp->dbp, p); + if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 && + ret == 0) + ret = tret; + if (n_pagep != NULL && + (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 && + ret == 0) + ret = tret; + if (ret != 0) + return (ret); + } else { + /* + * Mark item deleted so that we don't try to return it, and + * so that we update the cursor correctly on the next call + * to next. + */ + F_SET(cursorp, H_DELETED); + chg_pgno = cursorp->pgno; + ret = __ham_dirty_page(hashp, p); + } + __ham_c_update(hashp, cursorp, chg_pgno, 0, 0, 0); + + F_CLR(cursorp, H_OK); + return (ret); +} +/* + * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); + * Given the key data indicated by the cursor, replace part/all of it + * according to the fields in the dbt. + */ +int +__ham_replpair(hashp, hcp, dbt, make_dup) + HTAB *hashp; + HASH_CURSOR *hcp; + DBT *dbt; + u_int32_t make_dup; +{ + DBT old_dbt, tmp; + DB_LSN new_lsn; + HKEYDATA *hk; + u_int32_t len; + int32_t change; + int is_big, ret, type; + u_int8_t *beg, *dest, *end, *src; + + /* + * Big item replacements are handled in generic code. + * Items that fit on the current page fall into 4 classes. + * 1. On-page element, same size + * 2. On-page element, new is bigger (fits) + * 3. On-page element, new is bigger (does not fit) + * 4. On-page element, old is bigger + * Numbers 1, 2, and 4 are essentially the same (and should + * be the common case). We handle case 3 as a delete and + * add. + */ + + /* + * We need to compute the number of bytes that we are adding or + * removing from the entry. Normally, we can simply substract + * the number of bytes we are replacing (dbt->dlen) from the + * number of bytes we are inserting (dbt->size). However, if + * we are doing a partial put off the end of a record, then this + * formula doesn't work, because we are essentially adding + * new bytes. + */ + change = dbt->size - dbt->dlen; + + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + is_big = hk->type == H_OFFPAGE; + + if (is_big) + memcpy(&len, (u_int8_t *)hk + SSZ(HOFFPAGE, tlen), + sizeof(u_int32_t)); + else + len = LEN_HKEYDATA(hcp->pagep, + hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx)); + + if (dbt->doff + dbt->dlen > len) + change += dbt->doff + dbt->dlen - len; + + + if (change > (int)P_FREESPACE(hcp->pagep) || is_big) { + /* + * Case 3 -- two subcases. + * A. This is not really a partial operation, but an overwrite. + * Simple del and add works. + * B. This is a partial and we need to construct the data that + * we are really inserting (yuck). + * In both cases, we need to grab the key off the page (in + * some cases we could do this outside of this routine; for + * cleanliness we do it here. If you happen to be on a big + * key, this could be a performance hit). + */ + tmp.flags = 0; + F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL); + if ((ret = + __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx), + &tmp, &hcp->big_key, &hcp->big_keylen)) != 0) + return (ret); + + type = hk->type; + if (dbt->doff == 0 && dbt->dlen == len) { + ret = __ham_del_pair(hashp, hcp); + if (ret == 0) + ret = __ham_add_el(hashp, hcp, &tmp, dbt, type); + } else { /* Case B */ + DBT tdata; + tdata.flags = 0; + F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL); + + if ((ret = __db_ret(hashp->dbp, hcp->pagep, + H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data, + &hcp->big_datalen)) != 0) + goto err; + + /* Now we can delete the item. */ + if ((ret = __ham_del_pair(hashp, hcp)) != 0) { + free(tdata.data); + goto err; + } + + /* Now shift old data around to make room for new. */ + if (change > 0) { + tdata.data = (void *) + realloc(tdata.data, tdata.size + change); + memset((u_int8_t *)tdata.data + tdata.size, + 0, change); + } + if (tdata.data == NULL) + return (ENOMEM); + end = (u_int8_t *)tdata.data + tdata.size; + + src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen; + if (src < end && tdata.size > dbt->doff + dbt->dlen) { + len = tdata.size - dbt->doff - dbt->dlen; + dest = src + change; + memmove(dest, src, len); + } + memcpy((u_int8_t *)tdata.data + dbt->doff, + dbt->data, dbt->size); + tdata.size += change; + + /* Now add the pair. */ + ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type); + free(tdata.data); + } +err: free(tmp.data); + return (ret); + } + + /* + * Set up pointer into existing data. Do it before the log + * message so we can use it inside of the log setup. + */ + beg = H_PAIRDATA(hcp->pagep, hcp->bndx)->data; + beg += dbt->doff; + + /* + * If we are going to have to move bytes at all, figure out + * all the parameters here. Then log the call before moving + * anything around. + */ + if (DB_LOGGING(hashp->dbp)) { + old_dbt.data = beg; + old_dbt.size = dbt->dlen; + if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, PGNO(hcp->pagep), + (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep), + (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0) + return (ret); + + LSN(hcp->pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize, + (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt); + + return (0); +} + +/* + * Replace data on a page with new data, possibly growing or shrinking what's + * there. This is called on two different occasions. On one (from replpair) + * we are interested in changing only the data. On the other (from recovery) + * we are replacing the entire data (header and all) with a new element. In + * the latter case, the off argument is negative. + * pagep: the page that we're changing + * ndx: page index of the element that is growing/shrinking. + * off: Offset at which we are beginning the replacement. + * change: the number of bytes (+ or -) that the element is growing/shrinking. + * dbt: the new data that gets written at beg. + * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t, + * PUBLIC: int32_t, DBT *)); + */ +void +__ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt) + PAGE *pagep; + size_t pgsize; + u_int32_t ndx; + int32_t off; + int32_t change; + DBT *dbt; +{ + db_indx_t i; + int32_t len; + u_int8_t *src, *dest; + int zero_me; + + if (change != 0) { + zero_me = 0; + src = (u_int8_t *)(pagep) + HOFFSET(pagep); + if (off < 0) + len = pagep->inp[ndx] - HOFFSET(pagep); + else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) { + len = GET_HKEYDATA(pagep, ndx)->data + + LEN_HKEYDATA(pagep, pgsize, ndx) - src; + zero_me = 1; + } else + len = (GET_HKEYDATA(pagep, ndx)->data + off) - src; + dest = src - change; + memmove(dest, src, len); + if (zero_me) + memset(dest + len, 0, change); + + /* Now update the indices. */ + for (i = ndx; i < NUM_ENT(pagep); i++) + pagep->inp[i] -= change; + HOFFSET(pagep) -= change; + } + if (off >= 0) + memcpy(GET_HKEYDATA(pagep, ndx)->data + off, + dbt->data, dbt->size); + else + memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size); +} + +/* + * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t)); + */ +int +__ham_split_page(hashp, obucket, nbucket) + HTAB *hashp; + u_int32_t obucket, nbucket; +{ + DBT key, val, page_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn; + PAGE **pp, *old_pagep, *temp_pagep, *new_pagep; + db_indx_t n; + db_pgno_t bucket_pgno, next_pgno; + u_int32_t big_len, len; + int ret, tret; + void *big_buf; + + dbenv = hashp->dbp->dbenv; + temp_pagep = old_pagep = new_pagep = NULL; + + bucket_pgno = BUCKET_TO_PAGE(hashp, obucket); + if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0) + return (ret); + if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH, + &new_pagep)) != 0) + goto err; + + temp_pagep = hashp->split_buf; + memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize); + + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep), + &page_dbt, &LSN(old_pagep))) != 0) + goto err; + } + + P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID, + PGNO_INVALID, 0, P_HASH); + + if (DB_LOGGING(hashp->dbp)) + LSN(old_pagep) = new_lsn; /* Structure assignment. */ + + big_len = 0; + big_buf = NULL; + val.flags = key.flags = 0; + while (temp_pagep != NULL) { + for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) { + if ((ret = + __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n), + &key, &big_buf, &big_len)) != 0) + goto err; + + if (__ham_call_hash(hashp, key.data, key.size) + == obucket) + pp = &old_pagep; + else + pp = &new_pagep; + + /* + * Figure out how many bytes we need on the new + * page to store the key/data pair. + */ + + len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + H_DATAINDEX(n)) + + LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + H_KEYINDEX(n)) + + 2 * sizeof(db_indx_t); + + if (P_FREESPACE(*pp) < len) { + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = *pp; + if ((ret = __ham_splitdata_log( + dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, + &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, + PGNO(*pp), &page_dbt, + &LSN(*pp))) != 0) + goto err; + LSN(*pp) = new_lsn; + } + if ((ret = __ham_add_ovflpage(hashp, + *pp, 1, pp)) != 0) + goto err; + } + __ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp); + __ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp); + } + next_pgno = NEXT_PGNO(temp_pagep); + + /* Clear temp_page; if it's a link overflow page, free it. */ + if (PGNO(temp_pagep) != bucket_pgno && (ret = + __ham_del_page(hashp->dbp, temp_pagep)) != 0) + goto err; + + if (next_pgno == PGNO_INVALID) + temp_pagep = NULL; + else if ((ret = + __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0) + goto err; + + if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = temp_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep), + &page_dbt, &LSN(temp_pagep))) != 0) + goto err; + LSN(temp_pagep) = new_lsn; + } + } + if (big_buf != NULL) + free(big_buf); + + /* + * If the original bucket spanned multiple pages, then we've got + * a pointer to a page that used to be on the bucket chain. It + * should be deleted. + */ + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno && + (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0) + goto err; + + /* + * Write new buckets out. + */ + if (DB_LOGGING(hashp->dbp)) { + page_dbt.size = hashp->hdr->pagesize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep), + &page_dbt, &LSN(old_pagep))) != 0) + goto err; + LSN(old_pagep) = new_lsn; + + page_dbt.data = new_pagep; + if ((ret = __ham_splitdata_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, + hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep), + &page_dbt, &LSN(new_pagep))) != 0) + goto err; + LSN(new_pagep) = new_lsn; + } + ret = __ham_put_page(hashp->dbp, old_pagep, 1); + if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 && + ret == 0) + ret = tret; + +err: if (0) { + if (old_pagep != NULL) + (void)__ham_put_page(hashp->dbp, old_pagep, 1); + if (new_pagep != NULL) + (void)__ham_put_page(hashp->dbp, new_pagep, 1); + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno) + (void)__ham_put_page(hashp->dbp, temp_pagep, 1); + } + return (ret); +} + +/* + * Add the given pair to the page. The page in question may already be + * held (i.e. it was already gotten). If it is, then the page is passed + * in via the pagep parameter. On return, pagep will contain the page + * to which we just added something. This allows us to link overflow + * pages and return the new page having correctly put the last page. + * + * PUBLIC: int __ham_add_el __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, + * PUBLIC: int)); + */ +int +__ham_add_el(hashp, hcp, key, val, type) + HTAB *hashp; + HASH_CURSOR *hcp; + const DBT *key, *val; + int type; +{ + DBT *pkey, *pdata, key_dbt, data_dbt; + DB_LSN new_lsn; + HOFFPAGE doff, koff; + db_pgno_t next_pgno; + u_int32_t data_size, key_size, pairsize; + int do_expand, is_keybig, is_databig, rectype, ret; + int key_type, data_type; + + do_expand = 0; + + if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp, + hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page : + hcp->pgno, &hcp->pagep)) != 0) + return (ret); + + key_size = HKEYDATA_PSIZE(key->size); + data_size = HKEYDATA_PSIZE(val->size); + is_keybig = ISBIG(hashp, key->size); + is_databig = ISBIG(hashp, val->size); + if (is_keybig) + key_size = HOFFPAGE_PSIZE; + if (is_databig) + data_size = HOFFPAGE_PSIZE; + + pairsize = key_size + data_size; + + /* Advance to first page in chain with room for item. */ + while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) != + PGNO_INVALID) { + /* + * This may not be the end of the chain, but the pair may fit + * anyway. Check if it's a bigpair that fits or a regular + * pair that fits. + */ + if (P_FREESPACE(hcp->pagep) >= pairsize) + break; + next_pgno = NEXT_PGNO(hcp->pagep); + if ((ret = + __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0) + return (ret); + } + + /* + * Check if we need to allocate a new page. + */ + if (P_FREESPACE(hcp->pagep) < pairsize) { + do_expand = 1; + if ((ret = __ham_add_ovflpage(hashp, + hcp->pagep, 1, &hcp->pagep)) != 0) + return (ret); + hcp->pgno = PGNO(hcp->pagep); + } + + /* + * Update cursor. + */ + hcp->bndx = H_NUMPAIRS(hcp->pagep); + F_CLR(hcp, H_DELETED); + if (is_keybig) { + if ((ret = __db_poff(hashp->dbp, + key, &koff.pgno, __ham_overflow_page)) != 0) + return (ret); + koff.type = H_OFFPAGE; + koff.tlen = key->size; + key_dbt.data = &koff; + key_dbt.size = sizeof(koff); + pkey = &key_dbt; + key_type = H_OFFPAGE; + } else { + pkey = (DBT *)key; + key_type = H_KEYDATA; + } + + if (is_databig) { + if ((ret = __db_poff(hashp->dbp, + val, &doff.pgno, __ham_overflow_page)) != 0) + return (ret); + doff.type = H_OFFPAGE; + doff.tlen = val->size; + data_dbt.data = &doff; + data_dbt.size = sizeof(doff); + pdata = &data_dbt; + data_type = H_OFFPAGE; + } else { + pdata = (DBT *)val; + data_type = type; + } + + if (DB_LOGGING(hashp->dbp)) { + rectype = PUTPAIR; + if (is_databig) + rectype |= PAIR_DATAMASK; + if (is_keybig) + rectype |= PAIR_KEYMASK; + + if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype, + hashp->dbp->log_fileid, PGNO(hcp->pagep), + (u_int32_t)H_NUMPAIRS(hcp->pagep), + &LSN(hcp->pagep), pkey, pdata)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(hcp->pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_putitem(hcp->pagep, pkey, key_type); + __ham_putitem(hcp->pagep, pdata, data_type); + + /* + * For splits, we are going to update item_info's page number + * field, so that we can easily return to the same page the + * next time we come in here. For other operations, this shouldn't + * matter, since odds are this is the last thing that happens before + * we return to the user program. + */ + hcp->pgno = PGNO(hcp->pagep); + + /* + * XXX Maybe keep incremental numbers here + */ + if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) + hashp->hdr->nelem++; + + if (do_expand || (hashp->hdr->ffactor != 0 && + (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor)) + F_SET(hcp, H_EXPAND); + return (0); +} + + +/* + * Special __putitem call used in splitting -- copies one entry to + * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA, + * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we + * do not need to do any logging here. + * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, int, PAGE *)); + */ +void +__ham_copy_item(hashp, src_page, src_ndx, dest_page) + HTAB *hashp; + PAGE *src_page; + int src_ndx; + PAGE *dest_page; +{ + u_int32_t len; + void *src, *dest; + + /* + * Copy the key and data entries onto this new page. + */ + src = P_ENTRY(src_page, src_ndx); + + /* Set up space on dest. */ + len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx); + HOFFSET(dest_page) -= len; + dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page); + dest = P_ENTRY(dest_page, NUM_ENT(dest_page)); + NUM_ENT(dest_page)++; + + memcpy(dest, src, len); +} + +/* + * + * Returns: + * pointer on success + * NULL on error + * + * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **)); + */ +int +__ham_add_ovflpage(hashp, pagep, release, pp) + HTAB *hashp; + PAGE *pagep; + int release; + PAGE **pp; +{ + DB_ENV *dbenv; + DB_LSN new_lsn; + PAGE *new_pagep; + int ret; + + dbenv = hashp->dbp->dbenv; + + if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0) + return (ret); + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpage_log(dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL, + hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep), + PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(pagep) = LSN(new_pagep) = new_lsn; + } + NEXT_PGNO(pagep) = PGNO(new_pagep); + PREV_PGNO(new_pagep) = PGNO(pagep); + + if (release) + ret = __ham_put_page(hashp->dbp, pagep, 1); + + hashp->hash_overflows++; + *pp = new_pagep; + return (ret); +} + + +/* + * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **)); + */ +int +__ham_new_page(hashp, addr, type, pp) + HTAB *hashp; + u_int32_t addr, type; + PAGE **pp; +{ + PAGE *pagep; + int ret; + + if ((ret = memp_fget(hashp->dbp->mpf, + &addr, DB_MPOOL_CREATE, &pagep)) != 0) + return (ret); + +#ifdef DEBUG_SLOW + account_page(hashp, addr, 1); +#endif + /* This should not be necessary because page-in should do it. */ + P_INIT(pagep, + hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type); + + *pp = pagep; + return (0); +} + +/* + * PUBLIC: int __ham_del_page __P((DB *, PAGE *)); + */ +int +__ham_del_page(dbp, pagep) + DB *dbp; + PAGE *pagep; +{ + DB_LSN new_lsn; + HTAB *hashp; + int ret; + + hashp = (HTAB *)dbp->internal; + ret = 0; + DIRTY_META(hashp, ret); + if (ret != 0) { + if (ret != EAGAIN) + __db_err(hashp->dbp->dbenv, + "free_ovflpage: unable to lock meta data page %s\n", + strerror(ret)); + /* + * If we are going to return an error, then we should free + * the page, so it doesn't stay pinned forever. + */ + (void)__ham_put_page(hashp->dbp, pagep, 0); + return (ret); + } + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO, + hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed, + (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID, + &LSN(pagep), &hashp->hdr->lsn)) != 0) + return (ret); + + hashp->hdr->lsn = new_lsn; + LSN(pagep) = new_lsn; + } + +#ifdef DEBUG + { + db_pgno_t __pgno; + DB_LSN __lsn; + __pgno = pagep->pgno; + __lsn = pagep->lsn; + memset(pagep, 0xff, dbp->pgsize); + pagep->pgno = __pgno; + pagep->lsn = __lsn; + } +#endif + TYPE(pagep) = P_INVALID; + NEXT_PGNO(pagep) = hashp->hdr->last_freed; + hashp->hdr->last_freed = PGNO(pagep); + + return (__ham_put_page(hashp->dbp, pagep, 1)); +} + + +/* + * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t)); + */ +int +__ham_put_page(dbp, pagep, is_dirty) + DB *dbp; + PAGE *pagep; + int32_t is_dirty; +{ +#ifdef DEBUG_SLOW + account_page((HTAB *)dbp->cookie, + ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1); +#endif + return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0))); +} + +/* + * __ham_dirty_page -- + * Mark a page dirty. + * + * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *)); + */ +int +__ham_dirty_page(hashp, pagep) + HTAB *hashp; + PAGE *pagep; +{ + return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY)); +} + +/* + * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **)); + */ +int +__ham_get_page(dbp, addr, pagep) + DB *dbp; + db_pgno_t addr; + PAGE **pagep; +{ + int ret; + + ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep); +#ifdef DEBUG_SLOW + if (*pagep != NULL) + account_page((HTAB *)dbp->internal, addr, 1); +#endif + return (ret); +} + +/* + * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **)); + */ +int +__ham_overflow_page(dbp, type, pp) + DB *dbp; + u_int32_t type; + PAGE **pp; +{ + DB_LSN *lsnp, new_lsn; + HTAB *hashp; + PAGE *p; + db_pgno_t new_addr, next_free, newalloc_flag; + u_int32_t offset, splitnum; + int ret; + + hashp = (HTAB *)dbp->internal; + + ret = 0; + DIRTY_META(hashp, ret); + if (ret != 0) + return (ret); + + /* + * This routine is split up into two parts. First we have + * to figure out the address of the new page that we are + * allocating. Then we have to log the allocation. Only + * after the log do we get to complete allocation of the + * new page. + */ + new_addr = hashp->hdr->last_freed; + if (new_addr != PGNO_INVALID) { + if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0) + return (ret); + next_free = NEXT_PGNO(p); + lsnp = &LSN(p); + newalloc_flag = 0; + } else { + splitnum = hashp->hdr->ovfl_point; + hashp->hdr->spares[splitnum]++; + offset = hashp->hdr->spares[splitnum] - + (splitnum ? hashp->hdr->spares[splitnum - 1] : 0); + new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset); + if (new_addr > MAX_PAGES(hashp)) { + __db_err(hashp->dbp->dbenv, "hash: out of file pages"); + hashp->hdr->spares[splitnum]--; + return (ENOMEM); + } + next_free = PGNO_INVALID; + p = NULL; + lsnp = NULL; + newalloc_flag = 1; + } + + if (DB_LOGGING(hashp->dbp)) { + if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, + (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO, + hashp->dbp->log_fileid, new_addr, next_free, + 0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0) + return (ret); + + hashp->hdr->lsn = new_lsn; + if (lsnp != NULL) + *lsnp = new_lsn; + } + + if (p != NULL) { + /* We just took something off the free list, initialize it. */ + hashp->hdr->last_freed = next_free; + P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID, + PGNO_INVALID, 0, (u_int8_t)type); + } else { + /* Get the new page. */ + if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0) + return (ret); + } + if (DB_LOGGING(hashp->dbp)) + LSN(p) = new_lsn; + + *pp = p; + return (0); +} + +#ifdef DEBUG +/* + * PUBLIC: #ifdef DEBUG + * PUBLIC: int bucket_to_page __P((HTAB *, int)); + * PUBLIC: #endif + */ +int +bucket_to_page(hashp, n) + HTAB *hashp; + int n; +{ + int ret_val; + + ret_val = n + 1; + if (n != 0) + ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1]; + return (ret_val); +} +#endif + + +/* + * Create a bunch of overflow pages at the current split point. + * PUBLIC: void __ham_init_ovflpages __P((HTAB *)); + */ +void +__ham_init_ovflpages(hp) + HTAB *hp; +{ + DB_LSN new_lsn; + PAGE *p; + db_pgno_t last_pgno; + u_int32_t i, numpages; + + numpages = hp->hdr->ovfl_point + 1; + + last_pgno = hp->hdr->last_freed; + if (DB_LOGGING(hp->dbp)) { + (void)__ham_ovfl_log(hp->dbp->dbenv->lg_info, + (DB_TXN *)hp->dbp->txn, &new_lsn, 0, + hp->dbp->log_fileid, PGNO_OF(hp, hp->hdr->ovfl_point, 1), + numpages, last_pgno, &hp->hdr->lsn); + hp->hdr->lsn = new_lsn; + } else + ZERO_LSN(new_lsn); + + hp->hdr->spares[hp->hdr->ovfl_point] += numpages; + for (i = numpages; i > 0; i--) { + if (__ham_new_page(hp, + PGNO_OF(hp, hp->hdr->ovfl_point, i), P_INVALID, &p) != 0) + break; + LSN(p) = new_lsn; + NEXT_PGNO(p) = last_pgno; + last_pgno = PGNO(p); + (void)__ham_put_page(hp->dbp, p, 1); + } + hp->hdr->last_freed = last_pgno; +} + +/* + * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + */ +int +__ham_get_cpage(hashp, hcp, mode) + HTAB *hashp; + HASH_CURSOR *hcp; + db_lockmode_t mode; +{ + int ret; + + if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) && + (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0) + return (ret); + + if (hcp->pagep == NULL) { + if (hcp->pgno == PGNO_INVALID) { + hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket); + hcp->bndx = 0; + } + + if ((ret = + __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0) + return (ret); + } + + if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL) + if ((ret = + __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + return (ret); + return (0); +} + +/* + * Get a new page at the cursor, putting the last page if necessary. + * If the flag is set to H_ISDUP, then we are talking about the + * duplicate page, not the main page. + * PUBLIC: int __ham_next_cpage __P((HTAB *, HASH_CURSOR *, db_pgno_t, + * PUBLIC: int, int)); + */ +int +__ham_next_cpage(hashp, hcp, pgno, dirty, flags) + HTAB *hashp; + HASH_CURSOR *hcp; + db_pgno_t pgno; + int dirty; + int flags; +{ + PAGE *p; + int ret; + + if (flags & H_ISDUP && hcp->dpagep != NULL && + (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0) + return (ret); + else if (!(flags & H_ISDUP) && hcp->pagep != NULL && + (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0) + return (ret); + + if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0) + return (ret); + + if (flags & H_ISDUP) { + hcp->dpagep = p; + hcp->dpgno = pgno; + hcp->dndx = 0; + } else { + hcp->pagep = p; + hcp->pgno = pgno; + hcp->bndx = 0; + } + + return (0); +} + +/* + * __ham_lock_bucket -- + * Get the lock on a particular bucket. + */ +static int +__ham_lock_bucket(dbp, hcp, mode) + DB *dbp; + HASH_CURSOR *hcp; + db_lockmode_t mode; +{ + int ret; + + /* + * What a way to trounce on the memory system. It might be + * worth copying the lk_info into the hashp. + */ + ret = 0; + dbp->lock.pgno = (db_pgno_t)(hcp->bucket); + ret = lock_get(dbp->dbenv->lk_info, + dbp->txn == NULL ? dbp->locker : dbp->txn->txnid, 0, + &dbp->lock_dbt, mode, &hcp->lock); + + return (ret < 0 ? EAGAIN : ret); +} + +/* + * __ham_dpair -- + * Delete a pair on a page, paying no attention to what the pair + * represents. The caller is responsible for freeing up duplicates + * or offpage entries that might be referenced by this pair. + * + * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t)); + */ +void +__ham_dpair(dbp, p, pndx) + DB *dbp; + PAGE *p; + u_int32_t pndx; +{ + db_indx_t delta, n; + u_int8_t *dest, *src; + + /* + * Compute "delta", the amount we have to shift all of the + * offsets. To find the delta, we just need to calculate + * the size of the pair of elements we are removing. + */ + delta = H_PAIRSIZE(p, dbp->pgsize, pndx); + + /* + * The hard case: we want to remove something other than + * the last item on the page. We need to shift data and + * offsets down. + */ + if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) { + /* + * Move the data: src is the first occupied byte on + * the page. (Length is delta.) + */ + src = (u_int8_t *)p + HOFFSET(p); + + /* + * Destination is delta bytes beyond src. This might + * be an overlapping copy, so we have to use memmove. + */ + dest = src + delta; + memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p)); + } + + /* Adjust the offsets. */ + for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) { + p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta; + p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta; + } + + /* Adjust page metadata. */ + HOFFSET(p) = HOFFSET(p) + delta; + NUM_ENT(p) = NUM_ENT(p) - 2; +} + +#ifdef DEBUG_SLOW +static void +account_page(hashp, pgno, inout) + HTAB *hashp; + db_pgno_t pgno; + int inout; +{ + static struct { + db_pgno_t pgno; + int times; + } list[100]; + static int last; + int i, j; + + if (inout == -1) /* XXX: Kluge */ + inout = 0; + + /* Find page in list. */ + for (i = 0; i < last; i++) + if (list[i].pgno == pgno) + break; + /* Not found. */ + if (i == last) { + list[last].times = inout; + list[last].pgno = pgno; + last++; + } + list[i].times = inout; + if (list[i].times == 0) { + for (j = i; j < last; j++) + list[j] = list[j + 1]; + last--; + } + for (i = 0; i < last; i++, list[i].times++) + if (list[i].times > 20 && !is_bitmap_pgno(hashp, list[i].pgno)) + (void)fprintf(stderr, + "Warning: pg %lu has been out for %d times\n", + (u_long)list[i].pgno, list[i].times); +} +#endif /* DEBUG_SLOW */ diff --git a/db2/hash/hash_rec.c b/db2/hash/hash_rec.c new file mode 100644 index 0000000000..81d9bb5ea8 --- /dev/null +++ b/db2/hash/hash_rec.c @@ -0,0 +1,810 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_rec.c 10.12 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "hash.h" +#include "btree.h" +#include "log.h" +#include "db_dispatch.h" +#include "common_ext.h" + +/* + * __ham_insdel_recover -- + * + * PUBLIC: int __ham_insdel_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_insdel_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_insdel_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + PAGE *pagep; + u_int32_t op; + int cmp_n, cmp_p, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_insdel_print); + REC_INTRO(__ham_insdel_read); + + ret = memp_fget(mpf, &argp->pgno, 0, &pagep); + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + /* + * Two possible things going on: + * redo a delete/undo a put: delete the item from the page. + * redo a put/undo a delete: add the item to the page. + * If we are undoing a delete, then the information logged is the + * entire entry off the page, not just the data of a dbt. In + * this case, we want to copy it back onto the page verbatim. + * We do this by calling __putitem with the type H_OFFPAGE instead + * of H_KEYDATA. + */ + op = OPCODE_OF(argp->opcode); + + if ((op == DELPAIR && cmp_n == 0 && !redo) || + (op == PUTPAIR && cmp_p == 0 && redo)) { + /* Need to redo a PUT or undo a delete. */ + __ham_putitem(pagep, &argp->key, + !redo || PAIR_ISKEYBIG(argp->opcode) ? + H_OFFPAGE : H_KEYDATA); + __ham_putitem(pagep, &argp->data, + !redo || PAIR_ISDATABIG(argp->opcode) ? + H_OFFPAGE : H_KEYDATA); + + LSN(pagep) = redo ? *lsnp : argp->pagelsn; + if ((ret = __ham_put_page(file_dbp, pagep, 1)) != 0) + goto out; + + } else if ((op == DELPAIR && cmp_p == 0 && redo) + || (op == PUTPAIR && cmp_n == 0 && !redo)) { + /* Need to undo a put or redo a delete. */ + __ham_dpair(file_dbp, pagep, argp->ndx); + LSN(pagep) = redo ? *lsnp : argp->pagelsn; + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) + goto out; + } else + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) + goto out; + + /* Return the previous LSN. */ + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} + +/* + * __ham_newpage_recover -- + * This log message is used when we add/remove overflow pages. This + * message takes care of the pointer chains, not the data on the pages. + * + * PUBLIC: int __ham_newpage_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_newpage_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_newpage_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + PAGE *pagep; + int cmp_n, cmp_p, change, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_newpage_print); + REC_INTRO(__ham_newpage_read); + + ret = memp_fget(mpf, &argp->new_pgno, 0, &pagep); + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto ppage; + } else if ((ret = memp_fget(mpf, &argp->new_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + /* + * There are potentially three pages we need to check: the one + * that we created/deleted, the one before it and the one after + * it. + */ + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + change = 0; + + if ((cmp_p == 0 && redo && argp->opcode == PUTOVFL) || + (cmp_n == 0 && !redo && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + P_INIT(pagep, file_dbp->pgsize, argp->new_pgno, + argp->prev_pgno, argp->next_pgno, 0, P_HASH); + change = 1; + } else if ((cmp_p == 0 && redo && argp->opcode == DELOVFL) || + (cmp_n == 0 && !redo && argp->opcode == PUTOVFL)) { + /* + * Redo a delete or undo a create new page. All we + * really need to do is change the LSN. + */ + change = 1; + } + + if (!change) { + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) + goto out; + } else { + LSN(pagep) = redo ? *lsnp : argp->pagelsn; + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) + goto out; + } + + /* Now do the prev page. */ +ppage: if (argp->prev_pgno != PGNO_INVALID) { + ret = memp_fget(mpf, &argp->prev_pgno, 0, &pagep); + + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + ret = 0; + goto npage; + } else if ((ret = + memp_fget(mpf, &argp->prev_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->prevlsn); + change = 0; + + if ((cmp_p == 0 && redo && argp->opcode == PUTOVFL) || + (cmp_n == 0 && !redo && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + pagep->next_pgno = argp->new_pgno; + change = 1; + } else if ((cmp_p == 0 && redo && argp->opcode == DELOVFL) || + (cmp_n == 0 && !redo && argp->opcode == PUTOVFL)) { + /* Redo a delete or undo a create new page. */ + pagep->next_pgno = argp->next_pgno; + change = 1; + } + + if (!change) { + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) + goto out; + } else { + LSN(pagep) = redo ? *lsnp : argp->prevlsn; + if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) + goto out; + } + } + + /* Now time to do the next page */ +npage: if (argp->next_pgno != PGNO_INVALID) { + ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep); + + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else if ((ret = + memp_fget(mpf, &argp->next_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nextlsn); + change = 0; + + if ((cmp_p == 0 && redo && argp->opcode == PUTOVFL) || + (cmp_n == 0 && !redo && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + pagep->prev_pgno = argp->new_pgno; + change = 1; + } else if ((cmp_p == 0 && redo && argp->opcode == DELOVFL) || + (cmp_n == 0 && !redo && argp->opcode == PUTOVFL)) { + /* Redo a delete or undo a create new page. */ + pagep->prev_pgno = argp->prev_pgno; + change = 1; + } + + if (!change) { + if ((ret = + __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) + goto out; + } else { + LSN(pagep) = redo ? *lsnp : argp->nextlsn; + if ((ret = + __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) + goto out; + } + } + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} + + +/* + * __ham_replace_recover -- + * This log message refers to partial puts that are local to a single + * page. You can think of them as special cases of the more general + * insdel log message. + * + * PUBLIC: int __ham_replace_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_replace_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_replace_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + DBT dbt; + HKEYDATA *hk; + HTAB *hashp; + PAGE *pagep; + int32_t grow; + int change, cmp_n, cmp_p, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_replace_print); + REC_INTRO(__ham_replace_read); + + ret = memp_fget(mpf, &argp->pgno, 0, &pagep); + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + + if (cmp_p == 0 && redo) { + change = 1; + /* Reapply the change as specified. */ + dbt.data = argp->newitem.data; + dbt.size = argp->newitem.size; + grow = argp->newitem.size - argp->olditem.size; + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && !redo) { + change = 1; + /* Undo the already applied change. */ + dbt.data = argp->olditem.data; + dbt.size = argp->olditem.size; + grow = argp->olditem.size - argp->newitem.size; + LSN(pagep) = argp->pagelsn; + } else { + change = 0; + grow = 0; + } + + if (change) { + __ham_onpage_replace(pagep, + file_dbp->pgsize, argp->ndx, argp->off, grow, &dbt); + if (argp->makedup) { + hk = GET_HKEYDATA(pagep, argp->ndx); + if (redo) + hk->type = H_DUPLICATE; + else + hk->type = H_KEYDATA; + } + } + + if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) + goto out; + + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} + +/* + * __ham_newpgno_recover -- + * This log message is used when allocating or deleting an overflow + * page. It takes care of modifying the meta data. + * + * PUBLIC: int __ham_newpgno_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_newpgno_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_newpgno_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + PAGE *pagep; + int change, cmp_n, cmp_p, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_newpgno_print); + REC_INTRO(__ham_newpgno_read); + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + /* + * There are two phases to the recovery here. First we need + * to update the meta data; then we need to update the page. + * We'll do the meta-data first. + */ + cmp_n = log_compare(lsnp, &hashp->hdr->lsn); + cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + + change = 0; + if ((cmp_p == 0 && redo && argp->opcode == ALLOCPGNO) || + (cmp_n == 0 && !redo && argp->opcode == DELPGNO)) { + /* Need to redo an allocation or undo a deletion. */ + hashp->hdr->last_freed = argp->free_pgno; + if (redo && argp->old_pgno != 0) /* Must be ALLOCPGNO */ + hashp->hdr->spares[hashp->hdr->ovfl_point]++; + change = 1; + } else if (cmp_p == 0 && redo && argp->opcode == DELPGNO) { + /* Need to redo a deletion */ + hashp->hdr->last_freed = argp->pgno; + change = 1; + } else if (cmp_n == 0 && !redo && argp->opcode == ALLOCPGNO) { + /* undo an allocation. */ + if (argp->old_pgno == 0) + hashp->hdr->last_freed = argp->pgno; + else { + hashp->hdr->spares[hashp->hdr->ovfl_point]--; + hashp->hdr->last_freed = 0; + } + change = 1; + } + if (change) { + hashp->hdr->lsn = redo ? *lsnp : argp->metalsn; + F_SET(file_dbp, DB_HS_DIRTYMETA); + } + + + /* Now check the newly allocated/freed page. */ + ret = memp_fget(mpf, &argp->pgno, 0, &pagep); + + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + + change = 0; + if (cmp_p == 0 && redo && argp->opcode == ALLOCPGNO) { + /* Need to redo an allocation. */ + P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID, + PGNO_INVALID, 0, argp->new_type); + change = 1; + } else if (cmp_n == 0 && !redo && argp->opcode == DELPGNO) { + /* Undoing a delete. */ + P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID, + argp->old_pgno, 0, argp->old_type); + change = 1; + } else if ((cmp_p == 0 && redo && argp->opcode == DELPGNO) || + (cmp_n == 0 && !redo && argp->opcode == ALLOCPGNO)) { + /* Need to redo a deletion or undo an allocation. */ + NEXT_PGNO(pagep) = argp->free_pgno; + TYPE(pagep) = P_INVALID; + change = 1; + } + if (change) + LSN(pagep) = redo ? *lsnp : argp->pagelsn; + + if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) + goto out; + + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; + +} + +/* + * __ham_splitmeta_recover -- + * This is the meta-data part of the split. Records the new and old + * bucket numbers and the new/old mask information. + * + * PUBLIC: int __ham_splitmeta_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_splitmeta_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_splitmeta_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + int change, cmp_n, cmp_p, getmeta, ret; + u_int32_t pow; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_splitmeta_print); + REC_INTRO(__ham_splitmeta_read); + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + /* + * There are two phases to the recovery here. First we need + * to update the meta data; then we need to update the page. + * We'll do the meta-data first. + */ + cmp_n = log_compare(lsnp, &hashp->hdr->lsn); + cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + + change = 0; + if (cmp_p == 0 && redo) { + /* Need to redo the split information. */ + hashp->hdr->max_bucket = argp->bucket + 1; + pow = __db_log2(hashp->hdr->max_bucket + 1); + if (pow > hashp->hdr->ovfl_point) { + hashp->hdr->spares[pow] = + hashp->hdr->spares[hashp->hdr->ovfl_point]; + hashp->hdr->ovfl_point = pow; + } + if (hashp->hdr->max_bucket > hashp->hdr->high_mask) { + hashp->hdr->low_mask = hashp->hdr->high_mask; + hashp->hdr->high_mask = + hashp->hdr->max_bucket | hashp->hdr->low_mask; + } + change = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo the split information. */ + hashp->hdr->max_bucket = argp->bucket; + hashp->hdr->ovfl_point = argp->ovflpoint; + hashp->hdr->spares[hashp->hdr->ovfl_point] = argp->spares; + pow = 1 << __db_log2(hashp->hdr->max_bucket + 1); + hashp->hdr->high_mask = pow - 1; + hashp->hdr->low_mask = (pow >> 1) - 1; + change = 1; + } + if (change) { + hashp->hdr->lsn = redo ? *lsnp : argp->metalsn; + F_SET(file_dbp, DB_HS_DIRTYMETA); + } + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} + +/* + * __ham_splitdata_recover -- + * + * PUBLIC: int __ham_splitdata_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_splitdata_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_splitdata_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + PAGE *pagep; + int change, cmp_n, cmp_p, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_splitdata_print); + REC_INTRO(__ham_splitdata_read); + + ret = memp_fget(mpf, &argp->pgno, 0, &pagep); + if (ret != 0) + if (!redo) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + + /* + * There are two types of log messages here, one for the old page + * and one for the new pages created. The original image in the + * SPLITOLD record is used for undo. The image in the SPLITNEW + * is used for redo. We should never have a case where there is + * a redo operation and the SPLITOLD record is on disk, but not + * the SPLITNEW record. Therefore, we only have work to do when + * redo NEW messages and undo OLD messages, but we have to update + * LSNs in both cases. + */ + change = 0; + if (cmp_p == 0 && redo) { + if (argp->opcode == SPLITNEW) + /* Need to redo the split described. */ + memcpy(pagep, argp->pageimage.data, + argp->pageimage.size); + LSN(pagep) = *lsnp; + change = 1; + } else if (cmp_n == 0 && !redo) { + if (argp->opcode == SPLITOLD) { + /* Put back the old image. */ + memcpy(pagep, argp->pageimage.data, + argp->pageimage.size); + } else + P_INIT(pagep, file_dbp->pgsize, argp->pgno, + PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + LSN(pagep) = argp->pagelsn; + change = 1; + } + if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) + goto out; + + *lsnp = argp->prev_lsn; + +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} + +/* + * __ham_ovfl_recover -- + * This message is generated when we initialize a set of overflow pages. + * + * PUBLIC: int __ham_ovfl_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__ham_ovfl_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __ham_ovfl_args *argp; + DB *mdbp, *file_dbp; + DB_MPOOLFILE *mpf; + HTAB *hashp; + PAGE *pagep; + db_pgno_t max_pgno, pgno; + int cmp_n, cmp_p, getmeta, ret; + + getmeta = 0; + hashp = NULL; /* XXX: shut the compiler up. */ + REC_PRINT(__ham_ovfl_print); + REC_INTRO(__ham_ovfl_read); + + hashp = (HTAB *)file_dbp->internal; + GET_META(file_dbp, hashp); + getmeta = 1; + file_dbp = NULL; + + cmp_n = log_compare(lsnp, &hashp->hdr->lsn); + cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + + if (cmp_p == 0 && redo) { + /* Redo the allocation. */ + hashp->hdr->last_freed = argp->start_pgno; + hashp->hdr->spares[argp->npages - 1] += argp->npages; + hashp->hdr->lsn = *lsnp; + F_SET(file_dbp, DB_HS_DIRTYMETA); + } else if (cmp_n == 0 && !redo) { + hashp->hdr->last_freed = argp->free_pgno; + hashp->hdr->spares[argp->npages - 1] -= argp->npages; + hashp->hdr->lsn = argp->metalsn; + F_SET(file_dbp, DB_HS_DIRTYMETA); + } + + max_pgno = argp->start_pgno + argp->npages - 1; + ret = 0; + for (pgno = argp->start_pgno; pgno <= max_pgno; pgno++) { + ret = memp_fget(mpf, &pgno, 0, &pagep); + if (ret != 0) { + if (redo && (ret = memp_fget(mpf, &pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + else if (!redo) { + (void)__ham_put_page(file_dbp, pagep, 0); + continue; + } + } + if (redo && log_compare((const DB_LSN *)lsnp, + (const DB_LSN *)&LSN(pagep)) > 0) { + P_INIT(pagep, file_dbp->pgsize, pgno, PGNO_INVALID, + pgno == max_pgno ? argp->free_pgno : pgno + 1, + 0, P_HASH); + LSN(pagep) = *lsnp; + ret = __ham_put_page(file_dbp, pagep, 1); + } else if (!redo) { + ZERO_LSN(pagep->lsn); + ret = __ham_put_page(file_dbp, pagep, 1); + } else + ret = __ham_put_page(file_dbp, pagep, 0); + if (ret) + goto out; + } + + *lsnp = argp->prev_lsn; +out: if (getmeta) + RELEASE_META(file_dbp, hashp); + REC_CLOSE; +} diff --git a/db2/hash/hash_stat.c b/db2/hash/hash_stat.c new file mode 100644 index 0000000000..99c6078d86 --- /dev/null +++ b/db2/hash/hash_stat.c @@ -0,0 +1,58 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)hash_stat.c 10.6 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" +#include "common_ext.h" + +/* + * __ham_stat -- + * Gather/print the hash statistics. + * + * PUBLIC: int __ham_stat __P((DB *, FILE *)); + */ +int +__ham_stat(dbp, fp) + DB *dbp; + FILE *fp; +{ + HTAB *hashp; + int i; + + hashp = (HTAB *)dbp->internal; + + fprintf(fp, "hash: accesses %lu collisions %lu\n", + hashp->hash_accesses, hashp->hash_collisions); + fprintf(fp, "hash: expansions %lu\n", hashp->hash_expansions); + fprintf(fp, "hash: overflows %lu\n", hashp->hash_overflows); + fprintf(fp, "hash: big key/data pages %lu\n", hashp->hash_bigpages); + + SET_LOCKER(dbp, NULL); + GET_META(dbp, hashp); + fprintf(fp, "keys %lu maxp %lu\n", + (u_long)hashp->hdr->nelem, (u_long)hashp->hdr->max_bucket); + + for (i = 0; i < NCACHED; i++) + fprintf(fp, + "spares[%d] = %lu\n", i, (u_long)hashp->hdr->spares[i]); + + RELEASE_META(dbp, hashp); + return (0); +} diff --git a/db2/include/btree.h b/db2/include/btree.h new file mode 100644 index 0000000000..5cf4224ae6 --- /dev/null +++ b/db2/include/btree.h @@ -0,0 +1,312 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)btree.h 10.16 (Sleepycat) 8/24/97 + */ + +/* Forward structure declarations. */ +struct __btree; typedef struct __btree BTREE; +struct __cursor; typedef struct __cursor CURSOR; +struct __epg; typedef struct __epg EPG; +struct __rcursor; typedef struct __rcursor RCURSOR; +struct __recno; typedef struct __recno RECNO; + +#undef DEFMINKEYPAGE /* Minimum keys per page */ +#define DEFMINKEYPAGE (2) + +#undef ISINTERNAL /* If an internal page. */ +#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) +#undef ISLEAF /* If a leaf page. */ +#define ISLEAF(p) (TYPE(p) == P_LBTREE || TYPE(p) == P_LRECNO) + +/* Allocate and discard thread structures. */ +#define GETHANDLE(dbp, set_txn, dbpp, ret) { \ + if (F_ISSET(dbp, DB_AM_THREAD)) { \ + if ((ret = __db_gethandle(dbp, __bam_bdup, dbpp)) != 0) \ + return (ret); \ + } else \ + *dbpp = dbp; \ + *dbpp->txn = set_txn; \ +} +#define PUTHANDLE(dbp) { \ + dbp->txn = NULL; \ + if (F_ISSET(dbp, DB_AM_THREAD)) \ + __db_puthandle(dbp); \ +} + +/* + * If doing transactions we have to hold the locks associated with a data item + * from a page for the entire transaction. However, we don't have to hold the + * locks associated with walking the tree. Distinguish between the two so that + * we don't tie up the internal pages of the tree longer than necessary. + */ +#define __BT_LPUT(dbp, lock) \ + (F_ISSET((dbp), DB_AM_LOCKING) ? \ + lock_put((dbp)->dbenv->lk_info, lock) : 0) +#define __BT_TLPUT(dbp, lock) \ + (F_ISSET((dbp), DB_AM_LOCKING) && (dbp)->txn == NULL ? \ + lock_put((dbp)->dbenv->lk_info, lock) : 0) + +/* + * Flags to __bt_search() and __rec_search(). + * + * Note, internal page searches must find the largest record less than key in + * the tree so that descents work. Leaf page searches must find the smallest + * record greater than key so that the returned index is the record's correct + * position for insertion. + * + * The flags parameter to the search routines describes three aspects of the + * search: the type of locking required (including if we're locking a pair of + * pages), the item to return in the presence of duplicates and whether or not + * to return deleted entries. To simplify both the mnemonic representation + * and the code that checks for various cases, we construct a set of bitmasks. + */ +#define S_READ 0x0001 /* Read locks. */ +#define S_WRITE 0x0002 /* Write locks. */ + +#define S_APPEND 0x0040 /* Append to the tree. */ +#define S_DELNO 0x0080 /* Don't return deleted items. */ +#define S_DUPFIRST 0x0100 /* Return first duplicate. */ +#define S_DUPLAST 0x0200 /* Return last duplicate. */ +#define S_EXACT 0x0400 /* Exact items only. */ +#define S_PARENT 0x0800 /* Lock page pair. */ + +#define S_DELETE (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT) +#define S_FIND (S_READ | S_DUPFIRST | S_DELNO) +#define S_INSERT (S_WRITE | S_DUPLAST) +#define S_KEYFIRST (S_WRITE | S_DUPFIRST) +#define S_KEYLAST (S_WRITE | S_DUPLAST) +#define S_WRPAIR (S_WRITE | S_DUPLAST | S_PARENT) + +/* + * Flags to __bam_iitem(). + */ +#define BI_NEWKEY 0x01 /* New key. */ +#define BI_DELETED 0x02 /* Key/data pair only placeholder. */ + +/* + * Various routines pass around page references. A page reference can be a + * pointer to the page or a page number; for either, an indx can designate + * an item on the page. + */ +struct __epg { + PAGE *page; /* The page. */ + db_indx_t indx; /* The index on the page. */ + DB_LOCK lock; /* The page's lock. */ +}; + +/* + * Btree cursor. + * + * Arguments passed to __bam_ca_replace(). + */ +typedef enum { + REPLACE_SETUP, + REPLACE_SUCCESS, + REPLACE_FAILED +} ca_replace_arg; +struct __cursor { + DBC *dbc; /* Enclosing DBC. */ + + PAGE *page; /* Cursor page. */ + + db_pgno_t pgno; /* Page. */ + db_indx_t indx; /* Page item ref'd by the cursor. */ + + db_pgno_t dpgno; /* Duplicate page. */ + db_indx_t dindx; /* Page item ref'd by the cursor. */ + + DB_LOCK lock; /* Cursor read lock. */ + db_lockmode_t mode; /* Lock mode. */ + + /* + * If a cursor record is deleted, the key/data pair has to remain on + * the page so that subsequent inserts/deletes don't interrupt the + * cursor progression through the file. This results in interesting + * cases when "standard" operations, e.g., dbp->put() are done in the + * context of "deleted" cursors. + * + * C_DELETED -- The item referenced by the cursor has been "deleted" + * but not physically removed from the page. + * C_REPLACE -- The "deleted" item referenced by a cursor has been + * replaced by a dbp->put(), so the cursor is no longer + * responsible for physical removal from the page. + * C_REPLACE_SETUP -- + * We are about to overwrite a "deleted" item, flag any + * cursors referencing it for transition to C_REPLACE + * state. + */ +#define C_DELETED 0x0001 +#define C_REPLACE 0x0002 +#define C_REPLACE_SETUP 0x0004 + u_int32_t flags; +}; + +/* + * Recno cursor. + * + * Arguments passed to __ram_ca(). + */ +typedef enum { + CA_DELETE, + CA_IAFTER, + CA_IBEFORE +} ca_recno_arg; +struct __rcursor { + DBC *dbc; /* Enclosing DBC. */ + + db_recno_t recno; /* Current record number. */ + + /* + * Cursors referencing "deleted" records are positioned between + * two records, and so must be specially adjusted until they are + * moved. + */ +#define CR_DELETED 0x0001 /* Record deleted. */ + u_int32_t flags; +}; + +/* + * We maintain a stack of the pages that we're locking in the tree. Btree's + * (currently) only save two levels of the tree at a time, so the default + * stack is always large enough. Recno trees have to lock the entire tree to + * do inserts/deletes, however. Grow the stack as necessary. + */ +#undef BT_STK_CLR +#define BT_STK_CLR(t) \ + ((t)->bt_csp = (t)->bt_sp) + +#undef BT_STK_ENTER +#define BT_STK_ENTER(t, pagep, page_indx, lock, ret) do { \ + if ((ret = \ + (t)->bt_csp == (t)->bt_esp ? __bam_stkgrow(t) : 0) == 0) { \ + (t)->bt_csp->page = pagep; \ + (t)->bt_csp->indx = page_indx; \ + (t)->bt_csp->lock = lock; \ + } \ +} while (0) + +#undef BT_STK_PUSH +#define BT_STK_PUSH(t, pagep, page_indx, lock, ret) do { \ + BT_STK_ENTER(t, pagep, page_indx, lock, ret); \ + ++(t)->bt_csp; \ +} while (0) + +#undef BT_STK_POP +#define BT_STK_POP(t) \ + ((t)->bt_csp == (t)->bt_stack ? NULL : --(t)->bt_csp) + +/* + * The in-memory recno data structure. + * + * !!! + * These fields are ignored as far as multi-threading is concerned. There + * are no transaction semantics associated with backing files, nor is there + * any thread protection. + */ +#undef RECNO_OOB +#define RECNO_OOB 0 /* Illegal record number. */ + +struct __recno { + int re_delim; /* Variable-length delimiting byte. */ + int re_pad; /* Fixed-length padding byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + + char *re_source; /* Source file name. */ + int re_fd; /* Source file descriptor */ + db_recno_t re_last; /* Last record number read. */ + void *re_cmap; /* Current point in mapped space. */ + void *re_smap; /* Start of mapped space. */ + void *re_emap; /* End of mapped space. */ + size_t re_msize; /* Size of mapped region. */ + /* Recno input function. */ + int (*re_irec) __P((DB *, db_recno_t)); + +#define RECNO_EOF 0x0001 /* EOF on backing source file. */ +#define RECNO_MODIFIED 0x0002 /* Tree was modified. */ + u_int32_t flags; +}; + +/* + * The in-memory btree data structure. + */ +struct __btree { +/* + * These fields are per-thread and are initialized when the BTREE structure + * is created. + */ + db_pgno_t bt_lpgno; /* Last insert location. */ + + DBT bt_rkey; /* Returned key. */ + DBT bt_rdata; /* Returned data. */ + + EPG *bt_sp; /* Stack pointer. */ + EPG *bt_csp; /* Current stack entry. */ + EPG *bt_esp; /* End stack pointer. */ + EPG bt_stack[5]; + + RECNO *bt_recno; /* Private recno structure. */ + + DB_BTREE_LSTAT lstat; /* Btree local statistics. */ + +/* + * These fields are copied from the original BTREE structure and never + * change. + */ + db_indx_t bt_maxkey; /* Maximum keys per page. */ + db_indx_t bt_minkey; /* Minimum keys per page. */ + + int (*bt_compare) /* Comparison function. */ + __P((const DBT *, const DBT *)); + size_t(*bt_prefix) /* Prefix function. */ + __P((const DBT *, const DBT *)); + + db_indx_t bt_ovflsize; /* Maximum key/data on-page size. */ +}; + +#include "btree_auto.h" +#include "btree_ext.h" +#include "db_am.h" +#include "common_ext.h" diff --git a/db2/include/btree_auto.h b/db2/include/btree_auto.h new file mode 100644 index 0000000000..b422e1db1b --- /dev/null +++ b/db2/include/btree_auto.h @@ -0,0 +1,108 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#ifndef bam_AUTO_H +#define bam_AUTO_H + +#define DB_bam_pg_alloc (DB_bam_BEGIN + 1) + +typedef struct _bam_pg_alloc_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + DB_LSN meta_lsn; + DB_LSN page_lsn; + db_pgno_t pgno; + u_int32_t ptype; + db_pgno_t next; +} __bam_pg_alloc_args; + + +#define DB_bam_pg_free (DB_bam_BEGIN + 2) + +typedef struct _bam_pg_free_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN meta_lsn; + DBT header; + db_pgno_t next; +} __bam_pg_free_args; + + +#define DB_bam_split (DB_bam_BEGIN + 3) + +typedef struct _bam_split_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t left; + DB_LSN llsn; + db_pgno_t right; + DB_LSN rlsn; + u_int32_t indx; + db_pgno_t npgno; + DB_LSN nlsn; + DBT pg; +} __bam_split_args; + + +#define DB_bam_rsplit (DB_bam_BEGIN + 4) + +typedef struct _bam_rsplit_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DBT pgdbt; + DBT rootent; + DB_LSN rootlsn; +} __bam_rsplit_args; + + +#define DB_bam_adj (DB_bam_BEGIN + 5) + +typedef struct _bam_adj_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; + u_int32_t indx; + u_int32_t indx_copy; + u_int32_t is_insert; +} __bam_adj_args; + + +#define DB_bam_cadjust (DB_bam_BEGIN + 6) + +typedef struct _bam_cadjust_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; + u_int32_t indx; + int32_t adjust; + int32_t total; +} __bam_cadjust_args; + + +#define DB_bam_cdel (DB_bam_BEGIN + 7) + +typedef struct _bam_cdel_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; + u_int32_t indx; +} __bam_cdel_args; + +#endif diff --git a/db2/include/btree_ext.h b/db2/include/btree_ext.h new file mode 100644 index 0000000000..dab0f5be4e --- /dev/null +++ b/db2/include/btree_ext.h @@ -0,0 +1,121 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __bam_close __P((DB *)); +int __bam_sync __P((DB *, int)); +int __bam_cmp __P((DB *, const DBT *, EPG *)); +int __bam_defcmp __P((const DBT *, const DBT *)); +size_t __bam_defpfx __P((const DBT *, const DBT *)); +int __bam_pgin __P((db_pgno_t, void *, DBT *)); +int __bam_pgout __P((db_pgno_t, void *, DBT *)); +int __bam_mswap __P((PAGE *)); +int __bam_cursor __P((DB *, DB_TXN *, DBC **)); +int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); +int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); +int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *)); +void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); +void __bam_ca_dup __P((DB *, + db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); +void __bam_ca_move __P((DB *, BTREE *, db_pgno_t, db_pgno_t)); +void __bam_ca_replace + __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); +void __bam_ca_split __P((DB *, + db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); +int __bam_delete __P((DB *, DB_TXN *, DBT *, int)); +int __ram_delete __P((DB *, DB_TXN *, DBT *, int)); +int __bam_ditem __P((DB *, PAGE *, u_int32_t)); +int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); +int __bam_dpage __P((DB *, const DBT *)); +int __bam_open __P((DB *, DBTYPE, DB_INFO *)); +int __bam_bdup __P((DB *, DB *)); +int __bam_new __P((DB *, u_int32_t, PAGE **)); +int __bam_free __P((DB *, PAGE *)); +int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); +int __bam_lput __P((DB *, DB_LOCK)); +int __bam_pget __P((DB *, PAGE **, db_pgno_t *, int)); +int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); +int __bam_iitem __P((DB *, + PAGE **, db_indx_t *, DBT *, DBT *, int, int)); +int __bam_pg_alloc_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_pg_free_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_split_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_rsplit_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_adj_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_cadjust_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_cdel_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ram_open __P((DB *, DBTYPE, DB_INFO *)); +int __ram_cursor __P((DB *, DB_TXN *, DBC **)); +int __ram_close __P((DB *)); +void __ram_ca __P((DB *, db_recno_t, ca_recno_arg)); +int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); +int __ram_snapshot __P((DB *)); +int __bam_rsearch __P((DB *, db_recno_t *, u_int, int, int *)); +int __bam_adjust __P((DB *, BTREE *, int)); +int __bam_nrecs __P((DB *, db_recno_t *)); +db_recno_t __bam_total __P((PAGE *)); +int __bam_search __P((DB *, + const DBT *, u_int, int, db_recno_t *, int *)); +int __bam_stkrel __P((DB *)); +int __bam_stkgrow __P((BTREE *)); +int __bam_split __P((DB *, void *)); +int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); +int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); +int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); +int __bam_stat __P((DB *, void *, void *(*)(size_t), int)); +void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); +int __bam_pg_alloc_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t, + u_int32_t, db_pgno_t)); +int __bam_pg_alloc_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_pg_alloc_read __P((void *, __bam_pg_alloc_args **)); +int __bam_pg_free_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, DBT *, + db_pgno_t)); +int __bam_pg_free_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_pg_free_read __P((void *, __bam_pg_free_args **)); +int __bam_split_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + DB_LSN *, u_int32_t, db_pgno_t, DB_LSN *, + DBT *)); +int __bam_split_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_split_read __P((void *, __bam_split_args **)); +int __bam_rsplit_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DBT *, DBT *, + DB_LSN *)); +int __bam_rsplit_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_rsplit_read __P((void *, __bam_rsplit_args **)); +int __bam_adj_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t)); +int __bam_adj_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_adj_read __P((void *, __bam_adj_args **)); +int __bam_cadjust_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + int32_t, int32_t)); +int __bam_cadjust_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_cadjust_read __P((void *, __bam_cadjust_args **)); +int __bam_cdel_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, u_int32_t)); +int __bam_cdel_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __bam_cdel_read __P((void *, __bam_cdel_args **)); +int __bam_init_print __P((DB_ENV *)); +int __bam_init_recover __P((DB_ENV *)); diff --git a/db2/include/clib_ext.h b/db2/include/clib_ext.h new file mode 100644 index 0000000000..8ccd2b559f --- /dev/null +++ b/db2/include/clib_ext.h @@ -0,0 +1,65 @@ +/* Do not edit: automatically built by dist/distrib. */ +#ifdef __STDC__ +void err __P((int eval, const char *, ...)); +#else +void err(); +#endif +#ifdef __STDC__ +void errx __P((int eval, const char *, ...)); +#else +void errx(); +#endif +#ifdef __STDC__ +void warn __P((const char *, ...)); +#else +void warn(); +#endif +#ifdef __STDC__ +void warnx __P((const char *, ...)); +#else +void warnx(); +#endif +#ifndef HAVE_GETCWD +char *getcwd __P((char *, size_t)); +#endif +void get_long __P((char *, long, long, long *)); +#ifndef HAVE_GETOPT +int getopt __P((int, char * const *, const char *)); +#endif +#ifndef HAVE_MEMCMP +int memcmp __P((const void *, const void *, size_t)); +#endif +#ifndef HAVE_MEMCPY +void *memcpy __P((void *, const void *, size_t)); +#endif +#ifndef HAVE_MEMMOVE +void *memmove __P((void *, const void *, size_t)); +#endif +#ifndef HAVE_MEMCPY +void *memcpy __P((void *, const void *, size_t)); +#endif +#ifndef HAVE_MEMMOVE +void *memmove __P((void *, const void *, size_t)); +#endif +#ifndef HAVE_RAISE +int raise __P((int)); +#endif +#ifndef HAVE_SNPRINTF +#ifdef __STDC__ +int snprintf __P((char *, size_t, const char *, ...)); +#else +int snprintf(); +#endif +#endif +#ifndef HAVE_STRDUP +char *strdup __P((const char *)); +#endif +#ifndef HAVE_STRERROR +char *strerror __P((int)); +#endif +#ifndef HAVE_STRSEP +char *strsep __P((char **, const char *)); +#endif +#ifndef HAVE_VSNPRINTF +int vsnprintf(); +#endif diff --git a/db2/include/common_ext.h b/db2/include/common_ext.h new file mode 100644 index 0000000000..9840162a12 --- /dev/null +++ b/db2/include/common_ext.h @@ -0,0 +1,41 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __db_appname __P((DB_ENV *, + APPNAME, const char *, const char *, int *, char **)); +int __db_apprec __P((DB_ENV *, int)); +int __db_byteorder __P((DB_ENV *, int)); +#ifdef __STDC__ +void __db_err __P((const DB_ENV *dbenv, const char *fmt, ...)); +#else +void __db_err(); +#endif +int __db_panic __P((DB *)); +int __db_fchk __P((DB_ENV *, const char *, int, int)); +int __db_fcchk __P((DB_ENV *, const char *, int, int, int)); +int __db_cdelchk __P((const DB *, int, int, int)); +int __db_cgetchk __P((const DB *, DBT *, DBT *, int, int)); +int __db_cputchk __P((const DB *, + const DBT *, DBT *, int, int, int)); +int __db_delchk __P((const DB *, int, int)); +int __db_getchk __P((const DB *, const DBT *, DBT *, int)); +int __db_putchk __P((const DB *, DBT *, const DBT *, int, int, int)); +int __db_statchk __P((const DB *, int)); +int __db_syncchk __P((const DB *, int)); +int __db_ferr __P((const DB_ENV *, const char *, int)); +u_int32_t __db_log2 __P((u_int32_t)); +int __db_rcreate __P((DB_ENV *, APPNAME, + const char *, const char *, int, size_t, int *, void *)); +int __db_ropen __P((DB_ENV *, + APPNAME, const char *, const char *, int, int *, void *)); +int __db_rclose __P((DB_ENV *, int, void *)); +int __db_runlink __P((DB_ENV *, + APPNAME, const char *, const char *, int)); +int __db_rgrow __P((DB_ENV *, int, size_t)); +int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *)); +void __db_shalloc_init __P((void *, size_t)); +int __db_shalloc __P((void *, size_t, size_t, void *)); +void __db_shalloc_free __P((void *, void *)); +size_t __db_shalloc_count __P((void *)); +size_t __db_shsizeof __P((void *)); +void __db_shalloc_dump __P((FILE *, void *)); +int __db_tablesize __P((int)); +void __db_hashinit __P((void *, int)); diff --git a/db2/include/cxx_int.h b/db2/include/cxx_int.h new file mode 100644 index 0000000000..bf7a09602d --- /dev/null +++ b/db2/include/cxx_int.h @@ -0,0 +1,118 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)cxx_int.h 10.4 (Sleepycat) 8/22/97 + */ + +#ifndef _CXX_INT_H_ +#define _CXX_INT_H_ + +// private data structures known to the implementation only + +#include <assert.h> // used by defines below + +// +// Using FooImp classes will allow the implementation to change in the +// future without any modification to user code or even to header files +// that the user includes. FooImp * is just like void * except that it +// provides a little extra protection, since you cannot randomly assign +// any old pointer to a FooImp* as you can with void *. Currently, a +// pointer to such an opaque class is always just a pointer to the +// appropriate underlying implementation struct. These are converted +// back and forth using the various overloaded wrap()/unwrap() methods. +// This is essentially a use of the "Bridge" Design Pattern. +// +// WRAPPED_CLASS implements the appropriate wrap() and unwrap() methods +// for a wrapper class that has an underlying pointer representation. +// +#define WRAPPED_CLASS(_WRAPPER_CLASS, _IMP_CLASS, _WRAPPED_TYPE) \ + \ + class _IMP_CLASS {}; \ + \ + inline _WRAPPED_TYPE unwrap(_WRAPPER_CLASS *val) \ + { \ + if (!val) return 0; \ + return (_WRAPPED_TYPE)(val->imp()); \ + } \ + \ + inline const _WRAPPED_TYPE unwrapConst(const _WRAPPER_CLASS *val) \ + { \ + if (!val) return 0; \ + return (const _WRAPPED_TYPE)(val->imp()); \ + } \ + \ + inline _IMP_CLASS *wrap(_WRAPPED_TYPE val) \ + { \ + return (_IMP_CLASS*)val; \ + } + +WRAPPED_CLASS(DbLockTab, DbLockTabImp, DB_LOCKTAB*) +WRAPPED_CLASS(DbLog, DbLogImp, DB_LOG*) +WRAPPED_CLASS(DbMpool, DbMpoolImp, DB_MPOOL*) +WRAPPED_CLASS(DbMpoolFile, DbMpoolFileImp, DB_MPOOLFILE*) +WRAPPED_CLASS(Db, DbImp, DB*) +WRAPPED_CLASS(DbTxn, DbTxnImp, DB_TXN*) +WRAPPED_CLASS(DbTxnMgr, DbTxnMgrImp, DB_TXNMGR*) + +// Macros that handle detected errors, in case we want to +// change the default behavior. runtime_error() throws an +// exception by default. +// +// Since it's unusual to throw an exception in a destructor, +// we have a separate macro. For now, we silently ignore such +// detected errors. +// +#define DB_ERROR(caller, ecode) \ + DbEnv::runtime_error(caller, ecode) + +#define DB_DESTRUCTOR_ERROR(caller, ecode) \ + DbEnv::runtime_error(caller, ecode, 1) + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// These defines are for tedious flag or field set/get access methods. +// + +// Define setName() and getName() methods that twiddle +// the _flags field. +// +#define DB_FLAG_METHODS(_class, _flags, _cxx_name, _flag_name) \ + \ +void _class::set##_cxx_name(int onOrOff) \ +{ \ + if (onOrOff) \ + _flags |= _flag_name; \ + else \ + _flags &= ~(_flag_name); \ +} \ + \ +int _class::get##_cxx_name() const \ +{ \ + return (_flags & _flag_name) ? 1 : 0; \ +} + + +#define DB_RO_ACCESS(_class, _type, _cxx_name, _field) \ + \ +_type _class::get_##_cxx_name() const \ +{ \ + return _field; \ +} + +#define DB_WO_ACCESS(_class, _type, _cxx_name, _field) \ + \ +void _class::set_##_cxx_name(_type value) \ +{ \ + _field = value; \ +} \ + +#define DB_RW_ACCESS(_class, _type, _cxx_name, _field) \ + DB_RO_ACCESS(_class, _type, _cxx_name, _field) \ + DB_WO_ACCESS(_class, _type, _cxx_name, _field) + +#endif /* !_CXX_INT_H_ */ diff --git a/db2/include/db.h.src b/db2/include/db.h.src new file mode 100644 index 0000000000..f9b29fa2af --- /dev/null +++ b/db2/include/db.h.src @@ -0,0 +1,796 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db.h.src 10.67 (Sleepycat) 8/25/97 + */ + +#ifndef _DB_H_ +#define _DB_H_ + +#ifndef __NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#endif + +/* + * XXX + * MacOS: ensure that Metrowerks C makes enumeration types int sized. + */ +#ifdef __MWERKS__ +#pragma enumsalwaysint on +#endif + +/* + * XXX + * Handle function prototypes and the keyword "const". This steps on name + * space that DB doesn't control, but all of the other solutions are worse. + */ +#undef __P +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* ANSI C prototypes */ +#else +#define const +#define __P(protos) () /* K&R C preprocessor */ +#endif + +/* + * !!! + * DB needs basic information about specifically sized types. If they're + * not provided by the system, typedef them here. + * + * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__, + * as does BIND and Kerberos, since we don't know for sure what #include + * files the user is using. + * + * !!! + * We also provide the standard u_int, u_long etc., if they're not provided + * by the system. This isn't completely necessary, but the example programs + * need them. + */ +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ +@u_int8_decl@ +@int16_decl@ +@u_int16_decl@ +@int32_decl@ +@u_int32_decl@ +#endif + +@u_char_decl@ +@u_short_decl@ +@u_int_decl@ +@u_long_decl@ + +#define DB_VERSION_MAJOR 2 +#define DB_VERSION_MINOR 3 +#define DB_VERSION_PATCH 4 +#define DB_VERSION_STRING "Sleepycat Software: DB 2.3.4: (8/20/97)" + +typedef u_int32_t db_pgno_t; /* Page number type. */ +typedef u_int16_t db_indx_t; /* Page offset type. */ +#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ + +typedef u_int32_t db_recno_t; /* Record number type. */ +typedef size_t DB_LOCK; /* Object returned by lock manager. */ +#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ + +#define DB_FILE_ID_LEN 20 /* DB file ID length. */ + +/* Forward structure declarations, so applications get type checking. */ +struct __db; typedef struct __db DB; +#ifdef DB_DBM_HSEARCH + typedef struct __db DBM; +#endif +struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT; +struct __db_dbt; typedef struct __db_dbt DBT; +struct __db_env; typedef struct __db_env DB_ENV; +struct __db_info; typedef struct __db_info DB_INFO; +struct __db_lockregion; typedef struct __db_lockregion DB_LOCKREGION; +struct __db_lockreq; typedef struct __db_lockreq DB_LOCKREQ; +struct __db_locktab; typedef struct __db_locktab DB_LOCKTAB; +struct __db_log; typedef struct __db_log DB_LOG; +struct __db_lsn; typedef struct __db_lsn DB_LSN; +struct __db_mpool; typedef struct __db_mpool DB_MPOOL; +struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT; +struct __db_mpool_stat; typedef struct __db_mpool_stat DB_MPOOL_STAT; +struct __db_mpoolfile; typedef struct __db_mpoolfile DB_MPOOLFILE; +struct __db_txn; typedef struct __db_txn DB_TXN; +struct __db_txn_active; typedef struct __db_txn_active DB_TXN_ACTIVE; +struct __db_txn_stat; typedef struct __db_txn_stat DB_TXN_STAT; +struct __db_txnmgr; typedef struct __db_txnmgr DB_TXNMGR; +struct __db_txnregion; typedef struct __db_txnregion DB_TXNREGION; +struct __dbc; typedef struct __dbc DBC; + +/* Key/data structure -- a Data-Base Thang. */ +struct __db_dbt { + void *data; /* key/data */ + u_int32_t size; /* key/data length */ + u_int32_t ulen; /* RO: length of user buffer. */ + u_int32_t dlen; /* RO: get/put record length. */ + u_int32_t doff; /* RO: get/put record offset. */ + +#define DB_DBT_INTERNAL 0x01 /* Perform any mallocs using regular + malloc, not the user's malloc. */ +#define DB_DBT_MALLOC 0x02 /* Return in allocated memory. */ +#define DB_DBT_PARTIAL 0x04 /* Partial put/get. */ +#define DB_DBT_USERMEM 0x08 /* Return in user's memory. */ + u_int32_t flags; +}; + +/* + * Database configuration and initialization. + */ + /* + * Flags understood by both db_open(3) and db_appinit(3). + */ +#define DB_CREATE 0x00001 /* O_CREAT: create file as necessary. */ +#define DB_NOMMAP 0x00002 /* Don't mmap underlying file. */ +#define DB_THREAD 0x00004 /* Free-thread DB package handles. */ + +/* + * Flags understood by db_appinit(3). + * + * DB_APP_INIT and DB_MUTEXDEBUG are internal only, and not documented. + */ +/* 0x00007 COMMON MASK. */ +#define DB_APP_INIT 0x00008 /* Appinit called, paths initialized. */ +#define DB_INIT_LOCK 0x00010 /* Initialize locking. */ +#define DB_INIT_LOG 0x00020 /* Initialize logging. */ +#define DB_INIT_MPOOL 0x00040 /* Initialize mpool. */ +#define DB_INIT_TXN 0x00080 /* Initialize transactions. */ +#define DB_MPOOL_PRIVATE 0x00100 /* Mpool: private memory pool. */ +#define DB_MUTEXDEBUG 0x00200 /* Do not get/set mutexes in regions. */ +#define DB_RECOVER 0x00400 /* Run normal recovery. */ +#define DB_RECOVER_FATAL 0x00800 /* Run catastrophic recovery. */ +#define DB_TXN_NOSYNC 0x01000 /* Do not sync log on commit. */ +#define DB_USE_ENVIRON 0x02000 /* Use the environment. */ +#define DB_USE_ENVIRON_ROOT 0x04000 /* Use the environment if root. */ + +/* CURRENTLY UNUSED LOCK FLAGS. */ +#define DB_TXN_LOCK_2PL 0x00000 /* Two-phase locking. */ +#define DB_TXN_LOCK_OPTIMISTIC 0x00000 /* Optimistic locking. */ +#define DB_TXN_LOCK_MASK 0x00000 /* Lock flags mask. */ + +/* CURRENTLY UNUSED LOG FLAGS. */ +#define DB_TXN_LOG_REDO 0x00000 /* Redo-only logging. */ +#define DB_TXN_LOG_UNDO 0x00000 /* Undo-only logging. */ +#define DB_TXN_LOG_UNDOREDO 0x00000 /* Undo/redo write-ahead logging. */ +#define DB_TXN_LOG_MASK 0x00000 /* Log flags mask. */ + +/* + * Flags understood by db_open(3). + * + * DB_EXCL and DB_TEMPORARY are internal only, and not documented. + * DB_SEQUENTIAL is currently internal, but likely to be exported some day. + */ +/* 0x00007 COMMON MASK. */ +/* 0x07fff ALREADY USED. */ +#define DB_EXCL 0x08000 /* O_EXCL: exclusive open. */ +#define DB_RDONLY 0x10000 /* O_RDONLY: read-only. */ +#define DB_SEQUENTIAL 0x20000 /* Indicate sequential access. */ +#define DB_TEMPORARY 0x40000 /* Remove on last close. */ +#define DB_TRUNCATE 0x80000 /* O_TRUNCATE: replace existing DB. */ + +/* + * Deadlock detector modes; used in the DBENV structure to configure the + * locking subsystem. + */ +#define DB_LOCK_NORUN 0x0 +#define DB_LOCK_DEFAULT 0x1 +#define DB_LOCK_OLDEST 0x2 +#define DB_LOCK_RANDOM 0x3 +#define DB_LOCK_YOUNGEST 0x4 + +struct __db_env { + int db_lorder; /* Byte order. */ + + /* Error message callback. */ + void (*db_errcall) __P((const char *, char *)); + FILE *db_errfile; /* Error message file stream. */ + const char *db_errpfx; /* Error message prefix. */ + int db_verbose; /* Generate debugging messages. */ + + /* User paths. */ + char *db_home; /* Database home. */ + char *db_log_dir; /* Database log file directory. */ + char *db_tmp_dir; /* Database tmp file directory. */ + + char **db_data_dir; /* Database data file directories. */ + int data_cnt; /* Database data file slots. */ + int data_next; /* Next Database data file slot. */ + + /* Locking. */ + DB_LOCKTAB *lk_info; /* Return from lock_open(). */ + u_int8_t *lk_conflicts; /* Two dimensional conflict matrix. */ + int lk_modes; /* Number of lock modes in table. */ + unsigned int lk_max; /* Maximum number of locks. */ + u_int32_t lk_detect; /* Deadlock detect on every conflict. */ + int (*db_yield) __P((void)); /* Yield function for threads. */ + + /* Logging. */ + DB_LOG *lg_info; /* Return from log_open(). */ + u_int32_t lg_max; /* Maximum file size. */ + + /* Memory pool. */ + DB_MPOOL *mp_info; /* Return from memp_open(). */ + size_t mp_mmapsize; /* Maximum file size for mmap. */ + size_t mp_size; /* Bytes in the mpool cache. */ + + /* Transactions. */ + DB_TXNMGR *tx_info; /* Return from txn_open(). */ + unsigned int tx_max; /* Maximum number of transactions. */ + int (*tx_recover) /* Dispatch function for recovery. */ + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + + u_int32_t flags; /* Flags. */ +}; + +/******************************************************* + * Access methods. + *******************************************************/ +typedef enum { + DB_BTREE=1, /* B+tree. */ + DB_HASH, /* Extended Linear Hashing. */ + DB_RECNO, /* Fixed and variable-length records. */ + DB_UNKNOWN /* Figure it out on open. */ +} DBTYPE; + +#define DB_BTREEVERSION 6 /* Current btree version. */ +#define DB_BTREEOLDVER 6 /* Oldest btree version supported. */ +#define DB_BTREEMAGIC 0x053162 + +#define DB_HASHVERSION 5 /* Current hash version. */ +#define DB_HASHOLDVER 4 /* Oldest hash version supported. */ +#define DB_HASHMAGIC 0x061561 + +#define DB_LOGVERSION 2 /* Current log version. */ +#define DB_LOGOLDVER 2 /* Oldest log version supported. */ +#define DB_LOGMAGIC 0x040988 + +struct __db_info { + int db_lorder; /* Byte order. */ + size_t db_cachesize; /* Underlying cache size. */ + size_t db_pagesize; /* Underlying page size. */ + + /* Local heap allocation. */ + void *(*db_malloc) __P((size_t)); + + /* Btree access method. */ + int bt_maxkey; /* Maximum keys per page. */ + int bt_minkey; /* Minimum keys per page. */ + int (*bt_compare) /* Comparison function. */ + __P((const DBT *, const DBT *)); + size_t (*bt_prefix) /* Prefix function. */ + __P((const DBT *, const DBT *)); + + /* Hash access method. */ + unsigned int h_ffactor; /* Fill factor. */ + unsigned int h_nelem; /* Number of elements. */ + u_int32_t (*h_hash) /* Hash function. */ + __P((const void *, u_int32_t)); + + /* Recno access method. */ + int re_pad; /* Fixed-length padding byte. */ + int re_delim; /* Variable-length delimiting byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + char *re_source; /* Source file name. */ + +#define DB_DELIMITER 0x0001 /* Recno: re_delim set. */ +#define DB_DUP 0x0002 /* Btree, Hash: duplicate keys. */ +#define DB_FIXEDLEN 0x0004 /* Recno: fixed-length records. */ +#define DB_PAD 0x0008 /* Recno: re_pad set. */ +#define DB_RECNUM 0x0010 /* Btree: record numbers. */ +#define DB_RENUMBER 0x0020 /* Recno: renumber on insert/delete. */ +#define DB_SNAPSHOT 0x0040 /* Recno: snapshot the input. */ + u_int32_t flags; +}; + +/* + * DB access method and cursor operation codes. These are implemented as + * bit fields for future flexibility, but currently only a single one may + * be specified to any function. + */ +#define DB_AFTER 0x000001 /* c_put() */ +#define DB_APPEND 0x000002 /* put() */ +#define DB_BEFORE 0x000004 /* c_put() */ +#define DB_CHECKPOINT 0x000008 /* log_put(), log_get() */ +#define DB_CURRENT 0x000010 /* c_get(), c_put(), log_get() */ +#define DB_FIRST 0x000020 /* c_get(), log_get() */ +#define DB_FLUSH 0x000040 /* log_put() */ +#define DB_GET_RECNO 0x000080 /* c_get() */ +#define DB_KEYFIRST 0x000100 /* c_put() */ +#define DB_KEYLAST 0x000200 /* c_put() */ +#define DB_LAST 0x000400 /* c_get(), log_get() */ +#define DB_NEXT 0x000800 /* c_get(), log_get() */ +#define DB_NOOVERWRITE 0x001000 /* put() */ +#define DB_NOSYNC 0x002000 /* close() */ +#define DB_PREV 0x004000 /* c_get(), log_get() */ +#define DB_RECORDCOUNT 0x008000 /* stat() */ +#define DB_SET 0x010000 /* c_get(), log_get() */ +#define DB_SET_RANGE 0x020000 /* c_get() */ +#define DB_SET_RECNO 0x040000 /* get(), c_get() */ + +/* DB (user visible) error return codes. */ +#define DB_INCOMPLETE ( -1) /* Sync didn't finish. */ +#define DB_KEYEMPTY ( -2) /* The key/data pair was deleted or + was never created by the user. */ +#define DB_KEYEXIST ( -3) /* The key/data pair already exists. */ +#define DB_LOCK_DEADLOCK ( -4) /* Locker killed to resolve deadlock. */ +#define DB_LOCK_NOTGRANTED ( -5) /* Lock unavailable, no-wait set. */ +#define DB_LOCK_NOTHELD ( -6) /* Lock not held by locker. */ +#define DB_NOTFOUND ( -7) /* Key/data pair not found (EOF). */ + +/* DB (private) error return codes. */ +#define DB_DELETED ( -8) /* Recovery file marked deleted. */ +#define DB_NEEDSPLIT ( -9) /* Page needs to be split. */ +#define DB_REGISTERED (-10) /* Entry was previously registered. */ +#define DB_SWAPBYTES (-11) /* Database needs byte swapping. */ + +struct __db_ilock { /* Internal DB access method lock. */ + db_pgno_t pgno; /* Page being locked. */ + /* File id. */ + u_int8_t fileid[DB_FILE_ID_LEN]; +}; + +/* DB access method description structure. */ +struct __db { + void *mutex; /* Synchronization for free threading */ + DBTYPE type; /* DB access method. */ + DB_ENV *dbenv; /* DB_ENV structure. */ + DB_ENV *mp_dbenv; /* DB_ENV for local mpool creation. */ + + DB *master; /* Original DB created by db_open. */ + void *internal; /* Access method private. */ + + DB_MPOOL *mp; /* The access method's mpool. */ + DB_MPOOLFILE *mpf; /* The access method's mpool file. */ + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * TAILQ_HEAD(curs_queue, __dbc); + */ + struct { + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } curs_queue; + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * LIST_HEAD(handleq, __db); + * LIST_ENTRY(__db); + */ + struct { + struct __db *lh_first; + } handleq; /* List of handles for this DB. */ + struct { + struct __db *le_next; + struct __db **le_prev; + } links; /* Links for the handle list. */ + + u_int32_t log_fileid; /* Logging file id. */ + + DB_TXN *txn; /* Current transaction. */ + u_int32_t locker; /* Default process' locker id. */ + DBT lock_dbt; /* DBT referencing lock. */ + struct __db_ilock lock; /* Lock. */ + + size_t pgsize; /* Logical page size of file. */ + + /* Local heap allocation. */ + void *(*db_malloc) __P((size_t)); + + /* Functions. */ + int (*close) __P((DB *, int)); + int (*cursor) __P((DB *, DB_TXN *, DBC **)); + int (*del) __P((DB *, DB_TXN *, DBT *, int)); + int (*fd) __P((DB *, int *)); + int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, int)); + int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, int)); + int (*stat) __P((DB *, void *, void *(*)(size_t), int)); + int (*sync) __P((DB *, int)); + +#define DB_AM_DUP 0x000001 /* DB_DUP (internal). */ +#define DB_AM_INMEM 0x000002 /* In-memory; no sync on close. */ +#define DB_AM_LOCKING 0x000004 /* Perform locking. */ +#define DB_AM_LOGGING 0x000008 /* Perform logging. */ +#define DB_AM_MLOCAL 0x000010 /* Database memory pool is local. */ +#define DB_AM_PGDEF 0x000020 /* Page size was defaulted. */ +#define DB_AM_RDONLY 0x000040 /* Database is readonly. */ +#define DB_AM_RECOVER 0x000080 /* In recovery (do not log or lock). */ +#define DB_AM_SWAP 0x000100 /* Pages need to be byte-swapped. */ +#define DB_AM_THREAD 0x000200 /* DB is multi-threaded. */ +#define DB_BT_RECNUM 0x000400 /* DB_RECNUM (internal) */ +#define DB_HS_DIRTYMETA 0x000800 /* Hash: Metadata page modified. */ +#define DB_RE_DELIMITER 0x001000 /* DB_DELIMITER (internal). */ +#define DB_RE_FIXEDLEN 0x002000 /* DB_FIXEDLEN (internal). */ +#define DB_RE_PAD 0x004000 /* DB_PAD (internal). */ +#define DB_RE_RENUMBER 0x008000 /* DB_RENUMBER (internal). */ +#define DB_RE_SNAPSHOT 0x010000 /* DB_SNAPSHOT (internal). */ + + u_int32_t flags; +}; + +/* Cursor description structure. */ +struct __dbc { + DB *dbp; /* Related DB access method. */ + DB_TXN *txn; /* Associated transaction. */ + + /* + * XXX + * Explicit representations of structures in queue.h. + * + * TAILQ_ENTRY(__dbc); + */ + struct { + struct __dbc *tqe_next; + struct __dbc **tqe_prev; + } links; + + void *internal; /* Access method private. */ + + int (*c_close) __P((DBC *)); + int (*c_del) __P((DBC *, int)); + int (*c_get) __P((DBC *, DBT *, DBT *, int)); + int (*c_put) __P((DBC *, DBT *, DBT *, int)); +}; + +/* Btree/recno statistics structure. */ +struct __db_bt_stat { + u_int32_t bt_flags; /* Open flags. */ + u_int32_t bt_maxkey; /* Maxkey value. */ + u_int32_t bt_minkey; /* Minkey value. */ + u_int32_t bt_re_len; /* Fixed-length record length. */ + u_int32_t bt_re_pad; /* Fixed-length record pad. */ + u_int32_t bt_pagesize; /* Page size. */ + u_int32_t bt_levels; /* Tree levels. */ + u_int32_t bt_nrecs; /* Number of records. */ + u_int32_t bt_int_pg; /* Internal pages. */ + u_int32_t bt_leaf_pg; /* Leaf pages. */ + u_int32_t bt_dup_pg; /* Duplicate pages. */ + u_int32_t bt_over_pg; /* Overflow pages. */ + u_int32_t bt_free; /* Pages on the free list. */ + u_int32_t bt_freed; /* Pages freed for reuse. */ + u_int32_t bt_int_pgfree; /* Bytes free in internal pages. */ + u_int32_t bt_leaf_pgfree; /* Bytes free in leaf pages. */ + u_int32_t bt_dup_pgfree; /* Bytes free in duplicate pages. */ + u_int32_t bt_over_pgfree; /* Bytes free in overflow pages. */ + u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ + u_int32_t bt_split; /* Total number of splits. */ + u_int32_t bt_rootsplit; /* Root page splits. */ + u_int32_t bt_fastsplit; /* Fast splits. */ + u_int32_t bt_added; /* Items added. */ + u_int32_t bt_deleted; /* Items deleted. */ + u_int32_t bt_get; /* Items retrieved. */ + u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ + u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int db_appinit __P((const char *, char * const *, DB_ENV *, int)); +int db_appexit __P((DB_ENV *)); +int db_open __P((const char *, DBTYPE, int, int, DB_ENV *, DB_INFO *, DB **)); +char *db_version __P((int *, int *, int *)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Locking + *******************************************************/ +#define DB_LOCKVERSION 1 +#define DB_LOCKMAGIC 0x090193 + +/* Flag values for lock_vec(). */ +#define DB_LOCK_NOWAIT 0x01 /* Don't wait on unavailable lock. */ + +/* Flag values for lock_detect(). */ +#define DB_LOCK_CONFLICT 0x01 /* Run on any conflict. */ + +/* Request types. */ +typedef enum { + DB_LOCK_DUMP, /* Display held locks. */ + DB_LOCK_GET, /* Get the lock. */ + DB_LOCK_PUT, /* Release the lock. */ + DB_LOCK_PUT_ALL, /* Release locker's locks. */ + DB_LOCK_PUT_OBJ /* Release locker's locks on obj. */ +} db_lockop_t; + +/* Simple R/W lock modes and for multi-granularity intention locking. */ +typedef enum { + DB_LOCK_NG=0, /* Not granted. */ + DB_LOCK_READ, /* Shared/read. */ + DB_LOCK_WRITE, /* Exclusive/write. */ + DB_LOCK_IREAD, /* Intent to share/read. */ + DB_LOCK_IWRITE, /* Intent exclusive/write. */ + DB_LOCK_IWR /* Intent to read and write. */ +} db_lockmode_t; + +/* Lock request structure. */ +struct __db_lockreq { + db_lockop_t op; /* Operation. */ + db_lockmode_t mode; /* Requested mode. */ + u_int32_t locker; /* Locker identity. */ + DBT *obj; /* Object being locked. */ + DB_LOCK lock; /* Lock returned. */ +}; + +/* + * Commonly used conflict matrices. + * + * Standard Read/Write (or exclusive/shared) locks. + */ +#define DB_LOCK_RW_N 3 +extern const u_int8_t db_rw_conflicts[]; + +/* Multi-granularity locking. */ +#define DB_LOCK_RIW_N 6 +extern const u_int8_t db_riw_conflicts[]; + +#if defined(__cplusplus) +extern "C" { +#endif +int lock_close __P((DB_LOCKTAB *)); +int lock_detect __P((DB_LOCKTAB *, int, u_int32_t)); +int lock_get __P((DB_LOCKTAB *, + u_int32_t, int, const DBT *, db_lockmode_t, DB_LOCK *)); +int lock_id __P((DB_LOCKTAB *, u_int32_t *)); +int lock_open __P((const char *, int, int, DB_ENV *, DB_LOCKTAB **)); +int lock_put __P((DB_LOCKTAB *, DB_LOCK)); +int lock_unlink __P((const char *, int, DB_ENV *)); +int lock_vec __P((DB_LOCKTAB *, + u_int32_t, int, DB_LOCKREQ *, int, DB_LOCKREQ **)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Logging. + *******************************************************/ +/* Flag values for log_archive(). */ +#define DB_ARCH_ABS 0x001 /* Absolute pathnames. */ +#define DB_ARCH_DATA 0x002 /* Data files. */ +#define DB_ARCH_LOG 0x004 /* Log files. */ + +/* + * A DB_LSN has two parts, a fileid which identifies a specific file, and an + * offset within that file. The fileid is an unsigned 4-byte quantity that + * uniquely identifies a file within the log directory -- currently a simple + * counter inside the log. The offset is also an unsigned 4-byte value. The + * log manager guarantees the offset is never more than 4 bytes by switching + * to a new log file before the maximum length imposed by an unsigned 4-byte + * offset is reached. + */ +struct __db_lsn { + u_int32_t file; /* File ID. */ + u_int32_t offset; /* File offset. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int log_archive __P((DB_LOG *, char **[], int, void *(*)(size_t))); +int log_close __P((DB_LOG *)); +int log_compare __P((const DB_LSN *, const DB_LSN *)); +int log_file __P((DB_LOG *, const DB_LSN *, char *, size_t)); +int log_flush __P((DB_LOG *, const DB_LSN *)); +int log_get __P((DB_LOG *, DB_LSN *, DBT *, int)); +int log_open __P((const char *, int, int, DB_ENV *, DB_LOG **)); +int log_put __P((DB_LOG *, DB_LSN *, const DBT *, int)); +int log_register __P((DB_LOG *, DB *, const char *, DBTYPE, u_int32_t *)); +int log_unlink __P((const char *, int, DB_ENV *)); +int log_unregister __P((DB_LOG *, u_int32_t)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Mpool + *******************************************************/ +/* Flag values for memp_fget(). */ +#define DB_MPOOL_CREATE 0x001 /* Create a page. */ +#define DB_MPOOL_LAST 0x002 /* Return the last page. */ +#define DB_MPOOL_NEW 0x004 /* Create a new page. */ + +/* Flag values for memp_fput(), memp_fset(). */ +#define DB_MPOOL_CLEAN 0x001 /* Clear modified bit. */ +#define DB_MPOOL_DIRTY 0x002 /* Page is modified. */ +#define DB_MPOOL_DISCARD 0x004 /* Don't cache the page. */ + +/* Mpool statistics structure. */ +struct __db_mpool_stat { + size_t st_cachesize; /* Cache size. */ + unsigned long st_cache_hit; /* Pages found in the cache. */ + unsigned long st_cache_miss; /* Pages not found in the cache. */ + unsigned long st_map; /* Pages from mapped files. */ + unsigned long st_page_create; /* Pages created in the cache. */ + unsigned long st_page_in; /* Pages read in. */ + unsigned long st_page_out; /* Pages written out. */ + unsigned long st_ro_evict; /* Read-only pages evicted. */ + unsigned long st_rw_evict; /* Read-write pages evicted. */ + unsigned long st_hash_buckets; /* Number of hash buckets. */ + unsigned long st_hash_searches; /* Total hash chain searches. */ + unsigned long st_hash_longest; /* Longest hash chain searched. */ + unsigned long st_hash_examined; /* Total hash entries searched. */ +}; + +/* Mpool file statistics structure. */ +struct __db_mpool_fstat { + char *file_name; /* File name. */ + size_t st_pagesize; /* Page size. */ + unsigned long st_cache_hit; /* Pages found in the cache. */ + unsigned long st_cache_miss; /* Pages not found in the cache. */ + unsigned long st_map; /* Pages from mapped files. */ + unsigned long st_page_create; /* Pages created in the cache. */ + unsigned long st_page_in; /* Pages read in. */ + unsigned long st_page_out; /* Pages written out. */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int memp_close __P((DB_MPOOL *)); +int memp_fclose __P((DB_MPOOLFILE *)); +int memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, unsigned long, void *)); +int memp_fopen __P((DB_MPOOL *, const char *, + int, int, int, size_t, int, DBT *, u_int8_t *, DB_MPOOLFILE **)); +int memp_fput __P((DB_MPOOLFILE *, void *, unsigned long)); +int memp_fset __P((DB_MPOOLFILE *, void *, unsigned long)); +int memp_fsync __P((DB_MPOOLFILE *)); +int memp_open __P((const char *, int, int, DB_ENV *, DB_MPOOL **)); +int memp_register __P((DB_MPOOL *, int, + int (*)(db_pgno_t, void *, DBT *), + int (*)(db_pgno_t, void *, DBT *))); +int memp_stat __P((DB_MPOOL *, + DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, void *(*)(size_t))); +int memp_sync __P((DB_MPOOL *, DB_LSN *)); +int memp_unlink __P((const char *, int, DB_ENV *)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Transactions. + *******************************************************/ +#define DB_TXNVERSION 1 +#define DB_TXNMAGIC 0x041593 + +/* Operations values to the tx_recover() function. */ +#define DB_TXN_BACKWARD_ROLL 1 /* Read the log backwards. */ +#define DB_TXN_FORWARD_ROLL 2 /* Read the log forwards. */ +#define DB_TXN_OPENFILES 3 /* Read for open files. */ +#define DB_TXN_REDO 4 /* Redo the operation. */ +#define DB_TXN_UNDO 5 /* Undo the operation. */ + +/* Internal transaction status values. */ + +/* Transaction statistics structure. */ +struct __db_txn_active { + u_int32_t txnid; /* Transaction ID */ + DB_LSN lsn; /* Lsn of the begin record */ +}; + +struct __db_txn_stat { + DB_LSN st_last_ckp; /* lsn of the last checkpoint */ + DB_LSN st_pending_ckp; /* last checkpoint did not finish */ + time_t st_time_ckp; /* time of last checkpoint */ + u_int32_t st_last_txnid; /* last transaction id given out */ + u_int32_t st_maxtxns; /* maximum number of active txns */ + u_int32_t st_naborts; /* number of aborted transactions */ + u_int32_t st_nbegins; /* number of begun transactions */ + u_int32_t st_ncommits; /* number of committed transactions */ + u_int32_t st_nactive; /* number of active transactions */ + DB_TXN_ACTIVE *st_txnarray; /* array of active transactions */ +}; + +#if defined(__cplusplus) +extern "C" { +#endif +int txn_abort __P((DB_TXN *)); +int txn_begin __P((DB_TXNMGR *, DB_TXN *, DB_TXN **)); +int txn_checkpoint __P((const DB_TXNMGR *, long, long)); +int txn_commit __P((DB_TXN *)); +int txn_close __P((DB_TXNMGR *)); +u_int32_t txn_id __P((DB_TXN *)); +int txn_open __P((const char *, int, int, DB_ENV *, DB_TXNMGR **)); +int txn_prepare __P((DB_TXN *)); +int txn_stat __P((DB_TXNMGR *, DB_TXN_STAT **, void *(*)(size_t))); +int txn_unlink __P((const char *, int, DB_ENV *)); +#if defined(__cplusplus) +}; +#endif + +#ifdef DB_DBM_HSEARCH +/******************************************************* + * Dbm/Ndbm historic interfaces. + *******************************************************/ +#define DBM_INSERT 0 /* Flags to dbm_store(). */ +#define DBM_REPLACE 1 + +/* + * The db(3) support for ndbm(3) always appends this suffix to the + * file name to avoid overwriting the user's original database. + */ +#define DBM_SUFFIX ".db" + +typedef struct { + char *dptr; + int dsize; +} datum; + +#if defined(__cplusplus) +extern "C" { +#endif +int dbminit __P((char *)); +#if !defined(__cplusplus) +int delete __P((datum)); +#endif +datum fetch __P((datum)); +datum firstkey __P((void)); +datum nextkey __P((datum)); +int store __P((datum, datum)); + +/* + * !!! + * Don't prototype: + * + * dbm_clearerr(DBM *db); + * dbm_dirfno(DBM *db); + * dbm_error(DBM *db); + * dbm_pagfno(DBM *db); + * dbm_rdonly(DBM *db); + * + * they weren't documented and were historically implemented as #define's. + */ +void dbm_close __P((DBM *)); +int dbm_delete __P((DBM *, datum)); +datum dbm_fetch __P((DBM *, datum)); +datum dbm_firstkey __P((DBM *)); +long dbm_forder __P((DBM *, datum)); +datum dbm_nextkey __P((DBM *)); +DBM *dbm_open __P((const char *, int, int)); +int dbm_store __P((DBM *, datum, datum, int)); +#if defined(__cplusplus) +}; +#endif + +/******************************************************* + * Hsearch historic interface. + *******************************************************/ +typedef enum { + FIND, ENTER +} ACTION; + +typedef struct entry { + char *key; + void *data; +} ENTRY; + +#if defined(__cplusplus) +extern "C" { +#endif +int hcreate __P((unsigned int)); +void hdestroy __P((void)); +ENTRY *hsearch __P((ENTRY, ACTION)); +#if defined(__cplusplus) +}; +#endif +#endif /* DB_DBM_HSEARCH */ + +/* + * XXX + * MacOS: Reset Metrowerks C enum sizes. + */ +#ifdef __MWERKS__ +#pragma enumsalwaysint reset +#endif +#endif /* !_DB_H_ */ diff --git a/db2/include/db_185.h.src b/db2/include/db_185.h.src new file mode 100644 index 0000000000..52fb3a0da1 --- /dev/null +++ b/db2/include/db_185.h.src @@ -0,0 +1,170 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)db_185.h.src 8.3 (Sleepycat) 7/27/97 + */ + +#ifndef _DB_185_H_ +#define _DB_185_H_ + +#include <sys/types.h> + +#include <limits.h> + +/* + * XXX + * Handle function prototypes and the keyword "const". This steps on name + * space that DB doesn't control, but all of the other solutions are worse. + */ +#undef __P +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* ANSI C prototypes */ +#else +#define const +#define __P(protos) () /* K&R C preprocessor */ +#endif + +#define RET_ERROR -1 /* Return values. */ +#define RET_SUCCESS 0 +#define RET_SPECIAL 1 + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ +@u_int8_decl@ +@int16_decl@ +@u_int16_decl@ +@int32_decl@ +@u_int32_decl@ +#endif + +#define MAX_PAGE_NUMBER 0xffffffff /* >= # of pages in a file */ +typedef u_int32_t pgno_t; +#define MAX_PAGE_OFFSET 65535 /* >= # of bytes in a page */ +typedef u_int16_t indx_t; +#define MAX_REC_NUMBER 0xffffffff /* >= # of records in a tree */ +typedef u_int32_t recno_t; + +/* Key/data structure -- a Data-Base Thang. */ +typedef struct { + void *data; /* data */ + size_t size; /* data length */ +} DBT; + +/* Routine flags. */ +#define R_CURSOR 1 /* del, put, seq */ +#define __R_UNUSED 2 /* UNUSED */ +#define R_FIRST 3 /* seq */ +#define R_IAFTER 4 /* put (RECNO) */ +#define R_IBEFORE 5 /* put (RECNO) */ +#define R_LAST 6 /* seq (BTREE, RECNO) */ +#define R_NEXT 7 /* seq */ +#define R_NOOVERWRITE 8 /* put */ +#define R_PREV 9 /* seq (BTREE, RECNO) */ +#define R_SETCURSOR 10 /* put (RECNO) */ +#define R_RECNOSYNC 11 /* sync (RECNO) */ + +typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE; + +/* Access method description structure. */ +typedef struct __db { + DBTYPE type; /* Underlying db type. */ + int (*close) __P((struct __db *)); + int (*del) __P((const struct __db *, const DBT *, u_int)); + int (*get) __P((const struct __db *, const DBT *, DBT *, u_int)); + int (*put) __P((const struct __db *, DBT *, const DBT *, u_int)); + int (*seq) __P((const struct __db *, DBT *, DBT *, u_int)); + int (*sync) __P((const struct __db *, u_int)); + void *internal; /* Access method private. */ + int (*fd) __P((const struct __db *)); +} DB; + +#define BTREEMAGIC 0x053162 +#define BTREEVERSION 3 + +/* Structure used to pass parameters to the btree routines. */ +typedef struct { +#define R_DUP 0x01 /* duplicate keys */ + u_long flags; + u_int cachesize; /* bytes to cache */ + int maxkeypage; /* maximum keys per page */ + int minkeypage; /* minimum keys per page */ + u_int psize; /* page size */ + int (*compare) /* comparison function */ + __P((const DBT *, const DBT *)); + size_t (*prefix) /* prefix function */ + __P((const DBT *, const DBT *)); + int lorder; /* byte order */ +} BTREEINFO; + +#define HASHMAGIC 0x061561 +#define HASHVERSION 2 + +/* Structure used to pass parameters to the hashing routines. */ +typedef struct { + u_int bsize; /* bucket size */ + u_int ffactor; /* fill factor */ + u_int nelem; /* number of elements */ + u_int cachesize; /* bytes to cache */ + u_int32_t /* hash function */ + (*hash) __P((const void *, size_t)); + int lorder; /* byte order */ +} HASHINFO; + +/* Structure used to pass parameters to the record routines. */ +typedef struct { +#define R_FIXEDLEN 0x01 /* fixed-length records */ +#define R_NOKEY 0x02 /* key not required */ +#define R_SNAPSHOT 0x04 /* snapshot the input */ + u_long flags; + u_int cachesize; /* bytes to cache */ + u_int psize; /* page size */ + int lorder; /* byte order */ + size_t reclen; /* record length (fixed-length records) */ + u_char bval; /* delimiting byte (variable-length records */ + char *bfname; /* btree file name */ +} RECNOINFO; + +#if defined(__cplusplus) +extern "C" { +#endif +DB *dbopen __P((const char *, int, int, DBTYPE, const void *)); + +#if defined(__cplusplus) +}; +#endif +#endif /* !_DB_185_H_ */ diff --git a/db2/include/db_am.h b/db2/include/db_am.h new file mode 100644 index 0000000000..3289eececa --- /dev/null +++ b/db2/include/db_am.h @@ -0,0 +1,87 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_am.h 10.5 (Sleepycat) 8/22/97 + */ +#ifndef _DB_AM_H +#define _DB_AM_H + +#define DB_ISBIG 0x01 +#define DB_ADD_DUP 0x10 +#define DB_REM_DUP 0x20 +#define DB_ADD_BIG 0x30 +#define DB_REM_BIG 0x40 +#define DB_SPLITOLD 0x50 +#define DB_SPLITNEW 0x60 + +/* + * Standard initialization and shutdown macros for all recovery functions. + * + * Requires the following local variables: + * + * DB *file_dbp, *mdbp; + * DB_MPOOLFILE *mpf; + * int ret; + */ +#define REC_INTRO(func) { \ + file_dbp = mdbp = NULL; \ + if ((ret = func(dbtp->data, &argp)) != 0) \ + goto out; \ + if (__db_fileid_to_db(logp, &mdbp, argp->fileid)) { \ + if (ret == DB_DELETED) \ + ret = 0; \ + goto out; \ + } \ + if (mdbp == NULL) \ + goto out; \ + if (F_ISSET(mdbp, DB_AM_THREAD)) { \ + if ((ret = __db_gethandle(mdbp, \ + mdbp->type == DB_HASH ? __ham_hdup : __bam_bdup, \ + &file_dbp)) != 0) \ + goto out; \ + } else \ + file_dbp = mdbp; \ + F_SET(file_dbp, DB_AM_RECOVER); \ + mpf = file_dbp->mpf; \ +} +#define REC_CLOSE { \ + if (argp != NULL) \ + free (argp); \ + if (file_dbp != NULL) { \ + F_CLR(file_dbp, DB_AM_RECOVER); \ + if (F_ISSET(file_dbp, DB_AM_THREAD)) \ + __db_puthandle(file_dbp); \ + } \ + return (ret); \ +} + +/* + * No-op versions of the same macros. + */ +#define REC_NOOP_INTRO(func) { \ + if ((ret = func(dbtp->data, &argp)) != 0) \ + return (ret); \ +} +#define REC_NOOP_CLOSE { \ + if (argp != NULL) \ + free (argp); \ + return (ret); \ +} + +/* + * Standard debugging macro for all recovery functions. + */ +#ifdef DEBUG_RECOVER +#define REC_PRINT(func) \ + (void)func(logp, dbtp, lsnp, redo, info); +#else +#define REC_PRINT(func) \ + info = info; /* XXX: Shut the compiler up. */ +#endif + +#include "db_auto.h" +#include "db_ext.h" +#endif diff --git a/db2/include/db_auto.h b/db2/include/db_auto.h new file mode 100644 index 0000000000..7478173740 --- /dev/null +++ b/db2/include/db_auto.h @@ -0,0 +1,118 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#ifndef db_AUTO_H +#define db_AUTO_H + +#define DB_db_addrem (DB_db_BEGIN + 1) + +typedef struct _db_addrem_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t indx; + size_t nbytes; + DBT hdr; + DBT dbt; + DB_LSN pagelsn; +} __db_addrem_args; + + +#define DB_db_split (DB_db_BEGIN + 2) + +typedef struct _db_split_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + DBT pageimage; + DB_LSN pagelsn; +} __db_split_args; + + +#define DB_db_big (DB_db_BEGIN + 3) + +typedef struct _db_big_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + db_pgno_t prev_pgno; + db_pgno_t next_pgno; + DBT dbt; + DB_LSN pagelsn; + DB_LSN prevlsn; + DB_LSN nextlsn; +} __db_big_args; + + +#define DB_db_ovref (DB_db_BEGIN + 4) + +typedef struct _db_ovref_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; +} __db_ovref_args; + + +#define DB_db_relink (DB_db_BEGIN + 5) + +typedef struct _db_relink_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; + db_pgno_t prev; + DB_LSN lsn_prev; + db_pgno_t next; + DB_LSN lsn_next; +} __db_relink_args; + + +#define DB_db_addpage (DB_db_BEGIN + 6) + +typedef struct _db_addpage_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN lsn; + db_pgno_t nextpgno; + DB_LSN nextlsn; +} __db_addpage_args; + + +#define DB_db_debug (DB_db_BEGIN + 7) + +typedef struct _db_debug_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + DBT op; + u_int32_t fileid; + DBT key; + DBT data; + u_int32_t arg_flags; +} __db_debug_args; + + +#define DB_db_noop (DB_db_BEGIN + 8) + +typedef struct _db_noop_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; +} __db_noop_args; + +#endif diff --git a/db2/include/db_cxx.h b/db2/include/db_cxx.h new file mode 100644 index 0000000000..506aed845c --- /dev/null +++ b/db2/include/db_cxx.h @@ -0,0 +1,888 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_cxx.h 10.7 (Sleepycat) 8/22/97 + */ + +#ifndef _DB_CXX_H_ +#define _DB_CXX_H_ + +// +// C++ assumptions: +// +// To ensure portability to many platforms, both new and old, we make +// few assumptions about the C++ compiler and library. For example, +// we do not expect STL, templates or namespaces to be available. The +// "newest" C++ feature used is exceptions, which are used liberally +// to transmit error information. Even the use of exceptions can be +// disabled at runtime, see setErrorModel(). +// +// C++ naming conventions: +// +// - All top level class names start with Db. +// - All class members start with lower case letter. +// - All private data members are suffixed with underscore. +// - Use underscores to divide names into multiple words. +// - Simple data accessors are named with get_ or set_ prefix. +// - All method names are taken from names of functions in the C +// layer of db (usually by dropping a prefix like "db_"). +// These methods have the same argument types and order, +// other than dropping the explicit arg that acts as "this". +// +// As a rule, each DbFoo object has exactly one underlying DB_FOO struct +// (defined in db.h) associated with it. In many cases, we inherit directly +// from the DB_FOO structure to make this relationship explicit. Often, +// the underlying C layer allocates and deallocates these structures, so +// there is no easy way to add any data to the DbFoo class. When you see +// a comment about whether data is permitted to be added, this is what +// is going on. Of course, if we need to add data to such C++ classes +// in the future, we will arrange to have an indirect pointer to the +// DB_FOO struct (as some of the classes already have). +// + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Forward declarations +// + +#include "db.h" + +class Db; // forward +class Dbc; // forward +class DbEnv; // forward +class DbException; // forward +class DbInfo; // forward +class DbLock; // forward +class DbLockTab; // forward +class DbLog; // forward +class DbLsn; // forward +class DbMpool; // forward +class DbMpoolFile; // forward +class Dbt; // forward +class DbTxn; // forward +class DbTxnMgr; // forward + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Mechanisms for declaring classes +// + +// +// Every class defined in this file has an _exported next to the class name. +// This is needed for WinTel machines so that the class methods can +// be exported or imported in a DLL as appropriate. Users of the DLL +// use the define DB_USE_DLL. When the DLL is built, DB_CREATE_DLL +// must be defined. +// +#if defined(_MSC_VER) + +# if defined(DB_CREATE_DLL) +# define _exported __declspec(dllexport) // creator of dll +# elif defined(DB_USE_DLL) +# define _exported __declspec(dllimport) // user of dll +# else +# define _exported // static lib creator or user +# endif + +#else + +# define _exported + +#endif + +// DEFINE_DB_CLASS defines an imp_ data member and imp() accessor. +// The underlying type is a pointer to an opaque *Imp class, that +// gets converted to the correct implementation class by the implementation. +// +// Since these defines use "private/public" labels, and leave the access +// being "private", we always use these by convention before any data +// members in the private section of a class. Keeping them in the +// private section also emphasizes that they are off limits to user code. +// +#define DEFINE_DB_CLASS(name) \ + public: class name##Imp* imp() { return imp_; } \ + public: const class name##Imp* imp() const { return imp_; } \ + private: class name##Imp* imp_ + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Turn off inappropriate compiler warnings +// + +#ifdef _MSC_VER + +// These are level 4 warnings that are explicitly disabled. +// With Visual C++, by default you do not see above level 3 unless +// you use /W4. But we like to compile with the highest level +// warnings to catch other errors. +// +// 4201: nameless struct/union +// triggered by standard include file <winnt.h> +// +// 4514: unreferenced inline function has been removed +// certain include files in MSVC define methods that are not called +// +#pragma warning(disable: 4201 4514) + +#endif + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Exception classes +// + +// Almost any error in the DB library throws a DbException. +// Every exception should be considered an abnormality +// (e.g. bug, misuse of DB, file system error). +// +// NOTE: We would like to inherit from class exception and +// let it handle what(), but there are +// MSVC++ problems when <exception> is included. +// +class _exported DbException +{ +public: + virtual ~DbException(); + DbException(int err); + DbException(const char *description); + DbException(const char *prefix, int err); + DbException(const char *prefix1, const char *prefix2, int err); + const int get_errno(); + virtual const char *what() const; + + DbException(const DbException &); + DbException &operator = (const DbException &); + +private: + char *what_; + int err_; // errno +}; + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Lock classes +// + +class _exported DbLock +{ + friend DbLockTab; + +public: + DbLock(unsigned int); + DbLock(); + + unsigned int get_lock_id(); + void set_lock_id(unsigned int); + + int put(DbLockTab *locktab); + + DbLock(const DbLock &); + DbLock &operator = (const DbLock &); + +protected: + // We can add data to this class if needed + // since its contained class is not allocated by db. + // (see comment at top) + + DB_LOCK lock_; +}; + +class _exported DbLockTab +{ +friend DbEnv; +public: + int close(); + int detect(int atype, u_int32_t flags); + int get(u_int32_t locker, int flags, const Dbt *obj, + db_lockmode_t lock_mode, DbLock *lock); + int id(u_int32_t *idp); + int vec(u_int32_t locker, int flags, DB_LOCKREQ list[], + int nlist, DB_LOCKREQ **elistp); + + // Create or remove new locktab files + // + static int open(const char *dir, int flags, int mode, + DbEnv* dbenv, DbLockTab **regionp); + static int unlink(const char *dir, int force, DbEnv* dbenv); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // copying not allowed + // + DbLockTab(const DbLockTab &); + DbLockTab &operator = (const DbLockTab &); + + // Note: use DbLockTab::open() or DbEnv::get_lk_info() + // to get pointers to a DbLockTab, + // and call DbLockTab::close() rather than delete to release them. + // + DbLockTab(); + ~DbLockTab(); + + DEFINE_DB_CLASS(DbLockTab); +}; + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Log classes +// + +class _exported DbLsn : protected DB_LSN +{ + friend DbLog; // friendship needed to cast to base class + friend DbMpool; +}; + +class _exported DbLog +{ +friend DbEnv; +public: + int archive(char **list[], int flags, void *(*db_malloc)(size_t)); + int close(); + static int compare(const DbLsn *lsn0, const DbLsn *lsn1); + int file(DbLsn *lsn, char *namep, int len); + int flush(const DbLsn *lsn); + int get(DbLsn *lsn, Dbt *data, int flags); + int put(DbLsn *lsn, const Dbt *data, int flags); + + // Normally these would be called register and unregister to + // parallel the C interface, but "register" is a reserved word. + // + int db_register(Db *dbp, const char *name, u_int32_t *fidp); + int db_unregister(u_int32_t fid); + + // Create or remove new log files + // + static int open(const char *dir, int flags, int mode, + DbEnv* dbenv, DbLog **regionp); + static int unlink(const char *dir, int force, DbEnv* dbenv); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbLog::open() or DbEnv::get_lg_info() + // to get pointers to a DbLog, + // and call DbLog::close() rather than delete to release them. + // + DbLog(); + ~DbLog(); + + // no copying + DbLog(const DbLog &); + operator = (const DbLog &); + + DEFINE_DB_CLASS(DbLog); +}; + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Memory pool classes +// + +class _exported DbMpoolFile +{ +public: + int close(); + int get(db_pgno_t *pgnoaddr, unsigned long flags, void *pagep); + int put(void *pgaddr, unsigned long flags); + int set(void *pgaddr, unsigned long flags); + int sync(); + + static int open(DbMpool *mp, const char *file, + int ftype, int flags, int mode, + size_t pagesize, int lsn_offset, + Dbt *pgcookie, u_int8_t *uid, DbMpoolFile **mpf); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbMpoolFile::open() + // to get pointers to a DbMpoolFile, + // and call DbMpoolFile::close() rather than delete to release them. + // + DbMpoolFile(); + + // Shut g++ up. +protected: + ~DbMpoolFile(); + +private: + // no copying + DbMpoolFile(const DbMpoolFile &); + operator = (const DbMpoolFile &); + + DEFINE_DB_CLASS(DbMpoolFile); +}; + +class _exported DbMpool +{ +friend DbEnv; +public: + int close(); + + // access to low level interface + // Normally this would be called register to parallel + // the C interface, but "register" is a reserved word. + // + int db_register(int ftype, + int (*pgin)(db_pgno_t pgno, void *pgaddr, DBT *pgcookie), + int (*pgout)(db_pgno_t pgno, void *pgaddr, DBT *pgcookie)); + + int stat(DB_MPOOL_STAT **gsp, DB_MPOOL_FSTAT ***fsp, + void *(*db_malloc)(size_t)); + int sync(DbLsn *lsn); + + // Create or remove new mpool files + // + static int open(const char *dir, int flags, int mode, + DbEnv* dbenv, DbMpool **regionp); + static int unlink(const char *dir, int force, DbEnv* dbenv); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbMpool::open() or DbEnv::get_mp_info() + // to get pointers to a DbMpool, + // and call DbMpool::close() rather than delete to release them. + // + DbMpool(); + ~DbMpool(); + + // no copying + DbMpool(const DbMpool &); + DbMpool &operator = (const DbMpool &); + + DEFINE_DB_CLASS(DbMpool); +}; + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Transaction classes +// + +class _exported DbTxnMgr +{ +friend DbEnv; +public: + int begin(DbTxn *pid, DbTxn **tid); + int checkpoint(long kbyte, long min) const; + int close(); + int stat(DB_TXN_STAT **statp, void *(*db_malloc)(size_t)); + + // Create or remove new txnmgr files + // + static int open(const char *dir, int flags, int mode, + DbEnv* dbenv, DbTxnMgr **regionp); + static int unlink(const char *dir, int force, DbEnv* dbenv); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbTxnMgr::open() or DbEnv::get_tx_info() + // to get pointers to a DbTxnMgr, + // and call DbTxnMgr::close() rather than delete to release them. + // + DbTxnMgr(); + ~DbTxnMgr(); + + // no copying + DbTxnMgr(const DbTxnMgr &); + operator = (const DbTxnMgr &); + + DEFINE_DB_CLASS(DbTxnMgr); +}; + +class _exported DbTxn +{ +friend DbTxnMgr; +public: + int abort(); + int commit(); + u_int32_t id(); + int prepare(); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbTxnMgr::begin() to get pointers to a DbTxn, + // and call DbTxn::abort() or DbTxn::commit rather than + // delete to release them. + // + DbTxn(); + ~DbTxn(); + + // no copying + DbTxn(const DbTxn &); + operator = (const DbTxn &); + + DEFINE_DB_CLASS(DbTxn); +}; + + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Application classes +// + +// +// A set of application options - define how this application uses +// the db library. +// +class _exported DbInfo : protected DB_INFO +{ + friend DbEnv; + friend Db; + +public: + DbInfo(); + ~DbInfo(); + + // Byte order. + int get_lorder() const; + void set_lorder(int); + + // Underlying cache size. + size_t get_cachesize() const; + void set_cachesize(size_t); + + // Underlying page size. + size_t get_pagesize() const; + void set_pagesize(size_t); + + // Local heap allocation. + typedef void *(*db_malloc_fcn)(size_t); + db_malloc_fcn get_malloc() const; + void set_malloc(db_malloc_fcn); + + //////////////////////////////////////////////////////////////// + // Btree access method. + + // Maximum keys per page. + int get_bt_maxkey() const; + void set_bt_maxkey(int); + + // Minimum keys per page. + int get_bt_minkey() const; + void set_bt_minkey(int); + + // Comparison function. + typedef int (*bt_compare_fcn)(const DBT *, const DBT *); + bt_compare_fcn get_bt_compare() const; + void set_bt_compare(bt_compare_fcn); + + // Prefix function. + typedef size_t (*bt_prefix_fcn)(const DBT *, const DBT *); + bt_prefix_fcn get_bt_prefix() const; + void set_bt_prefix(bt_prefix_fcn); + + //////////////////////////////////////////////////////////////// + // Hash access method. + + // Fill factor. + unsigned int get_h_ffactor() const; + void set_h_ffactor(unsigned int); + + // Number of elements. + unsigned int get_h_nelem() const; + void set_h_nelem(unsigned int); + + // Hash function. + typedef u_int32_t (*h_hash_fcn)(const void *, u_int32_t); + h_hash_fcn get_h_hash() const; + void set_h_hash(h_hash_fcn); + + //////////////////////////////////////////////////////////////// + // Recno access method. + + // Fixed-length padding byte. + int get_re_pad() const; + void set_re_pad(int); + + // Variable-length delimiting byte. + int get_re_delim() const; + void set_re_delim(int); + + // Length for fixed-length records. + u_int32_t get_re_len() const; + void set_re_len(u_int32_t); + + // Source file name. + char *get_re_source() const; + void set_re_source(char *); + + // Note: some flags are set as side effects of calling + // above "set" methods. + // + u_int32_t get_flags() const; + void set_flags(u_int32_t); + + + // (deep) copying of this object is allowed. + // + DbInfo(const DbInfo &); + DbInfo &operator = (const DbInfo &); + +private: + // We can add data to this class if needed + // since parent class is not allocated by db. + // (see comment at top) +}; + +// +// Base application class. Provides functions for opening a database. +// User of this library can use this class as a starting point for +// developing a DB application - derive their application class from +// this one, add application control logic. +// +// Note that if you use the default constructor, you must explicitly +// call appinit() before any other db activity (e.g. opening files) +// +class _exported DbEnv : protected DB_ENV +{ +friend DbTxnMgr; +friend DbLog; +friend DbLockTab; +friend DbMpool; +friend Db; + +public: + + ~DbEnv(); + + // This constructor can be used to immediately initialize the + // application with these arguments. Do not use it if you + // need to set other parameters via the access methods. + // + DbEnv(const char *homeDir, char *const *db_config, int flags); + + // Use this constructor if you wish to *delay* the initialization + // of the db library. This is useful if you need to set + // any particular parameters via the access methods below. + // Then call appinit() to complete the initialization. + // + DbEnv(); + + // Used in conjunction with the default constructor to + // complete the initialization of the db library. + // + int appinit(const char *homeDir, char *const *db_config, int flags); + + //////////////////////////////////////////////////////////////// + // simple get/set access methods + // + // If you are calling set_ methods, you need to + // use the default constructor along with appinit(). + + // Byte order. + int get_lorder() const; + void set_lorder(int); + + // Error message callback. + typedef void (*db_errcall_fcn)(const char *, char *); + db_errcall_fcn get_errcall() const; + void set_errcall(db_errcall_fcn); + + // Error message file stream. + FILE *get_errfile() const; + void set_errfile(FILE *); + + // Error message prefix. + const char *get_errpfx() const; + void set_errpfx(const char *); + + // Generate debugging messages. + int get_verbose() const; + void set_verbose(int); + + //////////////////////////////////////////////////////////////// + // User paths. + + // Database home. + char *get_home() const; + void set_home(char *); + + // Database log file directory. + char *get_log_dir() const; + void set_log_dir(char *); + + // Database tmp file directory. + char *get_tmp_dir() const; + void set_tmp_dir(char *); + + // Database data file directories. + char **get_data_dir() const; + void set_data_dir(char **); + + // Database data file slots. + int get_data_cnt() const; + void set_data_cnt(int); + + // Next Database data file slot. + int get_data_next() const; + void set_data_next(int); + + + //////////////////////////////////////////////////////////////// + // Locking. + + // Return from lock_open(). + DbLockTab *get_lk_info() const; + + // Two dimensional conflict matrix. + u_int8_t *get_lk_conflicts() const; + void set_lk_conflicts(u_int8_t *); + + // Number of lock modes in table. + int get_lk_modes() const; + void set_lk_modes(int); + + // Maximum number of locks. + unsigned int get_lk_max() const; + void set_lk_max(unsigned int); + + // Deadlock detect on every conflict. + u_int32_t get_lk_detect() const; + void set_lk_detect(u_int32_t); + + // Yield function for threads. + typedef int (*db_yield_fcn) (void); + db_yield_fcn get_yield() const; + void set_yield(db_yield_fcn); + + + //////////////////////////////////////////////////////////////// + // Logging. + + // Return from log_open(). + DbLog *get_lg_info() const; + + // Maximum file size. + u_int32_t get_lg_max() const; + void set_lg_max(u_int32_t); + + + //////////////////////////////////////////////////////////////// + // Memory pool. + + // Return from memp_open(). + DbMpool *get_mp_info() const; + + // Maximum file size for mmap. + size_t get_mp_mmapsize() const; + void set_mp_mmapsize(size_t); + + // Bytes in the mpool cache. + size_t get_mp_size() const; + void set_mp_size(size_t); + + + //////////////////////////////////////////////////////////////// + // Transactions. + + // Return from txn_open(). + DbTxnMgr *get_tx_info() const; + + // Maximum number of transactions. + unsigned int get_tx_max() const; + void set_tx_max(unsigned int); + + // Dispatch function for recovery. + typedef int (*tx_recover_fcn)(DB_LOG *, DBT *, DB_LSN *, int, void *); + tx_recover_fcn get_tx_recover() const; + void set_tx_recover(tx_recover_fcn); + + // Flags. + u_int32_t get_flags() const; + void set_flags(u_int32_t); + + //////////////////////////////////////////////////////////////// + // The default error model is to throw an exception whenever + // an error occurs. This generally allows for cleaner logic + // for transaction processing, as a try block can surround a + // single transaction. Alternatively, since almost every method + // returns an error code (errno), the error model can be set to + // not throw exceptions, and instead return the appropriate code. + // + enum ErrorModel { Exception, ErrorReturn }; + void set_error_model(ErrorModel); + ErrorModel get_error_model() const; + + // If an error is detected and the error call function + // or stream is set, a message is dispatched or printed. + // If a prefix is set, each message is prefixed. + // + // You can use set_errcall() or set_errfile() above to control + // error functionality using a C model. Alternatively, you can + // call set_error_stream() to force all errors to a C++ stream. + // It is unwise to mix these approaches. + // + class ostream* get_error_stream() const; + void set_error_stream(class ostream*); + + // used internally + static int runtime_error(const char *caller, int err, int in_destructor = 0); + +private: + // We can add data to this class if needed + // since parent class is not allocated by db. + // (see comment at top) + + // no copying + DbEnv(const DbEnv &); + operator = (const DbEnv &); + + ErrorModel error_model_; + static void stream_error_function(const char *, char *); + static ostream *error_stream_; +}; + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Table access classes +// + +// +// Represents a database table = a set of keys with associated values. +// +class _exported Db +{ + friend DbEnv; + +public: + int close(int flags); + int cursor(DbTxn *txnid, Dbc **cursorp); + int del(Dbt *key, DbTxn *txnid); + int fd(int *fdp); + int get(DbTxn *txnid, Dbt *key, Dbt *data, int flags); + int put(DbTxn *txnid, Dbt *key, Dbt *data, int flags); + int stat(void *sp, void *(*db_malloc)(size_t), int flags); + int sync(int flags); + + DBTYPE get_type() const; + + static int open(const char *fname, DBTYPE type, int flags, + int mode, DbEnv *dbenv, DbInfo *info, Db **dbpp); + +private: + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use Db::open() to get initialize pointers to a Db, + // and call Db::close() rather than delete to release them. + Db(); + ~Db(); + + // no copying + Db(const Db &); + Db &operator = (const Db &); + + DEFINE_DB_CLASS(Db); +}; + +// +// A chunk of data, maybe a key or value. +// +class _exported Dbt : private DBT +{ + friend Dbc; + friend Db; + friend DbLog; + friend DbMpoolFile; + friend DbLockTab; + +public: + + // key/data + void *get_data() const; + void set_data(void *); + + // key/data length + u_int32_t get_size() const; + void set_size(u_int32_t); + + // RO: length of user buffer. + u_int32_t get_ulen() const; + void set_ulen(u_int32_t); + + // RO: get/put record length. + u_int32_t get_dlen() const; + void set_dlen(u_int32_t); + + // RO: get/put record offset. + u_int32_t get_doff() const; + void set_doff(u_int32_t); + + // flags + u_int32_t get_flags() const; + void set_flags(u_int32_t); + + Dbt(void *data, size_t size); + Dbt(); + ~Dbt(); + Dbt(const Dbt &); + Dbt &operator = (const Dbt &); + +private: + // We can add data to this class if needed + // since parent class is not allocated by db. + // (see comment at top) +}; + +class _exported Dbc : protected DBC +{ + friend Db; + +public: + int close(); + int del(int flags); + int get(Dbt* key, Dbt *data, int flags); + int put(Dbt* key, Dbt *data, int flags); + +private: + // No data is permitted in this class (see comment at top) + + // Note: use Db::cursor() to get pointers to a Dbc, + // and call Dbc::close() rather than delete to release them. + // + Dbc(); + ~Dbc(); + + // no copying + Dbc(const Dbc &); + Dbc &operator = (const Dbc &); +}; + +#endif /* !_DB_CXX_H_ */ diff --git a/db2/include/db_dispatch.h b/db2/include/db_dispatch.h new file mode 100644 index 0000000000..b93ec39b54 --- /dev/null +++ b/db2/include/db_dispatch.h @@ -0,0 +1,73 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)db_dispatch.h 10.1 (Sleepycat) 4/12/97 + */ + +#ifndef _DB_DISPATCH_H +#define _DB_DISPATCH_H + +/* + * Declarations and typedefs for the list of transaction IDs used during + * recovery. + */ + +typedef struct __db_txnhead { + LIST_HEAD(__db_headlink, _db_txnlist) head; + u_int32_t maxid; +} __db_txnhead; + +typedef struct _db_txnlist { + LIST_ENTRY(_db_txnlist) links; + u_int32_t txnid; +} __db_txnlist; + +#define DB_log_BEGIN 0 +#define DB_txn_BEGIN 5 +#define DB_ham_BEGIN 20 +#define DB_db_BEGIN 40 +#define DB_bam_BEGIN 50 +#define DB_ram_BEGIN 100 +#define DB_user_BEGIN 150 + +#define TXN_UNDO 0 +#define TXN_REDO 1 +#define TXN_BACKWARD_ROLL -1 +#define TXN_FORWARD_ROLL -2 +#define TXN_OPENFILES -3 +#endif diff --git a/db2/include/db_ext.h b/db2/include/db_ext.h new file mode 100644 index 0000000000..1cccb47617 --- /dev/null +++ b/db2/include/db_ext.h @@ -0,0 +1,114 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __db_pgerr __P((DB *, db_pgno_t)); +int __db_pgfmt __P((DB *, db_pgno_t)); +int __db_addrem_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, u_int32_t, + size_t, DBT *, DBT *, DB_LSN *)); +int __db_addrem_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_addrem_read __P((void *, __db_addrem_args **)); +int __db_split_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, DBT *, + DB_LSN *)); +int __db_split_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_split_read __P((void *, __db_split_args **)); +int __db_big_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, + db_pgno_t, DBT *, DB_LSN *, DB_LSN *, + DB_LSN *)); +int __db_big_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_big_read __P((void *, __db_big_args **)); +int __db_ovref_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *)); +int __db_ovref_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_ovref_read __P((void *, __db_ovref_args **)); +int __db_relink_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + DB_LSN *, db_pgno_t, DB_LSN *)); +int __db_relink_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_relink_read __P((void *, __db_relink_args **)); +int __db_addpage_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + DB_LSN *)); +int __db_addpage_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_addpage_read __P((void *, __db_addpage_args **)); +int __db_debug_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + DBT *, u_int32_t, DBT *, DBT *, + u_int32_t)); +int __db_debug_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_debug_read __P((void *, __db_debug_args **)); +int __db_noop_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t)); +int __db_noop_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_noop_read __P((void *, __db_noop_args **)); +int __db_init_print __P((DB_ENV *)); +int __db_init_recover __P((DB_ENV *)); +int __db_pgin __P((db_pgno_t, void *)); +int __db_pgout __P((db_pgno_t, void *)); +int __db_dispatch __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_add_recovery __P((DB_ENV *, + int (*)(DB_LOG *, DBT *, DB_LSN *, int, void *), u_int32_t)); +int __db_txnlist_init __P((void *)); +int __db_txnlist_add __P((void *, u_int32_t)); +int __db_txnlist_find __P((void *, u_int32_t)); +int __db_dput __P((DB *, + DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); +int __db_drem __P((DB *, + PAGE **, u_int32_t, int (*)(DB *, PAGE *))); +int __db_dend __P((DB *, db_pgno_t, PAGE **)); + int __db_ditem __P((DB *, PAGE *, int, u_int32_t)); +int __db_pitem + __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); +int __db_relink __P((DB *, PAGE *, PAGE **, int)); +int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); +int __db_goff __P((DB *, DBT *, + u_int32_t, db_pgno_t, void **, u_int32_t *)); +int __db_poff __P((DB *, const DBT *, db_pgno_t *, + int (*)(DB *, u_int32_t, PAGE **))); +int __db_ioff __P((DB *, db_pgno_t)); +int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); +int __db_moff __P((DB *, const DBT *, db_pgno_t)); +void __db_loadme __P((void)); +FILE *__db_prinit __P((FILE *)); +int __db_dump __P((DB *, char *, int)); +int __db_prdb __P((DB *)); +int __db_prbtree __P((DB *)); +int __db_prhash __P((DB *)); +int __db_prtree __P((DB_MPOOLFILE *, int)); +int __db_prnpage __P((DB_MPOOLFILE *, db_pgno_t)); +int __db_prpage __P((PAGE *, int)); +int __db_isbad __P((PAGE *, int)); +void __db_pr __P((u_int8_t *, u_int32_t)); +void __db_prflags __P((u_int32_t, const FN *)); +int __db_addrem_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_split_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_big_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_ovref_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_relink_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_addpage_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_debug_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_noop_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __db_ret __P((DB *, + PAGE *, u_int32_t, DBT *, void **, u_int32_t *)); +int __db_retcopy __P((DBT *, + void *, u_int32_t, void **, u_int32_t *, void *(*)(size_t))); +int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **)); +int __db_puthandle __P((DB *)); diff --git a/db2/include/db_int.h.src b/db2/include/db_int.h.src new file mode 100644 index 0000000000..b60e5002e5 --- /dev/null +++ b/db2/include/db_int.h.src @@ -0,0 +1,332 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_int.h.src 10.28 (Sleepycat) 8/20/97 + */ + +#ifndef _DB_INTERNAL_H_ +#define _DB_INTERNAL_H_ + +#include "db.h" /* Standard DB include file. */ +#include "queue.h" +#include "os_ext.h" + +/******************************************************* + * General purpose constants and macros. + *******************************************************/ +#define UINT32_T_MAX 0xffffffff /* Maximum 32 bit unsigned. */ +#define UINT16_T_MAX 0xffff /* Maximum 16 bit unsigned. */ + +#define DB_MIN_PGSIZE 0x000200 /* Minimum page size. */ +#define DB_MAX_PGSIZE 0x010000 /* Maximum page size. */ + +#define DB_MINCACHE 10 /* Minimum cached pages */ + +/* + * Aligning items to particular sizes or in pages or memory. ALIGNP is a + * separate macro, as we've had to cast the pointer to different integral + * types on different architectures. + * + * We cast pointers into unsigned longs when manipulating them because C89 + * guarantees that u_long is the largest available integral type and further, + * to never generate overflows. However, neither C89 or C9X requires that + * any integer type be large enough to hold a pointer, although C9X created + * the intptr_t type, which is guaranteed to hold a pointer but may or may + * not exist. At some point in the future, we should test for intptr_t and + * use it where available. + */ +#undef ALIGNTYPE +#define ALIGNTYPE u_long +#undef ALIGNP +#define ALIGNP(value, bound) ALIGN((ALIGNTYPE)value, bound) +#undef ALIGN +#define ALIGN(value, bound) (((value) + (bound) - 1) & ~((bound) - 1)) + +/* + * There are several on-page structures that are declared to have a number of + * fields followed by a variable length array of items. The structure size + * without including the variable length array or the address of the first of + * those elements can be found using SSZ. + * + * This macro can also be used to find the offset of a structure element in a + * structure. This is used in various places to copy structure elements from + * unaligned memory references, e.g., pointers into a packed page. + * + * There are two versions because compilers object if you take the address of + * an array. + */ +#undef SSZ +#define SSZ(name, field) ((int)&(((name *)0)->field)) + +#undef SSZA +#define SSZA(name, field) ((int)&(((name *)0)->field[0])) + +/* Free and free-string macros that overwrite memory during debugging. */ +#ifdef DEBUG +#undef FREE +#define FREE(p, len) { \ + memset(p, 0xff, len); \ + free(p); \ +} +#undef FREES +#define FREES(p) { \ + FREE(p, strlen(p)); \ +} +#else +#undef FREE +#define FREE(p, len) { \ + free(p); \ +} +#undef FREES +#define FREES(p) { \ + free(p); \ +} +#endif + +/* Structure used to print flag values. */ +typedef struct __fn { + u_int32_t mask; /* Flag value. */ + char *name; /* Flag name. */ +} FN; + +/* Set, clear and test flags. */ +#define F_SET(p, f) (p)->flags |= (f) +#define F_CLR(p, f) (p)->flags &= ~(f) +#define F_ISSET(p, f) ((p)->flags & (f)) +#define LF_SET(f) (flags |= (f)) +#define LF_CLR(f) (flags &= ~(f)) +#define LF_ISSET(f) (flags & (f)) + +/* Display separator string. */ +#undef DB_LINE +#define DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + +/******************************************************* + * Files. + *******************************************************/ +#ifndef MAXPATHLEN /* Maximum path length. */ +#ifdef PATH_MAX +#define MAXPATHLEN PATH_MAX +#else +#define MAXPATHLEN 1024 +#endif +#endif + +#define PATH_DOT "." /* Current working directory. */ +#define PATH_SEPARATOR "/" /* Path separator character. */ + +#ifndef S_IRUSR /* UNIX specific file permissions. */ +#define S_IRUSR 0000400 /* R for owner */ +#define S_IWUSR 0000200 /* W for owner */ +#define S_IRGRP 0000040 /* R for group */ +#define S_IWGRP 0000020 /* W for group */ +#define S_IROTH 0000004 /* R for other */ +#define S_IWOTH 0000002 /* W for other */ +#endif + +#ifndef S_ISDIR /* UNIX specific: directory test. */ +#define S_ISDIR(m) ((m & 0170000) == 0040000) +#endif + +/******************************************************* + * Mutex support. + *******************************************************/ +@spin_line1@ +@spin_line2@ +@spin_line3@ + +/* + * !!! + * Various systems require different alignments for mutexes (the worst we've + * seen so far is 16-bytes on some HP architectures). The mutex (tsl_t) must + * be first in the db_mutex_t structure, which must itself be first in the + * region. This ensures the alignment is as returned by mmap(2), which should + * be sufficient. All other mutex users must ensure proper alignment locally. + */ +#define MUTEX_ALIGNMENT @mutex_align@ + +/* + * The offset of a mutex in memory. + */ +#define MUTEX_LOCK_OFFSET(a, b) ((off_t)((u_int8_t *)b - (u_int8_t *)a)) + +typedef struct _db_mutex_t { +#ifdef HAVE_SPINLOCKS + tsl_t tsl_resource; /* Resource test and set. */ +#ifdef DEBUG + u_long pid; /* Lock holder: 0 or process pid. */ +#endif +#else + off_t off; /* Backing file offset. */ + u_long pid; /* Lock holder: 0 or process pid. */ +#endif +#ifdef MUTEX_STATISTICS + u_long mutex_set_wait; /* Blocking mutex: required waiting. */ + u_long mutex_set_nowait; /* Blocking mutex: without waiting. */ +#endif +} db_mutex_t; + +#include "mutex_ext.h" + +/******************************************************* + * Access methods. + *******************************************************/ +/* Lock/unlock a DB thread. */ +#define DB_THREAD_LOCK(dbp) \ + (F_ISSET(dbp, DB_AM_THREAD) ? \ + __db_mutex_lock((db_mutex_t *)(dbp)->mutex, -1, \ + (dbp)->dbenv == NULL ? NULL : (dbp)->dbenv->db_yield) : 0) +#define DB_THREAD_UNLOCK(dbp) \ + (F_ISSET(dbp, DB_AM_THREAD) ? \ + __db_mutex_unlock((db_mutex_t *)(dbp)->mutex, -1) : 0) + +/* Btree/recno local statistics structure. */ +struct __db_bt_lstat; typedef struct __db_bt_lstat DB_BTREE_LSTAT; +struct __db_bt_lstat { + u_int32_t bt_freed; /* Pages freed for reuse. */ + u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ + u_int32_t bt_split; /* Total number of splits. */ + u_int32_t bt_rootsplit; /* Root page splits. */ + u_int32_t bt_fastsplit; /* Fast splits. */ + u_int32_t bt_added; /* Items added. */ + u_int32_t bt_deleted; /* Items deleted. */ + u_int32_t bt_get; /* Items retrieved. */ + u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ + u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ +}; + +/******************************************************* + * Environment. + *******************************************************/ +/* Type passed to __db_appname(). */ +typedef enum { + DB_APP_NONE=0, /* No type (region). */ + DB_APP_DATA, /* Data file. */ + DB_APP_LOG, /* Log file. */ + DB_APP_TMP /* Temporary file. */ +} APPNAME; + +/******************************************************* + * Regions. + *******************************************************/ +/* + * The shared memory regions share an initial structure so that the general + * region code can handle races between the region being deleted and other + * processes waiting on the region mutex. + * + * !!! + * Note, the mutex must be the first entry in the region; see comment above. + */ +typedef struct _rlayout { + db_mutex_t lock; /* Region mutex. */ + u_int32_t refcnt; /* Region reference count. */ + size_t size; /* Region length. */ + int majver; /* Major version number. */ + int minver; /* Minor version number. */ + int patch; /* Patch version number. */ + +#define DB_R_DELETED 0x01 /* Region was deleted. */ + u_int32_t flags; +} RLAYOUT; + +/******************************************************* + * Mpool. + *******************************************************/ +/* + * File types for DB access methods. Negative numbers are reserved to DB. + */ +#define DB_FTYPE_BTREE -1 /* Btree. */ +#define DB_FTYPE_HASH -2 /* Hash. */ + +/* Structure used as the DB pgin/pgout pgcookie. */ +typedef struct __dbpginfo { + size_t db_pagesize; /* Underlying page size. */ + int needswap; /* If swapping required. */ +} DB_PGINFO; + +/******************************************************* + * Log. + *******************************************************/ +/* Initialize an LSN to 'zero'. */ +#define ZERO_LSN(LSN) { \ + (LSN).file = 0; \ + (LSN).offset = 0; \ +} + +/* Return 1 if LSN is a 'zero' lsn, otherwise return 0. */ +#define IS_ZERO_LSN(LSN) ((LSN).file == 0) + +/* Test if we need to log a change. */ +#define DB_LOGGING(dbp) \ + (F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER)) + +#ifdef DEBUG +/* + * Debugging macro to log operations. + * If DEBUG_WOP is defined, log operations that modify the database. + * If DEBUG_ROP is defined, log operations that read the database. + * + * D dbp + * T txn + * O operation (string) + * K key + * A data + * F flags + */ +#define LOG_OP(D, T, O, K, A, F) { \ + DB_LSN _lsn; \ + DBT _op; \ + if (DB_LOGGING((D))) { \ + memset(&_op, 0, sizeof(_op)); \ + _op.data = O; \ + _op.size = strlen(O) + 1; \ + (void)__db_debug_log((D)->dbenv->lg_info, \ + T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F); \ + } \ +} +#ifdef DEBUG_ROP +#define DEBUG_LREAD(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#else +#define DEBUG_LREAD(D, T, O, K, A, F) +#endif +#ifdef DEBUG_WOP +#define DEBUG_LWRITE(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#else +#define DEBUG_LWRITE(D, T, O, K, A, F) +#endif +#else +#define DEBUG_LREAD(D, T, O, K, A, F) +#define DEBUG_LWRITE(D, T, O, K, A, F) +#endif /* DEBUG */ + +/******************************************************* + * Transactions and recovery. + *******************************************************/ +/* + * The locker id space is divided between the transaction manager and the lock + * manager. Lockid's start at 0 and go to MAX_LOCKER_ID. Txn Id's start at + * MAX_LOCKER_ID + 1 and go up to MAX_TXNID. + */ +#define MAX_LOCKER_ID 0x0fffffff +#define MAX_TXNID 0xffffffff + +/* + * Out of band value for a lock. The locks are returned to callers as offsets + * into the lock regions. Since the RLAYOUT structure begins all regions, an + * offset of 0 is guaranteed not to be a valid lock. + */ +#define LOCK_INVALID 0 + +/* The structure allocated for every transaction. */ +struct __db_txn { + DB_TXNMGR *mgrp; /* Pointer to transaction manager. */ + DB_TXN *parent; /* Pointer to transaction's parent. */ + DB_LSN last_lsn; /* Lsn of last log write. */ + u_int32_t txnid; /* Unique transaction id. */ + size_t off; /* Detail structure within region. */ + TAILQ_ENTRY(__db_txn) links; +}; +#endif /* !_DB_INTERNAL_H_ */ diff --git a/db2/include/db_page.h b/db2/include/db_page.h new file mode 100644 index 0000000000..9e78682c57 --- /dev/null +++ b/db2/include/db_page.h @@ -0,0 +1,535 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_page.h 10.10 (Sleepycat) 8/18/97 + */ + +#ifndef _DB_PAGE_H_ +#define _DB_PAGE_H_ + +/* + * DB page formats. + * + * This implementation requires that values within the following structures + * NOT be padded -- note, ANSI C permits random padding within structures. + * If your compiler pads randomly you can just forget ever making DB run on + * your system. In addition, no data type can require larger alignment than + * its own size, e.g., a 4-byte data element may not require 8-byte alignment. + * + * Note that key/data lengths are often stored in db_indx_t's -- this is + * not accidental, nor does it limit the key/data size. If the key/data + * item fits on a page, it's guaranteed to be small enough to fit into a + * db_indx_t, and storing it in one saves space. + */ + +#define PGNO_METADATA 0 /* Metadata page number. */ +#define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */ +#define PGNO_ROOT 1 /* Root is page #1. */ + +/************************************************************************ + BTREE METADATA PAGE LAYOUT + ************************************************************************/ + +/* + * Btree metadata page layout: + * + * +-----------------------------------+ + * | lsn | pgno | magic | + * +-----------------------------------+ + * | version | pagesize | free | + * +-----------------------------------+ + * | flags | unused ... | + * +-----------------------------------+ + */ +typedef struct _btmeta { + DB_LSN lsn; /* 00-07: LSN. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t magic; /* 12-15: Magic number. */ + u_int32_t version; /* 16-19: Version. */ + u_int32_t pagesize; /* 20-23: Pagesize. */ + u_int32_t maxkey; /* 24-27: Btree: Maxkey. */ + u_int32_t minkey; /* 28-31: Btree: Minkey. */ + u_int32_t free; /* 32-35: Free list page number. */ +#define BTM_DUP 0x001 /* Duplicates. */ +#define BTM_RECNO 0x002 /* Recno tree. */ +#define BTM_RECNUM 0x004 /* Btree: maintain record count. */ +#define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */ +#define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */ +#define BTM_MASK 0x01f + u_int32_t flags; /* 36-39: Flags. */ + u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */ + u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */ + /* 48-67: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; + + u_int32_t spare[13]; /* 68-123: Save some room for growth. */ + + DB_BTREE_LSTAT stat; /* 124-163: Statistics. */ +} BTMETA; + +/************************************************************************ + HASH METADATA PAGE LAYOUT + ************************************************************************/ + +/* + * Hash metadata page layout: + * + * +-----------------------------------+ + * | lsn | magic | version | + * +-----------------------------------+ + * | pagesize | ovfl_point| last_freed| + * +-----------------------------------+ + * | max_bucket| high_mask | low_mask | + * +-----------------------------------+ + * | ffactor | nelem | charkey | + * +-----------------------------------+ + * | spares[32]| flags | unused | + * +-----------------------------------+ + */ +/* Hash Table Information */ +typedef struct hashhdr { /* Disk resident portion */ + DB_LSN lsn; /* 00-07: LSN of the header page */ + db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */ + u_int32_t magic; /* 12-15: Magic NO for hash tables */ + u_int32_t version; /* 16-19: Version ID */ + u_int32_t pagesize; /* 20-23: Bucket/Page Size */ + u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */ + u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */ + u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */ + u_int32_t high_mask; /* 36-39: Modulo mask into table */ + u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */ + u_int32_t ffactor; /* 44-47: Fill factor */ + u_int32_t nelem; /* 48-51: Number of keys in hash table */ + u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */ +#define DB_HASH_DUP 0x01 + u_int32_t flags; /* 56-59: Allow duplicates. */ +#define NCACHED 32 /* number of spare points */ + /* 60-187: Spare pages for overflow */ + u_int32_t spares[NCACHED]; + /* 188-207: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; + + /* + * Minimum page size is 256. + */ +} HASHHDR; + +/************************************************************************ + MAIN PAGE LAYOUT + ************************************************************************/ + +/* + * +-----------------------------------+ + * | lsn | pgno | prev pgno | + * +-----------------------------------+ + * | next pgno | entries | hf offset | + * +-----------------------------------+ + * | level | type | index | + * +-----------------------------------+ + * | index | free --> | + * +-----------+-----------------------+ + * | F R E E A R E A | + * +-----------------------------------+ + * | <-- free | item | + * +-----------------------------------+ + * | item | item | item | + * +-----------------------------------+ + * + * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be + * two-byte aligned. + * + * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the + * key for inp[1]'s data. All other types of pages only contain single items. + */ +typedef struct _db_page { + DB_LSN lsn; /* 00-07: Log sequence number. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + db_pgno_t prev_pgno; /* 12-15: Previous page number. */ + db_pgno_t next_pgno; /* 16-19: Next page number. */ + db_indx_t entries; /* 20-21: Number of item pairs on the page. */ + db_indx_t hf_offset; /* 22-23: High free byte page offset. */ + + /* + * The btree levels are numbered from the leaf to the root, starting + * with 1, so the leaf is level 1, its parent is level 2, and so on. + * We maintain this level on all btree pages, but the only place that + * we actually need it is on the root page. It would not be difficult + * to hide the byte on the root page once it becomes an internal page, + * so we could get this byte back if we needed it for something else. + */ +#define LEAFLEVEL 1 +#define MAXBTREELEVEL 255 + u_int8_t level; /* 24: Btree tree level. */ + +#define P_INVALID 0 /* Invalid page type. */ +#define P_DUPLICATE 1 /* Duplicate. */ +#define P_HASH 2 /* Hash. */ +#define P_IBTREE 3 /* Btree internal. */ +#define P_IRECNO 4 /* Recno internal. */ +#define P_LBTREE 5 /* Btree leaf. */ +#define P_LRECNO 6 /* Recno leaf. */ +#define P_OVERFLOW 7 /* Overflow. */ + u_int8_t type; /* 25: Page type. */ + db_indx_t inp[1]; /* Variable length index of items. */ +} PAGE; + +/* Element macros. */ +#define LSN(p) (((PAGE *)p)->lsn) +#define PGNO(p) (((PAGE *)p)->pgno) +#define PREV_PGNO(p) (((PAGE *)p)->prev_pgno) +#define NEXT_PGNO(p) (((PAGE *)p)->next_pgno) +#define NUM_ENT(p) (((PAGE *)p)->entries) +#define HOFFSET(p) (((PAGE *)p)->hf_offset) +#define LEVEL(p) (((PAGE *)p)->level) +#define TYPE(p) (((PAGE *)p)->type) + +/* + * !!! + * The next_pgno and prev_pgno fields are not maintained for btree and recno + * internal pages. It's a minor performance improvement, and more, it's + * hard to do when deleting internal pages, and it decreases the chance of + * deadlock during deletes and splits. + * + * !!! + * The btree/recno access method needs db_recno_t bytes of space on the root + * page to specify how many records are stored in the tree. (The alternative + * is to store the number of records in the meta-data page, which will create + * a second hot spot in trees being actively modified, or recalculate it from + * the BINTERNAL fields on each access.) Overload the prev_pgno field. + */ +#define RE_NREC(p) \ + (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \ + TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p)) +#define RE_NREC_ADJ(p, adj) \ + PREV_PGNO(p) += adj; +#define RE_NREC_SET(p, num) \ + PREV_PGNO(p) = num; + +/* + * Initialize a page. + * + * !!! + * Don't modify the page's LSN, code depends on it being unchanged after a + * P_INIT call. + */ +#define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \ + PGNO(pg) = n; \ + PREV_PGNO(pg) = pg_prev; \ + NEXT_PGNO(pg) = pg_next; \ + NUM_ENT(pg) = 0; \ + HOFFSET(pg) = pg_size; \ + LEVEL(pg) = btl; \ + TYPE(pg) = pg_type; \ +} while (0) + +/* Page header length (offset to first index). */ +#define P_OVERHEAD (SSZA(PAGE, inp)) + +/* First free byte. */ +#define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t)) + +/* Free space on the page. */ +#define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg)) + +/* Get a pointer to the bytes at a specific index. */ +#define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx]) + +/************************************************************************ + OVERFLOW PAGE LAYOUT + ************************************************************************/ + +/* + * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which + * store a page number (the first page of the overflow item) and a length + * (the total length of the overflow item). The overflow item consists of + * some number of overflow pages, linked by the next_pgno field of the page. + * A next_pgno field of PGNO_INVALID flags the end of the overflow item. + * + * Overflow page overloads: + * The amount of overflow data stored on each page is stored in the + * hf_offset field. + * + * The implementation reference counts overflow items as it's possible + * for them to be promoted onto btree internal pages. The reference + * count is stored in the entries field. + */ +#define OV_LEN(p) (((PAGE *)p)->hf_offset) +#define OV_REF(p) (((PAGE *)p)->entries) + +/* Maximum number of bytes that you can put on an overflow page. */ +#define P_MAXSPACE(psize) ((psize) - P_OVERHEAD) + +/************************************************************************ + HASH PAGE LAYOUT + ************************************************************************/ + +/* Each index references a group of bytes on the page. */ +#define H_KEYDATA 1 /* Key/data item. */ +#define H_DUPLICATE 2 /* Duplicate key/data item. */ +#define H_OFFPAGE 3 /* Overflow key/data item. */ +#define H_OFFDUP 4 /* Overflow page of duplicates. */ + +/* + * The first and second types are H_KEYDATA and H_DUPLICATE, represented + * by the HKEYDATA structure: + * + * +-----------------------------------+ + * | type | key/data ... | + * +-----------------------------------+ + * + * For duplicates, the data field encodes duplicate elements in the data + * field: + * + * +---------------------------------------------------------------+ + * | type | len1 | element1 | len1 | len2 | element2 | len2 | + * +---------------------------------------------------------------+ + * + * Thus, by keeping track of the offset in the element, we can do both + * backward and forward traversal. + */ +typedef struct _hkeydata { + u_int8_t type; /* 00: Page type. */ + u_int8_t data[1]; /* Variable length key/data item. */ +} HKEYDATA; + +/* Get a HKEYDATA item for a specific index. */ +#define GET_HKEYDATA(pg, indx) \ + ((HKEYDATA *)P_ENTRY(pg, indx)) + +/* + * The length of any HKEYDATA item. Note that indx is an element index, + * not a PAIR index. + */ +#define LEN_HITEM(pg, pgsize, indx) \ + (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx]) + +#define LEN_HKEYDATA(pg, psize, indx) \ + (((indx) == 0 ? psize : pg->inp[indx - 1]) - \ + pg->inp[indx] - HKEYDATA_SIZE(0)) + +/* + * Page space required to add a new HKEYDATA item to the page, with and + * without the index value. + */ +#define HKEYDATA_SIZE(len) \ + ((len) + SSZA(HKEYDATA, data)) +#define HKEYDATA_PSIZE(len) \ + (HKEYDATA_SIZE(len) + sizeof(db_indx_t)) + +/* Put a HKEYDATA item at the location referenced by a page entry. */ +#define PUT_HKEYDATA(pe, kd, len, type) { \ + ((HKEYDATA *)pe)->type = type; \ + memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \ +} + +/* + * Macros the describe the page layout in terms of key-data pairs. + * The use of "pindex" indicates that the argument is the index + * expressed in pairs instead of individual elements. + */ +#define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2) +#define H_KEYINDEX(pindx) (2 * (pindx)) +#define H_DATAINDEX(pindx) ((2 * (pindx)) + 1) +#define H_PAIRKEY(pg, pindx) GET_HKEYDATA(pg, H_KEYINDEX(pindx)) +#define H_PAIRDATA(pg, pindx) GET_HKEYDATA(pg, H_DATAINDEX(pindx)) +#define H_PAIRSIZE(pg, psize, pindx) \ + (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \ + LEN_HITEM(pg, psize, H_DATAINDEX(pindx))) +#define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx)) +#define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx)) + +/* + * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: + * + * +-----------------------------------+ + * | type | pgno_t | total len | + * +-----------------------------------+ + */ +typedef struct _hoffpage { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t unused[3]; /* 01-03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Offpage page number. */ + u_int32_t tlen; /* 08-11: Total length of item. */ +} HOFFPAGE; + +/* Get a HOFFPAGE item for a specific index. */ +#define GET_HOFFPAGE(pg, indx) \ + ((HOFFPAGE *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new HOFFPAGE item to the page, with and + * without the index value. + */ +#define HOFFPAGE_SIZE (sizeof(HOFFPAGE)) +#define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t)) + +/* + * The fourth type is H_OFFDUP represented by the HOFFDUP structure: + * + * +-----------------------+ + * | type | pgno_t | + * +-----------------------+ + */ +typedef struct _hoffdup { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t unused[3]; /* 01-03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Offpage page number. */ +} HOFFDUP; + +/* Get a HOFFDUP item for a specific index. */ +#define GET_HOFFDUP(pg, indx) \ + ((HOFFDUP *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new HOFFDUP item to the page, with and + * without the index value. + */ +#define HOFFDUP_SIZE (sizeof(HOFFDUP)) +#define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t)) + +/************************************************************************ + BTREE PAGE LAYOUT + ************************************************************************/ + +/* Each index references a group of bytes on the page. */ +#define B_KEYDATA 1 /* Key/data item. */ +#define B_DUPLICATE 2 /* Duplicate key/data item. */ +#define B_OVERFLOW 3 /* Overflow key/data item. */ + +/* + * The first type is B_KEYDATA, represented by the BKEYDATA structure: + * + * +-----------------------------------+ + * | length | type | key/data | + * +-----------------------------------+ + */ +typedef struct _bkeydata { + db_indx_t len; /* 00-01: Key/data item length. */ + u_int deleted :1; /* 02: Page type and delete flag. */ + u_int type :7; + u_int8_t data[1]; /* Variable length key/data item. */ +} BKEYDATA; + +/* Get a BKEYDATA item for a specific index. */ +#define GET_BKEYDATA(pg, indx) \ + ((BKEYDATA *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new BKEYDATA item to the page, with and + * without the index value. + */ +#define BKEYDATA_SIZE(len) \ + ALIGN((len) + SSZA(BKEYDATA, data), 4) +#define BKEYDATA_PSIZE(len) \ + (BKEYDATA_SIZE(len) + sizeof(db_indx_t)) + +/* + * The second and third types are B_DUPLICATE and B_OVERFLOW, represented + * by the BOVERFLOW structure: + * + * +-----------------------------------+ + * | total len | type | unused | + * +-----------------------------------+ + * | nxt: page | nxt: off | nxt: len | + * +-----------------------------------+ + */ +typedef struct _boverflow { + db_indx_t unused1; /* 00-01: Padding, unused. */ + u_int deleted :1; /* 02: Page type and delete flag. */ + u_int type :7; + u_int8_t unused2; /* 03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Next page number. */ + u_int32_t tlen; /* 08-11: Total length of item. */ +} BOVERFLOW; + +/* Get a BOVERFLOW item for a specific index. */ +#define GET_BOVERFLOW(pg, indx) \ + ((BOVERFLOW *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new BOVERFLOW item to the page, with and + * without the index value. + */ +#define BOVERFLOW_SIZE \ + ALIGN(sizeof(BOVERFLOW), 4) +#define BOVERFLOW_PSIZE \ + (BOVERFLOW_SIZE + sizeof(db_indx_t)) + +/* + * Btree leaf and hash page layouts group indices in sets of two, one + * for the key and one for the data. Everything else does it in sets + * of one to save space. I use the following macros so that it's real + * obvious what's going on... + */ +#define O_INDX 1 +#define P_INDX 2 + +/************************************************************************ + BTREE INTERNAL PAGE LAYOUT + ************************************************************************/ + +/* + * Btree internal entry. + * + * +-----------------------------------+ + * | leaf pgno | type | data ... | + * +-----------------------------------+ + */ +typedef struct _binternal { + db_indx_t len; /* 00-01: Key/data item length. */ + u_int deleted :1; /* 02: Page type and delete flag. */ + u_int type :7; + u_int8_t unused; /* 03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Page number of referenced page. */ + db_recno_t nrecs; /* 08-11: Subtree record count. */ + u_int8_t data[1]; /* Variable length key item. */ +} BINTERNAL; + +/* Get a BINTERNAL item for a specific index. */ +#define GET_BINTERNAL(pg, indx) \ + ((BINTERNAL *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new BINTERNAL item to the page, with and + * without the index value. + */ +#define BINTERNAL_SIZE(len) \ + ALIGN((len) + SSZA(BINTERNAL, data), 4) +#define BINTERNAL_PSIZE(len) \ + (BINTERNAL_SIZE(len) + sizeof(db_indx_t)) + +/************************************************************************ + RECNO INTERNAL PAGE LAYOUT + ************************************************************************/ + +/* + * The recno internal entry. + * + * +-----------------------+ + * | leaf pgno | # of recs | + * +-----------------------+ + * + * XXX + * Why not fold this into the db_indx_t structure, it's fixed length. + */ +typedef struct _rinternal { + db_pgno_t pgno; /* 00-03: Page number of referenced page. */ + db_recno_t nrecs; /* 04-07: Subtree record count. */ +} RINTERNAL; + +/* Get a RINTERNAL item for a specific index. */ +#define GET_RINTERNAL(pg, indx) \ + ((RINTERNAL *)P_ENTRY(pg, indx)) + +/* + * Page space required to add a new RINTERNAL item to the page, with and + * without the index value. + */ +#define RINTERNAL_SIZE \ + ALIGN(sizeof(RINTERNAL), 4) +#define RINTERNAL_PSIZE \ + (RINTERNAL_SIZE + sizeof(db_indx_t)) +#endif /* _DB_PAGE_H_ */ diff --git a/db2/include/db_shash.h b/db2/include/db_shash.h new file mode 100644 index 0000000000..f695a2bafa --- /dev/null +++ b/db2/include/db_shash.h @@ -0,0 +1,106 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)db_shash.h 10.1 (Sleepycat) 4/12/97 + */ + +/* Hash Headers */ +typedef SH_TAILQ_HEAD(hash_head) DB_HASHTAB; + +/* + * __db_hashlookup -- + * + * Look up something in a shared memory hash table. The "elt" argument + * should be a key, and cmp_func must know how to compare a key to whatever + * structure it is that appears in the hash table. The comparison function + * cmp_func is called as: cmp_func(lookup_elt, table_elt); + * begin: address of the beginning of the hash table. + * type: the structure type of the elements that are linked in each bucket. + * field: the name of the field by which the "type" structures are linked. + * elt: the item for which we are searching in the hash table. + * result: the variable into which we'll store the element if we find it. + * nelems: the number of buckets in the hash table. + * hash_func: the hash function that operates on elements of the type of elt + * cmp_func: compare elements of the type of elt with those in the table (of + * type "type"). + * + * If the element is not in the hash table, this macro exits with result + * set to NULL. + */ +#define __db_hashlookup(begin, type, field, elt, r, n, hash, cmp) do { \ + DB_HASHTAB *__bucket; \ + u_int32_t __ndx; \ + \ + __ndx = hash(elt) % (n); \ + __bucket = &begin[__ndx]; \ + for (r = SH_TAILQ_FIRST(__bucket, type); \ + r != NULL; r = SH_TAILQ_NEXT(r, field, type)) \ + if (cmp(elt, r)) \ + break; \ +} while(0) + +/* + * __db_hashinsert -- + * + * Insert a new entry into the hash table. This assumes that lookup has + * failed; don't call it if you haven't already called __db_hashlookup. + * begin: the beginning address of the hash table. + * type: the structure type of the elements that are linked in each bucket. + * field: the name of the field by which the "type" structures are linked. + * elt: the item to be inserted. + * nelems: the number of buckets in the hash table. + * hash_func: the hash function that operates on elements of the type of elt + */ +#define __db_hashinsert(begin, type, field, elt, n, hash) do { \ + u_int32_t __ndx; \ + DB_HASHTAB *__bucket; \ + \ + __ndx = hash(elt) % (n); \ + __bucket = &begin[__ndx]; \ + SH_TAILQ_INSERT_HEAD(__bucket, elt, field, type); \ +} while(0) + +/* + * __db_hashremove -- + * Remove the entry with a key == elt. + * begin: address of the beginning of the hash table. + * type: the structure type of the elements that are linked in each bucket. + * field: the name of the field by which the "type" structures are linked. + * elt: the item to be deleted. + * nelems: the number of buckets in the hash table. + * hash_func: the hash function that operates on elements of the type of elt + * cmp_func: compare elements of the type of elt with those in the table (of + * type "type"). + */ +#define __db_hashremove(begin, type, field, elt, n, hash, cmp) { \ + u_int32_t __ndx; \ + DB_HASHTAB *__bucket; \ + SH_TAILQ_ENTRY *__entp; \ + \ + __ndx = hash(elt) % (n); \ + __bucket = &begin[__ndx]; \ + __db_hashlookup(begin, type, field, elt, __entp, n, hash, cmp); \ + SH_TAILQ_REMOVE(__bucket, __entp, field, type); \ +} + +/* + * __db_hashremove_el -- + * Given the object "obj" in the table, remove it. + * begin: address of the beginning of the hash table. + * type: the structure type of the elements that are linked in each bucket. + * field: the name of the field by which the "type" structures are linked. + * obj: the object in the table that we with to delete. + * nelems: the number of buckets in the hash table. + * hash_func: the hash function that operates on elements of the type of elt + */ +#define __db_hashremove_el(begin, type, field, obj, n, hash) { \ + u_int32_t __ndx; \ + DB_HASHTAB *__bucket; \ + \ + __ndx = hash(obj) % (n); \ + __bucket = &begin[__ndx]; \ + SH_TAILQ_REMOVE(__bucket, obj, field, type); \ +} diff --git a/db2/include/db_swap.h b/db2/include/db_swap.h new file mode 100644 index 0000000000..278282f5e4 --- /dev/null +++ b/db2/include/db_swap.h @@ -0,0 +1,105 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)db_swap.h 10.3 (Sleepycat) 6/10/97 + */ + +#ifndef _DB_SWAP_H_ +#define _DB_SWAP_H_ + +/* + * Little endian <==> big endian 32-bit swap macros. + * M_32_SWAP swap a memory location + * P_32_COPY copy potentially unaligned 4 byte quantities + * P_32_SWAP swap a referenced memory location + */ +#define M_32_SWAP(a) { \ + u_int32_t _tmp; \ + _tmp = a; \ + ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[0]; \ +} +#define P_32_COPY(a, b) { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[3]; \ +} +#define P_32_SWAP(a) { \ + u_int32_t _tmp; \ + P_32_COPY(a, &_tmp); \ + ((u_int8_t *)a)[0] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)a)[1] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)a)[2] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)a)[3] = ((u_int8_t *)&_tmp)[0]; \ +} + +/* + * Little endian <==> big endian 16-bit swap macros. + * M_16_SWAP swap a memory location + * P_16_COPY copy potentially unaligned from one location to another + * P_16_SWAP swap a referenced memory location + */ +#define M_16_SWAP(a) { \ + u_int16_t _tmp; \ + _tmp = (u_int16_t)a; \ + ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[0]; \ +} +#define P_16_COPY(a, b) { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \ +} +#define P_16_SWAP(a) { \ + u_int16_t _tmp; \ + P_16_COPY(a, &_tmp); \ + ((u_int8_t *)a)[0] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)a)[1] = ((u_int8_t *)&_tmp)[0]; \ +} + +#define SWAP32(p) { \ + P_32_SWAP(p); \ + (p) += sizeof(u_int32_t); \ +} +#define SWAP16(p) { \ + P_16_SWAP(p); \ + (p) += sizeof(u_int16_t); \ +} +#endif /* !_DB_SWAP_H_ */ diff --git a/db2/include/hash.h b/db2/include/hash.h new file mode 100644 index 0000000000..cb8ea350f5 --- /dev/null +++ b/db2/include/hash.h @@ -0,0 +1,211 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hash.h 10.6 (Sleepycat) 8/18/97 + */ + +/* Cursor structure definitions. */ +typedef struct cursor_t { + DBC *db_cursor; + db_pgno_t bucket; /* Bucket we are traversing. */ + DB_LOCK lock; /* Lock held on the current bucket. */ + PAGE *pagep; /* The current page. */ + db_pgno_t pgno; /* Current page number. */ + db_indx_t bndx; /* Index within the current page. */ + PAGE *dpagep; /* Duplicate page pointer. */ + db_pgno_t dpgno; /* Duplicate page number. */ + db_indx_t dndx; /* Index within a duplicate set. */ + db_indx_t dup_off; /* Offset within a duplicate set. */ + db_indx_t dup_len; /* Length of current duplicate. */ + db_indx_t dup_tlen; /* Total length of duplicate entry. */ + u_int32_t seek_size; /* Number of bytes we need for add. */ + db_pgno_t seek_found_page;/* Page on which we can insert. */ + u_int32_t big_keylen; /* Length of big_key buffer. */ + void *big_key; /* Temporary buffer for big keys. */ + u_int32_t big_datalen; /* Length of big_data buffer. */ + void *big_data; /* Temporary buffer for big data. */ +#define H_OK 0x0001 +#define H_NOMORE 0x0002 +#define H_DELETED 0x0004 +#define H_ISDUP 0x0008 +#define H_EXPAND 0x0020 + u_int32_t flags; /* Is cursor inside a dup set. */ +} HASH_CURSOR; + +#define IS_VALID(C) ((C)->bucket != BUCKET_INVALID) + + +typedef struct htab { /* Memory resident data structure. */ + DB *dbp; /* Pointer to parent db structure. */ + DB_LOCK hlock; /* Metadata page lock. */ + HASHHDR *hdr; /* Pointer to meta-data page. */ + u_int32_t (*hash) __P((const void *, u_int32_t)); /* Hash Function */ + PAGE *split_buf; /* Temporary buffer for splits. */ + int local_errno; /* Error Number -- for DBM compatability */ + u_long hash_accesses; /* Number of accesses to this table. */ + u_long hash_collisions; /* Number of collisions on search. */ + u_long hash_expansions; /* Number of times we added a bucket. */ + u_long hash_overflows; /* Number of overflow pages. */ + u_long hash_bigpages; /* Number of big key/data pages. */ +} HTAB; + +/* + * Macro used for interface functions to set the txnid in the DBP. + */ +#define SET_LOCKER(D, T) ((D)->txn = (T)) + +/* + * More interface macros used to get/release the meta data page. + */ +#define GET_META(D, H) { \ + int _r; \ + if (F_ISSET(D, DB_AM_LOCKING) && !F_ISSET(D, DB_AM_RECOVER)) { \ + (D)->lock.pgno = BUCKET_INVALID; \ + if ((_r = lock_get((D)->dbenv->lk_info, \ + (D)->txn == NULL ? (D)->locker : (D)->txn->txnid, \ + 0, &(D)->lock_dbt, DB_LOCK_READ, \ + &(H)->hlock)) != 0) \ + return (_r < 0 ? EAGAIN : _r); \ + } \ + if ((_r = __ham_get_page(D, 0, (PAGE **)&((H)->hdr))) != 0) { \ + if ((H)->hlock) { \ + (void)lock_put((D)->dbenv->lk_info, (H)->hlock);\ + (H)->hlock = 0; \ + } \ + return (_r); \ + } \ +} + +#define RELEASE_META(D, H) { \ + if (!F_ISSET(D, DB_AM_RECOVER) && \ + (D)->txn == NULL && (H)->hlock) \ + (void)lock_put((H)->dbp->dbenv->lk_info, (H)->hlock); \ + (H)->hlock = 0; \ + if ((H)->hdr) \ + (void)__ham_put_page(D, (PAGE *)(H)->hdr, \ + F_ISSET(D, DB_HS_DIRTYMETA) ? 1 : 0); \ + (H)->hdr = NULL; \ + F_CLR(D, DB_HS_DIRTYMETA); \ +} + +#define DIRTY_META(H, R) { \ + if (F_ISSET((H)->dbp, DB_AM_LOCKING) && \ + !F_ISSET((H)->dbp, DB_AM_RECOVER)) { \ + DB_LOCK _tmp; \ + (H)->dbp->lock.pgno = BUCKET_INVALID; \ + if (((R) = lock_get((H)->dbp->dbenv->lk_info, \ + (H)->dbp->txn ? (H)->dbp->txn->txnid : \ + (H)->dbp->locker, 0, &(H)->dbp->lock_dbt, \ + DB_LOCK_WRITE, &_tmp)) == 0) \ + (R) = lock_put((H)->dbp->dbenv->lk_info, \ + (H)->hlock); \ + else if ((R) < 0) \ + (R) = EAGAIN; \ + (H)->hlock = _tmp; \ + } \ + F_SET((H)->dbp, DB_HS_DIRTYMETA); \ +} + +/* Allocate and discard thread structures. */ +#define H_GETHANDLE(dbp, dbpp, ret) \ + if (F_ISSET(dbp, DB_AM_THREAD)) \ + ret = __db_gethandle(dbp, __ham_hdup, dbpp); \ + else { \ + ret = 0; \ + *dbpp = dbp; \ + } + +#define H_PUTHANDLE(dbp) { \ + if (F_ISSET(dbp, DB_AM_THREAD)) \ + __db_puthandle(dbp); \ +} + +/* Test string. */ +#define CHARKEY "%$sniglet^&" + +/* Overflow management */ +/* + * Overflow page numbers are allocated per split point. At each doubling of + * the table, we can allocate extra pages. We keep track of how many pages + * we've allocated at each point to calculate bucket to page number mapping. + */ +#define BUCKET_TO_PAGE(H, B) \ + ((B) + 1 + ((B) ? (H)->hdr->spares[__db_log2((B)+1)-1] : 0)) + +#define PGNO_OF(H, S, O) (BUCKET_TO_PAGE((H), (1 << (S)) - 1) + (O)) + +/* Constraints about number of pages and how much data goes on a page. */ + +#define MAX_PAGES(H) UINT32_T_MAX +#define MINFILL 0.25 +#define ISBIG(H, N) (((N) > ((H)->hdr->pagesize * MINFILL)) ? 1 : 0) + +/* Shorthands for accessing structure */ +#define NDX_INVALID 0xFFFF +#define BUCKET_INVALID 0xFFFFFFFF + +/* On page duplicates are stored as a string of size-data-size triples. */ +#define DUP_SIZE(len) ((len) + 2 * sizeof(db_indx_t)) + +/* Log messages types (these are subtypes within a record type) */ +#define PAIR_KEYMASK 0x1 +#define PAIR_DATAMASK 0x2 +#define PAIR_ISKEYBIG(N) (N & PAIR_KEYMASK) +#define PAIR_ISDATABIG(N) (N & PAIR_DATAMASK) +#define OPCODE_OF(N) (N & ~(PAIR_KEYMASK | PAIR_DATAMASK)) + +#define PUTPAIR 0x20 +#define DELPAIR 0x30 +#define PUTOVFL 0x40 +#define DELOVFL 0x50 +#define ALLOCPGNO 0x60 +#define DELPGNO 0x70 +#define SPLITOLD 0x80 +#define SPLITNEW 0x90 + +#include "hash_auto.h" +#include "hash_ext.h" +#include "db_am.h" +#include "common_ext.h" diff --git a/db2/include/hash_auto.h b/db2/include/hash_auto.h new file mode 100644 index 0000000000..5ff1229115 --- /dev/null +++ b/db2/include/hash_auto.h @@ -0,0 +1,114 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#ifndef ham_AUTO_H +#define ham_AUTO_H + +#define DB_ham_insdel (DB_ham_BEGIN + 1) + +typedef struct _ham_insdel_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN pagelsn; + DBT key; + DBT data; +} __ham_insdel_args; + + +#define DB_ham_newpage (DB_ham_BEGIN + 2) + +typedef struct _ham_newpage_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t prev_pgno; + DB_LSN prevlsn; + db_pgno_t new_pgno; + DB_LSN pagelsn; + db_pgno_t next_pgno; + DB_LSN nextlsn; +} __ham_newpage_args; + + +#define DB_ham_splitmeta (DB_ham_BEGIN + 3) + +typedef struct _ham_splitmeta_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + u_int32_t bucket; + u_int32_t ovflpoint; + u_int32_t spares; + DB_LSN metalsn; +} __ham_splitmeta_args; + + +#define DB_ham_splitdata (DB_ham_BEGIN + 4) + +typedef struct _ham_splitdata_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + u_int32_t opcode; + db_pgno_t pgno; + DBT pageimage; + DB_LSN pagelsn; +} __ham_splitdata_args; + + +#define DB_ham_replace (DB_ham_BEGIN + 5) + +typedef struct _ham_replace_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN pagelsn; + int32_t off; + DBT olditem; + DBT newitem; + u_int32_t makedup; +} __ham_replace_args; + + +#define DB_ham_newpgno (DB_ham_BEGIN + 6) + +typedef struct _ham_newpgno_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t fileid; + db_pgno_t pgno; + db_pgno_t free_pgno; + u_int32_t old_type; + db_pgno_t old_pgno; + u_int32_t new_type; + DB_LSN pagelsn; + DB_LSN metalsn; +} __ham_newpgno_args; + + +#define DB_ham_ovfl (DB_ham_BEGIN + 7) + +typedef struct _ham_ovfl_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t fileid; + db_pgno_t start_pgno; + u_int32_t npages; + db_pgno_t free_pgno; + DB_LSN metalsn; +} __ham_ovfl_args; + +#endif diff --git a/db2/include/hash_ext.h b/db2/include/hash_ext.h new file mode 100644 index 0000000000..5ae63dc6ad --- /dev/null +++ b/db2/include/hash_ext.h @@ -0,0 +1,120 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __ham_open __P((DB *, DB_INFO *)); +int __ham_close __P((DB *)); +int __ham_expand_table __P((HTAB *)); +u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t)); +int __ham_init_dbt __P((DBT *, u_int32_t, void **, u_int32_t *)); +void __ham_c_update __P((HTAB *, + HASH_CURSOR *, db_pgno_t, u_int32_t, int, int)); +int __ham_hdup __P((DB *, DB *)); +int __ham_insdel_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, u_int32_t, + DB_LSN *, DBT *, DBT *)); +int __ham_insdel_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_insdel_read __P((void *, __ham_insdel_args **)); +int __ham_newpage_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, DB_LSN *, + db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *)); +int __ham_newpage_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_newpage_read __P((void *, __ham_newpage_args **)); +int __ham_splitmeta_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, u_int32_t, u_int32_t, + DB_LSN *)); +int __ham_splitmeta_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_splitmeta_read __P((void *, __ham_splitmeta_args **)); +int __ham_splitdata_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, DBT *, + DB_LSN *)); +int __ham_splitdata_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_splitdata_read __P((void *, __ham_splitdata_args **)); +int __ham_replace_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, u_int32_t, DB_LSN *, + int32_t, DBT *, DBT *, u_int32_t)); +int __ham_replace_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_replace_read __P((void *, __ham_replace_args **)); +int __ham_newpgno_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, + u_int32_t, db_pgno_t, u_int32_t, DB_LSN *, + DB_LSN *)); +int __ham_newpgno_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_newpgno_read __P((void *, __ham_newpgno_args **)); +int __ham_ovfl_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, + DB_LSN *)); +int __ham_ovfl_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_ovfl_read __P((void *, __ham_ovfl_args **)); +int __ham_init_print __P((DB_ENV *)); +int __ham_init_recover __P((DB_ENV *)); +int __ham_pgin __P((db_pgno_t, void *, DBT *)); +int __ham_pgout __P((db_pgno_t, void *, DBT *)); +int __ham_mswap __P((void *)); +#ifdef DEBUG +void __ham_dump_bucket __P((HTAB *, u_int32_t)); +#endif +int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, int)); +void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t)); +u_int32_t __ham_func2 __P((const void *, u_int32_t)); +u_int32_t __ham_func3 __P((const void *, u_int32_t)); +u_int32_t __ham_func4 __P((const void *, u_int32_t)); +u_int32_t __ham_func5 __P((const void *, u_int32_t)); +int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_item_reset __P((HTAB *, HASH_CURSOR *)); +void __ham_item_init __P((HASH_CURSOR *)); +int __ham_item_done __P((HTAB *, HASH_CURSOR *, int)); +int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +void __ham_putitem __P((PAGE *p, const DBT *, int)); +int __ham_del_pair __P((HTAB *, HASH_CURSOR *)); +int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); +void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t, + int32_t, DBT *)); +int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t)); +int __ham_add_el __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, + int)); +void __ham_copy_item __P((HTAB *, PAGE *, int, PAGE *)); +int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **)); +int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **)); +int __ham_del_page __P((DB *, PAGE *)); +int __ham_put_page __P((DB *, PAGE *, int32_t)); +int __ham_dirty_page __P((HTAB *, PAGE *)); +int __ham_get_page __P((DB *, db_pgno_t, PAGE **)); +int __ham_overflow_page __P((DB *, u_int32_t, PAGE **)); +#ifdef DEBUG +int bucket_to_page __P((HTAB *, int)); +#endif +void __ham_init_ovflpages __P((HTAB *)); +int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_next_cpage __P((HTAB *, HASH_CURSOR *, db_pgno_t, + int, int)); +void __ham_dpair __P((DB *, PAGE *, u_int32_t)); +int __ham_insdel_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_newpage_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_replace_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_newpgno_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_splitmeta_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_splitdata_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_ovfl_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __ham_stat __P((DB *, FILE *)); diff --git a/db2/include/lock.h b/db2/include/lock.h new file mode 100644 index 0000000000..18d29e8740 --- /dev/null +++ b/db2/include/lock.h @@ -0,0 +1,194 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)lock.h 10.7 (Sleepycat) 7/29/97 + */ + +typedef struct __db_lockobj DB_LOCKOBJ; + +#define DB_DEFAULT_LOCK_FILE "__db_lock.share" +#define DB_LOCK_DEFAULT_N 5000 +#define DB_LOCK_MAXID 0x7fffffff + +/* + * The lock region consists of: + * The DB_LOCKREGION structure (sizeof(DB_LOCKREGION)). + * The conflict matrix of nmodes * nmodes bytes (nmodes * nmodes). + * The hash table for object lookup (hashsize * sizeof(DB_OBJ *)). + * The locks themselves (maxlocks * sizeof(struct __db_lock). + * The objects being locked (maxlocks * sizeof(DB_OBJ)). + * String space to represent the DBTs that are the objects being locked. + */ +struct __db_lockregion { + RLAYOUT hdr; /* Shared region header. */ + u_int32_t magic; /* lock magic number */ + u_int32_t version; /* version number */ + u_int32_t id; /* unique id generator */ + u_int32_t need_dd; /* flag for deadlock detector */ + u_int32_t detect; /* run dd on every conflict */ + SH_TAILQ_HEAD(lock_header) free_locks; /* free lock header */ + SH_TAILQ_HEAD(obj_header) free_objs; /* free obj header */ + u_int32_t maxlocks; /* maximum number of locks in table */ + u_int32_t table_size; /* size of hash table */ + u_int32_t nmodes; /* number of lock modes */ + u_int32_t numobjs; /* number of objects */ + u_int32_t nlockers; /* number of lockers */ + size_t increment; /* how much to grow region */ + size_t hash_off; /* offset of hash table */ + size_t mem_off; /* offset of memory region */ + size_t mem_bytes; /* number of bytes in memory region */ + u_int32_t nconflicts; /* number of lock conflicts */ + u_int32_t nrequests; /* number of lock gets */ + u_int32_t nreleases; /* number of lock puts */ + u_int32_t ndeadlocks; /* number of deadlocks */ +}; + +/* Macros to lock/unlock the region. */ +#define LOCK_LOCKREGION(lt) \ + (void)__db_mutex_lock(&(lt)->region->hdr.lock,(lt)->fd, \ + (lt)->dbenv == NULL ? NULL : (lt)->dbenv->db_yield) +#define UNLOCK_LOCKREGION(lt) \ + (void)__db_mutex_unlock(&(lt)->region->hdr.lock, (lt)->fd) + +/* + * Since we will be keeping DBTs in shared memory, we need the equivalent + * of a DBT that will work in shared memory. + */ +typedef struct __sh_dbt { + u_int32_t size; + ssize_t off; +} SH_DBT; + +#define SH_DBT_PTR(p) ((void *)(((u_int8_t *)(p)) + (p)->off)) + +/* + * The lock table is the per-process cookie returned from a lock_open call. + */ +struct __db_lockobj { + SH_DBT lockobj; /* Identifies object locked. */ + SH_TAILQ_ENTRY links; /* Links for free list. */ + union { + SH_TAILQ_HEAD(_wait) _waiters; /* List of waiting locks. */ + u_int32_t _dd_id; /* Deadlock detector id. */ + } wlinks; + union { + SH_LIST_HEAD(_held) _heldby; /* Locks held by this locker. */ + SH_TAILQ_HEAD(_hold) _holders; /* List of held locks. */ + } dlinks; +#define DB_LOCK_OBJTYPE 1 +#define DB_LOCK_LOCKER 2 + u_int8_t type; /* Real object or locker id. */ +}; + + +#define dd_id wlinks._dd_id +#define waiters wlinks._waiters +#define holders dlinks._holders +#define heldby dlinks._heldby + +struct __db_locktab { + DB_ENV *dbenv; /* Environment. */ + int fd; /* mapped file descriptor */ + DB_LOCKREGION *region; /* address of shared memory region */ + DB_HASHTAB *hashtab; /* Beginning of hash table. */ + size_t reg_size; /* last known size of lock region */ + void *mem; /* Beginning of string space. */ + u_int8_t *conflicts; /* Pointer to conflict matrix. */ +}; + +/* Test for conflicts. */ +#define CONFLICTS(T, HELD, WANTED) \ + T->conflicts[HELD * T->region->nmodes + WANTED] + +/* + * Status of a lock. + */ +typedef enum { + DB_LSTAT_ABORTED, /* Lock belongs to an aborted txn. */ + DB_LSTAT_ERR, /* Lock is bad. */ + DB_LSTAT_FREE, /* Lock is unallocated. */ + DB_LSTAT_HELD, /* Lock is currently held. */ + DB_LSTAT_NOGRANT, /* Lock was not granted. */ + DB_LSTAT_PENDING, /* Lock was waiting and has been + * promoted; waiting for the owner + * to run and upgrade it to held. */ + DB_LSTAT_WAITING /* Lock is on the wait queue. */ +} db_status_t; + +/* + * Resources in the lock region. Used to indicate which resource + * is running low when we need to grow the region. + */ +typedef enum { + DB_LOCK_MEM, DB_LOCK_OBJ, DB_LOCK_LOCK +} db_resource_t; + +struct __db_lock { + /* + * Wait on mutex to wait on lock. You reference your own mutex with + * ID 0 and others reference your mutex with ID 1. + */ + db_mutex_t mutex; + + u_int32_t holder; /* Who holds this lock. */ + SH_TAILQ_ENTRY links; /* Free or holder/waiter list. */ + SH_LIST_ENTRY locker_links; /* List of locks held by a locker. */ + u_int32_t refcount; /* Reference count the lock. */ + db_lockmode_t mode; /* What sort of lock. */ + ssize_t obj; /* Relative offset of object struct. */ + db_status_t status; /* Status of this lock. */ +}; + +/* + * We cannot return pointers to the user (else we cannot easily grow regions), + * so we return offsets in the region. These must be converted to and from + * regular pointers. Always use the macros below. + */ +#define OFFSET_TO_LOCK(lt, off) \ + ((struct __db_lock *)((u_int8_t *)((lt)->region) + (off))) +#define LOCK_TO_OFFSET(lt, lock) \ + ((size_t)((u_int8_t *)(lock) - (u_int8_t *)lt->region)) +#define OFFSET_TO_OBJ(lt, off) \ + ((DB_LOCKOBJ *)((u_int8_t *)((lt)->region) + (off))) +#define OBJ_TO_OFFSET(lt, obj) \ + ((size_t)((u_int8_t *)(obj) - (u_int8_t *)lt->region)) + +/* + * The lock header contains the region structure and the conflict matrix. + * Aligned to a large boundary because we don't know what the underlying + * type of the hash table elements are. + */ +#define LOCK_HASH_ALIGN 8 +#define LOCK_HEADER_SIZE(M) \ + ((size_t)(sizeof(DB_LOCKREGION) + ALIGN((M * M), LOCK_HASH_ALIGN))) + +/* + * For the full region, we need to add the locks, the objects, the hash table + * and the string space (which is 16 bytes per lock). + */ +#define STRING_SIZE(N) (16 * N) + +#define LOCK_REGION_SIZE(M, N, H) \ + (ALIGN(LOCK_HEADER_SIZE(M) + \ + (H) * sizeof(DB_HASHTAB), MUTEX_ALIGNMENT) + \ + (N) * ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT) + \ + ALIGN((N) * sizeof(DB_LOCKOBJ), sizeof(size_t)) + \ + ALIGN(STRING_SIZE(N), sizeof(size_t))) + +#ifdef DEBUG +#define LOCK_DEBUG_LOCKERS 0x0001 +#define LOCK_DEBUG_LOCK 0x0002 +#define LOCK_DEBUG_OBJ 0x0004 +#define LOCK_DEBUG_CONF 0x0008 +#define LOCK_DEBUG_MEM 0x0010 +#define LOCK_DEBUG_BUCKET 0x0020 +#define LOCK_DEBUG_OBJECTS 0x0040 +#define LOCK_DEBUG_ALL 0xFFFF + +#define LOCK_DEBUG_NOMUTEX 0x0100 +#endif + +#include "lock_ext.h" diff --git a/db2/include/lock_ext.h b/db2/include/lock_ext.h new file mode 100644 index 0000000000..59d5072bc4 --- /dev/null +++ b/db2/include/lock_ext.h @@ -0,0 +1,8 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __lock_getobj __P((DB_LOCKTAB *, + u_int32_t, DBT *, u_int32_t type, DB_LOCKOBJ **)); +int __lock_cmp __P((DBT *, DB_LOCKOBJ *)); +int __lock_locker_cmp __P((u_int32_t, DB_LOCKOBJ *)); +int __lock_ohash __P((DBT *)); +u_int32_t __lock_locker_hash __P((u_int32_t)); +u_int32_t __lock_lhash __P((DB_LOCKOBJ *)); diff --git a/db2/include/log.h b/db2/include/log.h new file mode 100644 index 0000000000..970dfd153a --- /dev/null +++ b/db2/include/log.h @@ -0,0 +1,157 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)log.h 10.8 (Sleepycat) 8/18/97 + */ + +#ifndef _LOG_H_ +#define _LOG_H_ + +struct __fname; typedef struct __fname FNAME; +struct __hdr; typedef struct __hdr HDR; +struct __log; typedef struct __log LOG; +struct __log_persist; typedef struct __log_persist LOGP; + +#define MAXLFNAME 99999 /* Maximum log file name. */ +#define LFNAME "log.%05d" /* Log file name template. */ + + /* Default log name. */ +#define DB_DEFAULT_LOG_FILE "__db_log.share" + +#define DEFAULT_MAX (10 * 1048576) /* 10 Mb. */ + +/* Macros to return per-process address, offsets. */ +#define ADDR(base, offset) ((void *)((u_int8_t *)((base)->addr) + offset)) +#define OFFSET(base, p) ((u_int8_t *)(p) - (u_int8_t *)(base)->addr) + +/* Macros to lock/unlock the region and threads. */ +#define LOCK_LOGTHREAD(dblp) \ + if (F_ISSET(dblp, DB_AM_THREAD)) \ + (void)__db_mutex_lock(&(dblp)->mutex, -1, \ + (dblp)->dbenv == NULL ? NULL : (dblp)->dbenv->db_yield) +#define UNLOCK_LOGTHREAD(dblp) \ + if (F_ISSET(dblp, DB_AM_THREAD)) \ + (void)__db_mutex_unlock(&(dblp)->mutex, -1); +#define LOCK_LOGREGION(dblp) \ + (void)__db_mutex_lock(&((RLAYOUT *)(dblp)->lp)->lock, \ + (dblp)->fd, (dblp)->dbenv == NULL ? NULL : (dblp)->dbenv->db_yield) +#define UNLOCK_LOGREGION(dblp) \ + (void)__db_mutex_unlock(&((RLAYOUT *)(dblp)->lp)->lock, (dblp)->fd) + +/* + * The per-process table that maps log file-id's to DB structures. + */ +typedef struct __db_entry { + DB *dbp; /* Associated DB structure. */ + int refcount; /* Reference counted. */ + int deleted; /* File was not found during open. */ +} DB_ENTRY; + +/* + * DB_LOG + * Per-process log structure. + */ +struct __db_log { +/* These fields need to be protected for multi-threaded support. */ + db_mutex_t mutex; /* Mutex for thread protection. */ + + DB_ENTRY *dbentry; /* Recovery file-id mapping. */ +#define DB_GROW_SIZE 64 + u_int32_t dbentry_cnt; /* Entries. Grows by DB_GROW_SIZE. */ + +/* + * These fields are always accessed while the region lock is held, so they do + * not have to be protected by the thread lock as well OR, they are only used + * when threads are not being used, i.e. most cursor operations are disallowed + * on threaded logs. + */ + u_int32_t lfname; /* Log file "name". */ + int lfd; /* Log file descriptor. */ + + DB_LSN c_lsn; /* Cursor: current LSN. */ + DBT c_dbt; /* Cursor: return DBT structure. */ + int c_fd; /* Cursor: file descriptor. */ + u_int32_t c_off; /* Cursor: previous record offset. */ + u_int32_t c_len; /* Cursor: current record length. */ + +/* These fields are not protected. */ + LOG *lp; /* Address of the shared LOG. */ + + DB_ENV *dbenv; /* Reference to error information. */ + + void *maddr; /* Address of mmap'd region. */ + void *addr; /* Address of shalloc() region. */ + int fd; /* Region file descriptor. */ + + u_int32_t flags; /* Support the DB_AM_XXX flags. */ +}; + +/* + * HDR -- + * Log record header. + */ +struct __hdr { + u_int32_t prev; /* Previous offset. */ + u_int32_t cksum; /* Current checksum. */ + u_int32_t len; /* Current length. */ +}; + +struct __log_persist { + u_int32_t magic; /* DB_LOGMAGIC */ + u_int32_t version; /* DB_LOGVERSION */ + + u_int32_t lg_max; /* Maximum file size. */ + int mode; /* Log file mode. */ +}; + +/* + * LOG -- + * Shared log region. One of these is allocated in shared memory, + * and describes the log. + */ +struct __log { + RLAYOUT rlayout; /* General region information. */ + + LOGP persist; /* Persistent information. */ + + SH_TAILQ_HEAD(__fq) fq; /* List of file names. */ + + DB_LSN lsn; /* LSN at current file offset. */ + DB_LSN c_lsn; /* LSN of the last checkpoint. */ + DB_LSN s_lsn; /* LSN of the last sync. */ + DB_LSN span_lsn; /* LSN spanning buffer write. */ + + u_int32_t len; /* Length of the last record. */ + + size_t b_off; /* Current offset in the buffer. */ + u_int32_t w_off; /* Current write offset in the file. */ + + time_t chkpt; /* Time of the last checkpoint. */ + u_int32_t written; /* Bytes written since checkpoint. */ + + u_int8_t buf[4 * 1024]; /* Log buffer. */ +}; + +/* + * FNAME -- + * File name and id. + */ +struct __fname { + SH_TAILQ_ENTRY q; /* File name queue. */ + + u_int16_t ref; /* Reference count. */ + + u_int32_t id; /* Logging file id. */ + DBTYPE s_type; /* Saved DB type. */ + + u_int32_t fileid_off; /* Unique file id offset. */ + + size_t name_off; /* Name offset. */ +}; + +#include "log_auto.h" +#include "log_ext.h" +#endif /* _LOG_H_ */ diff --git a/db2/include/log_auto.h b/db2/include/log_auto.h new file mode 100644 index 0000000000..820aac6acf --- /dev/null +++ b/db2/include/log_auto.h @@ -0,0 +1,27 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#ifndef log_AUTO_H +#define log_AUTO_H + +#define DB_log_register (DB_log_BEGIN + 1) + +typedef struct _log_register_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + DBT name; + DBT uid; + u_int32_t id; + DBTYPE ftype; +} __log_register_args; + + +#define DB_log_unregister (DB_log_BEGIN + 2) + +typedef struct _log_unregister_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t id; +} __log_unregister_args; + +#endif diff --git a/db2/include/log_ext.h b/db2/include/log_ext.h new file mode 100644 index 0000000000..d5c9dd6e72 --- /dev/null +++ b/db2/include/log_ext.h @@ -0,0 +1,29 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __log_find __P((DB_ENV *, LOG *, int *)); +int __log_valid __P((DB_ENV *, LOG *, int)); +int __log_register_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + DBT *, DBT *, u_int32_t, DBTYPE)); +int __log_register_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __log_register_read __P((void *, __log_register_args **)); +int __log_unregister_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t)); +int __log_unregister_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __log_unregister_read __P((void *, __log_unregister_args **)); +int __log_init_print __P((DB_ENV *)); +int __log_init_recover __P((DB_ENV *)); +int __log_findckp __P((DB_LOG *, DB_LSN *)); +int __log_get __P((DB_LOG *, DB_LSN *, DBT *, int, int)); +int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, int)); +int __log_name __P((DB_ENV *, int, char **)); +int __log_register_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __log_unregister_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __log_add_logid __P((DB_LOG *, DB *, u_int32_t)); +int __db_fileid_to_db __P((DB_LOG *, DB **, u_int32_t)); +void __log_close_files __P((DB_LOG *)); +void __log_rem_logid __P((DB_LOG *, u_int32_t)); diff --git a/db2/include/mp.h b/db2/include/mp.h new file mode 100644 index 0000000000..4872596f83 --- /dev/null +++ b/db2/include/mp.h @@ -0,0 +1,266 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)mp.h 10.14 (Sleepycat) 8/18/97 + */ + +struct __bh; typedef struct __bh BH; +struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; +struct __mpool; typedef struct __mpool MPOOL; +struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; + + /* Default mpool name. */ +#define DB_DEFAULT_MPOOL_FILE "__db_mpool.share" + +/* + * We default to 128K (16 8K pages) if the user doesn't specify, and + * require a minimum of 20K. + */ +#define DB_CACHESIZE_DEF (128 * 1024) +#define DB_CACHESIZE_MIN ( 20 * 1024) + +/* Macro to return per-process address, offsets. */ +#define ADDR(base, offset) ((void *)((u_int8_t *)((base)->addr) + offset)) +#define OFFSET(base, p) ((u_int8_t *)(p) - (u_int8_t *)(base)->addr) + +#define INVALID 0 /* Invalid shared memory offset. */ +#define TEMPORARY "<tmp>" /* Temporary file name. */ + +/* + * There are two kinds of locks in the mpool code. The first is the region + * lock, used to serialize modifications to all data structures. The second + * is a per-buffer header lock. The locking order is as follows: + * + * Process searching for a buffer: + * Acquire the region lock. + * Find the buffer header. + * Increment the reference count (guarantee the buffer stays). + * If the BH_LOCKED flag is set: + * Release the region lock. + * Acquire the buffer lock. + * Release the buffer lock. + * Acquire the region lock. + * Return the buffer. + * + * Process reading/writing a buffer: + * Acquire the region lock. + * Find/create the buffer header. + * If reading, increment the reference count (guarantee the buffer stays). + * Set the BH_LOCKED flag. + * Acquire the buffer lock (guaranteed not to block). + * Release the region lock. + * Do the I/O and/or initialize buffer contents. + * Acquire the region lock. + * Clear the BH_LOCKED flag. + * Release the region lock. + * Release the buffer lock. + * If reading, return the buffer. + * + * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not + * reacquired when a region lock is reacquired because they couldn't have been + * closed/discarded and because they never move in memory. + */ +#define LOCKINIT(dbmp, mutexp) \ + if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \ + (void)__db_mutex_init(mutexp, (dbmp)->fd) + +#define LOCKHANDLE(dbmp, mutexp) \ + if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ + (void)__db_mutex_lock(mutexp, (dbmp)->fd, \ + (dbmp)->dbenv == NULL ? NULL : (dbmp)->dbenv->db_yield) +#define UNLOCKHANDLE(dbmp, mutexp) \ + if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ + (void)__db_mutex_unlock(mutexp, (dbmp)->fd) + +#define LOCKREGION(dbmp) \ + if (F_ISSET(dbmp, MP_LOCKREGION)) \ + (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \ + (dbmp)->fd, \ + (dbmp)->dbenv == NULL ? NULL : (dbmp)->dbenv->db_yield) +#define UNLOCKREGION(dbmp) \ + if (F_ISSET(dbmp, MP_LOCKREGION)) \ + (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \ + (dbmp)->fd) + +#define LOCKBUFFER(dbmp, bhp) \ + if (F_ISSET(dbmp, MP_LOCKREGION)) \ + (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->fd, \ + (dbmp)->dbenv == NULL ? NULL : (dbmp)->dbenv->db_yield) +#define UNLOCKBUFFER(dbmp, bhp) \ + if (F_ISSET(dbmp, MP_LOCKREGION)) \ + (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->fd) + +/* + * DB_MPOOL -- + * Per-process memory pool structure. + */ +struct __db_mpool { +/* These fields need to be protected for multi-threaded support. */ + db_mutex_t mutex; /* Structure lock. */ + + /* List of pgin/pgout routines. */ + LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; + + /* List of DB_MPOOLFILE's. */ + TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; + +/* These fields are not protected. */ + DB_ENV *dbenv; /* Reference to error information. */ + + MPOOL *mp; /* Address of the shared MPOOL. */ + + void *maddr; /* Address of mmap'd region. */ + void *addr; /* Address of shalloc() region. */ + + DB_HASHTAB *htab; /* Hash table of bucket headers. */ + + int fd; /* Underlying mmap'd fd. */ + + +#define MP_ISPRIVATE 0x01 /* Private, so local memory. */ +#define MP_LOCKHANDLE 0x02 /* Threaded, lock handles and region. */ +#define MP_LOCKREGION 0x04 /* Concurrent access, lock region. */ + u_int32_t flags; +}; + +/* + * DB_MPREG -- + * DB_MPOOL registry of pgin/pgout functions. + */ +struct __db_mpreg { + LIST_ENTRY(__db_mpreg) q; /* Linked list. */ + + int ftype; /* File type. */ + /* Pgin, pgout routines. */ + int (*pgin) __P((db_pgno_t, void *, DBT *)); + int (*pgout) __P((db_pgno_t, void *, DBT *)); +}; + +/* + * DB_MPOOLFILE -- + * Per-process DB_MPOOLFILE information. + */ +struct __db_mpoolfile { +/* These fields need to be protected for multi-threaded support. */ + db_mutex_t mutex; /* Structure lock. */ + + int fd; /* Underlying file descriptor. */ + + u_int32_t pinref; /* Pinned block reference count. */ + +/* These fields are not protected. */ + TAILQ_ENTRY(__db_mpoolfile) q; /* Linked list of DB_MPOOLFILE's. */ + + char *path; /* Initial file path. */ + DB_MPOOL *dbmp; /* Overlying DB_MPOOL. */ + MPOOLFILE *mfp; /* Underlying MPOOLFILE. */ + + void *addr; /* Address of mmap'd region. */ + size_t len; /* Length of mmap'd region. */ + +#define MP_PATH_ALLOC 0x01 /* Path is allocated memory. */ +#define MP_PATH_TEMP 0x02 /* Backing file is a temporary. */ +#define MP_READONLY 0x04 /* File is readonly. */ + u_int32_t flags; +}; + +/* + * MPOOL -- + * Shared memory pool region. One of these is allocated in shared + * memory, and describes the pool. + */ +struct __mpool { + RLAYOUT rlayout; /* General region information. */ + + SH_TAILQ_HEAD(__bhq) bhq; /* LRU list of buckets. */ + SH_TAILQ_HEAD(__bhfq) bhfq; /* Free buckets. */ + SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */ + + /* + * We make the assumption that the early pages of the file are far + * more likely to be retrieved than the later pages, which means + * that the top bits are more interesting for hashing since they're + * less likely to collide. On the other hand, since 512 4K pages + * represents a 2MB file, only the bottom 9 bits of the page number + * are likely to be set. We XOR in the offset in the MPOOL of the + * MPOOLFILE that backs this particular page, since that should also + * be unique for the page. + */ +#define BUCKET(mp, mf_offset, pgno) \ + (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets) + + size_t htab; /* Hash table offset. */ + size_t htab_buckets; /* Number of hash table entries. */ + + DB_LSN lsn; /* Maximum checkpoint LSN. */ + int lsn_cnt; /* Checkpoint buffers left to write. */ + + DB_MPOOL_STAT stat; /* Global mpool statistics. */ + +#define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */ + u_int32_t flags; +}; + +/* + * MPOOLFILE -- + * Shared DB_MPOOLFILE information. + */ +struct __mpoolfile { + SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ + + u_int32_t ref; /* Reference count. */ + + int ftype; /* File type. */ + int can_mmap; /* If the file can be mmap'd. */ + + int lsn_off; /* Page's LSN offset. */ + + size_t path_off; /* File name location. */ + + size_t fileid_off; /* File identification location. */ + + size_t pgcookie_len; /* Pgin/pgout cookie length. */ + size_t pgcookie_off; /* Pgin/pgout cookie location. */ + + int lsn_cnt; /* Checkpoint buffers left to write. */ + + DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ +}; + +/* + * BH -- + * Buffer header. + */ +struct __bh { + db_mutex_t mutex; /* Structure lock. */ + + u_int16_t ref; /* Reference count. */ + +#define BH_CALLPGIN 0x001 /* Page needs to be reworked... */ +#define BH_DIRTY 0x002 /* Page was modified. */ +#define BH_DISCARD 0x004 /* Page is useless. */ +#define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */ +#define BH_TRASH 0x010 /* Page is garbage. */ +#define BH_WRITE 0x020 /* Page scheduled for writing. */ + u_int16_t flags; + + SH_TAILQ_ENTRY q; /* LRU list of bucket headers. */ + SH_TAILQ_ENTRY mq; /* MPOOLFILE list of bucket headers. */ + + db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ + size_t mf_offset; /* Associated MPOOLFILE offset. */ + + /* + * !!! + * This array must be size_t aligned -- the DB access methods put PAGE + * and other structures into it, and expect to be able to access them + * directly. (We guarantee size_t alignment in the db_mpool(3) manual + * page as well.) + */ + u_int8_t buf[1]; /* Variable length data. */ +}; + +#include "mp_ext.h" diff --git a/db2/include/mp_ext.h b/db2/include/mp_ext.h new file mode 100644 index 0000000000..3934c130a8 --- /dev/null +++ b/db2/include/mp_ext.h @@ -0,0 +1,14 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __memp_bhwrite + __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); +int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); +int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *)); +int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); +void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); +int __memp_fopen __P((DB_MPOOL *, const char *, int, int, + int, size_t, int, DBT *, u_int8_t *, int, DB_MPOOLFILE **)); +void __memp_debug __P((DB_MPOOL *, FILE *, int)); +int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *)); +int __memp_ropen + __P((DB_MPOOL *, const char *, size_t, int, int)); +int __memp_rclose __P((DB_MPOOL *)); diff --git a/db2/include/mutex_ext.h b/db2/include/mutex_ext.h new file mode 100644 index 0000000000..ff46b6a404 --- /dev/null +++ b/db2/include/mutex_ext.h @@ -0,0 +1,4 @@ +/* Do not edit: automatically built by dist/distrib. */ +void __db_mutex_init __P((db_mutex_t *, off_t)); +int __db_mutex_lock __P((db_mutex_t *, int, int (*)(void))); +int __db_mutex_unlock __P((db_mutex_t *, int)); diff --git a/db2/include/os_ext.h b/db2/include/os_ext.h new file mode 100644 index 0000000000..59d72acf12 --- /dev/null +++ b/db2/include/os_ext.h @@ -0,0 +1,19 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __db_abspath __P((const char *)); +char *__db_rpath __P((const char *)); +int __db_dir __P((DB_ENV *, const char *, char ***, int *)); +void __db_dirf __P((DB_ENV *, char **, int)); +int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); +int __db_lseek __P((int, size_t, db_pgno_t, u_long, int)); +int __db_mmap __P((int, size_t, int, int, void *)); +int __db_munmap __P((void *, size_t)); +int __db_oflags __P((int)); +int __db_fdopen __P((const char *, int, int, int, int *)); +int __db_fsync __P((int)); +int __db_close __P((int)); +int __db_read __P((int, void *, size_t, ssize_t *)); +int __db_write __P((int, void *, size_t, ssize_t *)); +int __db_sleep __P((u_long, u_long)); +int __db_exists __P((const char *, int *)); +int __db_stat __P((DB_ENV *, const char *, int, off_t *, off_t *)); +int __db_unlink __P((const char *)); diff --git a/db2/include/queue.h b/db2/include/queue.h new file mode 100644 index 0000000000..0909c86c60 --- /dev/null +++ b/db2/include/queue.h @@ -0,0 +1,275 @@ +/* BSDI $Id$ */ + +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +/* + * This file defines three types of data structures: lists, tail queues, + * and circular queues. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may only be traversed in the forward direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * List definitions. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +#define LIST_FIRST(head) ((head)->lh_first) +#define LIST_NEXT(elm, field) ((elm)->field.le_next) +#define LIST_END(head) NULL + +/* + * List functions. + */ +#define LIST_INIT(head) { \ + (head)->lh_first = NULL; \ +} + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if (((elm)->field.le_next = (listelm)->field.le_next) != NULL) \ + (listelm)->field.le_next->field.le_prev = \ + &(elm)->field.le_next; \ + (listelm)->field.le_next = (elm); \ + (elm)->field.le_prev = &(listelm)->field.le_next; \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + (elm)->field.le_next = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &(elm)->field.le_next; \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if (((elm)->field.le_next = (head)->lh_first) != NULL) \ + (head)->lh_first->field.le_prev = &(elm)->field.le_next;\ + (head)->lh_first = (elm); \ + (elm)->field.le_prev = &(head)->lh_first; \ +} while (0) + +#define LIST_REMOVE(elm, field) do { \ + if ((elm)->field.le_next != NULL) \ + (elm)->field.le_next->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = (elm)->field.le_next; \ +} while (0) + +/* + * Tail queue definitions. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ +} + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ +} + +#define TAILQ_FIRST(head) ((head)->tqh_first) +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) +#define TAILQ_END(head) NULL + +/* + * Tail queue functions. + */ +#define TAILQ_INIT(head) do { \ + (head)->tqh_first = NULL; \ + (head)->tqh_last = &(head)->tqh_first; \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if (((elm)->field.tqe_next = (head)->tqh_first) != NULL) \ + (head)->tqh_first->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (head)->tqh_first = (elm); \ + (elm)->field.tqe_prev = &(head)->tqh_first; \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.tqe_next = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &(elm)->field.tqe_next; \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\ + (elm)->field.tqe_next->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (listelm)->field.tqe_next = (elm); \ + (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + (elm)->field.tqe_next = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &(elm)->field.tqe_next; \ +} while (0) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if (((elm)->field.tqe_next) != NULL) \ + (elm)->field.tqe_next->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = (elm)->field.tqe_next; \ +} while (0) + +/* + * Circular queue definitions. + */ +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +#define CIRCLEQ_FIRST(head) ((head)->cqh_first) +#define CIRCLEQ_LAST(head) ((head)->cqh_last) +#define CIRCLEQ_END(head) ((void *)(head)) +#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) +#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) + +/* + * Circular queue functions. + */ +#define CIRCLEQ_INIT(head) do { \ + (head)->cqh_first = (void *)(head); \ + (head)->cqh_last = (void *)(head); \ +} while (0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + (elm)->field.cqe_next = (listelm)->field.cqe_next; \ + (elm)->field.cqe_prev = (listelm); \ + if ((listelm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (listelm)->field.cqe_next->field.cqe_prev = (elm); \ + (listelm)->field.cqe_next = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + (elm)->field.cqe_next = (listelm); \ + (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \ + if ((listelm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (listelm)->field.cqe_prev->field.cqe_next = (elm); \ + (listelm)->field.cqe_prev = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + (elm)->field.cqe_next = (head)->cqh_first; \ + (elm)->field.cqe_prev = (void *)(head); \ + if ((head)->cqh_last == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (head)->cqh_first->field.cqe_prev = (elm); \ + (head)->cqh_first = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.cqe_next = (void *)(head); \ + (elm)->field.cqe_prev = (head)->cqh_last; \ + if ((head)->cqh_first == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (head)->cqh_last->field.cqe_next = (elm); \ + (head)->cqh_last = (elm); \ +} while (0) + +#define CIRCLEQ_REMOVE(head, elm, field) do { \ + if ((elm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm)->field.cqe_prev; \ + else \ + (elm)->field.cqe_next->field.cqe_prev = \ + (elm)->field.cqe_prev; \ + if ((elm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm)->field.cqe_next; \ + else \ + (elm)->field.cqe_prev->field.cqe_next = \ + (elm)->field.cqe_next; \ +} while (0) +#endif /* !_SYS_QUEUE_H_ */ diff --git a/db2/include/shqueue.h b/db2/include/shqueue.h new file mode 100644 index 0000000000..c3e2f4aecc --- /dev/null +++ b/db2/include/shqueue.h @@ -0,0 +1,361 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)shqueue.h 8.11 (Sleepycat) 7/27/97 + */ + +#ifndef _SYS_SHQUEUE_H_ +#define _SYS_SHQUEUE_H_ + +/* + * This file defines three types of data structures: lists, tail queues, and + * circular queues, similarly to the include file <sys/queue.h>. + * + * The difference is that this set of macros can be used for structures that + * reside in shared memory that may be mapped at different addresses in each + * process. In most cases, the macros for shared structures exactly mirror + * the normal macros, although the macro calls require an additional type + * parameter, only used by the HEAD and ENTRY macros of the standard macros. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * Shared list definitions. + */ +#define SH_LIST_HEAD(name) \ +struct name { \ + ssize_t slh_first; /* first element */ \ +} + +#define SH_LIST_ENTRY \ +struct { \ + ssize_t sle_next; /* relative offset next element */ \ + ssize_t sle_prev; /* relative offset of prev element */ \ +} + +/* + * Shared list functions. Since we use relative offsets for pointers, + * 0 is a valid offset. Therefore, we use -1 to indicate end of list. + * The macros ending in "P" return pointers without checking for end + * of list, the others check for end of list and evaluate to either a + * pointer or NULL. + */ + +#define SH_LIST_FIRSTP(head, type) \ + ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first)) + +#define SH_LIST_FIRST(head, type) \ + ((head)->slh_first == -1 ? NULL : \ + ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first))) + +#define SH_LIST_NEXTP(elm, field, type) \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)) + +#define SH_LIST_NEXT(elm, field, type) \ + ((elm)->field.sle_next == -1 ? NULL : \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))) + +#define SH_LIST_PREV(elm, field) \ + ((ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.sle_prev)) + +#define SH_PTR_TO_OFF(src, dest) \ + ((ssize_t)(((u_int8_t *)(dest)) - ((u_int8_t *)(src)))) + +#define SH_LIST_END(head) NULL + +/* + * Take the element's next pointer and calculate what the corresponding + * Prev pointer should be -- basically it is the negation plus the offset + * of the next field in the structure. + */ +#define SH_LIST_NEXT_TO_PREV(elm, field) \ + (-(elm)->field.sle_next + SH_PTR_TO_OFF(elm, &(elm)->field.sle_next)) + +#define SH_LIST_INIT(head) (head)->slh_first = -1 + +#define SH_LIST_INSERT_AFTER(listelm, elm, field, type) do { \ + if ((listelm)->field.sle_next != -1) { \ + (elm)->field.sle_next = SH_PTR_TO_OFF(elm, \ + SH_LIST_NEXTP(listelm, field, type)); \ + SH_LIST_NEXTP(listelm, field, type)->field.sle_prev = \ + SH_LIST_NEXT_TO_PREV(elm, field); \ + } else \ + (elm)->field.sle_next = -1; \ + (listelm)->field.sle_next = SH_PTR_TO_OFF(listelm, elm); \ + (elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(listelm, field); \ +} while (0) + +#define SH_LIST_INSERT_HEAD(head, elm, field, type) do { \ + if ((head)->slh_first != -1) { \ + (elm)->field.sle_next = \ + (head)->slh_first - SH_PTR_TO_OFF(head, elm); \ + SH_LIST_FIRSTP(head, type)->field.sle_prev = \ + SH_LIST_NEXT_TO_PREV(elm, field); \ + } else \ + (elm)->field.sle_next = -1; \ + (head)->slh_first = SH_PTR_TO_OFF(head, elm); \ + (elm)->field.sle_prev = SH_PTR_TO_OFF(elm, &(head)->slh_first); \ +} while (0) + +#define SH_LIST_REMOVE(elm, field, type) do { \ + if ((elm)->field.sle_next != -1) { \ + SH_LIST_NEXTP(elm, field, type)->field.sle_prev = \ + (elm)->field.sle_prev - (elm)->field.sle_next; \ + *SH_LIST_PREV(elm, field) += (elm)->field.sle_next; \ + } else \ + *SH_LIST_PREV(elm, field) = -1; \ +} while (0) + +/* + * Shared tail queue definitions. + */ +#define SH_TAILQ_HEAD(name) \ +struct name { \ + ssize_t stqh_first; /* relative offset of first element */ \ + ssize_t stqh_last; /* relative offset of last's next */ \ +} + +#define SH_TAILQ_ENTRY \ +struct { \ + ssize_t stqe_next; /* relative offset of next element */ \ + ssize_t stqe_prev; /* relative offset of prev's next */ \ +} + +/* + * Shared tail queue functions. + */ +#define SH_TAILQ_FIRSTP(head, type) \ + ((struct type *)((u_int8_t *)(head) + (head)->stqh_first)) + +#define SH_TAILQ_FIRST(head, type) \ + ((head)->stqh_first == -1 ? NULL : SH_TAILQ_FIRSTP(head, type)) + +#define SH_TAILQ_NEXTP(elm, field, type) \ + ((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next)) + +#define SH_TAILQ_NEXT(elm, field, type) \ + ((elm)->field.stqe_next == -1 ? NULL : SH_TAILQ_NEXTP(elm, field, type)) + +#define SH_TAILQ_PREVP(elm, field) \ + ((ssize_t *)((u_int8_t *)(elm) + (elm)->field.stqe_prev)) + +#define SH_TAILQ_LAST(head) \ + ((ssize_t *)(((u_int8_t *)(head)) + (head)->stqh_last)) + +#define SH_TAILQ_NEXT_TO_PREV(elm, field) \ + (-(elm)->field.stqe_next + SH_PTR_TO_OFF(elm, &(elm)->field.stqe_next)) + +#define SH_TAILQ_END(head) NULL + +#define SH_TAILQ_INIT(head) { \ + (head)->stqh_first = -1; \ + (head)->stqh_last = SH_PTR_TO_OFF(head, &(head)->stqh_first); \ +} + +#define SH_TAILQ_INSERT_HEAD(head, elm, field, type) do { \ + if ((head)->stqh_first != -1) { \ + (elm)->field.stqe_next = \ + (head)->stqh_first - SH_PTR_TO_OFF(head, elm); \ + SH_TAILQ_FIRSTP(head, type)->field.stqe_prev = \ + SH_TAILQ_NEXT_TO_PREV(elm, field); \ + } else { \ + (elm)->field.stqe_next = -1; \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &(elm)->field.stqe_next); \ + } \ + (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \ + (elm)->field.stqe_prev = \ + SH_PTR_TO_OFF(elm, &(head)->stqh_first); \ +} while (0) + +#define SH_TAILQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.stqe_next = -1; \ + (elm)->field.stqe_prev = \ + -SH_PTR_TO_OFF(head, elm) + (head)->stqh_last; \ + if ((head)->stqh_last == \ + SH_PTR_TO_OFF((head), &(head)->stqh_first)) \ + (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \ + else \ + *SH_TAILQ_LAST(head) = -(head)->stqh_last + \ + SH_PTR_TO_OFF((elm), &(elm)->field.stqe_next) + \ + SH_PTR_TO_OFF(head, elm); \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &((elm)->field.stqe_next)); \ +} while (0) + +#define SH_TAILQ_INSERT_AFTER(head, listelm, elm, field, type) do { \ + if ((listelm)->field.stqe_next != -1) { \ + (elm)->field.stqe_next = (listelm)->field.stqe_next - \ + SH_PTR_TO_OFF(listelm, elm); \ + SH_TAILQ_NEXTP(listelm, field, type)->field.stqe_prev = \ + SH_TAILQ_NEXT_TO_PREV(elm, field); \ + } else { \ + (elm)->field.stqe_next = -1; \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &elm->field.stqe_next); \ + } \ + (listelm)->field.stqe_next = SH_PTR_TO_OFF(listelm, elm); \ + (elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV(listelm, field); \ +} while (0) + +#define SH_TAILQ_REMOVE(head, elm, field, type) do { \ + if ((elm)->field.stqe_next != -1) { \ + SH_TAILQ_NEXTP(elm, field, type)->field.stqe_prev = \ + (elm)->field.stqe_prev + \ + SH_PTR_TO_OFF(SH_TAILQ_NEXTP(elm, \ + field, type), elm); \ + *SH_TAILQ_PREVP(elm, field) += elm->field.stqe_next; \ + } else { \ + (head)->stqh_last = (elm)->field.stqe_prev + \ + SH_PTR_TO_OFF(head, elm); \ + *SH_TAILQ_PREVP(elm, field) = -1; \ + } \ +} while (0) + +/* + * Shared circular queue definitions. + */ +#define SH_CIRCLEQ_HEAD(name) \ +struct name { \ + ssize_t scqh_first; /* first element */ \ + ssize_t scqh_last; /* last element */ \ +} + +#define SH_CIRCLEQ_ENTRY \ +struct { \ + ssize_t scqe_next; /* next element */ \ + ssize_t scqe_prev; /* previous element */ \ +} + +/* + * Shared circular queue functions. + */ +#define SH_CIRCLEQ_FIRSTP(head, type) \ + ((struct type *)(((u_int8_t *)(head)) + (head)->scqh_first)) + +#define SH_CIRCLEQ_FIRST(head, type) \ + ((head)->scqh_first == -1 ? \ + (void *)head : SH_CIRCLEQ_FIRSTP(head, type)) + +#define SH_CIRCLEQ_LASTP(head, type) \ + ((struct type *)(((u_int8_t *)(head)) + (head)->scqh_last)) + +#define SH_CIRCLEQ_LAST(head, type) \ + ((head)->scqh_last == -1 ? (void *)head : SH_CIRCLEQ_LASTP(head, type)) + +#define SH_CIRCLEQ_NEXTP(elm, field, type) \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.scqe_next)) + +#define SH_CIRCLEQ_NEXT(head, elm, field, type) \ + ((elm)->field.scqe_next == SH_PTR_TO_OFF(elm, head) ? \ + (void *)head : SH_CIRCLEQ_NEXTP(elm, field, type)) + +#define SH_CIRCLEQ_PREVP(elm, field, type) \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.scqe_prev)) + +#define SH_CIRCLEQ_PREV(head, elm, field, type) \ + ((elm)->field.scqe_prev == SH_PTR_TO_OFF(elm, head) ? \ + (void *)head : SH_CIRCLEQ_PREVP(elm, field, type)) + +#define SH_CIRCLEQ_END(head) ((void *)(head)) + +#define SH_CIRCLEQ_INIT(head) { \ + (head)->scqh_first = 0; \ + (head)->scqh_last = 0; \ +} + +#define SH_CIRCLEQ_INSERT_AFTER(head, listelm, elm, field, type) do { \ + (elm)->field.scqe_prev = SH_PTR_TO_OFF(elm, listelm); \ + (elm)->field.scqe_next = (listelm)->field.scqe_next + \ + (elm)->field.scqe_prev; \ + if (SH_CIRCLEQ_NEXTP(listelm, field, type) == (void *)head) \ + (head)->scqh_last = SH_PTR_TO_OFF(head, elm); \ + else \ + SH_CIRCLEQ_NEXTP(listelm, \ + field, type)->field.scqe_prev = \ + SH_PTR_TO_OFF(SH_CIRCLEQ_NEXTP(listelm, \ + field, type), elm); \ + (listelm)->field.scqe_next = -(elm)->field.scqe_prev; \ +} while (0) + +#define SH_CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field, type) do { \ + (elm)->field.scqe_next = SH_PTR_TO_OFF(elm, listelm); \ + (elm)->field.scqe_prev = (elm)->field.scqe_next - \ + SH_CIRCLEQ_PREVP(listelm, field, type)->field.scqe_next;\ + if (SH_CIRCLEQ_PREVP(listelm, field, type) == (void *)(head)) \ + (head)->scqh_first = SH_PTR_TO_OFF(head, elm); \ + else \ + SH_CIRCLEQ_PREVP(listelm, \ + field, type)->field.scqe_next = \ + SH_PTR_TO_OFF(SH_CIRCLEQ_PREVP(listelm, \ + field, type), elm); \ + (listelm)->field.scqe_prev = -(elm)->field.scqe_next; \ +} while (0) + +#define SH_CIRCLEQ_INSERT_HEAD(head, elm, field, type) do { \ + (elm)->field.scqe_prev = SH_PTR_TO_OFF(elm, head); \ + (elm)->field.scqe_next = (head)->scqh_first + \ + (elm)->field.scqe_prev; \ + if ((head)->scqh_last == 0) \ + (head)->scqh_last = -(elm)->field.scqe_prev; \ + else \ + SH_CIRCLEQ_FIRSTP(head, type)->field.scqe_prev = \ + SH_PTR_TO_OFF(SH_CIRCLEQ_FIRSTP(head, type), elm); \ + (head)->scqh_first = -(elm)->field.scqe_prev; \ +} while (0) + +#define SH_CIRCLEQ_INSERT_TAIL(head, elm, field, type) do { \ + (elm)->field.scqe_next = SH_PTR_TO_OFF(elm, head); \ + (elm)->field.scqe_prev = (head)->scqh_last + \ + (elm)->field.scqe_next; \ + if ((head)->scqh_first == 0) \ + (head)->scqh_first = -(elm)->field.scqe_next; \ + else \ + SH_CIRCLEQ_LASTP(head, type)->field.scqe_next = \ + SH_PTR_TO_OFF(SH_CIRCLEQ_LASTP(head, type), elm); \ + (head)->scqh_last = -(elm)->field.scqe_next; \ +} while (0) + +#define SH_CIRCLEQ_REMOVE(head, elm, field, type) do { \ + if (SH_CIRCLEQ_NEXTP(elm, field, type) == (void *)(head)) \ + (head)->scqh_last += (elm)->field.scqe_prev; \ + else \ + SH_CIRCLEQ_NEXTP(elm, field, type)->field.scqe_prev += \ + (elm)->field.scqe_prev; \ + if (SH_CIRCLEQ_PREVP(elm, field, type) == (void *)(head)) \ + (head)->scqh_first += (elm)->field.scqe_next; \ + else \ + SH_CIRCLEQ_PREVP(elm, field, type)->field.scqe_next += \ + (elm)->field.scqe_next; \ +} while (0) +#endif /* !_SYS_SHQUEUE_H_ */ diff --git a/db2/include/txn.h b/db2/include/txn.h new file mode 100644 index 0000000000..f4e0999b36 --- /dev/null +++ b/db2/include/txn.h @@ -0,0 +1,112 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)txn.h 10.6 (Sleepycat) 7/29/97 + */ +#ifndef _TXN_H_ +#define _TXN_H_ + +/* + * The name of the transaction shared memory region is DEFAULT_TXN_FILE and + * the region is always created group RW of the group owning the directory. + */ +#define DEFAULT_TXN_FILE "__db_txn.share" +#define TXN_INVALID 0xffffffff /* Maximum number of txn ids. */ +#define TXN_MINIMUM 0x80000000 /* First transaction id */ + +/* + * Transaction type declarations. + */ + +/* + * Internal data maintained in shared memory for each transaction. + */ +typedef struct __txn_detail { + u_int32_t txnid; /* current transaction id + used to link free list also */ + DB_LSN last_lsn; /* last lsn written for this txn */ + DB_LSN begin_lsn; /* lsn of begin record */ + size_t last_lock; /* offset in lock region of last lock + for this transaction. */ +#define TXN_UNALLOC 0 +#define TXN_RUNNING 1 +#define TXN_ABORTED 2 +#define TXN_PREPARED 3 + u_int32_t status; /* status of the transaction */ +} TXN_DETAIL; + +/* + * The transaction manager encapsulates the transaction system. It contains + * references to the log and lock managers as well as the state that keeps + * track of the shared memory region. + */ +struct __db_txnmgr { +/* These fields need to be protected for multi-threaded support. */ + db_mutex_t mutex; /* Synchronization. */ + /* list of active transactions */ + TAILQ_HEAD(_chain, __db_txn) txn_chain; + +/* These fields are not protected. */ + DB_ENV *dbenv; /* Environment. */ + int (*recover) /* Recovery dispatch routine */ + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + int fd; /* mapped file descriptor */ + u_int flags; /* DB_TXN_NOSYNC, DB_THREAD */ + size_t reg_size; /* how large we think the region is */ + DB_TXNREGION *region; /* address of shared memory region */ +}; + +/* + * Layout of the shared memory region. + * + */ +struct __db_txnregion { + RLAYOUT hdr; /* Shared memory region header. */ + u_int32_t magic; /* transaction magic number */ + u_int32_t version; /* version number */ + u_int32_t maxtxns; /* maximum number of active txns */ + u_int32_t last_txnid; /* last transaction id given out */ + u_int32_t free_txn; /* head of transaction free list */ + DB_LSN pending_ckp; /* last checkpoint did not finish */ + DB_LSN last_ckp; /* lsn of the last checkpoint */ + time_t time_ckp; /* time of last checkpoint */ + u_int32_t logtype; /* type of logging */ + u_int32_t locktype; /* lock type */ + u_int32_t naborts; /* number of aborted transactions */ + u_int32_t ncommits; /* number of committed transactions */ + u_int32_t nbegins; /* number of begun transactions */ + TXN_DETAIL table[1]; /* array of TXN structures */ +}; + +#define TXN_REGION_SIZE(N) \ + (sizeof(DB_TXNREGION) + N * sizeof(DB_TXN)) + +/* Macros to lock/unlock the region and threads. */ +#define LOCK_TXNTHREAD(tmgrp) \ + if (F_ISSET(tmgrp, DB_THREAD)) \ + (void)__db_mutex_lock(&(tmgrp)->mutex, -1, \ + (tmgrp)->dbenv == NULL ? NULL : (tmgrp)->dbenv->db_yield) +#define UNLOCK_TXNTHREAD(tmgrp) \ + if (F_ISSET(tmgrp, DB_THREAD)) \ + (void)__db_mutex_unlock(&(tmgrp)->mutex, -1) + +#define LOCK_TXNREGION(tmgrp) \ + (void)__db_mutex_lock(&(tmgrp)->region->hdr.lock,(tmgrp)->fd, \ + (tmgrp)->dbenv == NULL ? NULL : (tmgrp)->dbenv->db_yield) +#define UNLOCK_TXNREGION(tmgrp) \ + (void)__db_mutex_unlock(&(tmgrp)->region->hdr.lock, (tmgrp)->fd) + +/* + * Log record types. + */ +#define TXN_BEGIN 1 +#define TXN_COMMIT 2 +#define TXN_PREPARE 3 +#define TXN_CHECKPOINT 4 + +#include "txn_auto.h" +#include "txn_ext.h" +#endif /* !_TXN_H_ */ diff --git a/db2/include/txn_auto.h b/db2/include/txn_auto.h new file mode 100644 index 0000000000..fd5a456115 --- /dev/null +++ b/db2/include/txn_auto.h @@ -0,0 +1,25 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#ifndef txn_AUTO_H +#define txn_AUTO_H + +#define DB_txn_regop (DB_txn_BEGIN + 1) + +typedef struct _txn_regop_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; +} __txn_regop_args; + + +#define DB_txn_ckp (DB_txn_BEGIN + 2) + +typedef struct _txn_ckp_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + DB_LSN ckp_lsn; + DB_LSN last_ckp; +} __txn_ckp_args; + +#endif diff --git a/db2/include/txn_ext.h b/db2/include/txn_ext.h new file mode 100644 index 0000000000..8ba0b0c44e --- /dev/null +++ b/db2/include/txn_ext.h @@ -0,0 +1,18 @@ +/* Do not edit: automatically built by dist/distrib. */ +int __txn_regop_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t)); +int __txn_regop_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_regop_read __P((void *, __txn_regop_args **)); +int __txn_ckp_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + DB_LSN *, DB_LSN *)); +int __txn_ckp_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_ckp_read __P((void *, __txn_ckp_args **)); +int __txn_init_print __P((DB_ENV *)); +int __txn_init_recover __P((DB_ENV *)); +int __txn_regop_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_ckp_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); diff --git a/db2/lock/lock.c b/db2/lock/lock.c new file mode 100644 index 0000000000..8fc91334a7 --- /dev/null +++ b/db2/lock/lock.c @@ -0,0 +1,1362 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)lock.c 10.31 (Sleepycat) 8/17/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_shash.h" +#include "lock.h" +#include "common_ext.h" +#include "db_am.h" + +static void __lock_checklocker __P((DB_LOCKTAB *, struct __db_lock *, int)); +static int __lock_count_locks __P((DB_LOCKREGION *)); +static int __lock_count_objs __P((DB_LOCKREGION *)); +static int __lock_create __P((const char *, int, DB_ENV *)); +static void __lock_freeobj __P((DB_LOCKTAB *, DB_LOCKOBJ *)); +static int __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, int, const DBT *, + db_lockmode_t, struct __db_lock **)); +static int __lock_grow_region __P((DB_LOCKTAB *, int, size_t)); +static int __lock_put_internal __P((DB_LOCKTAB *, struct __db_lock *, int)); +static void __lock_remove_waiter + __P((DB_LOCKTAB *, DB_LOCKOBJ *, struct __db_lock *, db_status_t)); +static void __lock_reset_region __P((DB_LOCKTAB *)); +static int __lock_validate_region __P((DB_LOCKTAB *)); +#ifdef DEBUG +static void __lock_dump_locker __P((DB_LOCKTAB *, DB_LOCKOBJ *)); +static void __lock_dump_object __P((DB_LOCKTAB *, DB_LOCKOBJ *)); +static void __lock_printlock __P((DB_LOCKTAB *, struct __db_lock *, int)); +#endif + +/* + * Create and initialize a lock region in shared memory. + */ + +/* + * __lock_create -- + * Create the lock region. Returns an errno. In most cases, + * the errno should be that returned by __db_ropen, in which case + * an EAGAIN means that we should retry, and an EEXIST means that + * the region exists and we didn't need to create it. Any other + * sort of errno should be treated as a system error, leading to a + * failure of the original interface call. + */ +static int +__lock_create(path, mode, dbenv) + const char *path; + int mode; + DB_ENV *dbenv; +{ + struct __db_lock *lp; + struct lock_header *tq_head; + struct obj_header *obj_head; + DB_LOCKOBJ *op; + DB_LOCKREGION *lrp; + u_int maxlocks; + u_int32_t i; + int fd, lock_modes, nelements, ret; + u_int8_t *conflicts, *curaddr; + + maxlocks = dbenv == NULL || dbenv->lk_max == 0 ? + DB_LOCK_DEFAULT_N : dbenv->lk_max; + lock_modes = dbenv == NULL || dbenv->lk_modes == 0 ? + DB_LOCK_RW_N : dbenv->lk_modes; + conflicts = dbenv == NULL || dbenv->lk_conflicts == NULL ? + (u_int8_t *)db_rw_conflicts : dbenv->lk_conflicts; + + if ((ret = + __db_rcreate(dbenv, DB_APP_NONE, path, DB_DEFAULT_LOCK_FILE, mode, + LOCK_REGION_SIZE(lock_modes, maxlocks, __db_tablesize(maxlocks)), + &fd, &lrp)) != 0) + return (ret); + + /* Region exists; now initialize it. */ + lrp->table_size = __db_tablesize(maxlocks); + lrp->magic = DB_LOCKMAGIC; + lrp->version = DB_LOCKVERSION; + lrp->id = 0; + lrp->maxlocks = maxlocks; + lrp->need_dd = 0; + lrp->detect = DB_LOCK_NORUN; + lrp->numobjs = maxlocks; + lrp->nlockers = 0; + lrp->mem_bytes = ALIGN(STRING_SIZE(maxlocks), sizeof(size_t)); + lrp->increment = lrp->hdr.size / 2; + lrp->nmodes = lock_modes; + lrp->nconflicts = 0; + lrp->nrequests = 0; + lrp->nreleases = 0; + lrp->ndeadlocks = 0; + + /* + * As we write the region, we've got to maintain the alignment + * for the structures that follow each chunk. This information + * ends up being encapsulated both in here as well as in the + * lock.h file for the XXX_SIZE macros. + */ + /* Initialize conflict matrix. */ + curaddr = (u_int8_t *)lrp + sizeof(DB_LOCKREGION); + memcpy(curaddr, conflicts, lock_modes * lock_modes); + curaddr += lock_modes * lock_modes; + + /* + * Initialize hash table. + */ + curaddr = (u_int8_t *)ALIGNP(curaddr, LOCK_HASH_ALIGN); + lrp->hash_off = curaddr - (u_int8_t *)lrp; + nelements = lrp->table_size; + __db_hashinit(curaddr, nelements); + curaddr += nelements * sizeof(DB_HASHTAB); + + /* + * Initialize locks onto a free list. Since locks contains mutexes, + * we need to make sure that each lock is aligned on a MUTEX_ALIGNMENT + * boundary. + */ + curaddr = (u_int8_t *)ALIGNP(curaddr, MUTEX_ALIGNMENT); + tq_head = &lrp->free_locks; + SH_TAILQ_INIT(tq_head); + + for (i = 0; i++ < maxlocks; + curaddr += ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT)) { + lp = (struct __db_lock *)curaddr; + lp->status = DB_LSTAT_FREE; + SH_TAILQ_INSERT_HEAD(tq_head, lp, links, __db_lock); + } + + /* Initialize objects onto a free list. */ + obj_head = &lrp->free_objs; + SH_TAILQ_INIT(obj_head); + + for (i = 0; i++ < maxlocks; curaddr += sizeof(DB_LOCKOBJ)) { + op = (DB_LOCKOBJ *)curaddr; + SH_TAILQ_INSERT_HEAD(obj_head, op, links, __db_lockobj); + } + + /* + * Initialize the string space; as for all shared memory allocation + * regions, this requires size_t alignment, since we store the + * lengths of malloc'd areas in the area.. + */ + curaddr = (u_int8_t *)ALIGNP(curaddr, sizeof(size_t)); + lrp->mem_off = curaddr - (u_int8_t *)lrp; + __db_shalloc_init(curaddr, lrp->mem_bytes); + + /* Release the lock. */ + (void)__db_mutex_unlock(&lrp->hdr.lock, fd); + + /* Now unmap the region. */ + if ((ret = __db_rclose(dbenv, fd, lrp)) != 0) { + (void)lock_unlink(path, 1 /* force */, dbenv); + return (ret); + } + + return (0); +} + +int +lock_open(path, flags, mode, dbenv, ltp) + const char *path; + int flags, mode; + DB_ENV *dbenv; + DB_LOCKTAB **ltp; +{ + DB_LOCKTAB *lt; + int ret, retry_cnt; + + /* Validate arguments. */ +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_THREAD) +#else +#define OKFLAGS (DB_CREATE) +#endif + if ((ret = __db_fchk(dbenv, "lock_open", flags, OKFLAGS)) != 0) + return (ret); + + /* + * Create the lock table structure. + */ + if ((lt = (DB_LOCKTAB *)calloc(1, sizeof(DB_LOCKTAB))) == NULL) { + __db_err(dbenv, "%s", strerror(errno)); + return (ENOMEM); + } + lt->dbenv = dbenv; + + /* + * Now, create the lock region if it doesn't already exist. + */ + retry_cnt = 0; +retry: if (LF_ISSET(DB_CREATE) && + (ret = __lock_create(path, mode, dbenv)) != 0) + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry; + } else if (ret == EEXIST) /* We did not create the region */ + LF_CLR(DB_CREATE); + else + goto out; + + /* + * Finally, open the region, map it in, and increment the + * reference count. + */ + retry_cnt = 0; +retry1: if ((ret = __db_ropen(dbenv, DB_APP_NONE, path, DB_DEFAULT_LOCK_FILE, + LF_ISSET(~(DB_CREATE | DB_THREAD)), <->fd, <->region)) != 0) { + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry1; + } + goto out; + } + + if (lt->region->magic != DB_LOCKMAGIC) { + __db_err(dbenv, "lock_open: Bad magic number"); + ret = EINVAL; + goto out; + } + + /* Check for automatic deadlock detection. */ + if (dbenv->lk_detect != DB_LOCK_NORUN) { + if (lt->region->detect != DB_LOCK_NORUN && + dbenv->lk_detect != DB_LOCK_DEFAULT && + lt->region->detect != dbenv->lk_detect) { + __db_err(dbenv, + "lock_open: incompatible deadlock detector mode"); + ret = EINVAL; + goto out; + } + if (lt->region->detect == DB_LOCK_NORUN) + lt->region->detect = dbenv->lk_detect; + } + + /* Set up remaining pointers into region. */ + lt->conflicts = (u_int8_t *)lt->region + sizeof(DB_LOCKREGION); + lt->hashtab = + (DB_HASHTAB *)((u_int8_t *)lt->region + lt->region->hash_off); + lt->mem = (void *)((u_int8_t *)lt->region + lt->region->mem_off); + lt->reg_size = lt->region->hdr.size; + + *ltp = lt; + return (0); + +/* Error handling. */ +out: if (lt->region != NULL) + (void)__db_rclose(lt->dbenv, lt->fd, lt->region); + if (LF_ISSET(DB_CREATE)) + (void)lock_unlink(path, 1, lt->dbenv); + free(lt); + return (ret); +} + +int +lock_id (lt, idp) + DB_LOCKTAB *lt; + u_int32_t *idp; +{ + u_int32_t id; + + LOCK_LOCKREGION(lt); + if (lt->region->id >= DB_LOCK_MAXID) + lt->region->id = 0; + id = ++lt->region->id; + UNLOCK_LOCKREGION(lt); + + *idp = id; + return (0); +} + +int +lock_vec(lt, locker, flags, list, nlist, elistp) + DB_LOCKTAB *lt; + u_int32_t locker; + int flags, nlist; + DB_LOCKREQ *list, **elistp; +{ + struct __db_lock *lp; + DB_LOCKOBJ *sh_obj, *sh_locker; + int i, ret, run_dd; + + /* Validate arguments. */ + if ((ret = + __db_fchk(lt->dbenv, "lock_vec", flags, DB_LOCK_NOWAIT)) != 0) + return (ret); + + LOCK_LOCKREGION(lt); + + if ((ret = __lock_validate_region(lt)) != 0) { + UNLOCK_LOCKREGION(lt); + return (ret); + } + + ret = 0; + for (i = 0; i < nlist && ret == 0; i++) { + switch (list[i].op) { + case DB_LOCK_GET: + ret = __lock_get_internal(lt, locker, flags, + list[i].obj, list[i].mode, &lp); + if (ret == 0) + list[i].lock = LOCK_TO_OFFSET(lt, lp); + break; + case DB_LOCK_PUT: + lp = OFFSET_TO_LOCK(lt, list[i].lock); + if (lp->holder != locker) { + ret = DB_LOCK_NOTHELD; + break; + } + list[i].mode = lp->mode; + + /* XXX Need to copy the object. ??? */ + ret = __lock_put_internal(lt, lp, 0); + break; + case DB_LOCK_PUT_ALL: + /* Find the locker. */ + if ((ret = __lock_getobj(lt, locker, + NULL, DB_LOCK_LOCKER, &sh_locker)) != 0) + break; + + for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); + lp != NULL; + lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) { + if ((ret = __lock_put_internal(lt, lp, 0)) != 0) + break; + } + __lock_freeobj(lt, sh_locker); + lt->region->nlockers--; + break; + case DB_LOCK_PUT_OBJ: + + /* Look up the object in the hash table. */ + __db_hashlookup(lt->hashtab, __db_lockobj, links, + list[i].obj, sh_obj, lt->region->table_size, + __lock_ohash, __lock_cmp); + if (sh_obj == NULL) { + ret = EINVAL; + break; + } + /* + * Release waiters first, because they won't cause + * anyone else to be awakened. If we release the + * lockers first, all the waiters get awakened + * needlessly. + */ + for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock); + lp != NULL; + lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) { + lt->region->nreleases += lp->refcount; + __lock_remove_waiter(lt, sh_obj, lp, + DB_LSTAT_NOGRANT); + __lock_checklocker(lt, lp, 1); + } + + for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); + lp != NULL; + lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock)) { + + lt->region->nreleases += lp->refcount; + SH_LIST_REMOVE(lp, locker_links, __db_lock); + SH_TAILQ_REMOVE(&sh_obj->holders, lp, links, + __db_lock); + lp->status = DB_LSTAT_FREE; + SH_TAILQ_INSERT_HEAD(<->region->free_locks, + lp, links, __db_lock); + } + + /* Now free the object. */ + __lock_freeobj(lt, sh_obj); + break; +#ifdef DEBUG + case DB_LOCK_DUMP: + /* Find the locker. */ + if ((ret = __lock_getobj(lt, locker, + NULL, DB_LOCK_LOCKER, &sh_locker)) != 0) + break; + + for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); + lp != NULL; + lp = SH_LIST_NEXT(lp, locker_links, __db_lock)) { + __lock_printlock(lt, lp, 1); + ret = EINVAL; + } + if (ret == 0) { + __lock_freeobj(lt, sh_locker); + lt->region->nlockers--; + } + break; +#endif + default: + ret = EINVAL; + break; + } + } + + if (lt->region->need_dd && lt->region->detect != DB_LOCK_NORUN) { + run_dd = 1; + lt->region->need_dd = 0; + } else + run_dd = 0; + + UNLOCK_LOCKREGION(lt); + + if (ret == 0 && run_dd) + lock_detect(lt, 0, lt->region->detect); + + if (elistp && ret != 0) + *elistp = &list[i - 1]; + return (ret); +} + +int +lock_get(lt, locker, flags, obj, lock_mode, lock) + DB_LOCKTAB *lt; + u_int32_t locker; + int flags; + const DBT *obj; + db_lockmode_t lock_mode; + DB_LOCK *lock; +{ + struct __db_lock *lockp; + int ret; + + /* Validate arguments. */ + if ((ret = + __db_fchk(lt->dbenv, "lock_get", flags, DB_LOCK_NOWAIT)) != 0) + return (ret); + + LOCK_LOCKREGION(lt); + + ret = __lock_validate_region(lt); + if (ret == 0 && (ret = __lock_get_internal(lt, + locker, flags, obj, lock_mode, &lockp)) == 0) { + *lock = LOCK_TO_OFFSET(lt, lockp); + lt->region->nrequests++; + } + + UNLOCK_LOCKREGION(lt); + return (ret); +} + +int +lock_put(lt, lock) + DB_LOCKTAB *lt; + DB_LOCK lock; +{ + struct __db_lock *lockp; + int ret, run_dd; + + LOCK_LOCKREGION(lt); + + if ((ret = __lock_validate_region(lt)) != 0) + return (ret); + else { + lockp = OFFSET_TO_LOCK(lt, lock); + ret = __lock_put_internal(lt, lockp, 0); + } + + __lock_checklocker(lt, lockp, 0); + + if (lt->region->need_dd && lt->region->detect != DB_LOCK_NORUN) { + run_dd = 1; + lt->region->need_dd = 0; + } else + run_dd = 0; + + UNLOCK_LOCKREGION(lt); + + if (ret == 0 && run_dd) + lock_detect(lt, 0, lt->region->detect); + + return (ret); +} + +int +lock_close(lt) + DB_LOCKTAB *lt; +{ + int ret; + + if ((ret = __db_rclose(lt->dbenv, lt->fd, lt->region)) != 0) + return (ret); + + /* Free lock table. */ + free(lt); + return (0); +} + +int +lock_unlink(path, force, dbenv) + const char *path; + int force; + DB_ENV *dbenv; +{ + return (__db_runlink(dbenv, + DB_APP_NONE, path, DB_DEFAULT_LOCK_FILE, force)); +} + +/* + * XXX This looks like it could be void, but I'm leaving it returning + * an int because I think it will have to when we go through and add + * the appropriate error checking for the EINTR on mutexes. + */ +static int +__lock_put_internal(lt, lockp, do_all) + DB_LOCKTAB *lt; + struct __db_lock *lockp; + int do_all; +{ + struct __db_lock *lp_w, *lp_h, *next_waiter; + DB_LOCKOBJ *sh_obj; + int state_changed; + + if (lockp->refcount == 0 || (lockp->status != DB_LSTAT_HELD && + lockp->status != DB_LSTAT_WAITING) || lockp->obj == 0) { + __db_err(lt->dbenv, "lock_put: invalid lock %lu", + (u_long)((u_int8_t *)lockp - (u_int8_t *)lt->region)); + return (EINVAL); + } + + if (do_all) + lt->region->nreleases += lockp->refcount; + else + lt->region->nreleases++; + if (do_all == 0 && lockp->refcount > 1) { + lockp->refcount--; + return (0); + } + + /* Get the object associated with this lock. */ + sh_obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj); + + /* Remove lock from locker list. */ + SH_LIST_REMOVE(lockp, locker_links, __db_lock); + + /* Remove this lock from its holders/waitlist. */ + if (lockp->status != DB_LSTAT_HELD) + __lock_remove_waiter(lt, sh_obj, lockp, DB_LSTAT_FREE); + else + SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock); + + /* + * We need to do lock promotion. We also need to determine if + * we're going to need to run the deadlock detector again. If + * we release locks, and there are waiters, but no one gets promoted, + * then we haven't fundamentally changed the lockmgr state, so + * we may still have a deadlock and we have to run again. However, + * if there were no waiters, or we actually promoted someone, then + * we are OK and we don't have to run it immediately. + */ + for (lp_w = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock), + state_changed = lp_w == NULL; + lp_w != NULL; + lp_w = next_waiter) { + next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock); + for (lp_h = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); + lp_h != NULL; + lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) { + if (CONFLICTS(lt, lp_h->mode, lp_w->mode) && + lp_h->holder != lp_w->holder) + break; + } + if (lp_h != NULL) /* Found a conflict. */ + break; + + /* No conflict, promote the waiting lock. */ + SH_TAILQ_REMOVE(&sh_obj->waiters, lp_w, links, __db_lock); + lp_w->status = DB_LSTAT_PENDING; + SH_TAILQ_INSERT_TAIL(&sh_obj->holders, lp_w, links); + + /* Wake up waiter. */ + (void)__db_mutex_unlock(&lp_w->mutex, lt->fd); + state_changed = 1; + } + + /* Check if object should be reclaimed. */ + if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL) { + __db_hashremove_el(lt->hashtab, __db_lockobj, links, sh_obj, + lt->region->table_size, __lock_lhash); + __db_shalloc_free(lt->mem, SH_DBT_PTR(&sh_obj->lockobj)); + SH_TAILQ_INSERT_HEAD(<->region->free_objs, sh_obj, links, + __db_lockobj); + state_changed = 1; + } + + /* Free lock. */ + lockp->status = DB_LSTAT_FREE; + SH_TAILQ_INSERT_HEAD(<->region->free_locks, lockp, links, __db_lock); + + /* + * If we did not promote anyone; we need to run the deadlock + * detector again. + */ + if (state_changed == 0) + lt->region->need_dd = 1; + + return (0); +} + +static int +__lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) + DB_LOCKTAB *lt; + u_int32_t locker; + int flags; + const DBT *obj; + db_lockmode_t lock_mode; + struct __db_lock **lockp; +{ + struct __db_lock *newl, *lp; + DB_LOCKOBJ *sh_obj, *sh_locker; + DB_LOCKREGION *lrp; + size_t newl_off; + int ret; + + ret = 0; + /* + * Check that lock mode is valid. + */ + + lrp = lt->region; + if ((u_int32_t)lock_mode >= lrp->nmodes) { + __db_err(lt->dbenv, + "lock_get: invalid lock mode %lu\n", (u_long)lock_mode); + return (EINVAL); + } + + /* Allocate a new lock. Optimize for the common case of a grant. */ + if ((newl = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock)) == NULL) { + if ((ret = __lock_grow_region(lt, DB_LOCK_LOCK, 0)) != 0) + return (ret); + lrp = lt->region; + newl = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock); + } + newl_off = LOCK_TO_OFFSET(lt, newl); + + /* Optimize for common case of granting a lock. */ + SH_TAILQ_REMOVE(&lrp->free_locks, newl, links, __db_lock); + + newl->mode = lock_mode; + newl->status = DB_LSTAT_HELD; + newl->holder = locker; + newl->refcount = 1; + + if ((ret = + __lock_getobj(lt, 0, (DBT *)obj, DB_LOCK_OBJTYPE, &sh_obj)) != 0) + return (ret); + + lrp = lt->region; /* getobj might have grown */ + newl = OFFSET_TO_LOCK(lt, newl_off); + + /* Now make new lock point to object */ + newl->obj = SH_PTR_TO_OFF(newl, sh_obj); + + /* + * Now we have a lock and an object and we need to see if we should + * grant the lock. We use a FIFO ordering so we can only grant a + * new lock if it does not conflict with anyone on the holders list + * OR anyone on the waiters list. In case of conflict, we put the + * new lock on the end of the waiters list. + */ + for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); + lp != NULL; + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { + if (CONFLICTS(lt, lp->mode, lock_mode) && + locker != lp->holder) + break; + else if (lp->holder == locker && lp->mode == lock_mode && + lp->status == DB_LSTAT_HELD) { + /* Lock is already held, just inc the ref count. */ + lp->refcount++; + SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, + __db_lock); + *lockp = lp; + return (0); + } + } + + if (lp == NULL) + for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock); + lp != NULL; + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { + if (CONFLICTS(lt, lp->mode, lock_mode) && + locker != lp->holder) + break; + } + if (lp == NULL) + SH_TAILQ_INSERT_TAIL(&sh_obj->holders, newl, links); + else if (!(flags & DB_LOCK_NOWAIT)) + SH_TAILQ_INSERT_TAIL(&sh_obj->waiters, newl, links); + else { + /* Free the lock and return an error. */ + newl->status = DB_LSTAT_FREE; + SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock); + return (DB_LOCK_NOTGRANTED); + } + + /* + * This is really a blocker for the process, so initialize it + * set. That way the current process will block when it tries + * to get it and the waking process will release it. + */ + (void)__db_mutex_init(&newl->mutex, + MUTEX_LOCK_OFFSET(lt->region, &newl->mutex)); + (void)__db_mutex_lock(&newl->mutex, lt->fd, + lt->dbenv == NULL ? NULL : lt->dbenv->db_yield); + + /* + * Now, insert the lock onto its locker's list. + */ + if ((ret = + __lock_getobj(lt, locker, NULL, DB_LOCK_LOCKER, &sh_locker)) != 0) + return (ret); + + lrp = lt->region; + SH_LIST_INSERT_HEAD(&sh_locker->heldby, newl, locker_links, __db_lock); + + if (lp != NULL) { + newl->status = DB_LSTAT_WAITING; + lrp->nconflicts++; + /* + * We are about to wait; must release the region mutex. + * Then, when we wakeup, we need to reacquire the region + * mutex before continuing. + */ + if (lrp->detect == DB_LOCK_NORUN) + lt->region->need_dd = 1; + UNLOCK_LOCKREGION(lt); + + /* + * We are about to wait; before waiting, see if the deadlock + * detector should be run. + */ + if (lrp->detect != DB_LOCK_NORUN) + ret = lock_detect(lt, 0, lrp->detect); + + (void)__db_mutex_lock(&newl->mutex, + lt->fd, lt->dbenv == NULL ? NULL : lt->dbenv->db_yield); + + LOCK_LOCKREGION(lt); + if (newl->status != DB_LSTAT_PENDING) { + /* Return to free list. */ + __lock_checklocker(lt, newl, 0); + SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, + __db_lock); + switch (newl->status) { + case DB_LSTAT_ABORTED: + ret = DB_LOCK_DEADLOCK; + break; + case DB_LSTAT_NOGRANT: + ret = DB_LOCK_NOTGRANTED; + break; + default: + ret = EINVAL; + break; + } + newl->status = DB_LSTAT_FREE; + newl = NULL; + } else + newl->status = DB_LSTAT_HELD; + } + + *lockp = newl; + return (ret); +} + +/* + * This is called at every interface to verify if the region + * has changed size, and if so, to remap the region in and + * reset the process pointers. + */ +static int +__lock_validate_region(lt) + DB_LOCKTAB *lt; +{ + int ret; + + if (lt->reg_size == lt->region->hdr.size) + return (0); + + /* Grow the region. */ + if ((ret = __db_rremap(lt->dbenv, lt->region, + lt->reg_size, lt->region->hdr.size, lt->fd, <->region)) != 0) + return (ret); + + __lock_reset_region(lt); + + return (0); +} + +/* + * We have run out of space; time to grow the region. + */ +static int +__lock_grow_region(lt, which, howmuch) + DB_LOCKTAB *lt; + int which; + size_t howmuch; +{ + struct __db_lock *newl; + struct lock_header *lock_head; + struct obj_header *obj_head; + DB_LOCKOBJ *op; + DB_LOCKREGION *lrp; + float lock_ratio, obj_ratio; + size_t incr, oldsize, used; + u_int32_t i, newlocks, newmem, newobjs; + int ret, usedlocks, usedmem, usedobjs; + u_int8_t *curaddr; + + lrp = lt->region; + oldsize = lrp->hdr.size; + incr = lrp->increment; + + /* Figure out how much of each sort of space we have. */ + usedmem = lrp->mem_bytes - __db_shalloc_count(lt->mem); + usedobjs = lrp->numobjs - __lock_count_objs(lrp); + usedlocks = lrp->maxlocks - __lock_count_locks(lrp); + + /* + * Figure out what fraction of the used space belongs to each + * different type of "thing" in the region. Then partition the + * new space up according to this ratio. + */ + used = usedmem + + usedlocks * ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT) + + usedobjs * sizeof(DB_LOCKOBJ); + + lock_ratio = usedlocks * + ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT) / (float)used; + obj_ratio = usedobjs * sizeof(DB_LOCKOBJ) / (float)used; + + newlocks = (u_int32_t)(lock_ratio * + incr / ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT)); + newobjs = (u_int32_t)(obj_ratio * incr / sizeof(DB_LOCKOBJ)); + newmem = incr - + (newobjs * sizeof(DB_LOCKOBJ) + + newlocks * ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT)); + + /* + * Make sure we allocate enough memory for the object being + * requested. + */ + switch (which) { + case DB_LOCK_LOCK: + if (newlocks == 0) { + newlocks = 10; + incr += newlocks * sizeof(struct __db_lock); + } + break; + case DB_LOCK_OBJ: + if (newobjs == 0) { + newobjs = 10; + incr += newobjs * sizeof(DB_LOCKOBJ); + } + break; + case DB_LOCK_MEM: + if (newmem < howmuch * 2) { + incr += howmuch * 2 - newmem; + newmem = howmuch * 2; + } + break; + } + + newmem += ALIGN(incr, sizeof(size_t)) - incr; + incr = ALIGN(incr, sizeof(size_t)); + + /* + * Since we are going to be allocating locks at the beginning of the + * new chunk, we need to make sure that the chunk is MUTEX_ALIGNMENT + * aligned. We did not guarantee this when we created the region, so + * we may need to pad the old region by extra bytes to ensure this + * alignment. + */ + incr += ALIGN(oldsize, MUTEX_ALIGNMENT) - oldsize; + + __db_err(lt->dbenv, + "Growing lock region: %lu locks %lu objs %lu bytes", + (u_long)newlocks, (u_long)newobjs, (u_long)newmem); + + if ((ret = __db_rgrow(lt->dbenv, lt->fd, incr)) != 0) + return (ret); + if ((ret = __db_rremap(lt->dbenv, + lt->region, oldsize, oldsize + incr, lt->fd, <->region)) != 0) + return (ret); + __lock_reset_region(lt); + + /* Update region parameters. */ + lrp = lt->region; + lrp->increment = incr << 1; + lrp->maxlocks += newlocks; + lrp->numobjs += newobjs; + lrp->mem_bytes += newmem; + + curaddr = (u_int8_t *)lrp + oldsize; + curaddr = (u_int8_t *)ALIGNP(curaddr, MUTEX_ALIGNMENT); + + /* Put new locks onto the free list. */ + lock_head = &lrp->free_locks; + for (i = 0; i++ < newlocks; + curaddr += ALIGN(sizeof(struct __db_lock), MUTEX_ALIGNMENT)) { + newl = (struct __db_lock *)curaddr; + SH_TAILQ_INSERT_HEAD(lock_head, newl, links, __db_lock); + } + + /* Put new objects onto the free list. */ + obj_head = &lrp->free_objs; + for (i = 0; i++ < newobjs; curaddr += sizeof(DB_LOCKOBJ)) { + op = (DB_LOCKOBJ *)curaddr; + SH_TAILQ_INSERT_HEAD(obj_head, op, links, __db_lockobj); + } + + *((size_t *)curaddr) = newmem - sizeof(size_t); + curaddr += sizeof(size_t); + __db_shalloc_free(lt->mem, curaddr); + + return (0); +} + +#ifdef DEBUG +void +__lock_dump_region(lt, flags) + DB_LOCKTAB *lt; + unsigned long flags; +{ + struct __db_lock *lp; + DB_LOCKOBJ *op; + DB_LOCKREGION *lrp; + u_int32_t i, j; + + lrp = lt->region; + + printf("Lock region parameters\n"); + printf("%s:0x%x\t%s:%lu\t%s:%lu\t%s:%lu\n%s:%lu\t%s:%lu\t%s:%lu\t\n", + "magic ", lrp->magic, + "version ", (u_long)lrp->version, + "processes ", (u_long)lrp->hdr.refcnt, + "maxlocks ", (u_long)lrp->maxlocks, + "table size ", (u_long)lrp->table_size, + "nmodes ", (u_long)lrp->nmodes, + "numobjs ", (u_long)lrp->numobjs); + printf("%s:%lu\t%s:%lu\t%s:%lu\n%s:%lu\t%s:%lu\t%s:%lu\n", + "size ", (u_long)lrp->hdr.size, + "nlockers ", (u_long)lrp->nlockers, + "hash_off ", (u_long)lrp->hash_off, + "increment ", (u_long)lrp->increment, + "mem_off ", (u_long)lrp->mem_off, + "mem_bytes ", (u_long)lrp->mem_bytes); +#ifndef HAVE_SPINLOCKS + printf("Mutex: off %lu", (u_long)lrp->hdr.lock.off); +#endif +#ifdef MUTEX_STATISTICS + printf(" waits %lu nowaits %lu", + (u_long)lrp->hdr.lock.mutex_set_wait, + (u_long)lrp->hdr.lock.mutex_set_nowait); +#endif + printf("\n%s:%lu\t%s:%lu\t%s:%lu\t%s:%lu\n", + "nconflicts ", (u_long)lrp->nconflicts, + "nrequests ", (u_long)lrp->nrequests, + "nreleases ", (u_long)lrp->nreleases, + "ndeadlocks ", (u_long)lrp->ndeadlocks); + printf("need_dd %lu\n", (u_long)lrp->need_dd); + if (flags & LOCK_DEBUG_CONF) { + printf("\nConflict matrix\n"); + + for (i = 0; i < lrp->nmodes; i++) { + for (j = 0; j < lrp->nmodes; j++) + printf("%lu\t", + (u_long)lt->conflicts[i * lrp->nmodes + j]); + printf("\n"); + } + } + + for (i = 0; i < lrp->table_size; i++) { + op = SH_TAILQ_FIRST(<->hashtab[i], __db_lockobj); + if (op != NULL && flags & LOCK_DEBUG_BUCKET) + printf("Bucket %lu:\n", (unsigned long)i); + while (op != NULL) { + if (op->type == DB_LOCK_LOCKER && + flags & LOCK_DEBUG_LOCKERS) + __lock_dump_locker(lt, op); + else if (flags & LOCK_DEBUG_OBJECTS && + op->type == DB_LOCK_OBJTYPE) + __lock_dump_object(lt, op); + op = SH_TAILQ_NEXT(op, links, __db_lockobj); + } + } + + if (flags & LOCK_DEBUG_LOCK) { + printf("\nLock Free List\n"); + for (lp = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock); + lp != NULL; + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { + printf("0x%x: %lu\t%lu\t%lu\t0x%x\n", (u_int)lp, + (u_long)lp->holder, (u_long)lp->mode, + (u_long)lp->status, (u_int)lp->obj); + } + } + + if (flags & LOCK_DEBUG_LOCK) { + printf("\nObject Free List\n"); + for (op = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj); + op != NULL; + op = SH_TAILQ_NEXT(op, links, __db_lockobj)) + printf("0x%x\n", (u_int)op); + } + + if (flags & LOCK_DEBUG_MEM) { + printf("\nMemory Free List\n"); + __db_shalloc_dump(stdout, lt->mem); + } +} + +static void +__lock_dump_locker(lt, op) + DB_LOCKTAB *lt; + DB_LOCKOBJ *op; +{ + struct __db_lock *lp; + u_int32_t locker; + void *ptr; + + ptr = SH_DBT_PTR(&op->lockobj); + memcpy(&locker, ptr, sizeof(u_int32_t)); + printf("L %lu", (u_long)locker); + + lp = SH_LIST_FIRST(&op->heldby, __db_lock); + if (lp == NULL) { + printf("\n"); + return; + } + for (; lp != NULL; lp = SH_LIST_NEXT(lp, locker_links, __db_lock)) + __lock_printlock(lt, lp, 0); +} + +static void +__lock_dump_object(lt, op) + DB_LOCKTAB *lt; + DB_LOCKOBJ *op; +{ + struct __db_lock *lp; + u_int32_t j; + char *ptr; + + ptr = SH_DBT_PTR(&op->lockobj); + for (j = 0; j < op->lockobj.size; ptr++, j++) + printf("%c", (int)*ptr); + printf("\n"); + + printf("H:"); + for (lp = + SH_TAILQ_FIRST(&op->holders, __db_lock); + lp != NULL; + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) + __lock_printlock(lt, lp, 0); + lp = SH_TAILQ_FIRST(&op->waiters, __db_lock); + if (lp != NULL) { + printf("\nW:"); + for (; lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) + __lock_printlock(lt, lp, 0); + } +} + +int +__lock_is_locked(lt, locker, dbt, mode) + DB_LOCKTAB *lt; + u_int32_t locker; + DBT *dbt; + db_lockmode_t mode; +{ + struct __db_lock *lp; + DB_LOCKOBJ *sh_obj; + DB_LOCKREGION *lrp; + + lrp = lt->region; + + /* Look up the object in the hash table. */ + __db_hashlookup(lt->hashtab, __db_lockobj, links, + dbt, sh_obj, lrp->table_size, __lock_ohash, __lock_cmp); + if (sh_obj == NULL) + return (0); + + for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); + lp != NULL; + lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock)) { + if (lp->holder == locker && lp->mode == mode) + return (1); + } + + return (0); +} + +static void +__lock_printlock(lt, lp, ispgno) + DB_LOCKTAB *lt; + struct __db_lock *lp; + int ispgno; +{ + DB_LOCKOBJ *lockobj; + db_pgno_t pgno; + size_t obj; + u_int8_t *ptr; + char *mode, *stat; + + switch (lp->mode) { + case DB_LOCK_IREAD: + mode = "IREAD"; + break; + case DB_LOCK_IWR: + mode = "IWR"; + break; + case DB_LOCK_IWRITE: + mode = "IWRITE"; + break; + case DB_LOCK_NG: + mode = "NG"; + break; + case DB_LOCK_READ: + mode = "READ"; + break; + case DB_LOCK_WRITE: + mode = "WRITE"; + break; + default: + mode = "UNKNOWN"; + break; + } + switch (lp->status) { + case DB_LSTAT_ABORTED: + stat = "ABORT"; + break; + case DB_LSTAT_ERR: + stat = "ERROR"; + break; + case DB_LSTAT_FREE: + stat = "FREE"; + break; + case DB_LSTAT_HELD: + stat = "HELD"; + break; + case DB_LSTAT_NOGRANT: + stat = "NONE"; + break; + case DB_LSTAT_WAITING: + stat = "WAIT"; + break; + case DB_LSTAT_PENDING: + stat = "PENDING"; + break; + default: + stat = "UNKNOWN"; + break; + } + printf("\t%lu\t%s\t%lu\t%s\t", + (u_long)lp->holder, mode, (u_long)lp->refcount, stat); + + lockobj = (DB_LOCKOBJ *)((u_int8_t *)lp + lp->obj); + ptr = SH_DBT_PTR(&lockobj->lockobj); + if (ispgno) { + /* Assume this is a DBT lock. */ + memcpy(&pgno, ptr, sizeof(db_pgno_t)); + printf("page %lu\n", (u_long)pgno); + } else { + obj = (u_int8_t *)lp + lp->obj - (u_int8_t *)lt->region; + printf("0x%lx ", (u_long)obj); + __db_pr(ptr, lockobj->lockobj.size); + printf("\n"); + } +} + +#endif + +static int +__lock_count_locks(lrp) + DB_LOCKREGION *lrp; +{ + struct __db_lock *newl; + int count; + + count = 0; + for (newl = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock); + newl != NULL; + newl = SH_TAILQ_NEXT(newl, links, __db_lock)) + count++; + + return (count); +} + +static int +__lock_count_objs(lrp) + DB_LOCKREGION *lrp; +{ + DB_LOCKOBJ *obj; + int count; + + count = 0; + for (obj = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj); + obj != NULL; + obj = SH_TAILQ_NEXT(obj, links, __db_lockobj)) + count++; + + return (count); +} + +/* + * PUBLIC: int __lock_getobj __P((DB_LOCKTAB *, + * PUBLIC: u_int32_t, DBT *, u_int32_t type, DB_LOCKOBJ **)); + */ +int +__lock_getobj(lt, locker, dbt, type, objp) + DB_LOCKTAB *lt; + u_int32_t locker, type; + DBT *dbt; + DB_LOCKOBJ **objp; +{ + DB_LOCKREGION *lrp; + DB_LOCKOBJ *sh_obj; + u_int32_t obj_size; + int ret; + void *p, *src; + + lrp = lt->region; + + /* Look up the object in the hash table. */ + if (type == DB_LOCK_OBJTYPE) { + __db_hashlookup(lt->hashtab, __db_lockobj, links, dbt, sh_obj, + lrp->table_size, __lock_ohash, __lock_cmp); + obj_size = dbt->size; + } else { + __db_hashlookup(lt->hashtab, __db_lockobj, links, locker, + sh_obj, lrp->table_size, __lock_locker_hash, + __lock_locker_cmp); + obj_size = sizeof(locker); + } + + /* + * If we found the object, then we can just return it. If + * we didn't find the object, then we need to create it. + */ + if (sh_obj == NULL) { + /* Create new object and then insert it into hash table. */ + if ((sh_obj = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj)) + == NULL) { + if ((ret = __lock_grow_region(lt, DB_LOCK_OBJ, 0)) != 0) + return (ret); + lrp = lt->region; + sh_obj = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj); + } + if ((ret = __db_shalloc(lt->mem, obj_size, 0, &p)) != 0) { + if ((ret = __lock_grow_region(lt, + DB_LOCK_MEM, obj_size)) != 0) + return (ret); + lrp = lt->region; + /* Reacquire the head of the list. */ + sh_obj = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj); + (void)__db_shalloc(lt->mem, obj_size, 0, &p); + } + sh_obj->type = type; + src = type == DB_LOCK_OBJTYPE ? dbt->data : (void *)&locker; + memcpy(p, src, obj_size); + SH_TAILQ_REMOVE(&lrp->free_objs, sh_obj, links, __db_lockobj); + + SH_TAILQ_INIT(&sh_obj->waiters); + if (type == DB_LOCK_LOCKER) + SH_LIST_INIT(&sh_obj->heldby); + else + SH_TAILQ_INIT(&sh_obj->holders); + sh_obj->lockobj.size = obj_size; + sh_obj->lockobj.off = SH_PTR_TO_OFF(&sh_obj->lockobj, p); + + __db_hashinsert(lt->hashtab, __db_lockobj, links, sh_obj, + lrp->table_size, __lock_lhash); + + if (type == DB_LOCK_LOCKER) + lrp->nlockers++; + } + + *objp = sh_obj; + return (0); +} + +/* + * Any lock on the waitlist has a process waiting for it. Therefore, we + * can't return the lock to the freelist immediately. Instead, we can + * remove the lock from the list of waiters, set the status field of the + * lock, and then let the process waking up return the lock to the + * free list. + */ +static void +__lock_remove_waiter(lt, sh_obj, lockp, status) + DB_LOCKTAB *lt; + DB_LOCKOBJ *sh_obj; + struct __db_lock *lockp; + db_status_t status; +{ + SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock); + lockp->status = status; + + /* Wake whoever is waiting on this lock. */ + (void)__db_mutex_unlock(&lockp->mutex, lt->fd); +} + +static void +__lock_freeobj(lt, obj) + DB_LOCKTAB *lt; + DB_LOCKOBJ *obj; +{ + __db_hashremove_el(lt->hashtab, __db_lockobj, links, + obj, lt->region->table_size, __lock_lhash); + __db_shalloc_free(lt->mem, SH_DBT_PTR(&obj->lockobj)); + SH_TAILQ_INSERT_HEAD(<->region->free_objs, obj, links, __db_lockobj); +} + +static void +__lock_checklocker(lt, lockp, do_remove) + DB_LOCKTAB *lt; + struct __db_lock *lockp; + int do_remove; +{ + DB_LOCKOBJ *sh_locker; + + if (do_remove) + SH_LIST_REMOVE(lockp, locker_links, __db_lock); + + /* if the locker list is NULL, free up the object. */ + if (__lock_getobj(lt, lockp->holder, NULL, DB_LOCK_LOCKER, &sh_locker) + == 0 && SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL) { + __lock_freeobj(lt, sh_locker); + lt->region->nlockers--; + } +} + +static void +__lock_reset_region(lt) + DB_LOCKTAB *lt; +{ + lt->conflicts = (u_int8_t *)lt->region + sizeof(DB_LOCKREGION); + lt->hashtab = + (DB_HASHTAB *)((u_int8_t *)lt->region + lt->region->hash_off); + lt->mem = (void *)((u_int8_t *)lt->region + lt->region->mem_off); + lt->reg_size = lt->region->hdr.size; +} diff --git a/db2/lock/lock_conflict.c b/db2/lock/lock_conflict.c new file mode 100644 index 0000000000..ff0287f07e --- /dev/null +++ b/db2/lock/lock_conflict.c @@ -0,0 +1,39 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)lock_conflict.c 10.2 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" + +/* + * The conflict arrays are set up such that the row is the lock you + * are holding and the column is the lock that is desired. + */ +const u_int8_t db_rw_conflicts[] = { + /* N R W */ + /* N */ 0, 0, 0, + /* R */ 0, 0, 1, + /* W */ 0, 1, 1 +}; + +const u_int8_t db_riw_conflicts[] = { + /* N S X IS IX SIX */ + /* N */ 0, 0, 0, 0, 0, 0, + /* S */ 0, 0, 1, 0, 1, 1, + /* X */ 1, 1, 1, 1, 1, 1, + /* IS */ 0, 0, 1, 0, 0, 0, + /* IX */ 0, 1, 1, 0, 0, 0, + /* SIX */ 0, 1, 1, 0, 0, 0 +}; diff --git a/db2/lock/lock_deadlock.c b/db2/lock/lock_deadlock.c new file mode 100644 index 0000000000..54a73afd1b --- /dev/null +++ b/db2/lock/lock_deadlock.c @@ -0,0 +1,496 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)lock_deadlock.c 10.20 (Sleepycat) 8/21/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "common_ext.h" + +#define ISSET_MAP(M, N) (M[(N) / 32] & (1 << (N) % 32)) + +#define CLEAR_MAP(M, N) { \ + u_int32_t __i; \ + for (__i = 0; __i < (N); __i++) \ + M[__i] = 0; \ +} + +#define SET_MAP(M, B) (M[(B) / 32] |= (1 << ((B) % 32))) +#define CLR_MAP(M, B) (M[(B) / 32] &= ~(1 << ((B) % 32))) + +#define OR_MAP(D, S, N) { \ + u_int32_t __i; \ + for (__i = 0; __i < (N); __i++) \ + D[__i] |= S[__i]; \ +} +#define BAD_KILLID 0xffffffff + +typedef struct { + int valid; + u_int32_t id; + DB_LOCK last_lock; +} locker_info; + +static int __dd_abort __P((DB_ENV *, locker_info *)); +static int __dd_build __P((DB_ENV *, u_int32_t **, int *, locker_info **)); +#ifdef DEBUG +static void __dd_debug __P((DB_ENV *, locker_info *, u_int32_t *, int)); +#endif +static u_int32_t + *__dd_find __P((u_int32_t *, locker_info *, u_int32_t)); + +int +lock_detect(lt, flags, atype) + DB_LOCKTAB *lt; + int flags; + u_int32_t atype; +{ + DB_ENV *dbenv; + locker_info *idmap; + u_int32_t *bitmap, *deadlock, killid; + int do_pass, i, nlockers, nentries, ret; + + /* Validate arguments. */ + if ((ret = + __db_fchk(lt->dbenv, "lock_detect", flags, DB_LOCK_CONFLICT)) != 0) + return (ret); + + /* Check if a detector run is necessary. */ + do_pass = 1; + dbenv = lt->dbenv; + if (LF_ISSET(DB_LOCK_CONFLICT)) { + /* Make a pass every time a lock waits. */ + LOCK_LOCKREGION(lt); + do_pass = dbenv->lk_info->region->need_dd != 0; + UNLOCK_LOCKREGION(lt); + } + + if (!do_pass) + return (0); + + /* Build the waits-for bitmap. */ + if ((ret = __dd_build(dbenv, &bitmap, &nlockers, &idmap)) != 0) + return (ret); + + if (nlockers == 0) + return (0); +#ifdef DEBUG + if (dbenv->db_verbose != 0) + __dd_debug(dbenv, idmap, bitmap, nlockers); +#endif + /* Find a deadlock. */ + deadlock = __dd_find(bitmap, idmap, nlockers); + nentries = ALIGN(nlockers, 32) / 32; + killid = BAD_KILLID; + if (deadlock != NULL) { + /* Kill someone. */ + switch (atype) { + case DB_LOCK_OLDEST: + /* + * Find the first bit set in the current + * array and then look for a lower tid in + * the array. + */ + for (i = 0; i < nlockers; i++) + if (ISSET_MAP(deadlock, i)) + killid = i; + + if (killid == BAD_KILLID) { + __db_err(dbenv, + "warning: could not find %s", + "locker to abort"); + break; + } + + /* + * The oldest transaction has the lowest + * transaction id. + */ + for (i = killid + 1; i < nlockers; i++) + if (ISSET_MAP(deadlock, i) && + idmap[i].id < idmap[killid].id) + killid = i; + break; + case DB_LOCK_DEFAULT: + case DB_LOCK_RANDOM: + /* + * We are trying to calculate the id of the + * locker whose entry is indicated by deadlock. + * We know that this is less than nlockers, so + * the cast below is valid. + */ + killid = + (u_int32_t)((deadlock - bitmap) / nentries); + break; + case DB_LOCK_YOUNGEST: + /* + * Find the first bit set in the current + * array and then look for a lower tid in + * the array. + */ + for (i = 0; i < nlockers; i++) + if (ISSET_MAP(deadlock, i)) + killid = i; + + if (killid == BAD_KILLID) { + __db_err(dbenv, + "warning: could not find %s", + "locker to abort"); + break; + } + /* + * The youngest transaction has the highest + * transaction id. + */ + for (i = killid + 1; i < nlockers; i++) + if (ISSET_MAP(deadlock, i) && + idmap[i].id > idmap[killid].id) + killid = i; + break; + default: + killid = BAD_KILLID; + ret = EINVAL; + } + + /* Kill the locker with lockid idmap[killid]. */ + if (dbenv->db_verbose != 0 && killid != BAD_KILLID) + __db_err(dbenv, "Aborting locker %lx", + (u_long)idmap[killid].id); + + if (killid != BAD_KILLID && + (ret = __dd_abort(dbenv, &idmap[killid])) != 0) + __db_err(dbenv, + "warning: unable to abort locker %lx", + (u_long)idmap[killid].id); + } + free(bitmap); + free(idmap); + + return (ret); +} + +/* + * ======================================================================== + * Utilities + */ +static int +__dd_build(dbenv, bmp, nlockers, idmap) + DB_ENV *dbenv; + u_int32_t **bmp; + int *nlockers; + locker_info **idmap; +{ + DB_LOCKTAB *lt; + DB_LOCKOBJ *op, *lockerp; + struct __db_lock *lp; + u_int32_t *bitmap, count, *entryp, i, id, nentries, *tmpmap; + locker_info *id_array; + int is_first, ret; + + lt = dbenv->lk_info; + + /* + * We'll check how many lockers there are, add a few more in for + * good measure and then allocate all the structures. Then we'll + * verify that we have enough room when we go back in and get the + * mutex the second time. + */ + LOCK_LOCKREGION(lt); +retry: count = lt->region->nlockers; + lt->region->need_dd = 0; + UNLOCK_LOCKREGION(lt); + + if (count == 0) { + *nlockers = 0; + return (0); + } + + if (dbenv->db_verbose) + __db_err(dbenv, "%lu lockers", (u_long)count); + + count += 10; + nentries = ALIGN(count, 32) / 32; + /* + * Allocate enough space for a count by count bitmap matrix. + * + * XXX + * We can probably save the malloc's between iterations just + * reallocing if necessary because count grew by too much. + */ + if ((bitmap = (u_int32_t *)calloc((size_t)count, + sizeof(u_int32_t) * nentries)) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); + } + + if ((tmpmap = + (u_int32_t *)calloc(sizeof(u_int32_t), nentries)) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + free(bitmap); + return (ENOMEM); + } + + if ((id_array = (locker_info *)calloc((size_t)count, + sizeof(locker_info))) == NULL) { + __db_err(dbenv, "%s", strerror(ENOMEM)); + free(bitmap); + free(tmpmap); + return (ENOMEM); + } + + /* + * Now go back in and actually fill in the matrix. + */ + LOCK_LOCKREGION(lt); + if (lt->region->nlockers > count) { + free(bitmap); + free(tmpmap); + free(id_array); + goto retry; + } + + /* + * First we go through and assign each locker a deadlock detector id. + * Note that we fill in the idmap in the next loop since that's the + * only place where we conveniently have both the deadlock id and the + * actual locker. + */ + for (id = 0, i = 0; i < lt->region->table_size; i++) + for (op = SH_TAILQ_FIRST(<->hashtab[i], __db_lockobj); + op != NULL; op = SH_TAILQ_NEXT(op, links, __db_lockobj)) + if (op->type == DB_LOCK_LOCKER) + op->dd_id = id++; + /* + * We go through the hash table and find each object. For each object, + * we traverse the waiters list and add an entry in the waitsfor matrix + * for each waiter/holder combination. + */ + for (i = 0; i < lt->region->table_size; i++) { + for (op = SH_TAILQ_FIRST(<->hashtab[i], __db_lockobj); + op != NULL; op = SH_TAILQ_NEXT(op, links, __db_lockobj)) { + if (op->type != DB_LOCK_OBJTYPE) + continue; + CLEAR_MAP(tmpmap, nentries); + + /* + * First we go through and create a bit map that + * represents all the holders of this object. + */ + for (lp = SH_TAILQ_FIRST(&op->holders, __db_lock); + lp != NULL; + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { + if ((errno = __lock_getobj(lt, lp->holder, + NULL, DB_LOCK_LOCKER, &lockerp)) != 0) { + __db_err(dbenv, + "warning unable to find object"); + continue; + } + id_array[lockerp->dd_id].id = lp->holder; + id_array[lockerp->dd_id].valid = 1; + + /* + * If the holder has already been aborted, then + * we should ignore it for now. + */ + if (lp->status == DB_LSTAT_HELD) + SET_MAP(tmpmap, lockerp->dd_id); + } + + /* + * Next, for each waiter, we set its row in the matrix + * equal to the map of holders we set up above. + */ + for (is_first = 1, + lp = SH_TAILQ_FIRST(&op->waiters, __db_lock); + lp != NULL; + is_first = 0, + lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { + if ((ret = __lock_getobj(lt, + lp->holder, NULL, DB_LOCK_LOCKER, &lockerp)) + != 0) { + __db_err(dbenv, + "warning unable to find object"); + continue; + } + id_array[lockerp->dd_id].id = lp->holder; + id_array[lockerp->dd_id].valid = 1; + + /* + * If the transaction is pending abortion, then + * ignore it on this iteration. + */ + if (lp->status != DB_LSTAT_WAITING) + continue; + + entryp = bitmap + (nentries * lockerp->dd_id); + OR_MAP(entryp, tmpmap, nentries); + /* + * If this is the first waiter on the queue, + * then we remove the waitsfor relationship + * with oneself. However, if it's anywhere + * else on the queue, then we have to keep + * it and we have an automatic deadlock. + */ + if (is_first) + CLR_MAP(entryp, lockerp->dd_id); + } + } + } + + /* Now for each locker; record its last lock. */ + for (id = 0; id < count; id++) { + if (!id_array[id].valid) + continue; + if ((ret = __lock_getobj(lt, + id_array[id].id, NULL, DB_LOCK_LOCKER, &lockerp)) != 0) { + __db_err(dbenv, + "No locks for locker %lu", (u_long)id_array[id].id); + continue; + } + lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock); + if (lp != NULL) + id_array[id].last_lock = LOCK_TO_OFFSET(lt, lp); + } + + /* Pass complete, reset the deadlock detector bit. */ + lt->region->need_dd = 0; + UNLOCK_LOCKREGION(lt); + + /* + * Now we can release everything except the bitmap matrix that we + * created. + */ + *nlockers = id; + *idmap = id_array; + *bmp = bitmap; + free(tmpmap); + return (0); +} + +static u_int32_t * +__dd_find(bmp, idmap, nlockers) + u_int32_t *bmp; + locker_info *idmap; + u_int32_t nlockers; +{ + u_int32_t i, j, nentries, *mymap, *tmpmap; + + /* + * For each locker, or in the bits from the lockers + * on which that locker is waiting. + */ + nentries = ALIGN(nlockers, 32) / 32; + for (mymap = bmp, i = 0; i < nlockers; i++, mymap += nentries) { + if (!idmap[i].valid) + continue; + for (j = 0; j < nlockers; j++) { + if (ISSET_MAP(mymap, j)) { + /* Find the map for this bit. */ + tmpmap = bmp + (nentries * j); + OR_MAP(mymap, tmpmap, nentries); + if (ISSET_MAP(mymap, i)) + return (mymap); + } + } + } + return (NULL); +} + +static int +__dd_abort(dbenv, info) + DB_ENV *dbenv; + locker_info *info; +{ + DB_LOCKTAB *lt; + DB_LOCKOBJ *lockerp, *sh_obj; + struct __db_lock *lockp; + int ret; + + lt = dbenv->lk_info; + LOCK_LOCKREGION(lt); + + /* Find the locker's last lock. */ + if ((ret = + __lock_getobj(lt, info->id, NULL, DB_LOCK_LOCKER, &lockerp)) != 0) + goto out; + + lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock); + if (LOCK_TO_OFFSET(lt, lockp) != info->last_lock || + lockp == NULL || lockp->status != DB_LSTAT_WAITING) + goto out; + + /* Abort lock, take it off list, and wake up this lock. */ + lockp->status = DB_LSTAT_ABORTED; + lt->region->ndeadlocks++; + SH_LIST_REMOVE(lockp, locker_links, __db_lock); + sh_obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj); + SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock); + (void)__db_mutex_unlock(&lockp->mutex, lt->fd); + + ret = 0; + +out: UNLOCK_LOCKREGION(lt); + return (ret); +} + +#ifdef DEBUG +static void +__dd_debug(dbenv, idmap, bitmap, nlockers) + DB_ENV *dbenv; + locker_info *idmap; + u_int32_t *bitmap; + int nlockers; +{ + u_int32_t *mymap; + int i, j, nentries; + char *msgbuf; + + __db_err(dbenv, "Waitsfor array"); + __db_err(dbenv, "waiter\twaiting on"); + /* + * Alloc space to print 10 bytes per item waited on. + */ + if ((msgbuf = (char *)malloc((nlockers + 1) * 10 + 64)) == NULL) { + errno = ENOMEM; + __db_err(dbenv, "%s", strerror(errno)); + return; + } + + nentries = ALIGN(nlockers, 32) / 32; + for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nentries) { + if (!idmap[i].valid) + continue; + sprintf(msgbuf, "%lx\t\t", (u_long)idmap[i].id);/* Waiter. */ + for (j = 0; j < nlockers; j++) + if (ISSET_MAP(mymap, j)) + sprintf(msgbuf, "%s %lx", msgbuf, + (u_long)idmap[j].id); + (void)sprintf(msgbuf, + "%s %lu", msgbuf, (u_long)idmap[i].last_lock); + __db_err(dbenv, msgbuf); + } + + free(msgbuf); +} +#endif diff --git a/db2/lock/lock_util.c b/db2/lock/lock_util.c new file mode 100644 index 0000000000..4063849f28 --- /dev/null +++ b/db2/lock/lock_util.c @@ -0,0 +1,103 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)lock_util.c 10.4 (Sleepycat) 7/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_shash.h" +#include "hash.h" +#include "lock.h" + +/* + * This function is used to compare a DBT that is about to be entered + * into a hash table with an object already in the hash table. Note + * that it just returns true on equal and 0 on not-equal. Therefore this + * cannot be used as a sort function; its purpose is to be used as a + * hash comparison function. + * PUBLIC: int __lock_cmp __P((DBT *, DB_LOCKOBJ *)); + */ +int +__lock_cmp(dbt, lock_obj) + DBT *dbt; + DB_LOCKOBJ *lock_obj; +{ + void *obj_data; + + if (lock_obj->type != DB_LOCK_OBJTYPE) + return (0); + obj_data = SH_DBT_PTR(&lock_obj->lockobj); + return (dbt->size == lock_obj->lockobj.size && + memcmp(dbt->data, obj_data, dbt->size) == 0); +} + +/* + * PUBLIC: int __lock_locker_cmp __P((u_int32_t, DB_LOCKOBJ *)); + */ +int +__lock_locker_cmp(locker, lock_obj) + u_int32_t locker; + DB_LOCKOBJ *lock_obj; +{ + void *obj_data; + + if (lock_obj->type != DB_LOCK_LOCKER) + return (0); + + obj_data = SH_DBT_PTR(&lock_obj->lockobj); + return (memcmp(&locker, obj_data, sizeof(u_int32_t)) == 0); +} + +/* + * PUBLIC: int __lock_ohash __P((DBT *)); + */ +int +__lock_ohash(dbt) + DBT *dbt; +{ + return (__ham_func5(dbt->data, dbt->size)); +} + +/* + * PUBLIC: u_int32_t __lock_locker_hash __P((u_int32_t)); + */ +u_int32_t +__lock_locker_hash(locker) + u_int32_t locker; +{ + return (__ham_func5(&locker, sizeof(locker))); +} + +/* + * PUBLIC: u_int32_t __lock_lhash __P((DB_LOCKOBJ *)); + */ +u_int32_t +__lock_lhash(lock_obj) + DB_LOCKOBJ *lock_obj; +{ + void *obj_data; + + obj_data = SH_DBT_PTR(&lock_obj->lockobj); + return (__ham_func5(obj_data, lock_obj->lockobj.size)); +} + diff --git a/db2/log/log.c b/db2/log/log.c new file mode 100644 index 0000000000..1684ce8cc8 --- /dev/null +++ b/db2/log/log.c @@ -0,0 +1,438 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log.c 10.24 (Sleepycat) 8/16/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "log.h" +#include "db_dispatch.h" +#include "txn_auto.h" +#include "common_ext.h" + +static int __log_recover __P((DB_ENV *, DB_LOG *)); + +/* + * log_open -- + * Initialize and/or join a log. + */ +int +log_open(path, flags, mode, dbenv, lpp) + const char *path; + int flags; + int mode; + DB_ENV *dbenv; + DB_LOG **lpp; +{ + DB_LOG *dblp; + LOG *lp; + size_t len; + int fd, newregion, ret, retry_cnt; + + /* Validate arguments. */ +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_THREAD) +#else +#define OKFLAGS (DB_CREATE) +#endif + if ((ret = __db_fchk(dbenv, "log_open", flags, OKFLAGS)) != 0) + return (ret); + + /* + * We store 4-byte offsets into the file, so the maximum file + * size can't be larger than that. + */ + if (dbenv != NULL && dbenv->lg_max > UINT32_T_MAX) { + __db_err(dbenv, "log_open: maximum file size too large"); + return (EINVAL); + } + + /* Create and initialize the DB_LOG structure. */ + if ((dblp = (DB_LOG *)calloc(1, sizeof(DB_LOG))) == NULL) + return (ENOMEM); + + dblp->dbenv = dbenv; + dblp->lfd = -1; + ZERO_LSN(dblp->c_lsn); + dblp->c_fd = -1; + if (LF_ISSET(DB_THREAD)) { + F_SET(dblp, DB_AM_THREAD); + (void)__db_mutex_init(&dblp->mutex, -1); + } + + /* + * The log region isn't fixed size because we store the registered + * file names there. Make it fairly large so that we don't have to + * grow it. + */ + len = 30 * 1024; + + /* Map in the region. */ + retry_cnt = newregion = 0; +retry: if (LF_ISSET(DB_CREATE)) { + ret = __db_rcreate(dbenv, DB_APP_LOG, path, + DB_DEFAULT_LOG_FILE, mode, len, &fd, &dblp->maddr); + if (ret == 0) { + /* Put the LOG structure first in the region. */ + lp = dblp->maddr; + + /* Initialize the rest of the region as free space. */ + dblp->addr = (u_int8_t *)dblp->maddr + sizeof(LOG); + __db_shalloc_init(dblp->addr, len - sizeof(LOG)); + + /* Initialize the LOG structure. */ + lp->persist.lg_max = dbenv == NULL ? 0 : dbenv->lg_max; + if (lp->persist.lg_max == 0) + lp->persist.lg_max = DEFAULT_MAX; + lp->persist.magic = DB_LOGMAGIC; + lp->persist.version = DB_LOGVERSION; + lp->persist.mode = mode; + SH_TAILQ_INIT(&lp->fq); + + /* Initialize LOG LSNs. */ + lp->lsn.file = 1; + lp->lsn.offset = 0; + + newregion = 1; + } else if (ret != EEXIST) + return (ret); + } + + /* If we didn't or couldn't create the region, try and join it. */ + if (!newregion && + (ret = __db_ropen(dbenv, DB_APP_LOG, + path, DB_DEFAULT_LOG_FILE, 0, &fd, &dblp->maddr)) != 0) { + /* + * If we fail because the file isn't available, wait a + * second and try again. + */ + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry; + } + return (ret); + } + + /* Set up the common information. */ + dblp->lp = dblp->maddr; + dblp->addr = (u_int8_t *)dblp->maddr + sizeof(LOG); + dblp->fd = fd; + + /* + * If doing recovery, try and recover any previous log files + * before releasing the lock. + */ + if (newregion) { + if ((ret = __log_recover(dbenv, dblp)) != 0) { + log_unlink(path, 1, dbenv); + return (ret); + } + UNLOCK_LOGREGION(dblp); + } + *lpp = dblp; + return (0); +} + +/* + * __log_recover -- + * Recover a log. + */ +static int +__log_recover(dbenv, dblp) + DB_ENV *dbenv; + DB_LOG *dblp; +{ + DBT dbt; + DB_LSN lsn; + LOG *lp; + u_int32_t chk; + int cnt, found_checkpoint, ret; + + lp = dblp->lp; + + /* + * Find a log file. If none exist, we simply return, leaving + * everything initialized to a new log. + */ + if ((ret = __log_find(dbenv, lp, &cnt)) != 0) + return (ret); + if (cnt == 0) + return (0); + + /* We have a log file name, find the last one. */ + while (cnt < MAXLFNAME) + if (__log_valid(dbenv, lp, ++cnt) != 0) { + --cnt; + break; + } + + /* + * We have the last useful log file and we've loaded any persistent + * information. Pretend that the log is larger than it can possibly + * be, and read this file, looking for a checkpoint and its end. + */ + dblp->c_lsn.file = cnt; + dblp->c_lsn.offset = 0; + lsn = dblp->c_lsn; + lp->lsn.file = cnt + 1; + lp->lsn.offset = 0; + + /* Set the cursor. Shouldn't fail, leave error messages on. */ + memset(&dbt, 0, sizeof(dbt)); + if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0) + return (ret); + + /* + * Read to the end of the file, saving checkpoints. This will fail + * at some point, so turn off error messages. + */ + found_checkpoint = 0; + while (__log_get(dblp, &lsn, &dbt, DB_NEXT, 1) == 0) { + if (dbt.size < sizeof(u_int32_t)) + continue; + memcpy(&chk, dbt.data, sizeof(u_int32_t)); + if (chk == DB_txn_ckp) { + lp->c_lsn = lsn; + found_checkpoint = 1; + } + } + + /* + * We know where the end of the log is. Since that record is on disk, + * it's also the last-synced LSN. + */ + lp->lsn = lsn; + lp->lsn.offset += dblp->c_len; + lp->s_lsn = lp->lsn; + + /* Set up the current buffer information, too. */ + lp->len = dblp->c_len; + lp->b_off = 0; + lp->w_off = lp->lsn.offset; + + /* + * It's possible that we didn't find a checkpoint because there wasn't + * one in the last log file. Start searching. + */ + while (!found_checkpoint && cnt > 1) { + dblp->c_lsn.file = --cnt; + dblp->c_lsn.offset = 0; + lsn = dblp->c_lsn; + + /* Set the cursor. Shouldn't fail, leave error messages on. */ + if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0) + return (ret); + + /* + * Read to the end of the file, saving checkpoints. Shouldn't + * fail, leave error messages on. + */ + while (__log_get(dblp, &lsn, &dbt, DB_NEXT, 0) == 0) { + if (dbt.size < sizeof(u_int32_t)) + continue; + memcpy(&chk, dbt.data, sizeof(u_int32_t)); + if (chk == DB_txn_ckp) { + lp->c_lsn = lsn; + found_checkpoint = 1; + } + } + } + + /* If we never find a checkpoint, that's okay, just 0 it out. */ + if (!found_checkpoint) { + lp->c_lsn.file = 1; + lp->c_lsn.offset = 0; + } + + __db_err(dbenv, + "Recovering the log: last valid LSN: file: %lu offset %lu", + (u_long)lp->lsn.file, (u_long)lp->lsn.offset); + + /* Reset the cursor. */ + ZERO_LSN(dblp->c_lsn); + + return (0); +} + +/* + * __log_find -- + * Try to find a log file. + * + * PUBLIC: int __log_find __P((DB_ENV *, LOG *, int *)); + */ +int +__log_find(dbenv, lp, valp) + DB_ENV *dbenv; + LOG *lp; + int *valp; +{ + int cnt, fcnt, logval, ret; + const char *dir; + char **names, *p, *q; + + /* Find the directory name. */ + if ((ret = __log_name(dbenv, 1, &p)) != 0) + return (ret); + if ((q = __db_rpath(p)) == NULL) + dir = PATH_DOT; + else { + *q = '\0'; + dir = p; + } + + /* Get the list of file names. */ + ret = __db_dir(dbenv, dir, &names, &fcnt); + FREES(p); + if (ret != 0) + return (ret); + + /* + * Search for a valid log file name, return a value of 0 on + * failure. + */ + *valp = 0; + for (cnt = fcnt, logval = 0; --cnt >= 0;) + if (strncmp(names[cnt], "log.", sizeof("log.") - 1) == 0) { + logval = atoi(names[cnt] + 4); + if (logval != 0 && + __log_valid(dbenv, lp, logval) == 0) { + *valp = logval; + break; + } + } + + /* Discard the list. */ + __db_dirf(dbenv, names, fcnt); + + return (ret); +} + +/* + * log_valid -- + * Validate a log file. + * + * PUBLIC: int __log_valid __P((DB_ENV *, LOG *, int)); + */ +int +__log_valid(dbenv, lp, cnt) + DB_ENV *dbenv; + LOG *lp; + int cnt; +{ + LOGP persist; + ssize_t nw; + int fd, ret; + char *p; + + if ((ret = __log_name(dbenv, cnt, &p)) != 0) + return (ret); + + fd = -1; + if ((ret = __db_fdopen(p, + DB_RDONLY | DB_SEQUENTIAL, + DB_RDONLY | DB_SEQUENTIAL, 0, &fd)) != 0 || + (ret = __db_lseek(fd, 0, 0, sizeof(HDR), SEEK_SET)) != 0 || + (ret = __db_read(fd, &persist, sizeof(LOGP), &nw)) != 0 || + nw != sizeof(LOGP)) { + if (ret == 0) + ret = EIO; + if (fd != -1) { + (void)__db_close(fd); + __db_err(dbenv, + "Ignoring log file: %s: %s", p, strerror(ret)); + } + goto err; + } + (void)__db_close(fd); + + if (persist.magic != DB_LOGMAGIC) { + __db_err(dbenv, + "Ignoring log file: %s: magic number %lx, not %lx", + p, (u_long)persist.magic, (u_long)DB_LOGMAGIC); + ret = EINVAL; + goto err; + } + if (persist.version < DB_LOGOLDVER || persist.version > DB_LOGVERSION) { + __db_err(dbenv, + "Ignoring log file: %s: unsupported log version %lu", + p, (u_long)persist.version); + ret = EINVAL; + goto err; + } + + if (lp != NULL) { + lp->persist.lg_max = persist.lg_max; + lp->persist.mode = persist.mode; + } + ret = 0; + +err: FREES(p); + return (ret); +} + +/* + * log_close -- + * Close a log. + */ +int +log_close(dblp) + DB_LOG *dblp; +{ + int ret, t_ret; + + ret = 0; + + /* Close the region. */ + if ((t_ret = + __db_rclose(dblp->dbenv, dblp->fd, dblp->maddr)) != 0 && ret == 0) + ret = t_ret; + + /* Close open files, release allocated memory. */ + if (dblp->lfd != -1 && (t_ret = __db_close(dblp->lfd)) != 0 && ret == 0) + ret = t_ret; + if (dblp->c_dbt.data != NULL) + FREE(dblp->c_dbt.data, dblp->c_dbt.ulen); + if (dblp->c_fd != -1 && + (t_ret = __db_close(dblp->c_fd)) != 0 && ret == 0) + ret = t_ret; + + /* Free the structure. */ + if (dblp->dbentry != NULL) + FREE(dblp->dbentry, (dblp->dbentry_cnt * sizeof(DB_ENTRY))); + FREE(dblp, sizeof(DB_LOG)); + + return (ret); +} + +/* + * log_unlink -- + * Exit a log. + */ +int +log_unlink(path, force, dbenv) + const char *path; + int force; + DB_ENV *dbenv; +{ + return (__db_runlink(dbenv, + DB_APP_LOG, path, DB_DEFAULT_LOG_FILE, force)); +} diff --git a/db2/log/log.src b/db2/log/log.src new file mode 100644 index 0000000000..9f4829179b --- /dev/null +++ b/db2/log/log.src @@ -0,0 +1,53 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)log.src 10.3 (Sleepycat) 8/20/97 + * + * This is the source file used to create the logging functions for the + * log package. Each access method (or set of routines wishing to register + * record types with the transaction system) should have a file like this. + * Each type of log record and its parameters is defined. The basic + * format of a record definition is: + * + * BEGIN <RECORD_TYPE> + * ARG|STRING|POINTER <variable name> <variable type> <printf format> + * ... + * END + * ARG the argument is a simple parameter of the type * specified. + * DBT the argument is a DBT (db.h) containing a length and pointer. + * PTR the argument is a pointer to the data type specified; the entire + * type should be logged. + * + * There are a set of shell scripts of the form xxx.sh that generate c + * code and or h files to process these. (This is probably better done + * in a single PERL script, but for now, this works.) + * + * The DB recovery system requires the following three fields appear in + * every record, and will assign them to the per-record-type structures + * as well as making them the first parameters to the appropriate logging + * call. + * rectype: record-type, identifies the structure and log/read call + * txnid: transaction id, a DBT in this implementation + * prev: the last LSN for this transaction + */ + +/* + * Use the argument of PREFIX as the prefix for all record types, + * routines, id numbers, etc. + */ +PREFIX log + +/* Used for registering new name/id translations. */ +BEGIN register +DBT name DBT s +DBT uid DBT s +ARG id u_int32_t lu +ARG ftype DBTYPE lx +END + +BEGIN unregister +ARG id u_int32_t lu +END diff --git a/db2/log/log_archive.c b/db2/log/log_archive.c new file mode 100644 index 0000000000..d70d4c64c0 --- /dev/null +++ b/db2/log/log_archive.c @@ -0,0 +1,413 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_archive.c 10.23 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_dispatch.h" +#include "shqueue.h" +#include "log.h" +#include "clib_ext.h" +#include "common_ext.h" + +static int absname __P((char *, char *, char **)); +static int build_data __P((DB_LOG *, char *, char ***, void *(*)(size_t))); +static int cmpfunc __P((const void *, const void *)); +static int usermem __P((char ***, void *(*)(size_t))); + +/* + * log_archive -- + * Supporting function for db_archive(1). + */ +int +log_archive(logp, listp, flags, db_malloc) + DB_LOG *logp; + char ***listp; + int flags; + void *(*db_malloc) __P((size_t)); +{ + DBT rec; + DB_LSN stable_lsn; + u_int32_t fnum; + int array_size, n, ret; + char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN]; + + fnum = 0; /* XXX: Shut the compiler up. */ + +#define OKFLAGS (DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG) + if (flags != 0) { + if ((ret = + __db_fchk(logp->dbenv, "log_archive", flags, OKFLAGS)) != 0) + return (ret); + if ((ret = + __db_fcchk(logp->dbenv, + "log_archive", flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0) + return (ret); + } + + /* + * Get the absolute pathname of the current directory. It would + * be nice to get the shortest pathname of the database directory, + * but that's just not possible. + */ + if (LF_ISSET(DB_ARCH_ABS)) { + errno = 0; + if ((pref = getcwd(buf, sizeof(buf))) == NULL) + return (errno == 0 ? ENOMEM : errno); + } else + pref = NULL; + + switch (LF_ISSET(~DB_ARCH_ABS)) { + case DB_ARCH_DATA: + return (build_data(logp, pref, listp, db_malloc)); + case DB_ARCH_LOG: + memset(&rec, 0, sizeof(rec)); + if (F_ISSET(logp, DB_AM_THREAD)) + F_SET(&rec, DB_DBT_MALLOC); + if ((ret = log_get(logp, &stable_lsn, &rec, DB_LAST)) != 0) + return (ret); + if (F_ISSET(logp, DB_AM_THREAD)) + free(rec.data); + fnum = stable_lsn.file; + break; + case 0: + if ((ret = __log_findckp(logp, &stable_lsn)) != 0) { + if (ret != DB_NOTFOUND) + return (ret); + *listp = NULL; + return (0); + } + /* Remove any log files before the last stable LSN. */ + fnum = stable_lsn.file - 1; + break; + } + +#define LIST_INCREMENT 64 + /* Get some initial space. */ + if ((array = + (char **)malloc(sizeof(char *) * (array_size = 10))) == NULL) + return (ENOMEM); + array[0] = NULL; + + /* Build an array of the file names. */ + for (n = 0; fnum > 0; --fnum) { + if ((ret = __log_name(logp->dbenv, fnum, &name)) != 0) + goto err; + if (__db_exists(name, NULL) != 0) + break; + + if (n >= array_size - 1) { + array_size += LIST_INCREMENT; + if ((array = (char **)realloc(array, + sizeof(char *) * array_size)) == NULL) { + ret = ENOMEM; + goto err; + } + } + + if (LF_ISSET(DB_ARCH_ABS)) { + if ((ret = absname(pref, name, &array[n])) != 0) + goto err; + FREES(name); + } else if ((p = __db_rpath(name)) != NULL) { + if ((array[n] = (char *)strdup(p + 1)) == NULL) { + ret = ENOMEM; + goto err; + } + FREES(name); + } else + array[n] = name; + + array[++n] = NULL; + } + + /* If there's nothing to return, we're done. */ + if (n == 0) { + *listp = NULL; + ret = 0; + goto err; + } + + /* Sort the list. */ + qsort(array, (size_t)n, sizeof(char *), cmpfunc); + + /* Rework the memory. */ + if ((ret = usermem(&array, db_malloc)) != 0) + goto err; + + *listp = array; + return (0); + +err: if (array != NULL) { + for (arrayp = array; *arrayp != NULL; ++arrayp) + FREES(*arrayp); + free(array); + } + return (ret); +} + +/* + * build_data -- + * Build a list of datafiles for return. + */ +static int +build_data(logp, pref, listp, db_malloc) + DB_LOG *logp; + char *pref, ***listp; + void *(*db_malloc) __P((size_t)); +{ + DBT rec; + DB_LSN lsn; + __log_register_args *argp; + u_int32_t rectype; + int array_size, last, n, nxt, ret; + char **array, **arrayp, *p, *real_name; + + /* Get some initial space. */ + if ((array = + (char **)malloc(sizeof(char *) * (array_size = 10))) == NULL) + return (ENOMEM); + array[0] = NULL; + + memset(&rec, 0, sizeof(rec)); + if (F_ISSET(logp, DB_AM_THREAD)) + F_SET(&rec, DB_DBT_MALLOC); + for (n = 0, ret = log_get(logp, &lsn, &rec, DB_FIRST); + ret == 0; ret = log_get(logp, &lsn, &rec, DB_NEXT)) { + if (rec.size < sizeof(rectype)) { + ret = EINVAL; + __db_err(logp->dbenv, "log_archive: bad log record"); + goto lg_free; + } + + memcpy(&rectype, rec.data, sizeof(rectype)); + if (rectype != DB_log_register) { + if (F_ISSET(logp, DB_AM_THREAD)) { + free(rec.data); + rec.data = NULL; + } + continue; + } + if ((ret = __log_register_read(rec.data, &argp)) != 0) { + ret = EINVAL; + __db_err(logp->dbenv, + "log_archive: unable to read log record"); + goto lg_free; + } + + if (n >= array_size - 1) { + array_size += LIST_INCREMENT; + if ((array = (char **)realloc(array, + sizeof(char *) * array_size)) == NULL) { + ret = ENOMEM; + goto lg_free; + } + } + + if ((array[n] = (char *)strdup(argp->name.data)) == NULL) { + ret = ENOMEM; +lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) + free(rec.data); + goto err1; + } + + array[++n] = NULL; + free(argp); + + if (F_ISSET(logp, DB_AM_THREAD)) { + free(rec.data); + rec.data = NULL; + } + } + + /* If there's nothing to return, we're done. */ + if (n == 0) { + ret = 0; + *listp = NULL; + goto err1; + } + + /* Sort the list. */ + qsort(array, (size_t)n, sizeof(char *), cmpfunc); + + /* + * Build the real pathnames, discarding nonexistent files and + * duplicates. + */ + for (last = nxt = 0; nxt < n;) { + /* + * Discard duplicates. Last is the next slot we're going + * to return to the user, nxt is the next slot that we're + * going to consider. + */ + if (last != nxt) { + array[last] = array[nxt]; + array[nxt] = NULL; + } + for (++nxt; nxt < n && + strcmp(array[last], array[nxt]) == 0; ++nxt) { + FREES(array[nxt]); + array[nxt] = NULL; + } + + /* Get the real name. */ + if ((ret = __db_appname(logp->dbenv, + DB_APP_DATA, NULL, array[last], NULL, &real_name)) != 0) + goto err2; + + /* If the file doesn't exist, ignore it. */ + if (__db_exists(real_name, NULL) != 0) { + FREES(real_name); + FREES(array[last]); + array[last] = NULL; + continue; + } + + /* Rework the name as requested by the user. */ + FREES(array[last]); + array[last] = NULL; + if (pref != NULL) { + ret = absname(pref, real_name, &array[last]); + FREES(real_name); + if (ret != 0) + goto err2; + } else if ((p = __db_rpath(real_name)) != NULL) { + array[last] = (char *)strdup(p + 1); + FREES(real_name); + if (array[last] == NULL) + goto err2; + } else + array[last] = real_name; + ++last; + } + + /* NULL-terminate the list. */ + array[last] = NULL; + + /* Rework the memory. */ + if ((ret = usermem(&array, db_malloc)) != 0) + goto err1; + + *listp = array; + return (0); + +err2: /* + * XXX + * We've possibly inserted NULLs into the array list, so clean up a + * bit so that the other error processing works. + */ + if (array != NULL) + for (; nxt < n; ++nxt) + FREES(array[nxt]); + /* FALLTHROUGH */ + +err1: if (array != NULL) { + for (arrayp = array; *arrayp != NULL; ++arrayp) + FREES(*arrayp); + free(array); + } + return (ret); +} + +/* + * absname -- + * Return an absolute path name for the file. + */ +static int +absname(pref, name, newnamep) + char *pref, *name, **newnamep; +{ + size_t l_pref, l_name; + char *newname; + + l_pref = strlen(pref); + l_name = strlen(name); + + /* Malloc space for concatenating the two. */ + if ((newname = (char *)malloc(l_pref + l_name + 2)) == NULL) + return (ENOMEM); + + /* Build the name. */ + memcpy(newname, pref, l_pref); + if (strchr(PATH_SEPARATOR, newname[l_pref - 1]) == NULL) + newname[l_pref++] = PATH_SEPARATOR[0]; + memcpy(newname + l_pref, name, l_name + 1); + *newnamep = newname; + + return (0); +} + +/* + * usermem -- + * Create a single chunk of memory that holds the returned information. + * If the user has their own malloc routine, use it. + */ +static int +usermem(listp, func) + char ***listp; + void *(*func) __P((size_t)); +{ + size_t len; + char **array, **arrayp, **orig, *strp; + + /* Find out how much space we need. */ + for (len = 0, orig = *listp; *orig != NULL; ++orig) + len += sizeof(char *) + strlen(*orig) + 1; + len += sizeof(char *); + + /* + * Allocate it and set up the pointers. + * + * XXX + * Don't simplify this expression, SunOS compilers don't like it. + */ + if (func == NULL) + array = (char **)malloc(len); + else + array = (char **)func(len); + if (array == NULL) + return (ENOMEM); + strp = (char *)(array + (orig - *listp) + 1); + + /* Copy the original information into the new memory. */ + for (orig = *listp, arrayp = array; *orig != NULL; ++orig, ++arrayp) { + len = strlen(*orig); + memcpy(strp, *orig, len + 1); + *arrayp = strp; + strp += len + 1; + + FREES(*orig); + } + + /* NULL-terminate the list. */ + *arrayp = NULL; + + free(*listp); + *listp = array; + + return (0); +} + +static int +cmpfunc(p1, p2) + const void *p1, *p2; +{ + return (strcmp(*((char **)p1), *((char **)p2))); +} diff --git a/db2/log/log_auto.c b/db2/log/log_auto.c new file mode 100644 index 0000000000..59400087ca --- /dev/null +++ b/db2/log/log_auto.c @@ -0,0 +1,351 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "log.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __log_register_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: DBT *, DBT *, u_int32_t, DBTYPE)); + */ +int __log_register_log(logp, txnid, ret_lsnp, flags, + name, uid, id, ftype) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + DBT *name; + DBT *uid; + u_int32_t id; + DBTYPE ftype; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_log_register; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + (name == NULL ? 0 : name->size) + + sizeof(u_int32_t) + (uid == NULL ? 0 : uid->size) + + sizeof(id) + + sizeof(ftype); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + if (name == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &name->size, sizeof(name->size)); + bp += sizeof(name->size); + memcpy(bp, name->data, name->size); + bp += name->size; + } + if (uid == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &uid->size, sizeof(uid->size)); + bp += sizeof(uid->size); + memcpy(bp, uid->data, uid->size); + bp += uid->size; + } + memcpy(bp, &id, sizeof(id)); + bp += sizeof(id); + memcpy(bp, &ftype, sizeof(ftype)); + bp += sizeof(ftype); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = __log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __log_register_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__log_register_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __log_register_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __log_register_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]log_register: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tname: "); + for (i = 0; i < argp->name.size; i++) { + c = ((char *)argp->name.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tuid: "); + for (i = 0; i < argp->uid.size; i++) { + c = ((char *)argp->uid.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tid: %lu\n", (u_long)argp->id); + printf("\tftype: 0x%lx\n", (u_long)argp->ftype); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __log_register_read __P((void *, __log_register_args **)); + */ +int +__log_register_read(recbuf, argpp) + void *recbuf; + __log_register_args **argpp; +{ + __log_register_args *argp; + u_int8_t *bp; + + argp = (__log_register_args *)malloc(sizeof(__log_register_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->name.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->name.data = bp; + bp += argp->name.size; + memcpy(&argp->uid.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->uid.data = bp; + bp += argp->uid.size; + memcpy(&argp->id, bp, sizeof(argp->id)); + bp += sizeof(argp->id); + memcpy(&argp->ftype, bp, sizeof(argp->ftype)); + bp += sizeof(argp->ftype); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __log_unregister_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t)); + */ +int __log_unregister_log(logp, txnid, ret_lsnp, flags, + id) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t id; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_log_unregister; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(id); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &id, sizeof(id)); + bp += sizeof(id); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = __log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __log_unregister_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__log_unregister_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __log_unregister_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __log_unregister_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]log_unregister: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tid: %lu\n", (u_long)argp->id); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __log_unregister_read __P((void *, __log_unregister_args **)); + */ +int +__log_unregister_read(recbuf, argpp) + void *recbuf; + __log_unregister_args **argpp; +{ + __log_unregister_args *argp; + u_int8_t *bp; + + argp = (__log_unregister_args *)malloc(sizeof(__log_unregister_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->id, bp, sizeof(argp->id)); + bp += sizeof(argp->id); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __log_init_print __P((DB_ENV *)); + */ +int +__log_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __log_register_print, DB_log_register)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __log_unregister_print, DB_log_unregister)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __log_init_recover __P((DB_ENV *)); + */ +int +__log_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __log_register_recover, DB_log_register)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __log_unregister_recover, DB_log_unregister)) != 0) + return (ret); + return (0); +} + diff --git a/db2/log/log_compare.c b/db2/log/log_compare.c new file mode 100644 index 0000000000..601b25c626 --- /dev/null +++ b/db2/log/log_compare.c @@ -0,0 +1,34 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_compare.c 10.2 (Sleepycat) 6/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" + +/* + * log_compare -- + * Compare two LSN's. + */ +int +log_compare(lsn0, lsn1) + const DB_LSN *lsn0, *lsn1; +{ + if (lsn0->file != lsn1->file) + return (lsn0->file < lsn1->file ? -1 : 1); + + if (lsn0->offset != lsn1->offset) + return (lsn0->offset < lsn1->offset ? -1 : 1); + + return (0); +} diff --git a/db2/log/log_findckp.c b/db2/log/log_findckp.c new file mode 100644 index 0000000000..67fe9c9f50 --- /dev/null +++ b/db2/log/log_findckp.c @@ -0,0 +1,130 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_findckp.c 10.10 (Sleepycat) 7/30/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "log.h" +#include "txn.h" +#include "common_ext.h" + +/* + * __log_findckp -- + * + * Looks for the most recent checkpoint that occurs before the most recent + * checkpoint LSN. This is the point from which recovery can start and the + * point up to which archival/truncation can take place. Checkpoints in + * the log look like: + * + * ------------------------------------------------------------------- + * | ckp A, ckplsn 100 | .... record .... | ckp B, ckplsn 600 | ... + * ------------------------------------------------------------------- + * LSN 500 LSN 1000 + * + * If we read what log returns from using the DB_CKP parameter to logput, + * we'll get the record at LSN 1000. The checkpoint LSN there is 600. + * Now we have to scan backwards looking for a checkpoint before LSN 600. + * We find one at 500. This means that we can truncate the log before + * 500 or run recovery beginning at 500. + * + * Returns 0 if we find a checkpoint. + * Returns errno on error. + * Returns DB_NOTFOUND if we could not find a suitable start point and + * we should start from the beginning. + * + * PUBLIC: int __log_findckp __P((DB_LOG *, DB_LSN *)); + */ +int +__log_findckp(lp, lsnp) + DB_LOG *lp; + DB_LSN *lsnp; +{ + DBT data; + DB_LSN ckp_lsn, last_ckp, next_lsn; + __txn_ckp_args *ckp_args; + int ret, verbose; + + verbose = lp->dbenv != NULL && lp->dbenv->db_verbose != 0; + + /* + * Need to find the appropriate point from which to begin + * recovery. + */ + memset(&data, 0, sizeof(data)); + if (F_ISSET(lp, DB_AM_THREAD)) + F_SET(&data, DB_DBT_MALLOC); + if ((ret = log_get(lp, &last_ckp, &data, DB_CHECKPOINT)) != 0) + return (ret == ENOENT ? DB_NOTFOUND : ret); + ZERO_LSN(ckp_lsn); + + next_lsn = last_ckp; + do { + if (F_ISSET(lp, DB_AM_THREAD)) + free(data.data); + + if ((ret = log_get(lp, &next_lsn, &data, DB_SET)) != 0) + return (ret); + if ((ret = __txn_ckp_read(data.data, &ckp_args)) != 0) { + if (F_ISSET(lp, DB_AM_THREAD)) + free(data.data); + return (ret); + } + if (IS_ZERO_LSN(ckp_lsn)) + ckp_lsn = ckp_args->ckp_lsn; + if (verbose) { + __db_err(lp->dbenv, "Checkpoint at: [%lu][%lu]", + (u_long)last_ckp.file, (u_long)last_ckp.offset); + __db_err(lp->dbenv, "Checkpoint LSN: [%lu][%lu]", + (u_long)ckp_args->ckp_lsn.file, + (u_long)ckp_args->ckp_lsn.offset); + __db_err(lp->dbenv, "Previous checkpoint: [%lu][%lu]", + (u_long)ckp_args->last_ckp.file, + (u_long)ckp_args->last_ckp.offset); + } + last_ckp = next_lsn; + next_lsn = ckp_args->last_ckp; + free(ckp_args); + } while (!IS_ZERO_LSN(next_lsn) && + log_compare(&last_ckp, &ckp_lsn) > 0); + + if (F_ISSET(lp, DB_AM_THREAD)) + free(data.data); + + /* + * At this point, either, next_lsn is ZERO or ckp_lsn is the + * checkpoint lsn and last_ckp is the LSN of the last checkpoint + * before ckp_lsn. If the compare in the loop is still true, then + * next_lsn must be 0 and we need to roll forward from the + * beginning of the log. + */ + if (log_compare(&last_ckp, &ckp_lsn) > 0) { + if ((ret = log_get(lp, &last_ckp, &data, DB_FIRST)) != 0) + return (ret); + if (F_ISSET(lp, DB_AM_THREAD)) + free(data.data); + } + *lsnp = last_ckp; + + if (verbose) + __db_err(lp->dbenv, "Rolling forward from [%lu][%lu]", + (u_long)last_ckp.file, (u_long)last_ckp.offset); + + return (IS_ZERO_LSN(last_ckp) ? DB_NOTFOUND : 0); +} diff --git a/db2/log/log_get.c b/db2/log/log_get.c new file mode 100644 index 0000000000..37eb5cb249 --- /dev/null +++ b/db2/log/log_get.c @@ -0,0 +1,355 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_get.c 10.16 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "log.h" +#include "hash.h" +#include "common_ext.h" + +/* + * log_get -- + * Get a log record. + */ +int +log_get(dblp, alsn, dbt, flags) + DB_LOG *dblp; + DB_LSN *alsn; + DBT *dbt; + int flags; +{ + LOG *lp; + int ret; + + /* Validate arguments. */ +#define OKFLAGS (DB_CHECKPOINT | \ + DB_CURRENT | DB_FIRST | DB_LAST | DB_NEXT | DB_PREV | DB_SET) + if (flags != 0) { + if ((ret = + __db_fchk(dblp->dbenv, "log_get", flags, OKFLAGS)) != 0) + return (ret); + switch (flags) { + case DB_CHECKPOINT: + case DB_CURRENT: + case DB_FIRST: + case DB_LAST: + case DB_NEXT: + case DB_PREV: + case DB_SET: + case 0: + break; + default: + return (__db_ferr(dblp->dbenv, "log_get", 1)); + } + } + if (F_ISSET(dblp, DB_AM_THREAD)) { + if (LF_ISSET(DB_NEXT | DB_PREV | DB_CURRENT)) + return (__db_ferr(dblp->dbenv, "log_get", 1)); + if (!F_ISSET(dbt, DB_DBT_USERMEM | DB_DBT_MALLOC)) + return (__db_ferr(dblp->dbenv, "threaded data", 1)); + } + + lp = dblp->lp; + + LOCK_LOGREGION(dblp); + + /* + * If we get one of the log's header records, repeat the operation. + * This assumes that applications don't ever request the log header + * records by LSN, but that seems reasonable to me. + */ + ret = __log_get(dblp, alsn, dbt, flags, 0); + if (ret == 0 && alsn->offset == 0) { + switch (flags) { + case DB_FIRST: + flags = DB_NEXT; + break; + case DB_LAST: + flags = DB_PREV; + break; + } + ret = __log_get(dblp, alsn, dbt, flags, 0); + } + + UNLOCK_LOGREGION(dblp); + + return (ret); +} + +/* + * __log_get -- + * Get a log record; internal version. + * + * PUBLIC: int __log_get __P((DB_LOG *, DB_LSN *, DBT *, int, int)); + */ +int +__log_get(dblp, alsn, dbt, flags, silent) + DB_LOG *dblp; + DB_LSN *alsn; + DBT *dbt; + int flags, silent; +{ + DB_LSN nlsn; + HDR hdr; + LOG *lp; + size_t len; + ssize_t nr; + int cnt, ret; + const char *fail; + char *np, *tbuf; + void *p, *shortp; + + lp = dblp->lp; + fail = np = tbuf = NULL; + + nlsn = dblp->c_lsn; + switch (flags) { + case DB_CHECKPOINT: + nlsn = dblp->lp->c_lsn; + if (IS_ZERO_LSN(nlsn)) { + __db_err(dblp->dbenv, + "log_get: unable to find checkpoint record: no checkpoint set."); + ret = ENOENT; + goto err2; + } + break; + case DB_NEXT: /* Next log record. */ + if (!IS_ZERO_LSN(nlsn)) { + /* Increment the cursor by the cursor record size. */ + nlsn.offset += dblp->c_len; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: /* Find the first log record. */ + /* + * Find any log file. Note, we may have only entered records + * in the buffer, and not yet written a log file. + */ + if ((ret = __log_find(dblp->dbenv, lp, &cnt)) != 0) { + __db_err(dblp->dbenv, + "log_get: unable to find the first record: no log files found."); + goto err2; + } + + /* If there's anything in the buffer, it belongs to file 1. */ + if (cnt == 0) + cnt = 1; + + /* Now go backwards to find the smallest one. */ + for (; cnt > 1; --cnt) + if (__log_valid(dblp->dbenv, NULL, cnt) != 0) { + ++cnt; + break; + } + nlsn.file = cnt; + nlsn.offset = 0; + break; + case DB_CURRENT: /* Current log record. */ + break; + case DB_PREV: /* Previous log record. */ + if (!IS_ZERO_LSN(nlsn)) { + /* If at start-of-file, move to the previous file. */ + if (nlsn.offset == 0) { + if (nlsn.file == 1) + return (DB_NOTFOUND); + + --nlsn.file; + nlsn.offset = dblp->c_off; + } else + nlsn.offset = dblp->c_off; + break; + } + /* FALLTHROUGH */ + case DB_LAST: /* Last log record. */ + nlsn.file = lp->lsn.file; + nlsn.offset = lp->lsn.offset - lp->len; + break; + case DB_SET: /* Set log record. */ + nlsn = *alsn; + break; + } + +retry: + /* Return 1 if the request is past end-of-file. */ + if (nlsn.file > lp->lsn.file || + (nlsn.file == lp->lsn.file && nlsn.offset >= lp->lsn.offset)) + return (DB_NOTFOUND); + + /* If we've switched files, discard the current fd. */ + if (dblp->c_lsn.file != nlsn.file && dblp->c_fd != -1) { + (void)__db_close(dblp->c_fd); + dblp->c_fd = -1; + } + + /* If the entire record is in the in-memory buffer, copy it out. */ + if (nlsn.file == lp->lsn.file && nlsn.offset >= lp->w_off) { + /* Copy the header. */ + p = lp->buf + (nlsn.offset - lp->w_off); + memcpy(&hdr, p, sizeof(HDR)); + + /* Copy the record. */ + len = hdr.len - sizeof(HDR); + if ((ret = __db_retcopy(dbt, (u_int8_t *)p + sizeof(HDR), + len, &dblp->c_dbt.data, &dblp->c_dbt.ulen, NULL)) != 0) + goto err1; + goto cksum; + } + + /* + * Move the file descriptor to the page that has the hdr. We dealt + * with moving to a previous log file in the flags switch code, but + * we don't yet know if we'll need to move to a subsequent file. + * + * Acquire a file descriptor. + */ + if (dblp->c_fd == -1) { + if ((ret = __log_name(dblp->dbenv, nlsn.file, &np)) != 0) + goto err1; + if ((ret = __db_fdopen(np, DB_RDONLY | DB_SEQUENTIAL, + DB_RDONLY | DB_SEQUENTIAL, 0, &dblp->c_fd)) != 0) { + fail = np; + goto err1; + } + free(np); + np = NULL; + } + + /* Seek to the header offset and read the header. */ + if ((ret = __db_lseek(dblp->c_fd, 0, 0, nlsn.offset, SEEK_SET)) != 0) { + fail = "seek"; + goto err1; + } + if ((ret = __db_read(dblp->c_fd, &hdr, sizeof(HDR), &nr)) != 0) { + fail = "read"; + goto err1; + } + if (nr == sizeof(HDR)) + shortp = NULL; + else { + /* If read returns EOF, try the next file. */ + if (nr == 0) { + if (flags != DB_NEXT || nlsn.file == lp->lsn.file) + goto corrupt; + + /* Move to the next file. */ + ++nlsn.file; + nlsn.offset = 0; + goto retry; + } + + /* + * If read returns a short count the rest of the record has + * to be in the in-memory buffer. + */ + if (lp->b_off < sizeof(HDR) - nr) + goto corrupt; + + /* Get the rest of the header from the in-memory buffer. */ + memcpy((u_int8_t *)&hdr + nr, lp->buf, sizeof(HDR) - nr); + shortp = lp->buf + (sizeof(HDR) - nr); + } + + /* + * Check for buffers of 0's, that's what we usually see during + * recovery, although it's certainly not something on which we + * can depend. + */ + if (hdr.len <= sizeof(HDR)) + goto corrupt; + len = hdr.len - sizeof(HDR); + + /* If we've already moved to the in-memory buffer, fill from there. */ + if (shortp != NULL) { + if (lp->b_off < ((u_int8_t *)shortp - lp->buf) + len) + goto corrupt; + if ((ret = __db_retcopy(dbt, shortp, len, + &dblp->c_dbt.data, &dblp->c_dbt.ulen, NULL)) != 0) + goto err1; + goto cksum; + } + + /* Allocate temporary memory to hold the record. */ + if ((tbuf = (char *)malloc(len)) == NULL) { + ret = ENOMEM; + goto err1; + } + + /* + * Read the record into the buffer. If read returns a short count, + * there was an error or the rest of the record is in the in-memory + * buffer. Note, the information may be garbage if we're in recovery, + * so don't read past the end of the buffer's memory. + */ + if ((ret = __db_read(dblp->c_fd, tbuf, len, &nr)) != 0) { + fail = "read"; + goto err1; + } + if (len - nr > sizeof(lp->buf)) + goto corrupt; + if (nr != (ssize_t)len) { + if (lp->b_off < len - nr) + goto corrupt; + + /* Get the rest of the record from the in-memory buffer. */ + memcpy((u_int8_t *)tbuf + nr, lp->buf, len - nr); + } + + /* Copy the record into the user's DBT. */ + if ((ret = __db_retcopy(dbt, tbuf, len, + &dblp->c_dbt.data, &dblp->c_dbt.ulen, NULL)) != 0) + goto err1; + free(tbuf); + +cksum: if (hdr.cksum != __ham_func4(dbt->data, dbt->size)) { + if (!silent) + __db_err(dblp->dbenv, "log_get: checksum mismatch"); + goto corrupt; + } + + /* Update the cursor and the return lsn. */ + dblp->c_off = hdr.prev; + dblp->c_len = hdr.len; + dblp->c_lsn = *alsn = nlsn; + + return (0); + +corrupt:/* + * This is the catchall -- for some reason we didn't find enough + * information or it wasn't reasonable information, and it wasn't + * because a system call failed. + */ + ret = EIO; + fail = "read"; + +err1: if (!silent) + if (fail == NULL) + __db_err(dblp->dbenv, "log_get: %s", strerror(ret)); + else + __db_err(dblp->dbenv, + "log_get: %s: %s", fail, strerror(ret)); +err2: if (np != NULL) + free(np); + if (tbuf != NULL) + free(tbuf); + return (ret); +} diff --git a/db2/log/log_put.c b/db2/log/log_put.c new file mode 100644 index 0000000000..db31f9b0e1 --- /dev/null +++ b/db2/log/log_put.c @@ -0,0 +1,484 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_put.c 10.12 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "log.h" +#include "hash.h" +#include "common_ext.h" + +static int __log_fill __P((DB_LOG *, void *, u_int32_t)); +static int __log_newfd __P((DB_LOG *)); +static int __log_write __P((DB_LOG *, void *, u_int32_t)); +static int __log_putr __P((DB_LOG *, const DBT *, u_int32_t)); + +/* + * log_put -- + * Write a log record. + */ +int +log_put(dblp, lsn, dbt, flags) + DB_LOG *dblp; + DB_LSN *lsn; + const DBT *dbt; + int flags; +{ + int ret; + + /* Validate arguments. */ +#define OKFLAGS (DB_CHECKPOINT | DB_FLUSH) + if (flags != 0) { + if ((ret = + __db_fchk(dblp->dbenv, "log_put", flags, OKFLAGS)) != 0) + return (ret); + switch (flags) { + case DB_CHECKPOINT: + case DB_FLUSH: + case 0: + break; + default: + return (__db_ferr(dblp->dbenv, "log_put", 1)); + } + } + + LOCK_LOGREGION(dblp); + + ret = __log_put(dblp, lsn, dbt, flags); + + UNLOCK_LOGREGION(dblp); + + return (ret); +} + +/* + * __log_put -- + * Write a log record; internal version. + * + * PUBLIC: int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, int)); + */ +int +__log_put(dblp, lsn, dbt, flags) + DB_LOG *dblp; + DB_LSN *lsn; + const DBT *dbt; + int flags; +{ + DBT t; + DBT fid_dbt; + DB_LSN r_unused; + FNAME *fnp; + LOG *lp; + u_int32_t lastoff; + int ret; + + lp = dblp->lp; + + /* If this information won't fit in the file, swap files. */ + if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) { + if (sizeof(HDR) + + sizeof(LOGP) + dbt->size > lp->persist.lg_max) { + __db_err(dblp->dbenv, + "log_put: record larger than maximum file size"); + return (EINVAL); + } + if (lp->b_off != 0) { + if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) + return (ret); + if ((ret = __db_fsync(dblp->lfd)) != 0) + return (ret); + lp->s_lsn.file = lp->lsn.file; + lp->s_lsn.offset = lp->lsn.offset - 1; + } + + /* + * Save the last known offset from the previous file, we'll + * need it to initialize the persistent header information. + */ + lastoff = lp->lsn.offset; + + ++lp->lsn.file; + lp->lsn.offset = 0; + lp->w_off = 0; + } else + lastoff = 0; + + /* + * Insert persistent information as the first record in every file. + * Note that the previous length is wrong for the very first record + * of the log, but that's okay, we check for it during retrieval. + */ + if (lp->lsn.offset == 0) { + t.data = &lp->persist; + t.size = sizeof(LOGP); + if ((ret = __log_putr(dblp, + &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0) + return (ret); + } + + /* Initialize the LSN information returned to the user. */ + lsn->file = lp->lsn.file; + lsn->offset = lp->lsn.offset; + + /* Put out the user's record. */ + if ((ret = __log_putr(dblp, dbt, lp->lsn.offset - lp->len)) != 0) + return (ret); + + /* + * On a checkpoint, we: + * Put out the checkpoint record (above). + * Save the LSN of the checkpoint in the shared region. + * Append the set of file name information into the log. + * Flush the current buffer contents to disk. + * Sync the log to disk. + * Save the time the checkpoint was written. + * Reset the bytes written since the last checkpoint. + */ + if (flags == DB_CHECKPOINT) { + lp->c_lsn = *lsn; + + for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); + fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { + t.data = ADDR(dblp, fnp->name_off); + t.size = strlen(t.data) + 1; + memset(&fid_dbt, 0, sizeof(fid_dbt)); + fid_dbt.data = ADDR(dblp, fnp->fileid_off); + fid_dbt.size = DB_FILE_ID_LEN; + if ((ret = __log_register_log(dblp, NULL, &r_unused, + 0, &t, &fid_dbt, fnp->id, fnp->s_type)) != 0) + return (ret); + } + if (lp->b_off != 0 && + (ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) + return (ret); + (void)time(&lp->chkpt); + lp->written = 0; + + if ((ret = __db_fsync(dblp->lfd)) != 0) + return (ret); + lp->s_lsn.file = lp->lsn.file; + lp->s_lsn.offset = lp->lsn.offset - 1; + } + + /* We always flush on a checkpoint. */ + if (flags == DB_FLUSH || flags == DB_CHECKPOINT) { + if (lp->b_off != 0 && + (ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) + return (ret); + + if ((ret = __db_fsync(dblp->lfd)) != 0) + return (ret); + lp->s_lsn.file = lp->lsn.file; + lp->s_lsn.offset = lp->lsn.offset - 1; + } + + /* + * If we just did I/O, i.e., this LSN could have spanned the start of + * the in-core buffer, we remember it so that we can flush correctly + * during a sync. + */ + if (lsn->offset < lp->w_off && lsn->offset + lp->len > lp->w_off) + lp->span_lsn = *lsn; + return (0); +} + +/* + * __log_putr -- + * Actually put a record into the log. + */ +static int +__log_putr(dblp, dbt, prev) + DB_LOG *dblp; + const DBT *dbt; + u_int32_t prev; +{ + HDR hdr; + LOG *lp; + int ret; + + lp = dblp->lp; + + /* + * Initialize the header. If we just switched files, lsn.offset will + * be 0, and what we really want is the offset of the previous record + * in the previous file. Fortunately, prev holds the value we want. + */ + hdr.prev = prev; + hdr.len = sizeof(HDR) + dbt->size; + hdr.cksum = __ham_func4(dbt->data, dbt->size); + + if ((ret = __log_fill(dblp, &hdr, sizeof(HDR))) != 0) + return (ret); + lp->lsn.offset += sizeof(HDR); + + if ((ret = __log_fill(dblp, dbt->data, dbt->size)) != 0) + return (ret); + lp->lsn.offset += dbt->size; + + lp->len = sizeof(HDR) + dbt->size; + return (0); +} + +/* + * log_flush -- + * Write all records less than or equal to the specified LSN. + */ +int +log_flush(dblp, lsn) + DB_LOG *dblp; + const DB_LSN *lsn; +{ + DB_LSN t_lsn; + LOG *lp; + int ret; + + ret = 0; + lp = dblp->lp; + + LOCK_LOGREGION(dblp); + + /* If no LSN specified, flush the entire log. */ + if (lsn == NULL) { + t_lsn.file = lp->lsn.file; + t_lsn.offset = lp->lsn.offset - lp->len; + lsn = &t_lsn; + } + + /* If it's a non-existent record, it's an error. */ + if (lsn->file > lp->lsn.file || + (lsn->file == lp->lsn.file && lsn->offset > lp->lsn.offset)) { + __db_err(dblp->dbenv, "log_flush: LSN past current end-of-log"); + ret = EINVAL; + goto ret1; + } + + /* + * If it's from a previous file, we're done because we sync each + * file when we move to a new one. + */ + if (lsn->file < lp->lsn.file) + goto ret1; + + /* + * If it's less than the last-sync'd offset, we've already sync'd + * this LSN. + */ + if (lsn->offset <= lp->s_lsn.offset) + goto ret1; + + /* + * We may need to write the current buffer. We have to write the + * current buffer if the sync LSN is greater than or equal to the + * saved spanning-LSN. + */ + if (lsn->file >= lp->span_lsn.file && + lsn->offset >= lp->span_lsn.offset) + if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) + goto ret1; + + /* Acquire a file descriptor if we don't have one. */ + if (dblp->lfname != dblp->lp->lsn.file && + (ret = __log_newfd(dblp)) != 0) + goto ret1; + + if ((ret = __db_fsync(dblp->lfd)) != 0) + goto ret1; + + lp->s_lsn.file = lp->lsn.file; + lp->s_lsn.offset = lsn->offset; + +ret1: UNLOCK_LOGREGION(dblp); + return (ret); +} + +/* + * __log_fill -- + * Write information into the log. + */ +static int +__log_fill(dblp, addr, len) + DB_LOG *dblp; + void *addr; + u_int32_t len; +{ + LOG *lp; + u_int32_t nrec; + size_t nw, remain; + int ret; + + /* Copy out the data. */ + for (lp = dblp->lp; len > 0;) { + /* + * If we're on a buffer boundary and the data is big enough, + * copy as many records as we can directly from the data. + */ + if (lp->b_off == 0 && len >= sizeof(lp->buf)) { + nrec = len / sizeof(lp->buf); + if ((ret = __log_write(dblp, + addr, nrec * sizeof(lp->buf))) != 0) + return (ret); + addr = (u_int8_t *)addr + nrec * sizeof(lp->buf); + len -= nrec * sizeof(lp->buf); + continue; + } + + /* Figure out how many bytes we can copy this time. */ + remain = sizeof(lp->buf) - lp->b_off; + nw = remain > len ? len : remain; + memcpy(lp->buf + lp->b_off, addr, nw); + addr = (u_int8_t *)addr + nw; + len -= nw; + lp->b_off += nw; + + /* If we fill the buffer, flush it. */ + if (lp->b_off == sizeof(lp->buf) && + (ret = __log_write(dblp, lp->buf, sizeof(lp->buf))) != 0) + return (ret); + } + return (0); +} + +/* + * __log_write -- + * Write the log buffer to disk. + */ +static int +__log_write(dblp, addr, len) + DB_LOG *dblp; + void *addr; + u_int32_t len; +{ + LOG *lp; + ssize_t nw; + int ret; + + /* + * If we haven't opened the log file yet or the current one + * has changed, acquire a new log file. + */ + lp = dblp->lp; + if (dblp->lfd == -1 || dblp->lfname != lp->lsn.file) + if ((ret = __log_newfd(dblp)) != 0) + return (ret); + + /* + * Seek to the offset in the file (someone may have written it + * since we last did). + */ + if ((ret = __db_lseek(dblp->lfd, 0, 0, lp->w_off, SEEK_SET)) != 0) + return (ret); + if ((ret = __db_write(dblp->lfd, addr, len, &nw)) != 0) + return (ret); + if (nw != (int32_t)len) + return (EIO); + + /* Update the seek offset and reset the buffer offset. */ + lp->b_off = 0; + lp->w_off += len; + lp->written += len; + + return (0); +} + +/* + * log_file -- + * Map a DB_LSN to a file name. + */ +int +log_file(dblp, lsn, namep, len) + DB_LOG *dblp; + const DB_LSN *lsn; + char *namep; + size_t len; +{ + int ret; + char *p; + + LOCK_LOGREGION(dblp); + + ret = __log_name(dblp->dbenv, lsn->file, &p); + + UNLOCK_LOGREGION(dblp); + + if (ret != 0) + return (ret); + + /* Check to make sure there's enough room and copy the name. */ + if (len < strlen(p)) { + *namep = '\0'; + return (ENOMEM); + } + (void)strcpy(namep, p); + free(p); + + return (0); +} + +/* + * __log_newfd -- + * Acquire a file descriptor for the current log file. + */ +static int +__log_newfd(dblp) + DB_LOG *dblp; +{ + int ret; + char *p; + + /* Close any previous file descriptor. */ + if (dblp->lfd != -1) { + (void)__db_close(dblp->lfd); + dblp->lfd = -1; + } + + /* Get the path of the new file and open it. */ + dblp->lfname = dblp->lp->lsn.file; + if ((ret = __log_name(dblp->dbenv, dblp->lfname, &p)) != 0) + return (ret); + if ((ret = __db_fdopen(p, + DB_CREATE | DB_SEQUENTIAL, + DB_CREATE | DB_SEQUENTIAL, + dblp->lp->persist.mode, &dblp->lfd)) != 0) + __db_err(dblp->dbenv, + "log_put: %s: %s", p, strerror(errno)); + FREES(p); + return (ret); +} + +/* + * __log_name -- + * Return the log name for a particular file. + * + * PUBLIC: int __log_name __P((DB_ENV *, int, char **)); + */ +int +__log_name(dbenv, fn, np) + DB_ENV *dbenv; + int fn; + char **np; +{ + char name[sizeof(LFNAME) + 10]; + + (void)snprintf(name, sizeof(name), LFNAME, fn); + return (__db_appname(dbenv, DB_APP_LOG, NULL, name, NULL, np)); +} diff --git a/db2/log/log_rec.c b/db2/log/log_rec.c new file mode 100644 index 0000000000..dbc5960731 --- /dev/null +++ b/db2/log/log_rec.c @@ -0,0 +1,332 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_rec.c 10.11 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "log.h" +#include "db_dispatch.h" +#include "common_ext.h" + +static int __log_open_file __P((DB_LOG *, + u_int8_t *, char *, DBTYPE, u_int32_t)); + +/* + * PUBLIC: int __log_register_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__log_register_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __log_register_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + __log_register_print(logp, dbtp, lsnp, redo, info); +#endif + info = info; /* XXX: Shut the compiler up. */ + lsnp = lsnp; + + F_SET(logp, DB_AM_RECOVER); + + if ((ret = __log_register_read(dbtp->data, &argp)) != 0) + goto out; + + ret = __log_open_file(logp, + argp->uid.data, argp->name.data, argp->ftype, argp->id); + if (ret == ENOENT) { + if (redo == TXN_OPENFILES) + __db_err(logp->dbenv, + "warning: file %s not found", argp->name.data); + ret = 0; + } + +out: F_CLR(logp, DB_AM_RECOVER); + if (argp != NULL) + free(argp); + return (ret); +} + +/* + * PUBLIC: int __log_unregister_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__log_unregister_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __log_unregister_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + __log_unregister_print(logp, dbtp, lsnp, redo, info); +#endif + info = info; /* XXX: Shut the compiler up. */ + lsnp = lsnp; + + if (redo == TXN_OPENFILES || + redo == TXN_BACKWARD_ROLL || redo == TXN_UNDO) + return (0); + + F_SET(logp, DB_AM_RECOVER); + if ((ret = __log_unregister_read(dbtp->data, &argp)) != 0) + goto out; + + LOCK_LOGTHREAD(logp); + if (logp->dbentry[argp->id].dbp == NULL) + ret = EINVAL; + else if (--logp->dbentry[argp->id].refcount == 0) { + ret = logp->dbentry[argp->id].dbp->close( + logp->dbentry[argp->id].dbp, 0); + logp->dbentry[argp->id].dbp = NULL; + } + UNLOCK_LOGTHREAD(logp); + +out: F_CLR(logp, DB_AM_RECOVER); + if (argp != NULL) + free(argp); + return (ret); +} + +/* Hand coded routines. */ + +/* + * Called during log_register recovery. Make sure that we have an + * entry in the dbentry table for this ndx. + * Returns 0 on success, non-zero on error. + */ +static int +__log_open_file(lp, uid, name, ftype, ndx) + DB_LOG *lp; + u_int8_t *uid; + char *name; + DBTYPE ftype; + u_int32_t ndx; +{ + DB *dbp; + int ret; + + LOCK_LOGTHREAD(lp); + if (ndx < lp->dbentry_cnt && + (lp->dbentry[ndx].deleted == 1 || lp->dbentry[ndx].dbp != NULL)) { + lp->dbentry[ndx].refcount++; + + UNLOCK_LOGTHREAD(lp); + return (0); + } + UNLOCK_LOGTHREAD(lp); + + /* Need to open file. */ + dbp = NULL; + if ((ret = db_open(name, ftype, 0, 0, lp->dbenv, NULL, &dbp)) == 0) { + /* + * Verify that we are opening the same file that we were + * referring to when we wrote this log record. + */ + if (memcmp(uid, dbp->lock.fileid, DB_FILE_ID_LEN) != 0) { + (void)dbp->close(dbp, 0); + dbp = NULL; + ret = ENOENT; + } + } + + if (ret == 0 || ret == ENOENT) + (void)__log_add_logid(lp, dbp, ndx); + + return (ret); +} + +/* + * This function returns: + * 0 SUCCESS (the entry was not previously set and is now set or the + * entry was previously set and we just inced the ref count. + * >0 on system error (returns errno value). + * PUBLIC: int __log_add_logid __P((DB_LOG *, DB *, u_int32_t)); + */ +int +__log_add_logid(logp, dbp, ndx) + DB_LOG *logp; + DB *dbp; + u_int32_t ndx; +{ + DB_ENTRY *temp_entryp; + u_int32_t i; + int ret; + + ret = 0; + + LOCK_LOGTHREAD(logp); + /* + * Check if we need to grow the table. + */ + if (logp->dbentry_cnt <= ndx) { + if (logp->dbentry_cnt == 0) { + logp->dbentry = + (DB_ENTRY *)malloc(DB_GROW_SIZE * sizeof(DB_ENTRY)); + if (logp->dbentry == NULL) { + ret = ENOMEM; + goto err; + } + } else { + temp_entryp = (DB_ENTRY *)realloc(logp->dbentry, + (DB_GROW_SIZE + logp->dbentry_cnt) * + sizeof(DB_ENTRY)); + if (temp_entryp == NULL) { + ret = ENOMEM; + goto err; + } + logp->dbentry = temp_entryp; + + } + /* Initialize the new entries. */ + for (i = logp->dbentry_cnt; + i < logp->dbentry_cnt + DB_GROW_SIZE; i++) { + logp->dbentry[i].dbp = NULL; + logp->dbentry[i].deleted = 0; + } + + logp->dbentry_cnt += DB_GROW_SIZE; + } + + if (logp->dbentry[ndx].deleted == 0 && logp->dbentry[ndx].dbp == NULL) { + logp->dbentry[ndx].dbp = dbp; + logp->dbentry[ndx].refcount = 1; + logp->dbentry[ndx].deleted = dbp == NULL; + } else + logp->dbentry[ndx].refcount++; + +err: UNLOCK_LOGTHREAD(logp); + return (ret); +} + + +/* + * __db_fileid_to_db -- + * Return the DB corresponding to the specified fileid. + * + * PUBLIC: int __db_fileid_to_db __P((DB_LOG *, DB **, u_int32_t)); + */ +int +__db_fileid_to_db(logp, dbpp, ndx) + DB_LOG *logp; + DB **dbpp; + u_int32_t ndx; +{ + int ret; + + ret = 0; + LOCK_LOGTHREAD(logp); + + /* + * Return DB_DELETED if the file has been deleted + * (it's not an error). + */ + if (logp->dbentry[ndx].deleted) { + ret = DB_DELETED; + goto err; + } + + /* + * Otherwise return 0, but if we don't have a corresponding DB, + * it's an error. + */ + if ((*dbpp = logp->dbentry[ndx].dbp) == NULL) + ret = ENOENT; + +err: UNLOCK_LOGTHREAD(logp); + return (ret); +} + +/* + * Close files that were opened by the recovery daemon. + * + * PUBLIC: void __log_close_files __P((DB_LOG *)); + */ +void +__log_close_files(logp) + DB_LOG *logp; +{ + u_int32_t i; + + LOCK_LOGTHREAD(logp); + for (i = 0; i < logp->dbentry_cnt; i++) + if (logp->dbentry[i].dbp) + logp->dbentry[i].dbp->close(logp->dbentry[i].dbp, 0); + UNLOCK_LOGTHREAD(logp); +} + +/* + * PUBLIC: void __log_rem_logid __P((DB_LOG *, u_int32_t)); + */ +void +__log_rem_logid(logp, ndx) + DB_LOG *logp; + u_int32_t ndx; +{ + LOCK_LOGTHREAD(logp); + if (--logp->dbentry[ndx].refcount == 0) { + logp->dbentry[ndx].dbp = NULL; + logp->dbentry[ndx].deleted = 0; + } + UNLOCK_LOGTHREAD(logp); +} diff --git a/db2/log/log_register.c b/db2/log/log_register.c new file mode 100644 index 0000000000..582eab9408 --- /dev/null +++ b/db2/log/log_register.c @@ -0,0 +1,199 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)log_register.c 10.10 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "log.h" +#include "common_ext.h" + +/* + * log_register -- + * Register a file name. + */ +int +log_register(dblp, dbp, name, type, idp) + DB_LOG *dblp; + DB *dbp; + const char *name; + DBTYPE type; + u_int32_t *idp; +{ + DBT r_name; + DBT fid_dbt; + DB_LSN r_unused; + FNAME *fnp; + size_t len; + u_int32_t fid; + int inserted, ret; + char *fullname; + void *fidp, *namep; + + fid = 0; + inserted = 0; + fullname = NULL; + fnp = fidp = namep = NULL; + + /* Check the arguments. */ + if (type != DB_BTREE && type != DB_HASH && type != DB_RECNO) { + __db_err(dblp->dbenv, "log_register: unknown DB file type"); + return (EINVAL); + } + + /* Get the log file id. */ + if ((ret = __db_appname(dblp->dbenv, + DB_APP_DATA, NULL, name, NULL, &fullname)) != 0) + return (ret); + + LOCK_LOGREGION(dblp); + + /* + * See if we've already got this file in the log, finding the + * next-to-lowest file id currently in use as we do it. + */ + for (fid = 1, fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); + fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { + if (fid <= fnp->id) + fid = fnp->id + 1; + if (!memcmp(dbp->lock.fileid, + ADDR(dblp, fnp->fileid_off), DB_FILE_ID_LEN)) { + ++fnp->ref; + fid = fnp->id; + if (!F_ISSET(dblp, DB_AM_RECOVER) && + (ret = __log_add_logid(dblp, dbp, fid) != 0)) + goto err; + goto ret1; + } + } + + /* Allocate a new file name structure. */ + if ((ret = __db_shalloc(dblp->addr, sizeof(FNAME), 0, &fnp)) != 0) + goto err; + fnp->ref = 1; + fnp->id = fid; + fnp->s_type = type; + + if ((ret = __db_shalloc(dblp->addr, DB_FILE_ID_LEN, 0, &fidp)) != 0) + goto err; + /* + * XXX Now that uids are fixed size, we can put them in the fnp + * structure. + */ + fnp->fileid_off = OFFSET(dblp, fidp); + memcpy(fidp, dbp->lock.fileid, DB_FILE_ID_LEN); + + len = strlen(name) + 1; + if ((ret = __db_shalloc(dblp->addr, len, 0, &namep)) != 0) + goto err; + fnp->name_off = OFFSET(dblp, namep); + memcpy(namep, name, len); + + SH_TAILQ_INSERT_HEAD(&dblp->lp->fq, fnp, q, __fname); + inserted = 1; + + /* Log the registry. */ + if (!F_ISSET(dblp, DB_AM_RECOVER)) { + r_name.data = (void *)name; /* XXX: Yuck! */ + r_name.size = strlen(name) + 1; + memset(&fid_dbt, 0, sizeof(fid_dbt)); + fid_dbt.data = dbp->lock.fileid; + fid_dbt.size = DB_FILE_ID_LEN; + if ((ret = __log_register_log(dblp, NULL, &r_unused, + 0, &r_name, &fid_dbt, fid, type)) != 0) + goto err; + if ((ret = __log_add_logid(dblp, dbp, fid)) != 0) + goto err; + } + + if (0) { +err: /* + * XXX + * We should grow the region. + */ + if (inserted) + SH_TAILQ_REMOVE(&dblp->lp->fq, fnp, q, __fname); + if (namep != NULL) + __db_shalloc_free(dblp->addr, namep); + if (fidp != NULL) + __db_shalloc_free(dblp->addr, fidp); + if (fnp != NULL) + __db_shalloc_free(dblp->addr, fnp); + } + +ret1: UNLOCK_LOGREGION(dblp); + + if (fullname != NULL) + FREES(fullname); + + if (idp != NULL) + *idp = fid; + return (ret); +} + +/* + * log_unregister -- + * Discard a registered file name. + */ +int +log_unregister(dblp, fid) + DB_LOG *dblp; + u_int32_t fid; +{ + DB_LSN r_unused; + FNAME *fnp; + int ret; + + ret = 0; + LOCK_LOGREGION(dblp); + + /* Unlog the registry. */ + if (!F_ISSET(dblp, DB_AM_RECOVER) && + (ret = __log_unregister_log(dblp, NULL, &r_unused, 0, fid)) != 0) + return (ret); + + /* Find the entry in the log. */ + for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); + fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) + if (fid == fnp->id) + break; + if (fnp == NULL) { + __db_err(dblp->dbenv, "log_unregister: non-existent file id"); + ret = EINVAL; + goto ret1; + } + + /* If more than 1 reference, decrement the reference and return. */ + if (fnp->ref > 1) { + --fnp->ref; + goto ret1; + } + + /* Free the unique file information, name and structure. */ + __db_shalloc_free(dblp->addr, ADDR(dblp, fnp->fileid_off)); + __db_shalloc_free(dblp->addr, ADDR(dblp, fnp->name_off)); + SH_TAILQ_REMOVE(&dblp->lp->fq, fnp, q, __fname); + __db_shalloc_free(dblp->addr, fnp); + + /* Remove from the process local table. */ + __log_rem_logid(dblp, fid); + +ret1: UNLOCK_LOGREGION(dblp); + + return (ret); +} diff --git a/db2/makedb.c b/db2/makedb.c new file mode 100644 index 0000000000..68c9514882 --- /dev/null +++ b/db2/makedb.c @@ -0,0 +1,363 @@ +/* Create simple DB database from textual input. + Copyright (C) 1996, 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include <argp.h> +#include <ctype.h> +#include <db_185.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <libintl.h> +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* Get libc version number. */ +#include "../version.h" + +#define PACKAGE _libc_intl_domainname + +/* If non-zero convert key to lower case. */ +static int to_lowercase; + +/* If non-zero print content of input file, one entry per line. */ +static int do_undo; + +/* If non-zero do not print informational messages. */ +static int be_quiet; + +/* Name of output file. */ +static const char *output_name; + +/* Name and version of program. */ +static void print_version (FILE *stream, struct argp_state *state); +void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version; + +/* Definitions of arguments for argp functions. */ +static const struct argp_option options[] = +{ + { "fold-case", 'f', NULL, 0, N_("Convert key to lower case") }, + { "output", 'o', N_("NAME"), 0, N_("Write output to file NAME") }, + { "quiet", 'q', NULL, 0, + N_("Do not print messages while building database") }, + { "undo", 'u', NULL, 0, + N_("Print content of database file, one entry a line") }, + { NULL, 0, NULL, 0, NULL } +}; + +/* Short description of program. */ +static const char doc[] = N_("Create simple DB database from textual input."); + +/* Strings for arguments in help texts. */ +static const char args_doc[] = N_("\ +INPUT-FILE OUTPUT-FILE\n-o OUTPUT-FILE INPUT-FILE\n-u INPUT-FILE"); + +/* Prototype for option handler. */ +static error_t parse_opt __P ((int key, char *arg, struct argp_state *state)); + +/* Function to print some extra text in the help message. */ +static char *more_help __P ((int key, const char *text, void *input)); + +/* Data structure to communicate with argp functions. */ +static struct argp argp = +{ + options, parse_opt, args_doc, doc, NULL, more_help +}; + + +/* Prototypes for local functions. */ +static int process_input __P ((FILE *input, const char *inname, DB *output, + int to_lowercase, int be_quiet)); +static int print_database __P ((DB *db)); +int main __P ((int argc, char *argv[])); + + +int +main (argc, argv) + int argc; + char *argv[]; +{ + const char *input_name; + FILE *input_file; + DB *db_file; + int status; + int remaining; + + /* Set locale via LC_ALL. */ + setlocale (LC_ALL, ""); + + /* Set the text message domain. */ + textdomain (_libc_intl_domainname); + + /* Initialize local variables. */ + input_name = NULL; + + /* Parse and process arguments. */ + argp_parse (&argp, argc, argv, 0, &remaining, NULL); + + /* Determine file names. */ + if (do_undo || output_name != NULL) + { + if (remaining + 1 != argc) + { + wrong_arguments: + error (0, 0, gettext ("wrong number of arguments")); + argp_help (&argp, stdout, ARGP_HELP_SEE, + program_invocation_short_name); + exit (1); + } + input_name = argv[remaining]; + } + else + { + if (remaining + 2 != argc) + goto wrong_arguments; + + input_name = argv[remaining++]; + output_name = argv[remaining]; + } + + /* Special handling if we are asked to print the database. */ + if (do_undo) + { + db_file = dbopen (input_name, O_RDONLY, 0666, DB_BTREE, NULL); + if (db_file == NULL) + error (EXIT_FAILURE, 0, gettext ("cannot open database file `%s': %s"), + input_name, + errno == EINVAL ? gettext ("incorrectly formatted file") + : strerror (errno)); + + status = print_database (db_file); + + db_file->close (db_file); + + return status; + } + + /* Open input file. */ + if (strcmp (input_name, "-") == 0 || strcmp (input_name, "/dev/stdin") == 0) + input_file = stdin; + else + { + input_file = fopen (input_name, "r"); + if (input_file == NULL) + error (EXIT_FAILURE, errno, gettext ("cannot open input file `%s'"), + input_name); + } + + /* Open output file. This must not be standard output so we don't + handle "-" and "/dev/stdout" special. */ + db_file = dbopen (output_name, O_CREAT | O_RDWR | O_TRUNC, 0666, + DB_BTREE, NULL); + if (db_file == NULL) + error (EXIT_FAILURE, errno, gettext ("cannot open output file `%s'")); + + /* Start the real work. */ + status = process_input (input_file, input_name, db_file, to_lowercase, + be_quiet); + + /* Close files. */ + if (input_file != stdin) + fclose (input_file); + db_file->close (db_file); + + return status; +} + + +/* Handle program arguments. */ +static error_t +parse_opt (int key, char *arg, struct argp_state *state) +{ + switch (key) + { + case 'f': + to_lowercase = 1; + break; + case 'o': + output_name = arg; + break; + case 'q': + be_quiet = 1; + break; + case 'u': + do_undo = 1; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + + +static char * +more_help (int key, const char *text, void *input) +{ + switch (key) + { + case ARGP_KEY_HELP_EXTRA: + /* We print some extra information. */ + return strdup (gettext ("\ +Report bugs using the `glibcbug' script to <bugs@gnu.ai.mit.edu>.\n")); + default: + break; + } + return (char *) text; +} + +/* Print the version information. */ +static void +print_version (FILE *stream, struct argp_state *state) +{ + fprintf (stream, "makedb (GNU %s) %s\n", PACKAGE, VERSION); + fprintf (stream, gettext ("\ +Copyright (C) %s Free Software Foundation, Inc.\n\ +This is free software; see the source for copying conditions. There is NO\n\ +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\ +"), "1996, 1997"); + fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper"); +} + + +static int +process_input (input, inname, output, to_lowercase, be_quiet) + FILE *input; + const char *inname; + DB *output; + int to_lowercase; + int be_quiet; +{ + char *line; + size_t linelen; + int status; + size_t linenr; + + line = NULL; + linelen = 0; + status = EXIT_SUCCESS; + linenr = 0; + + while (!feof (input)) + { + DBT key; + DBT val; + char *cp; + int n; + + n = getline (&line, &linelen, input); + if (n < 0) + /* This means end of file or some bug. */ + break; + if (n == 0) + /* Short read. Probably interrupted system call. */ + continue; + + ++linenr; + + if (line[n - 1] == '\n') + /* Remove trailing newline. */ + line[--n] = '\0'; + + cp = line; + while (isspace (*cp)) + ++cp; + + if (*cp == '#') + /* First non-space character in line '#': it's a comment. */ + continue; + + key.data = cp; + while (*cp != '\0' && !isspace (*cp)) + { + if (to_lowercase) + *cp = tolower (*cp); + ++cp; + } + + if (key.data == cp) + /* It's an empty line. */ + continue; + + key.size = cp - (char *) key.data; + + while (isspace (*cp)) + ++cp; + + val.data = cp; + val.size = &line[n] - cp; + + /* Store the value. */ + status = output->put (output, &key, &val, R_NOOVERWRITE); + if (status != 0) + { + if (status == 1) + { + if (!be_quiet) + error_at_line (0, 0, inname, linenr, + gettext ("duplicate key")); + /* This is no real error. Just give a warning. */ + status = 0; + } + else + error (0, errno, gettext ("while writing data base file")); + + status = status ? EXIT_FAILURE : EXIT_SUCCESS; + + clearerr (input); + break; + } + } + + if (ferror (input)) + { + error (0, 0, gettext ("problems while reading `%s'")); + status = EXIT_FAILURE; + } + + return status; +} + + +static int +print_database (db) + DB *db; +{ + DBT key; + DBT val; + int no_more; + + no_more = db->seq (db, &key, &val, R_FIRST); + while (!no_more) + { + printf ("%.*s %.*s\n", (int) key.size, (char *) key.data, (int) val.size, + (char *) val.data); + + no_more = db->seq (db, &key, &val, R_NEXT); + } + + if (no_more == -1) + { + error (0, errno, gettext ("while reading database")); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c new file mode 100644 index 0000000000..e1b68ce450 --- /dev/null +++ b/db2/mp/mp_bh.c @@ -0,0 +1,437 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_bh.c 10.12 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * __memp_bhwrite -- + * Write the page associated with a given bucket header. + * + * PUBLIC: int __memp_bhwrite + * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + int *restartp, *wrotep; +{ + DBT dbt; + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + + /* + * Walk the process' DB_MPOOLFILE list and try and find a file + * descriptor for this file. + */ + LOCKHANDLE(dbmp, &dbmp->mutex); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) + if (dbmfp->mfp == mfp) + break; + UNLOCKHANDLE(dbmp, &dbmp->mutex); + if (dbmfp != NULL) + goto found; + + /* + * It's not a page from a file we've opened. If the file requires + * input/output processing, see if this process has ever registered + * information as to how to write this type of file. If not, there's + * nothing we can do. + */ + if (mfp->ftype != 0) { + LOCKHANDLE(dbmp, &dbmp->mutex); + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) + if (mpreg->ftype == mfp->ftype) + break; + UNLOCKHANDLE(dbmp, &dbmp->mutex); + if (mpreg == NULL) + return (0); + } + + /* + * Try and open the file; ignore any error, assume it's a permissions + * problem. + */ + dbt.size = mfp->pgcookie_len; + dbt.data = ADDR(dbmp, mfp->pgcookie_off); + if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off), + mfp->ftype, 0, 0, mfp->stat.st_pagesize, + mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0) + return (0); + +found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep)); +} + +/* + * __memp_pgread -- + * Read a page from a file. + * + * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pgread(dbmfp, bhp, can_create) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int can_create; +{ + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + size_t pagesize; + ssize_t nr; + int ret; + + dbmp = dbmfp->dbmp; + mfp = dbmfp->mfp; + pagesize = mfp->stat.st_pagesize; + + F_SET(bhp, BH_LOCKED | BH_TRASH); + LOCKBUFFER(dbmp, bhp); + UNLOCKREGION(dbmp); + + /* + * Temporary files may not yet have been created. + * + * Seek to the page location. + */ + ret = 0; + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->fd == -1 || (ret = + __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) { + if (!can_create) { + if (dbmfp->fd == -1) + ret = EINVAL; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + __db_err(dbmp->dbenv, + "%s: page %lu doesn't exist, create flag not set", + dbmfp->path, (u_long)bhp->pgno); + goto err; + } + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + /* Clear any uninitialized data. */ + memset(bhp->buf, 0, pagesize); + goto pgin; + } + + /* + * Read the page; short reads are treated like creates, although + * any valid data is preserved. + */ + ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr); + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + if (ret != 0) + goto err; + + if (nr == (ssize_t)pagesize) + can_create = 0; + else { + if (!can_create) { + ret = EINVAL; + goto err; + } + + /* Clear any uninitialized data. */ + memset(bhp->buf + nr, 0, pagesize - nr); + } + + /* Call any pgin function. */ +pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); + + /* Reacquire the region lock. */ + LOCKREGION(dbmp); + + /* If the pgin function succeeded, the data is now valid. */ + if (ret == 0) + F_CLR(bhp, BH_TRASH); + + /* Update the statistics. */ + if (can_create) { + ++dbmp->mp->stat.st_page_create; + ++mfp->stat.st_page_create; + } else { + ++dbmp->mp->stat.st_page_in; + ++mfp->stat.st_page_in; + } + + if (0) { +err: LOCKREGION(dbmp); + } + + /* Release the buffer. */ + F_CLR(bhp, BH_LOCKED); + UNLOCKBUFFER(dbmp, bhp); + + return (ret); +} + +/* + * __memp_pgwrite -- + * Write a page to a file. + * + * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_pgwrite(dbmfp, bhp, restartp, wrotep) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int *restartp, *wrotep; +{ + DB_ENV *dbenv; + DB_LOG *lg_info; + DB_LSN lsn; + DB_MPOOL *dbmp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t pagesize; + ssize_t nw; + int callpgin, ret; + const char *fail; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mfp = dbmfp->mfp; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + callpgin = 0; + pagesize = mfp->stat.st_pagesize; + + F_SET(bhp, BH_LOCKED); + LOCKBUFFER(dbmp, bhp); + UNLOCKREGION(dbmp); + + if (restartp != NULL) + *restartp = 1; + + /* Copy the LSN off the page if we're going to need it. */ + lg_info = dbenv->lg_info; + if (lg_info != NULL || F_ISSET(bhp, BH_WRITE)) + memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); + + /* Ensure the appropriate log records are on disk. */ + if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0) + goto err; + + /* + * Call any pgout function. We set the callpgin flag so that on + * error we flag that the contents of the buffer may be trash. + */ + if (mfp->ftype == 0) + ret = 0; + else { + callpgin = 1; + if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) + goto err; + } + + /* Temporary files may not yet have been created. */ + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, DB_APP_TMP, + NULL, NULL, &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) { + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + __db_err(dbenv, "unable to create temporary backing file"); + goto err; + } + + /* Write the page out. */ + if ((ret = + __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) + fail = "seek"; + else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0) + fail = "write"; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + if (ret != 0) { + /* + * XXX + * Shut the compiler up; it doesn't understand the correlation + * between the failing clauses to __db_lseek and __db_write and + * this ret != 0. + */ + fail = NULL; + goto syserr; + } + + if (nw != (ssize_t)pagesize) { + ret = EIO; + fail = "write"; + goto syserr; + } + + if (wrotep != NULL) + *wrotep = 1; + + /* Reacquire the region lock. */ + LOCKREGION(dbmp); + + /* Clean up the flags based on a successful write. */ + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_DIRTY | BH_LOCKED); + UNLOCKBUFFER(dbmp, bhp); + + /* + * If we wrote a buffer which a checkpoint is waiting for, update + * the count of pending buffers (both in the mpool as a whole and + * for this file). If the count for this file goes to zero, flush + * the writes. + * + * XXX: + * We ignore errors from the sync -- it makes no sense to return an + * error to the calling process, so set a flag causing the sync to + * be retried later. + * + * If the buffer we wrote has a LSN larger than the current largest + * we've written for this checkpoint, update the saved value. + */ + mp = dbmp->mp; + if (F_ISSET(bhp, BH_WRITE)) { + if (log_compare(&lsn, &mp->lsn) > 0) + mp->lsn = lsn; + F_CLR(bhp, BH_WRITE); + + --mp->lsn_cnt; + if (--mfp->lsn_cnt == 0) { + /* + * Don't lock -- there are no atomicity issues for + * fsync(2). + */ + if (__db_fsync(dbmfp->fd) != 0) + F_SET(mp, MP_LSN_RETRY); + } + } + + /* Update I/O statistics. */ + ++mp->stat.st_page_out; + ++mfp->stat.st_page_out; + + return (0); + +syserr: __db_err(dbenv, + "%s: %s failed for page %lu", dbmfp->path, fail, (u_long)bhp->pgno); + +err: UNLOCKBUFFER(dbmp, bhp); + LOCKREGION(dbmp); + if (callpgin) + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_LOCKED); + return (ret); +} + +/* + * __memp_pg -- + * Call the pgin/pgout routine. + * + * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pg(dbmfp, bhp, is_pgin) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int is_pgin; +{ + DBT dbt, *dbtp; + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + MPOOLFILE *mfp; + int ftype, ret; + + dbmp = dbmfp->dbmp; + mfp = dbmfp->mfp; + + LOCKHANDLE(dbmp, &dbmp->mutex); + + ftype = mfp->ftype; + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { + if (ftype != mpreg->ftype) + continue; + if (mfp->pgcookie_len == 0) + dbtp = NULL; + else { + dbt.size = mfp->pgcookie_len; + dbt.data = ADDR(dbmp, mfp->pgcookie_off); + dbtp = &dbt; + } + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + if (is_pgin) { + if (mpreg->pgin != NULL && (ret = + mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + } else + if (mpreg->pgout != NULL && (ret = + mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + break; + } + + if (mpreg == NULL) + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + return (0); + +err: UNLOCKHANDLE(dbmp, &dbmp->mutex); + __db_err(dbmp->dbenv, "%s: %s failed for page %lu", + dbmfp->path, is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); + return (ret); +} + +/* + * __memp_bhfree -- + * Free a bucket header and its referenced data. + * + * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); + */ +void +__memp_bhfree(dbmp, mfp, bhp, free_mem) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + int free_mem; +{ + size_t off; + + /* Delete the buffer header from the MPOOL hash list. */ + off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno); + SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh); + + /* Delete the buffer header from the LRU chain. */ + SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); + + /* + * If we're not reusing it immediately, free the buffer header + * and data for real. + */ + if (free_mem) + __db_shalloc_free(dbmp->addr, bhp); +} diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c new file mode 100644 index 0000000000..418802a3b9 --- /dev/null +++ b/db2/mp/mp_fget.c @@ -0,0 +1,359 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fget.c 10.22 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +int __sleep_on_every_page_get; /* XXX: thread debugging option. */ + +/* + * memp_fget -- + * Get a page from the file. + */ +int +memp_fget(dbmfp, pgnoaddr, flags, addrp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; + u_long flags; + void *addrp; +{ + BH *bhp, *tbhp; + DB_MPOOL *dbmp; + MPOOL *mp; + MPOOLFILE *mfp; + db_pgno_t lastpgno; + size_t bucket, mf_offset; + off_t size; + u_long cnt; + int b_incr, b_inserted, readonly_alloc, ret; + void *addr; + + dbmp = dbmfp->dbmp; + + /* + * Validate arguments. + * + * !!! + * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly + * files here, and create non-existent pages in readonly files if the + * flags are set, later. The reason is that the hash access method + * wants to get empty pages that don't really exist in readonly files. + * The only alternative is for hash to write the last "bucket" all the + * time, which we don't want to do because one of our big goals in life + * is to keep database files small. It's sleazy as hell, but we catch + * any attempt to actually write the file in memp_fput(). + */ +#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) + if (flags != 0) { + if ((ret = + __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0) + return (ret); + + switch (flags) { + case DB_MPOOL_CREATE: + case DB_MPOOL_LAST: + case DB_MPOOL_NEW: + case 0: + break; + default: + return (__db_ferr(dbmp->dbenv, "memp_fget", 1)); + } + } + +#ifdef DEBUG + /* + * XXX + * We want to switch threads as often as possible. Sleep every time + * we get a new page to make it more likely. + */ + if (__sleep_on_every_page_get && (dbmp->dbenv == NULL || + dbmp->dbenv->db_yield == NULL || dbmp->dbenv->db_yield() != 0)) + __db_sleep(0, 1); +#endif + + mp = dbmp->mp; + mfp = dbmfp->mfp; + mf_offset = OFFSET(dbmp, mfp); + addr = NULL; + bhp = NULL; + b_incr = b_inserted = readonly_alloc = ret = 0; + + LOCKREGION(dbmp); + + /* + * If mmap'ing the file, just return a pointer. However, if another + * process has opened the file for writing since we mmap'd it, start + * playing the game by their rules, i.e. everything goes through the + * cache. All pages previously returned should be safe, as long as + * a locking protocol was observed. + * + * XXX + * We don't discard the map because we don't know when all of the + * pages will have been discarded from the process' address space. + * It would be possible to do so by reference counting the open + * pages from the mmap, but it's unclear to me that it's worth it. + */ + if (dbmfp->addr != NULL && dbmfp->mfp->can_mmap) { + lastpgno = dbmfp->len == 0 ? + 0 : (dbmfp->len - 1) / mfp->stat.st_pagesize; + if (LF_ISSET(DB_MPOOL_LAST)) + *pgnoaddr = lastpgno; + else { + /* + * !!! + * Allocate a page that can never really exist. See + * the comment above about non-existent pages and the + * hash access method. + */ + if (LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) + readonly_alloc = 1; + else if (*pgnoaddr > lastpgno) { + __db_err(dbmp->dbenv, + "%s: page %lu doesn't exist", + dbmfp->path, (u_long)*pgnoaddr); + ret = EINVAL; + goto err; + } + } + if (!readonly_alloc) { + addr = ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); + + ++mp->stat.st_map; + ++mfp->stat.st_map; + + goto mapret; + } + } + + /* + * If requesting the last page or a new page, find the last page. The + * tricky thing is that the user may have created a page already that's + * after any page that exists in the file. + */ + if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) { + /* + * Temporary files may not yet have been created. + * + * Don't lock -- there are no atomicity issues for stat(2). + */ + if (dbmfp->fd == -1) + size = 0; + else if ((ret = __db_stat(dbmp->dbenv, + dbmfp->path, dbmfp->fd, &size, NULL)) != 0) + goto err; + + *pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize; + + /* + * Walk the list of BH's, looking for later pages. Save the + * pointer if a later page is found so that we don't have to + * search the list twice. + * + * If requesting a new page, return the page one after the last + * page -- which we'll have to create. + */ + for (tbhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + tbhp != NULL; tbhp = SH_TAILQ_NEXT(tbhp, q, __bh)) + if (tbhp->pgno >= *pgnoaddr && + tbhp->mf_offset == mf_offset) { + bhp = tbhp; + *pgnoaddr = bhp->pgno; + } + if (LF_ISSET(DB_MPOOL_NEW)) + ++*pgnoaddr; + } + + /* If we already found the right buffer, return it. */ + if (LF_ISSET(DB_MPOOL_LAST) && bhp != NULL) { + addr = bhp->buf; + goto found; + } + + /* If we haven't checked the BH list yet, do the search. */ + if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) { + ++mp->stat.st_hash_searches; + bucket = BUCKET(mp, mf_offset, *pgnoaddr); + for (cnt = 0, + bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) { + ++cnt; + if (bhp->pgno == *pgnoaddr && + bhp->mf_offset == mf_offset) { + addr = bhp->buf; + if (cnt > mp->stat.st_hash_longest) + mp->stat.st_hash_longest = cnt; + mp->stat.st_hash_examined += cnt; + goto found; + } + } + if (cnt > mp->stat.st_hash_longest) + mp->stat.st_hash_longest = cnt; + mp->stat.st_hash_examined += cnt; + } + + /* + * Allocate a new buffer header and data space, and mark the contents + * as useless. + */ + if ((ret = __memp_ralloc(dbmp, sizeof(BH) - + sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0) + goto err; + addr = bhp->buf; +#ifdef DEBUG + if ((ALIGNTYPE)addr & (sizeof(size_t) - 1)) { + __db_err(dbmp->dbenv, + "Internal error: BH data NOT size_t aligned."); + abort(); + } +#endif + memset(bhp, 0, sizeof(BH)); + LOCKINIT(dbmp, &bhp->mutex); + + /* + * Prepend the bucket header to the head of the appropriate MPOOL + * bucket hash list. Append the bucket header to the tail of the + * MPOOL LRU chain. + * + * We have to do this before we read in the page so we can discard + * our region lock without screwing up the world. + */ + bucket = BUCKET(mp, mf_offset, *pgnoaddr); + SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, mq, __bh); + SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q); + b_inserted = 1; + + /* Set the page number, and associated MPOOLFILE. */ + bhp->mf_offset = mf_offset; + bhp->pgno = *pgnoaddr; + + /* + * If we know we created the page, zero it out and continue. + * + * !!! + * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function. + * If DB_MPOOL_CREATE is used, then the application's pgin function + * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, + * it can detect all of its page creates, and not bother. + * + * Otherwise, read the page into memory, optionally creating it if + * DB_MPOOL_CREATE is set. + * + * Increment the reference count for created buffers, but importantly, + * increment the reference count for buffers we're about to read so + * that the buffer can't move. + */ + ++bhp->ref; + b_incr = 1; + + if (LF_ISSET(DB_MPOOL_NEW)) + memset(addr, 0, mfp->stat.st_pagesize); + else { + /* + * It's possible for the read function to fail, which means + * that we fail as well. + */ +reread: if ((ret = __memp_pgread(dbmfp, + bhp, LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))) != 0) + goto err; + + /* + * !!! + * The __memp_pgread call discarded and reacquired the region + * lock. Because the buffer reference count was incremented + * before the region lock was discarded the buffer didn't move. + */ + ++mp->stat.st_cache_miss; + ++mfp->stat.st_cache_miss; + } + + if (0) { +found: /* Increment the reference count. */ + if (bhp->ref == UINT16_T_MAX) { + __db_err(dbmp->dbenv, + "%s: too many references to page %lu", + dbmfp->path, bhp->pgno); + ret = EAGAIN; + goto err; + } + ++bhp->ref; + b_incr = 1; + + /* + * Any found buffer might be trouble. + * + * BH_LOCKED -- + * I/O in progress, wait for it to finish. Because the buffer + * reference count was incremented before the region lock was + * discarded we know the buffer didn't move. + */ + if (F_ISSET(bhp, BH_LOCKED)) { + UNLOCKREGION(dbmp); + LOCKBUFFER(dbmp, bhp); + /* Waiting for I/O to finish... */ + UNLOCKBUFFER(dbmp, bhp); + LOCKREGION(dbmp); + } + + /* + * BH_TRASH -- + * The buffer is garbage. + */ + if (F_ISSET(bhp, BH_TRASH)) + goto reread; + + /* + * BH_CALLPGIN -- + * The buffer was written, and the contents need to be + * converted again. + */ + if (F_ISSET(bhp, BH_CALLPGIN)) { + if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) + goto err; + F_CLR(bhp, BH_CALLPGIN); + } + + ++mp->stat.st_cache_hit; + ++mfp->stat.st_cache_hit; + } + +mapret: LOCKHANDLE(dbmp, &dbmfp->mutex); + ++dbmfp->pinref; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + if (0) { +err: /* + * If no other process is already waiting on a created buffer, + * go ahead and discard it, it's not useful. + */ + if (b_incr) + --bhp->ref; + if (b_inserted && bhp->ref == 0) + __memp_bhfree(dbmp, mfp, bhp, 1); + } + + UNLOCKREGION(dbmp); + + *(void **)addrp = addr; + return (ret); +} diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c new file mode 100644 index 0000000000..7703847b73 --- /dev/null +++ b/db2/mp/mp_fopen.c @@ -0,0 +1,437 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fopen.c 10.24 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *)); +static int __memp_mf_open __P((DB_MPOOL *, DB_MPOOLFILE *, + int, int, size_t, int, DBT *, u_int8_t *, int, MPOOLFILE **)); + +/* + * memp_fopen -- + * Open a backing file for the memory pool. + */ +int +memp_fopen(dbmp, path, ftype, + flags, mode, pagesize, lsn_offset, pgcookie, fileid, retp) + DB_MPOOL *dbmp; + const char *path; + int ftype, flags, mode, lsn_offset; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + DB_MPOOLFILE **retp; +{ + int ret; + + /* Validate arguments. */ + if ((ret = __db_fchk(dbmp->dbenv, + "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0) + return (ret); + + return (__memp_fopen(dbmp, path, ftype, + flags, mode, pagesize, lsn_offset, pgcookie, fileid, 1, retp)); +} + +/* + * __memp_fopen -- + * Open a backing file for the memory pool; internal version. + * + * PUBLIC: int __memp_fopen __P((DB_MPOOL *, const char *, int, int, + * PUBLIC: int, size_t, int, DBT *, u_int8_t *, int, DB_MPOOLFILE **)); + */ +int +__memp_fopen(dbmp, path, + ftype, flags, mode, pagesize, lsn_offset, pgcookie, fileid, needlock, retp) + DB_MPOOL *dbmp; + const char *path; + int ftype, flags, mode, lsn_offset, needlock; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + DB_MPOOLFILE **retp; +{ + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + off_t size; + int ret; + + dbenv = dbmp->dbenv; + ret = 0; + + /* Require a non-zero pagesize. */ + if (pagesize == 0) { + __db_err(dbenv, "memp_fopen: pagesize not specified"); + return (EINVAL); + } + + /* Allocate and initialize the per-process structure. */ + if ((dbmfp = + (DB_MPOOLFILE *)calloc(1, sizeof(DB_MPOOLFILE))) == NULL) { + __db_err(dbenv, "%s: %s", + path == NULL ? TEMPORARY : path, strerror(ENOMEM)); + return (ENOMEM); + } + LOCKINIT(dbmp, &dbmfp->mutex); + dbmfp->dbmp = dbmp; + dbmfp->fd = -1; + if (LF_ISSET(DB_RDONLY)) + F_SET(dbmfp, MP_READONLY); + + if (path == NULL) { + if (LF_ISSET(DB_RDONLY)) { + __db_err(dbenv, + "memp_fopen: temporary files can't be readonly"); + ret = EINVAL; + goto err; + } + dbmfp->path = (char *) TEMPORARY; + F_SET(dbmfp, MP_PATH_TEMP); + } else { + /* Calculate the real name for this file. */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, NULL, path, NULL, &dbmfp->path)) != 0) + goto err; + F_SET(dbmfp, MP_PATH_ALLOC); + + + /* Open the file. */ + if ((ret = __db_fdopen(dbmfp->path, + LF_ISSET(DB_CREATE | DB_RDONLY), DB_CREATE | DB_RDONLY, + mode, &dbmfp->fd)) != 0) { + __db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret)); + goto err; + } + + /* Don't permit files that aren't a multiple of the pagesize. */ + if ((ret = __db_stat(dbenv, + dbmfp->path, dbmfp->fd, &size, NULL)) != 0) + goto err; + if (size % pagesize) { + __db_err(dbenv, + "%s: file size not a multiple of the pagesize", + dbmfp->path); + ret = EINVAL; + goto err; + } + } + + /* Find/allocate the shared file object. */ + if (needlock) + LOCKREGION(dbmp); + ret = __memp_mf_open(dbmp, dbmfp, ftype, + F_ISSET(dbmfp, MP_READONLY), pagesize, + lsn_offset, pgcookie, fileid, F_ISSET(dbmfp, MP_PATH_TEMP), &mfp); + if (needlock) + UNLOCKREGION(dbmp); + if (ret != 0) + goto err; + + dbmfp->mfp = mfp; + + /* + * If a file: + * + * + is read-only + * + doesn't require any pgin/pgout support + * + is less than mp_mmapsize bytes in size. + * + and the DB_NOMMAP flag wasn't set + * + * we can mmap it instead of reading/writing buffers. Don't do error + * checking based on the mmap call failure. We want to do normal I/O + * on the file if the reason we failed was because the file was on an + * NFS mounted partition, and we can fail in buffer I/O just as easily + * as here. + * + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ +#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */ + dbmfp->addr = NULL; + mfp->can_mmap = F_ISSET(dbmfp, MP_READONLY) && + ftype == 0 && !LF_ISSET(DB_NOMMAP) && path != NULL && + size <= (dbenv == NULL || dbenv->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : (off_t)dbenv->mp_mmapsize); + if (mfp->can_mmap) { + dbmfp->len = size; + if (__db_mmap(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) { + mfp->can_mmap = 0; + dbmfp->addr = NULL; + } + } + + LOCKHANDLE(dbmp, &dbmp->mutex); + TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + *retp = dbmfp; + return (0); + +err: if (F_ISSET(dbmfp, MP_PATH_ALLOC)) + FREES(dbmfp->path); + if (dbmfp->fd != -1) + (void)__db_close(dbmfp->fd); + if (dbmfp != NULL) + FREE(dbmfp, sizeof(DB_MPOOLFILE)); + return (ret); +} + +/* + * __memp_mf_open -- + * Open an MPOOLFILE. + */ +static int +__memp_mf_open(dbmp, dbmfp, + ftype, readonly, pagesize, lsn_offset, pgcookie, fileid, istemp, retp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + int ftype, readonly, lsn_offset, istemp; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + MPOOLFILE **retp; +{ + MPOOLFILE *mfp; + int ret; + u_int8_t idbuf[DB_FILE_ID_LEN]; + void *p; + + /* Temporary files can't match previous files. */ + if (istemp) + goto alloc; + + /* + * Get the file id if we weren't give one. Generated file id's don't + * use timestamps, otherwise there'd be no chance of anyone joining + * the party. + */ + if (fileid == NULL) { + if ((ret = + __db_fileid(dbmp->dbenv, dbmfp->path, 0, idbuf)) != 0) + return (ret); + fileid = idbuf; + } + + /* Walk the list of MPOOLFILE's, looking for a matching file. */ + for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + if (!memcmp(fileid, + ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) { + if (ftype != mfp->ftype || + pagesize != mfp->stat.st_pagesize) { + __db_err(dbmp->dbenv, + "%s: ftype or pagesize changed", + dbmfp->path); + ret = EINVAL; + mfp = NULL; + goto ret1; + } + /* + * Found it: increment the reference count and update + * the mmap-able status. + */ + ++mfp->ref; + if (!readonly) + mfp->can_mmap = 0; + goto ret1; + } + + /* Allocate a new MPOOLFILE. */ +alloc: if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + goto ret1; + + /* Initialize the structure. */ + memset(mfp, 0, sizeof(MPOOLFILE)); + mfp->ref = 1; + mfp->ftype = ftype; + mfp->lsn_off = lsn_offset; + mfp->stat.st_pagesize = pagesize; + + /* Copy the file path into shared memory. */ + if ((ret = __memp_ralloc(dbmp, + strlen(dbmfp->path) + 1, &mfp->path_off, &p)) != 0) + goto err; + memcpy(p, dbmfp->path, strlen(dbmfp->path) + 1); + + /* Copy the file identification string into shared memory. */ + if (istemp) + mfp->fileid_off = 0; + else { + if ((ret = __memp_ralloc(dbmp, + DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) + goto err; + memcpy(p, fileid, DB_FILE_ID_LEN); + } + + /* Copy the page cookie into shared memory. */ + if (pgcookie == NULL || pgcookie->size == 0) { + mfp->pgcookie_len = 0; + mfp->pgcookie_off = 0; + } else { + if ((ret = __memp_ralloc(dbmp, + pgcookie->size, &mfp->pgcookie_off, &p)) != 0) + goto err; + memcpy(p, pgcookie->data, pgcookie->size); + mfp->pgcookie_len = pgcookie->size; + } + + /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */ + SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile); + + if (0) { +err: if (mfp->path_off != 0) + __db_shalloc_free(dbmp->addr, + ADDR(dbmp, mfp->path_off)); + if (!istemp) + __db_shalloc_free(dbmp->addr, + ADDR(dbmp, mfp->fileid_off)); + if (mfp != NULL) + __db_shalloc_free(dbmp->addr, mfp); + mfp = NULL; + } + +ret1: *retp = mfp; + return (0); +} + +/* + * memp_fclose -- + * Close a backing file for the memory pool. + */ +int +memp_fclose(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_MPOOL *dbmp; + int ret, t_ret; + + dbmp = dbmfp->dbmp; + ret = 0; + + /* Complain if pinned blocks never returned. */ + if (dbmfp->pinref != 0) + __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned", + dbmfp->path, (u_long)dbmfp->pinref); + + /* Remove the DB_MPOOLFILE structure from the list. */ + LOCKHANDLE(dbmp, &dbmp->mutex); + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + /* Close the underlying MPOOLFILE. */ + (void)__memp_mf_close(dbmp, dbmfp); + + /* Discard any mmap information. */ + if (dbmfp->addr != NULL && + (ret = __db_munmap(dbmfp->addr, dbmfp->len)) != 0) + __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(ret)); + + /* Close the file; temporary files may not yet have been created. */ + if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) { + __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(t_ret)); + if (ret != 0) + t_ret = ret; + } + + /* Potentially allocated path. */ + if (F_ISSET(dbmfp, MP_PATH_ALLOC)) + FREES(dbmfp->path); + + /* Free the DB_MPOOLFILE structure. */ + FREE(dbmfp, sizeof(DB_MPOOLFILE)); + + return (ret); +} + +/* + * __memp_mf_close -- + * Close down an MPOOLFILE. + */ +static int +__memp_mf_close(dbmp, dbmfp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; +{ + BH *bhp, *nbhp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t mf_offset; + + mp = dbmp->mp; + mfp = dbmfp->mfp; + + LOCKREGION(dbmp); + + /* If more than a single reference, simply decrement. */ + if (mfp->ref > 1) { + --mfp->ref; + goto ret1; + } + + /* + * Move any BH's held by the file to the free list. We don't free the + * memory itself because we may be discarding the memory pool, and it's + * fairly expensive to reintegrate the buffers back into the region for + * no purpose. + */ + mf_offset = OFFSET(dbmp, mfp); + for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + +#ifdef DEBUG_NO_DIRTY + /* Complain if we find any blocks that were left dirty. */ + if (F_ISSET(bhp, BH_DIRTY)) + __db_err(dbmp->dbenv, + "%s: close: pgno %lu left dirty; ref %lu", + dbmfp->path, (u_long)bhp->pgno, (u_long)bhp->ref); +#endif + + if (bhp->mf_offset == mf_offset) { + __memp_bhfree(dbmp, mfp, bhp, 0); + SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh); + } + } + + /* Delete from the list of MPOOLFILEs. */ + SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); + + /* Free the space. */ + __db_shalloc_free(dbmp->addr, mfp); + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->path_off)); + if (mfp->fileid_off != 0) + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->fileid_off)); + if (mfp->pgcookie_off != 0) + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->pgcookie_off)); + +ret1: UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c new file mode 100644 index 0000000000..5fac8ae76b --- /dev/null +++ b/db2/mp/mp_fput.c @@ -0,0 +1,140 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fput.c 10.10 (Sleepycat) 7/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_fput -- + * Mpool file put function. + */ +int +memp_fput(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_long flags; +{ + BH *bhp; + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + int wrote, ret; + + dbmp = dbmfp->dbmp; + + /* Validate arguments. */ + if (flags) { + if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags, + DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbmp->dbenv, "memp_fput", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbmp->dbenv, + "%s: dirty flag set for readonly file page", + dbmfp->path); + return (EACCES); + } + } + + /* Decrement the pinned reference count. */ + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->pinref == 0) + __db_err(dbmp->dbenv, + "%s: put: more blocks returned than retrieved", + dbmfp->path); + else + --dbmfp->pinref; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + /* + * If we're mapping the file, there's nothing to do. Because we can + * quit mapping at any time, we have to check on each buffer to see + * if it's in the map region. + */ + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) + return (0); + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + LOCKREGION(dbmp); + + /* Set/clear the page bits. */ + if (LF_ISSET(DB_MPOOL_CLEAN)) + F_CLR(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DIRTY)) + F_SET(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + /* + * If more than one reference to the page, we're done. Ignore discard + * flags (for now) and leave it at its position in the LRU chain. The + * rest gets done at last reference close. + */ +#ifdef DEBUG + if (bhp->ref == 0) { + __db_err(dbmp->dbenv, + "Internal error: bhp->ref on page %lu went negative.", + (u_long)bhp->pgno); + abort(); + } +#endif + if (--bhp->ref > 0) { + UNLOCKREGION(dbmp); + return (0); + } + + /* Move the buffer to the head/tail of the LRU chain. */ + SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); + if (F_ISSET(bhp, BH_DISCARD)) + SH_TAILQ_INSERT_HEAD(&dbmp->mp->bhq, bhp, q, __bh); + else + SH_TAILQ_INSERT_TAIL(&dbmp->mp->bhq, bhp, q); + + /* + * If this buffer is scheduled for writing because of a checkpoint, + * write it now. If we can't write it, set a flag so that the next + * time the memp_sync function is called we try writing it there, + * as the checkpoint application better be able to write all of the + * files. + */ + if (F_ISSET(bhp, BH_WRITE)) + if (F_ISSET(bhp, BH_DIRTY)) { + if (__memp_bhwrite(dbmp, + dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote) + F_SET(dbmp->mp, MP_LSN_RETRY); + } else { + F_CLR(bhp, BH_WRITE); + + mfp = ADDR(dbmp, bhp->mf_offset); + --mfp->lsn_cnt; + + --dbmp->mp->lsn_cnt; + } + + UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c new file mode 100644 index 0000000000..588085a358 --- /dev/null +++ b/db2/mp/mp_fset.c @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fset.c 10.8 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_fset -- + * Mpool page set-flag routine. + */ +int +memp_fset(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_long flags; +{ + BH *bhp; + DB_MPOOL *dbmp; + int ret; + + dbmp = dbmfp->dbmp; + + /* Validate arguments. */ + if (flags != 0) { + if ((ret = __db_fchk(dbmp->dbenv, "memp_fset", flags, + DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbmp->dbenv, "memp_fset", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbmp->dbenv, + "%s: dirty flag set for readonly file page", + dbmfp->path); + return (EACCES); + } + } + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + LOCKREGION(dbmp); + + if (LF_ISSET(DB_MPOOL_DIRTY)) + F_SET(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_CLEAN)) + F_CLR(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c new file mode 100644 index 0000000000..257ce1b9e9 --- /dev/null +++ b/db2/mp/mp_open.c @@ -0,0 +1,176 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_open.c 10.12 (Sleepycat) 7/6/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_open -- + * Initialize and/or join a memory pool. + */ +int +memp_open(path, flags, mode, dbenv, retp) + const char *path; + int flags, mode; + DB_ENV *dbenv; + DB_MPOOL **retp; +{ + DB_MPOOL *dbmp; + size_t cachesize; + int ret; + + /* Validate arguments. */ +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD) +#else +#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP) +#endif + if ((ret = __db_fchk(dbenv, "memp_open", flags, OKFLAGS)) != 0) + return (ret); + + /* Extract fields from DB_ENV structure. */ + cachesize = dbenv == NULL ? 0 : dbenv->mp_size; + + /* Create and initialize the DB_MPOOL structure. */ + if ((dbmp = (DB_MPOOL *)calloc(1, sizeof(DB_MPOOL))) == NULL) + return (ENOMEM); + LOCKINIT(dbmp, &dbmp->mutex); + LIST_INIT(&dbmp->dbregq); + TAILQ_INIT(&dbmp->dbmfq); + + dbmp->dbenv = dbenv; + + /* Decide if it's possible for anyone else to access the pool. */ + if ((dbenv == NULL && path == NULL) || + (dbenv != NULL && F_ISSET(dbenv, DB_MPOOL_PRIVATE))) + F_SET(dbmp, MP_ISPRIVATE); + + /* + * Map in the region. We do locking regardless, as portions of it are + * implemented in common code (if we put the region in a file, that is). + */ + F_SET(dbmp, MP_LOCKREGION); + if ((ret = __memp_ropen(dbmp, path, cachesize, mode, flags)) != 0) + goto err; + F_CLR(dbmp, MP_LOCKREGION); + + /* + * If there's concurrent access, then we have to lock the region. + * If it's threaded, then we have to lock both the handles and the + * region. + */ + if (!F_ISSET(dbmp, MP_ISPRIVATE)) + F_SET(dbmp, MP_LOCKREGION); + if (LF_ISSET(DB_THREAD)) + F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION); + + *retp = dbmp; + return (0); + +err: if (dbmp != NULL) + FREE(dbmp, sizeof(DB_MPOOL)); + return (ret); +} + +/* + * memp_close -- + * Close a memory pool. + */ +int +memp_close(dbmp) + DB_MPOOL *dbmp; +{ + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + int ret, t_ret; + + ret = 0; + + /* Discard DB_MPREGs. */ + while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { + LIST_REMOVE(mpreg, q); + FREE(mpreg, sizeof(DB_MPREG)); + } + + /* Discard DB_MPOOLFILEs. */ + while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) + if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0) + ret = t_ret; + + /* Close the region. */ + if ((t_ret = __memp_rclose(dbmp)) && ret == 0) + ret = t_ret; + + /* Free the structure. */ + FREE(dbmp, sizeof(DB_MPOOL)); + + return (ret); +} + +/* + * memp_unlink -- + * Exit a memory pool. + */ +int +memp_unlink(path, force, dbenv) + const char *path; + int force; + DB_ENV *dbenv; +{ + return (__db_runlink(dbenv, + DB_APP_NONE, path, DB_DEFAULT_MPOOL_FILE, force)); +} + +/* + * memp_register -- + * Register a file type's pgin, pgout routines. + */ +int +memp_register(dbmp, ftype, pgin, pgout) + DB_MPOOL *dbmp; + int ftype; + int (*pgin) __P((db_pgno_t, void *, DBT *)); + int (*pgout) __P((db_pgno_t, void *, DBT *)); +{ + DB_MPREG *mpr; + + if ((mpr = (DB_MPREG *)malloc(sizeof(DB_MPREG))) == NULL) + return (ENOMEM); + + mpr->ftype = ftype; + mpr->pgin = pgin; + mpr->pgout = pgout; + + /* + * Insert at the head. Because we do a linear walk, we'll find + * the most recent registry in the case of multiple entries, so + * we don't have to check for multiple registries. + */ + LOCKHANDLE(dbmp, &dbmp->mutex); + LIST_INSERT_HEAD(&dbmp->dbregq, mpr, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + return (0); +} diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c new file mode 100644 index 0000000000..94eabf5947 --- /dev/null +++ b/db2/mp/mp_pr.c @@ -0,0 +1,313 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_pr.c 10.12 (Sleepycat) 7/29/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" + +void __memp_debug __P((DB_MPOOL *, FILE *, int)); + +static void __memp_pbh __P((FILE *, DB_MPOOL *, BH *, int)); +static void __memp_pdbmf __P((FILE *, DB_MPOOLFILE *, int)); +static void __memp_pmf __P((FILE *, MPOOLFILE *, int)); +static void __memp_pmp __P((FILE *, DB_MPOOL *, MPOOL *, int)); + +/* + * memp_stat -- + * Display MPOOL statistics. + */ +int +memp_stat(dbmp, gspp, fspp, db_malloc) + DB_MPOOL *dbmp; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + void *(*db_malloc) __P((size_t)); +{ + DB_MPOOL_FSTAT **tfsp; + MPOOLFILE *mfp; + size_t len, nlen; + char *name; + + /* Allocate space for the global statistics. */ + if (gspp != NULL) { + *gspp = NULL; + + if ((*gspp = db_malloc == NULL ? + (DB_MPOOL_STAT *)malloc(sizeof(**gspp)) : + (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL) + return (ENOMEM); + + LOCKREGION(dbmp); + + /* Copy out the global statistics. */ + **gspp = dbmp->mp->stat; + (*gspp)->st_hash_buckets = dbmp->mp->htab_buckets; + + UNLOCKREGION(dbmp); + } + + if (fspp != NULL) { + *fspp = NULL; + + LOCKREGION(dbmp); + + /* Count the MPOOLFILE structures. */ + for (len = 0, + mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; + ++len, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)); + + UNLOCKREGION(dbmp); + + if (len == 0) + return (0); + + /* Allocate space for the pointers. */ + len = (len + 1) * sizeof(DB_MPOOL_FSTAT *); + if ((*fspp = db_malloc == NULL ? + (DB_MPOOL_FSTAT **)malloc(len) : + (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL) + return (ENOMEM); + + LOCKREGION(dbmp); + + /* Build each individual entry. */ + for (tfsp = *fspp, + mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; + ++tfsp, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + name = ADDR(dbmp, mfp->path_off); + nlen = strlen(name); + len = sizeof(DB_MPOOL_FSTAT) + nlen + 1; + if ((*tfsp = db_malloc == NULL ? + (DB_MPOOL_FSTAT *)malloc(len) : + (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL) + return (ENOMEM); + **tfsp = mfp->stat; + (*tfsp)->file_name = (char *) + (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT); + memcpy((*tfsp)->file_name, name, nlen + 1); + } + *tfsp = NULL; + + UNLOCKREGION(dbmp); + } + return (0); +} + +/* + * __memp_debug -- + * Display MPOOL structures. + * + * PUBLIC: void __memp_debug __P((DB_MPOOL *, FILE *, int)); + */ +void +__memp_debug(dbmp, fp, data) + DB_MPOOL *dbmp; + FILE *fp; + int data; +{ + DB_MPOOLFILE *dbmfp; + u_long cnt; + + /* Make it easy to call from the debugger. */ + if (fp == NULL) + fp = stderr; + + /* Welcome message. */ + (void)fprintf(fp, "%s\nMpool per-process (%lu) statistics\n", + DB_LINE, (u_long)getpid()); + + if (data) + (void)fprintf(fp, " fd: %d; addr %lx; maddr %lx\n", + dbmp->fd, (u_long)dbmp->addr, (u_long)dbmp->maddr); + + /* Display the DB_MPOOLFILE structures. */ + for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; ++cnt, dbmfp = TAILQ_NEXT(dbmfp, q)); + (void)fprintf(fp, "%lu process-local files\n", cnt); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) { + (void)fprintf(fp, "%s\n", dbmfp->path); + __memp_pdbmf(fp, dbmfp, data); + } + + /* Switch to global statistics. */ + (void)fprintf(fp, "\n%s\nMpool statistics\n", DB_LINE); + + /* Display the MPOOL structure. */ + __memp_pmp(fp, dbmp, dbmp->mp, data); + + /* Flush in case we're debugging. */ + (void)fflush(fp); +} + +/* + * __memp_pdbmf -- + * Display a DB_MPOOLFILE structure. + */ +static void +__memp_pdbmf(fp, dbmfp, data) + FILE *fp; + DB_MPOOLFILE *dbmfp; + int data; +{ + if (!data) + return; + + (void)fprintf(fp, " fd: %d; %s\n", + dbmfp->fd, F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write"); +} + +/* + * __memp_pmp -- + * Display the MPOOL structure. + */ +static void +__memp_pmp(fp, dbmp, mp, data) + FILE *fp; + DB_MPOOL *dbmp; + MPOOL *mp; + int data; +{ + BH *bhp; + MPOOLFILE *mfp; + DB_HASHTAB *htabp; + size_t bucket; + int cnt; + const char *sep; + + (void)fprintf(fp, "references: %lu; cachesize: %lu\n", + (u_long)mp->rlayout.refcnt, (u_long)mp->stat.st_cachesize); + (void)fprintf(fp, + " %lu pages created\n", mp->stat.st_page_create); + (void)fprintf(fp, + " %lu mmap pages returned\n", mp->stat.st_map); + (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n", + mp->stat.st_page_in + mp->stat.st_page_out, + mp->stat.st_page_in, mp->stat.st_page_out); + if (mp->stat.st_cache_hit + mp->stat.st_cache_miss != 0) + (void)fprintf(fp, + " %.0f%% cache hit rate (%lu hit, %lu miss)\n", + ((double)mp->stat.st_cache_hit / + (mp->stat.st_cache_hit + mp->stat.st_cache_miss)) * 100, + mp->stat.st_cache_hit, mp->stat.st_cache_miss); + + /* Display the MPOOLFILE structures. */ + for (cnt = 0, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)); + (void)fprintf(fp, "%d total files\n", cnt); + for (cnt = 1, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + (void)fprintf(fp, "file %d\n", cnt); + __memp_pmf(fp, mfp, data); + } + + if (!data) + return; + + /* Display the hash table list of BH's. */ + (void)fprintf(fp, "%s\nHASH table of BH's (%lu buckets):\n", + DB_LINE, (u_long)mp->htab_buckets); + (void)fprintf(fp, + "longest chain searched %lu\n", mp->stat.st_hash_longest); + (void)fprintf(fp, "average chain searched %lu (total/calls: %lu/%lu)\n", + mp->stat.st_hash_examined / + (mp->stat.st_hash_searches ? mp->stat.st_hash_searches : 1), + mp->stat.st_hash_examined, mp->stat.st_hash_searches); + for (htabp = dbmp->htab, + bucket = 0; bucket < mp->htab_buckets; ++htabp, ++bucket) { + if (SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh) != NULL) + (void)fprintf(fp, "%lu:\n", (u_long)bucket); + for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) + __memp_pbh(fp, dbmp, bhp, data); + } + + /* Display the LRU list of BH's. */ + (void)fprintf(fp, "LRU list of BH's (pgno/offset):"); + for (sep = "\n ", bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + bhp != NULL; sep = ", ", bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + (void)fprintf(fp, "%s%lu/%lu", sep, + (u_long)bhp->pgno, (u_long)OFFSET(dbmp, bhp)); + (void)fprintf(fp, "\n"); +} + +/* + * __memp_pmf -- + * Display an MPOOLFILE structure. + */ +static void +__memp_pmf(fp, mfp, data) + FILE *fp; + MPOOLFILE *mfp; + int data; +{ + (void)fprintf(fp, " %lu pages created\n", mfp->stat.st_page_create); + (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n", + mfp->stat.st_page_in + mfp->stat.st_page_out, + mfp->stat.st_page_in, mfp->stat.st_page_out); + if (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss != 0) + (void)fprintf(fp, + " %.0f%% cache hit rate (%lu hit, %lu miss)\n", + ((double)mfp->stat.st_cache_hit / + (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss)) * 100, + mfp->stat.st_cache_hit, mfp->stat.st_cache_miss); + if (!data) + return; + + (void)fprintf(fp, " %d references; %s; pagesize: %lu\n", mfp->ref, + mfp->can_mmap ? "mmap" : "read/write", + (u_long)mfp->stat.st_pagesize); +} + +/* + * __memp_pbh -- + * Display a BH structure. + */ +static void +__memp_pbh(fp, dbmp, bhp, data) + FILE *fp; + DB_MPOOL *dbmp; + BH *bhp; + int data; +{ + const char *sep; + + if (!data) + return; + + (void)fprintf(fp, " BH @ %lu (mf: %lu): page %lu; ref %lu", + (u_long)OFFSET(dbmp, bhp), + (u_long)bhp->mf_offset, (u_long)bhp->pgno, (u_long)bhp->ref); + sep = "; "; + if (F_ISSET(bhp, BH_DIRTY)) { + (void)fprintf(fp, "%sdirty", sep); + sep = ", "; + } + if (F_ISSET(bhp, BH_WRITE)) { + (void)fprintf(fp, "%schk_write", sep); + sep = ", "; + } + (void)fprintf(fp, "\n"); +} diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c new file mode 100644 index 0000000000..a5c52123b9 --- /dev/null +++ b/db2/mp/mp_region.c @@ -0,0 +1,340 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_region.c 10.11 (Sleepycat) 8/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * __memp_ralloc -- + * Allocate some space in the mpool region. + * + * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *)); + */ +int +__memp_ralloc(dbmp, len, offsetp, retp) + DB_MPOOL *dbmp; + size_t len, *offsetp; + void *retp; +{ + BH *bhp, *nbhp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t fsize, total; + int nomore, restart, ret, wrote; + void *p; + + mp = dbmp->mp; + + nomore = 0; +alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) { + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, p); + *(void **)retp = p; + return (0); + } + if (nomore) { + __db_err(dbmp->dbenv, "%s", strerror(ret)); + return (ret); + } + + /* Look for a buffer on the free list that's the right size. */ + for (bhp = + SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + if (__db_shsizeof(bhp) == len) { + SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh); + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, bhp); + *(void **)retp = bhp; + return (0); + } + } + + /* Discard from the free list until we've freed enough memory. */ + total = 0; + for (bhp = + SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh); + __db_shalloc_free(dbmp->addr, bhp); + + /* + * Retry as soon as we've freed up sufficient space. If we + * have to coalesce of memory to satisfy the request, don't + * try until it's likely (possible?) that we'll succeed. + */ + total += fsize = __db_shsizeof(bhp); + if (fsize >= len || total >= 3 * len) + goto alloc; + } + +retry: /* Find a buffer we can flush; pure LRU. */ + total = 0; + for (bhp = + SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + /* Ignore pinned or locked (I/O in progress) buffers. */ + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + continue; + + /* Find the associated MPOOLFILE. */ + mfp = ADDR(dbmp, bhp->mf_offset); + + /* + * Write the page if it's dirty. + * + * If we wrote the page, fall through and free the buffer. We + * don't have to rewalk the list to acquire the buffer because + * it was never available for any other process to modify it. + * If we didn't write the page, but we discarded and reacquired + * the region lock, restart the buffer list walk. If we neither + * wrote the buffer nor discarded the region lock, continue down + * the buffer list. + */ + if (F_ISSET(bhp, BH_DIRTY)) { + if ((ret = __memp_bhwrite(dbmp, + mfp, bhp, &restart, &wrote)) != 0) + return (ret); + + /* + * It's possible that another process wants this buffer + * and incremented the ref count while we were writing + * it. + */ + if (bhp->ref != 0) + goto retry; + + if (wrote) + ++mp->stat.st_rw_evict; + else { + if (restart) + goto retry; + else + continue; + } + } else + ++mp->stat.st_ro_evict; + + /* + * Check to see if the buffer is the size we're looking for. + * If it is, simply reuse it. + */ + total += fsize = __db_shsizeof(bhp); + if (fsize == len) { + __memp_bhfree(dbmp, mfp, bhp, 0); + + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, bhp); + *(void **)retp = bhp; + return (0); + } + + /* Free the buffer. */ + __memp_bhfree(dbmp, mfp, bhp, 1); + + /* + * Retry as soon as we've freed up sufficient space. If we + * have to coalesce of memory to satisfy the request, don't + * try until it's likely (possible?) that we'll succeed. + */ + if (fsize >= len || total >= 3 * len) + goto alloc; + + /* Restart the walk if we discarded the region lock. */ + if (restart) + goto retry; + } + nomore = 1; + goto alloc; +} + +/* + * __memp_ropen -- + * Attach to, and optionally create, the mpool region. + * + * PUBLIC: int __memp_ropen + * PUBLIC: __P((DB_MPOOL *, const char *, size_t, int, int)); + */ +int +__memp_ropen(dbmp, path, cachesize, mode, flags) + DB_MPOOL *dbmp; + const char *path; + size_t cachesize; + int mode, flags; +{ + MPOOL *mp; + size_t rlen; + int fd, newregion, ret, retry_cnt; + + /* + * Unlike other DB subsystems, mpool can't simply grow the region + * because it returns pointers into the region to its clients. To + * "grow" the region, we'd have to allocate a new region and then + * store a region number in the structures that reference regional + * objects. It's reasonable that we fail regardless, as clients + * shouldn't have every page in the region pinned, so the only + * "failure" mode should be a performance penalty because we don't + * find a page in the cache that we'd like to have found. + * + * Up the user's cachesize by 25% to account for our overhead. + */ + if (cachesize < DB_CACHESIZE_MIN) + if (cachesize == 0) + cachesize = DB_CACHESIZE_DEF; + else + cachesize = DB_CACHESIZE_MIN; + rlen = cachesize + cachesize / 4; + + /* Map in the region. */ + retry_cnt = newregion = 0; +retry: if (LF_ISSET(DB_CREATE)) { + /* + * If it's a private mpool, use malloc, it's a lot faster than + * instantiating a region. + * + * XXX + * If we're doing locking and don't have spinlocks for this + * architecture, we'd have to instantiate the file, we need + * the file descriptor for locking. However, it should not + * be possible for DB_THREAD to be set if HAVE_SPINLOCKS aren't + * defined. + */ + if (F_ISSET(dbmp, MP_ISPRIVATE)) + ret = (dbmp->maddr = malloc(rlen)) == NULL ? ENOMEM : 0; + else + ret = __db_rcreate(dbmp->dbenv, DB_APP_NONE, path, + DB_DEFAULT_MPOOL_FILE, mode, rlen, &fd, + &dbmp->maddr); + if (ret == 0) { + /* Put the MPOOL structure first in the region. */ + mp = dbmp->maddr; + + SH_TAILQ_INIT(&mp->bhq); + SH_TAILQ_INIT(&mp->bhfq); + SH_TAILQ_INIT(&mp->mpfq); + + /* Initialize the rest of the region as free space. */ + dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL); + __db_shalloc_init(dbmp->addr, rlen - sizeof(MPOOL)); + + /* + * + * Pretend that the cache will be broken up into 4K + * pages, and that we want to keep it under, say, 10 + * pages on each chain. This means a 256MB cache will + * allocate ~6500 offset pairs. + */ + mp->htab_buckets = + __db_tablesize((cachesize / (4 * 1024)) / 10); + + /* Allocate hash table space and initialize it. */ + if ((ret = __db_shalloc(dbmp->addr, + mp->htab_buckets * sizeof(DB_HASHTAB), + 0, &dbmp->htab)) != 0) + goto err; + __db_hashinit(dbmp->htab, mp->htab_buckets); + mp->htab = OFFSET(dbmp, dbmp->htab); + + memset(&mp->stat, 0, sizeof(mp->stat)); + mp->stat.st_cachesize = cachesize; + + mp->flags = 0; + + newregion = 1; + } else if (ret != EEXIST) + return (ret); + } + + /* If we didn't or couldn't create the region, try and join it. */ + if (!newregion && + (ret = __db_ropen(dbmp->dbenv, DB_APP_NONE, + path, DB_DEFAULT_MPOOL_FILE, 0, &fd, &dbmp->maddr)) != 0) { + /* + * If we failed because the file wasn't available, wait a + * second and try again. + */ + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry; + } + return (ret); + } + + /* Set up the common pointers. */ + dbmp->mp = dbmp->maddr; + dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL); + + /* + * If not already locked, lock the region -- if it's a new region, + * then either __db_rcreate() locked it for us or we malloc'd it + * instead of creating a region, neither of which requires locking + * here. + */ + if (!newregion) + LOCKREGION(dbmp); + + /* + * Get the hash table address; it's on the shared page, so we have + * to lock first. + */ + dbmp->htab = ADDR(dbmp, dbmp->mp->htab); + + dbmp->fd = fd; + + /* If we locked the region, release it now. */ + if (!F_ISSET(dbmp, MP_ISPRIVATE)) + UNLOCKREGION(dbmp); + return (0); + +err: if (fd != -1) { + dbmp->fd = fd; + (void)__memp_rclose(dbmp); + } + + if (newregion) + (void)memp_unlink(path, 1, dbmp->dbenv); + return (ret); +} + +/* + * __memp_rclose -- + * Close the mpool region. + * + * PUBLIC: int __memp_rclose __P((DB_MPOOL *)); + */ +int +__memp_rclose(dbmp) + DB_MPOOL *dbmp; +{ + if (F_ISSET(dbmp, MP_ISPRIVATE)) { + free(dbmp->maddr); + return (0); + } + return (__db_rclose(dbmp->dbenv, dbmp->fd, dbmp->maddr)); +} diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c new file mode 100644 index 0000000000..4f1205661a --- /dev/null +++ b/db2/mp/mp_sync.c @@ -0,0 +1,205 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_sync.c 10.8 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_sync -- + * Mpool sync function. + */ +int +memp_sync(dbmp, lsnp) + DB_MPOOL *dbmp; + DB_LSN *lsnp; +{ + BH *bhp; + DB_ENV *dbenv; + MPOOL *mp; + MPOOLFILE *mfp; + int can_write, wrote, lsn_cnt, restart, ret; + + dbenv = dbmp->dbenv; + + if (dbmp->dbenv->lg_info == NULL) { + __db_err(dbenv, "memp_sync requires logging"); + return (EINVAL); + } + + LOCKREGION(dbmp); + + /* + * If the application is asking about a previous call, and we haven't + * found any buffers that the application holding the pin couldn't + * write, return yes or no based on the current count. Note, if the + * application is asking about a LSN *smaller* than one we've already + * handled, then we return based on the count for that LSN. + */ + mp = dbmp->mp; + if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { + if (mp->lsn_cnt == 0) { + *lsnp = mp->lsn; + ret = 0; + } else + ret = DB_INCOMPLETE; + + UNLOCKREGION(dbmp); + return (ret); + } + + /* Else, it's a new checkpoint. */ + F_CLR(mp, MP_LSN_RETRY); + + /* + * Save the LSN. We know that it's a new LSN or larger than the one + * for which we were already doing a checkpoint. (BTW, I don't expect + * to see multiple LSN's from the same or multiple processes, but You + * Just Never Know. Responding as if they all called with the largest + * of the LSNs specified makes everything work. + * + * We don't currently use the LSN we save. We could potentially save + * the last-written LSN in each buffer header and use it to determine + * what buffers need to be written. The problem with this is that it's + * sizeof(LSN) more bytes of buffer header. We currently write all the + * dirty buffers instead. + * + * Walk the list of shared memory segments clearing the count of + * buffers waiting to be written. + */ + mp->lsn = *lsnp; + mp->lsn_cnt = 0; + for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + mfp->lsn_cnt = 0; + + /* + * Walk the list of buffers and mark all dirty buffers to be written + * and all pinned buffers to be potentially written. We do this in + * single fell swoop while holding the region locked so that processes + * can't make new buffers dirty, causing us to never finish. Since + * the application may have restarted the sync, clear any BH_WRITE + * flags that appear to be left over. + */ + can_write = lsn_cnt = 0; + for (lsn_cnt = 0, bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { + F_SET(bhp, BH_WRITE); + + if (bhp->ref == 0) + can_write = 1; + + mfp = ADDR(dbmp, bhp->mf_offset); + ++mfp->lsn_cnt; + + ++lsn_cnt; + } else + F_CLR(bhp, BH_WRITE); + + mp->lsn_cnt = lsn_cnt; + + /* If there no buffers we can write, we're done. */ + if (!can_write) { + UNLOCKREGION(dbmp); + return (mp->lsn_cnt ? DB_INCOMPLETE : 0); + } + + /* + * Write any buffers that we can. Restart the walk after each write, + * __memp_pgwrite() discards and reacquires the region lock during I/O. + */ +retry: for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + /* Ignore pinned or locked buffers. */ + if (!F_ISSET(bhp, BH_WRITE) || + bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + continue; + + mfp = ADDR(dbmp, bhp->mf_offset); + if ((ret = + __memp_bhwrite(dbmp, mfp, bhp, &restart, &wrote)) != 0) + goto err; + if (wrote) { + if (restart) + goto retry; + continue; + } + __db_err(dbenv, "%s: unable to flush page: %lu", + ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno); + ret = EPERM; + goto err; + } + ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + +err: UNLOCKREGION(dbmp); + return (ret); +} + +/* + * memp_fsync -- + * Mpool file sync function. + */ +int +memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + BH *bhp; + DB_MPOOL *dbmp; + size_t mf_offset; + int pincnt, restart, ret, wrote; + + /* We don't sync temporary files -- what's the use? */ + if (F_ISSET(dbmfp, MP_PATH_TEMP)) + return (0); + + dbmp = dbmfp->dbmp; + ret = 0; + + mf_offset = OFFSET(dbmp, dbmfp->mfp); + + LOCKREGION(dbmp); + + /* + * Walk the list of buffer headers for the MPOOLFILE, and write out any + * dirty buffers that we can. + */ +retry: pincnt = 0; + for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + if (F_ISSET(bhp, BH_DIRTY) && bhp->mf_offset == mf_offset) { + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { + ++pincnt; + continue; + } + if ((ret = + __memp_pgwrite(dbmfp, bhp, &restart, &wrote)) != 0) + goto err; + if (!wrote) + ++pincnt; + if (restart) + goto retry; + } + + UNLOCKREGION(dbmp); + +err: return (ret == 0 ? (pincnt ? DB_INCOMPLETE : 0) : ret); +} diff --git a/db2/mutex/68020.gcc b/db2/mutex/68020.gcc new file mode 100644 index 0000000000..9d8be641d8 --- /dev/null +++ b/db2/mutex/68020.gcc @@ -0,0 +1,19 @@ +/* + * @(#)68020.gcc 10.1 (Sleepycat) 4/12/97 + * + * For gcc/68K, 0 is clear, 1 is set. + */ +#define TSL_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + int __r; \ + asm volatile("tas %1; \n \ + seq %0" \ + : "=dm" (__r), "=m" (*__l) \ + : "1" (*__l) \ + ); \ + __r & 1; \ +}) + +#define TSL_UNSET(tsl) (*(tsl) = 0) +#define TSL_INIT(tsl) TSL_UNSET(tsl) + diff --git a/db2/mutex/README b/db2/mutex/README new file mode 100644 index 0000000000..30d6b6a7d1 --- /dev/null +++ b/db2/mutex/README @@ -0,0 +1,105 @@ +# @(#)README 10.1 (Sleepycat) 4/12/97 + +Resource locking routines: lock based on a db_mutex_t. All this gunk +(including trying to make assembly code portable), is necessary because +System V semaphores require system calls for uncontested locks and we +don't want to make two system calls per resource lock. + +First, this is how it works. The db_mutex_t structure contains a resource +test-and-set lock (tsl), a file offset, a pid for debugging and statistics +information. + +If HAVE_SPINLOCKS is defined (i.e. we know how to do test-and-sets for +this compiler/architecture combination), we try and lock the resource tsl +TSL_DEFAULT_SPINS times. If we can't acquire the lock that way, we use +a system call to sleep for 10ms, 20ms, 40ms, etc. (The time is bounded +at 1 second, just in case.) Using the timer backoff means that there are +two assumptions: that locks are held for brief periods (never over system +calls or I/O) and that locks are not hotly contested. + +If HAVE_SPINLOCKS is not defined, i.e. we can't do test-and-sets, we use +a file descriptor to do byte locking on a file at a specified offset. In +this case, ALL of the locking is done in the kernel. Because file +descriptors are allocated per process, we have to provide the file +descriptor as part of the lock/unlock call. We still have to do timer +backoff because we need to be able to block ourselves, i.e. the lock +manager causes processes to wait by having the process acquire a mutex +and then attempting to re-acquire the mutex. There's no way to use kernel +locking to block yourself, i.e. if you hold a lock and attempt to +re-acquire it, the attempt will succeed. + +Next, let's talk about why it doesn't work the way a reasonable person +would think it should work. + +Ideally, we'd have the ability to try to lock the resource tsl, and if +that fails, increment a counter of waiting processes, then block in the +kernel until the tsl is released. The process holding the resource tsl +would see the wait counter when it went to release the resource tsl, and +would wake any waiting processes up after releasing the lock. This would +actually require both another tsl (call it the mutex tsl) and +synchronization between the call that blocks in the kernel and the actual +resource tsl. The mutex tsl would be used to protect accesses to the +db_mutex_t itself. Locking the mutex tsl would be done by a busy loop, +which is safe because processes would never block holding that tsl (all +they would do is try to obtain the resource tsl and set/check the wait +count). The problem in this model is that the blocking call into the +kernel requires a blocking semaphore, i.e. one whose normal state is +locked. + +The only portable forms of locking under UNIX are fcntl(2) on a file +descriptor/offset, and System V semaphores. Neither of these locking +methods are sufficient to solve the problem. + +The problem with fcntl locking is that only the process that obtained the +lock can release it. Remember, we want the normal state of the kernel +semaphore to be locked. So, if the creator of the db_mutex_t were to +initialize the lock to "locked", then a second process locks the resource +tsl, and then a third process needs to block, waiting for the resource +tsl, when the second process wants to wake up the third process, it can't +because it's not the holder of the lock! For the second process to be +the holder of the lock, we would have to make a system call per +uncontested lock, which is what we were trying to get away from in the +first place. + +There are some hybrid schemes, such as signaling the holder of the lock, +or using a different blocking offset depending on which process is +holding the lock, but it gets complicated fairly quickly. I'm open to +suggestions, but I'm not holding my breath. + +Regardless, we use this form of locking when HAVE_SPINLOCKS is not +defined, (i.e. we're locking in the kernel) because it doesn't have the +limitations found in System V semaphores, and because the normal state of +the kernel object in that case is unlocked, so the process releasing the +lock is also the holder of the lock. + +The System V semaphore design has a number of other limitations that make +it inappropriate for this task. Namely: + +First, the semaphore key name space is separate from the file system name +space (although there exist methods for using file names to create +semaphore keys). If we use a well-known key, there's no reason to believe +that any particular key will not already be in use, either by another +instance of the DB application or some other application, in which case +the DB application will fail. If we create a key, then we have to use a +file system name to rendezvous and pass around the key. + +Second, System V semaphores traditionally have compile-time, system-wide +limits on the number of semaphore keys that you can have. Typically, that +number is far too low for any practical purpose. Since the semaphores +permit more than a single slot per semaphore key, we could try and get +around that limit by using multiple slots, but that means that the file +that we're using for rendezvous is going to have to contain slot +information as well as semaphore key information, and we're going to be +reading/writing it on every db_mutex_t init or destroy operation. Anyhow, +similar compile-time, system-wide limits on the numbers of slots per +semaphore key kick in, and you're right back where you started. + +My fantasy is that once POSIX.1 standard mutexes are in wide-spread use, +we can switch to them. My guess is that it won't happen, because the +POSIX semaphores are only required to work for threads within a process, +and not independent processes. + +Note: there are races in the statistics code, but since it's just that, +I didn't bother fixing them. (The fix requires a mutex tsl, so, when/if +this code is fixed to do rational locking (see above), then change the +statistics update code to acquire/release the mutex tsl. diff --git a/db2/mutex/alpha.dec b/db2/mutex/alpha.dec new file mode 100644 index 0000000000..83ed371136 --- /dev/null +++ b/db2/mutex/alpha.dec @@ -0,0 +1,25 @@ +/* + * @(#)alpha.dec 8.3 (Sleepycat Software) 1/18/97 + * + * The DEC C asm acts as a pseudo-call. The first argument is the assembly + * code, and the remaining arguments are assigned as in a procedure call, to + * r16, r17, etc. (represented in asm as %a0, %a1, and so forth). + * + * From: Dave Butenhof. + */ + +#include <c_asm.h> + +#define TSL_SET(tsl) (asm ("mb; \ + 10: ldl_l %v0,(%a0) ; \ + bne %v0,30f ; \ + or %v0,1,%r1 ; \ + stl_c %r1,(%a0) ; \ + beq %r1,20f ; \ + mb ; \ + br %r31,30f ; \ + 20: br %r31,10b ; \ + 30: ", (tsl))) + +THIS WAS NOT CONVERTED TO TAKE A POINTER AS AN ARGUMENT... +#define TSL_UNSET(tsl) (asm ("mb"), *(tsl) = 0) diff --git a/db2/mutex/alpha.gcc b/db2/mutex/alpha.gcc new file mode 100644 index 0000000000..247d04cf31 --- /dev/null +++ b/db2/mutex/alpha.gcc @@ -0,0 +1,52 @@ +/* + * @(#)alpha.gcc 10.1 (Sleepycat) 4/12/97 + * + * The code appearing below is taken from Richard L. Sites, ed. "Alpha + * Architecture Reference Manual", Digital Press, 1992, page 5-7 and 5-8. + * There are 2 modifications: + * + * 1. The jump from blbs __r1,30f to !__r1, which is dictated by the way the + * TSL_SET macro is used. The code suggested in Sites includes the main loop + * of the spin lock, whereas in this code the rest the loop is specified in C. + * The generated code might be suboptimal if the compiler generates a forward + * branch for the usual case in which the mutex is uncontested. + * + * 2. At label 20, Sites suggests including code for testing for an excessive + * number of _processor_ lock conflicts. (The seq_c instruction stores its + * first argument provided that no other processor has written to a byte range + * including its memory-location argument.) Absent such checking the code + * below could conceivably stall silently on a multiprocessor alpha, depending + * on how often processor/processor conflicts occur in a particular byte range. + * + * Note that the mb ("memory-barrier") instruction in TSL_UNSET is critical to + * correct operation in a multiprocessor alpha (as is, of course, the mb in + * the TSL_SET macro). Without the mb, changes to shared memory that occurred + * inside the critical section (before the TSL_UNSET) might reach shared memory + * _after_ the change of tsl to 0, thereby permitting another processor to see + * an inconsistent view of the data protected by the mutex. + * + * For gcc/alpha, 0 is clear, 1 is set. + */ +#define TSL_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + register tsl_t __r1, __r2; \ + __asm__ volatile(" \n\ + 10: ldq_l %0,(%2) \n\ + blbs %0,30f \n\ + or %0,1,%1 \n\ + stq_c %1,(%2) \n\ + beq %1,20f \n\ + mb \n\ + br 30f \n\ + 20: br 10b \n\ + 30: " \ + : "=&r" (__r1), "=&r" (__r2) \ + : "r" (__l)); \ + !__r1; \ +}) + +#define TSL_UNSET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + __asm__ volatile("mb; stq $31,(%0);" : : "r" (__l)); \ +}) +#define TSL_INIT(tsl) TSL_UNSET(tsl) diff --git a/db2/mutex/mutex.c b/db2/mutex/mutex.c new file mode 100644 index 0000000000..b23f738ad7 --- /dev/null +++ b/db2/mutex/mutex.c @@ -0,0 +1,280 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mutex.c 10.22 (Sleepycat) 8/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "common_ext.h" + +#ifdef HAVE_SPINLOCKS + +#ifdef HAVE_FUNC_AIX +#define TSL_INIT(x) +#define TSL_SET(x) (!_check_lock(x, 0, 1)) +#define TSL_UNSET(x) _clear_lock(x, 0) +#endif + +#ifdef HAVE_ASSEM_MC68020_GCC +#include "68020.gcc" +#endif + +#if defined(HAVE_FUNC_MSEM) +/* + * XXX + * Should we not use MSEM_IF_NOWAIT and let the system block for us? + * I've no idea if this will block all threads in the process or not. + */ +#define TSL_INIT(x) msem_init(x, MSEM_UNLOCKED) +#define TSL_SET(x) (!msem_lock(x, MSEM_IF_NOWAIT)) +#define TSL_UNSET(x) msem_unlock(x, 0) +#endif + +#ifdef HAVE_FUNC_SGI +#define TSL_INIT(x) init_lock(x) +#define TSL_SET(x) (!acquire_lock(x)) +#define TSL_UNSET(x) release_lock(x) +#endif + +#ifdef HAVE_FUNC_SOLARIS +/* + * Semaphore calls don't work on Solaris 5.5. + * + * #define TSL_INIT(x) sema_init(x, 1, USYNC_PROCESS, NULL) + * #define TSL_SET(x) (sema_wait(x) == 0) + * #define TSL_UNSET(x) sema_post(x) + */ +#define TSL_INIT(x) +#define TSL_SET(x) (_lock_try(x)) +#define TSL_UNSET(x) _lock_clear(x) +#endif + +#ifdef HAVE_ASSEM_SPARC_GCC +#include "sparc.gcc" +#endif + +#ifdef HAVE_ASSEM_UTS4_CC +#define TSL_INIT(x) +#define TSL_SET(x) (!uts_lock(x, 1)) +#define TSL_UNSET(x) (*(x) = 0) +#endif + +#ifdef HAVE_ASSEM_X86_GCC +#include "x86.gcc" +#endif + +#if defined(_WIN32) +/* DBDB this needs to be byte-aligned!! */ +#define TSL_INIT(tsl) +#define TSL_SET(tsl) (!InterlockedExchange((PLONG)tsl, 1)) +#define TSL_UNSET(tsl) (*(tsl) = 0) +#endif + +#ifdef macintosh +/* Mac spinlocks are simple because we cannot possibly be preempted. */ +#define TSL_INIT(tsl) +#define TSL_SET(tsl) (*(tsl) = 1) +#define TSL_UNSET(tsl) (*(tsl) = 0) +#endif + +#endif /* HAVE_SPINLOCKS */ + +#ifdef MORE_THAN_ONE_PROCESSOR +#define TSL_DEFAULT_SPINS 5 /* Default spins before block. */ +#else +#define TSL_DEFAULT_SPINS 1 /* Default spins before block. */ +#endif + +/* + * __db_mutex_init -- + * Initialize a DB mutex structure. + * + * PUBLIC: void __db_mutex_init __P((db_mutex_t *, off_t)); + */ +void +__db_mutex_init(mp, off) + db_mutex_t *mp; + off_t off; +{ +#ifdef DEBUG + if ((ALIGNTYPE)mp & (MUTEX_ALIGNMENT - 1)) { + (void)fprintf(stderr, + "MUTEX ERROR: mutex NOT %d-byte aligned!\n", + MUTEX_ALIGNMENT); + abort(); + } +#endif + memset(mp, 0, sizeof(db_mutex_t)); + +#ifdef HAVE_SPINLOCKS + TSL_INIT(&mp->tsl_resource); +#else + mp->off = off; +#endif +} + +#define MS(n) ((n) * 1000) /* Milliseconds to micro-seconds. */ +#define SECOND (MS(1000)) /* A second's worth of micro-seconds. */ + +/* + * __db_mutex_lock + * Lock on a mutex, logically blocking if necessary. + * + * PUBLIC: int __db_mutex_lock __P((db_mutex_t *, int, int (*)(void))); + */ +int +__db_mutex_lock(mp, fd, yield) + db_mutex_t *mp; + int fd; + int (*yield) __P((void)); +{ + u_long usecs; + +#ifdef HAVE_SPINLOCKS + int nspins; + + for (usecs = MS(10);;) { + /* + * Try and acquire the uncontested resource lock for + * TSL_DEFAULT_SPINS. + */ + for (nspins = TSL_DEFAULT_SPINS; nspins > 0; --nspins) + if (TSL_SET(&mp->tsl_resource)) { +#ifdef DEBUG + if (mp->pid != 0) { + (void)fprintf(stderr, + "MUTEX ERROR: __db_mutex_lock: lock currently locked\n"); + abort(); + } + mp->pid = getpid(); +#endif +#ifdef MUTEX_STATISTICS + if (usecs == MS(10)) + ++mp->mutex_set_nowait; + else + ++mp->mutex_set_wait; +#endif + return (0); + } + + /* Yield the processor; wait 10ms initially, up to 1 second. */ + if (yield == NULL || yield() != 0) { + (void)__db_sleep(0, usecs); + if ((usecs <<= 1) > SECOND) + usecs = SECOND; + } + } + /* NOTREACHED */ + +#else /* !HAVE_SPINLOCKS */ + struct flock k_lock; + pid_t mypid; + int locked; + + /* Initialize the lock. */ + k_lock.l_whence = SEEK_SET; + k_lock.l_start = mp->off; + k_lock.l_len = 1; + + for (locked = 0, mypid = getpid();;) { + /* + * Wait for the lock to become available; wait 10ms initially, + * up to 1 second. + */ + for (usecs = MS(10); mp->pid != 0;) + if (yield == NULL || yield() != 0) { + (void)__db_sleep(0, usecs); + if ((usecs <<= 1) > SECOND) + usecs = SECOND; + } + + /* Acquire an exclusive kernel lock. */ + k_lock.l_type = F_WRLCK; + if (fcntl(fd, F_SETLKW, &k_lock)) + return (1); + + /* If the resource tsl is still available, it's ours. */ + if (mp->pid == 0) { + locked = 1; + mp->pid = mypid; + } + + /* Release the kernel lock. */ + k_lock.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &k_lock)) + return (1); + + /* + * If we got the resource tsl we're done. + * + * !!! + * We can't check to see if the lock is ours, because we may + * be trying to block ourselves in the lock manager, and so + * the holder of the lock that's preventing us from getting + * the lock may be us! (Seriously.) + */ + if (locked) + break; + } + +#ifdef MUTEX_STATISTICS + ++mp->mutex_set_wait; +#endif + return (0); +#endif /* !HAVE_SPINLOCKS */ +} + +/* + * __db_mutex_unlock -- + * Release a lock. + * + * PUBLIC: int __db_mutex_unlock __P((db_mutex_t *, int)); + */ +int +__db_mutex_unlock(mp, fd) + db_mutex_t *mp; + int fd; +{ +#ifdef DEBUG + if (mp->pid == 0) { + (void)fprintf(stderr, + "MUTEX ERROR: __db_mutex_unlock: lock already unlocked\n"); + abort(); + } +#endif + +#ifdef HAVE_SPINLOCKS +#ifdef DEBUG + mp->pid = 0; +#endif + + /* Release the resource tsl. */ + TSL_UNSET(&mp->tsl_resource); +#else + /* + * Release the resource tsl. We don't have to acquire any locks + * because processes trying to acquire the lock are checking for + * a pid of 0, not a specific value. + */ + mp->pid = 0; +#endif + return (0); +} diff --git a/db2/mutex/parisc.gcc b/db2/mutex/parisc.gcc new file mode 100644 index 0000000000..e15f6f2dba --- /dev/null +++ b/db2/mutex/parisc.gcc @@ -0,0 +1,40 @@ +/* + * @(#)parisc.gcc 8.5 (Sleepycat) 1/18/97 + * + * Copyright (c) 1996-1997, The University of Utah and the Computer Systems + * Laboratory at the University of Utah (CSL). All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the Computer + * Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + */ + +/* + * The PA-RISC has a "load and clear" instead of a "test and set" instruction. + * The 32-bit word used by that instruction must be 16-byte aligned hence we + * allocate 16 bytes for a tsl_t and use the word that is properly aligned. + * We could use the "aligned" attribute in GCC but that doesn't work for stack + * variables. + */ +#define TSL_SET(tsl) ({ \ + int *__l = (int *)(((int)(tsl)+15)&~15); \ + int __r; \ + asm volatile("ldcws 0(%1),%0" : "=r" (__r) : "r" (__l)); \ + __r & 1; \ +}) + +#define TSL_UNSET(tsl) ({ \ + int *__l = (int *)(((int)(tsl)+15)&~15); \ + *__l = -1; \ +}) diff --git a/db2/mutex/parisc.hp b/db2/mutex/parisc.hp new file mode 100644 index 0000000000..d10807b7f1 --- /dev/null +++ b/db2/mutex/parisc.hp @@ -0,0 +1,29 @@ +/* + * @(#)parisc.hp 8.5 (Sleepycat) 1/18/97 + * + * Copyright (c) 1996-1997, The University of Utah and the Computer Systems + * Laboratory at the University of Utah (CSL). All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the Computer + * Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + */ + +/* + * The PA-RISC has a "load and clear" instead of a "test and set" instruction. + * The 32-bit word used by that instruction must be 16-byte aligned hence we + * allocate 16 bytes for a tsl_t and use the word that is properly aligned. + */ +#define TSL_SET(tsl) tsl_set(tsl) +#define TSL_UNSET(tsl) tsl_unset(tsl) diff --git a/db2/mutex/sparc.gcc b/db2/mutex/sparc.gcc new file mode 100644 index 0000000000..8445a0629b --- /dev/null +++ b/db2/mutex/sparc.gcc @@ -0,0 +1,33 @@ +/* + * @(#)sparc.gcc 10.1 (Sleepycat) 4/12/97 + * + * The ldstub instruction takes the location specified by its first argument + * (a register containing a memory address) and loads its contents into its + * second argument (a register) and atomically sets the contents the location + * specified by its first argument to a byte of 1s. (The value in the second + * argument is never read, but only overwritten.) + * + * The membar instructions are needed to ensure that writes to the lock are + * correctly ordered with writes that occur later in the instruction stream. + * + * For gcc/sparc, 0 is clear, 1 is set. + */ + +#if defined(__sparcv9__) +Does the following code need membar instructions for V9 processors? +#endif + +#define TSL_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + register tsl_t __r; \ + __asm__ volatile \ + ("ldstub [%1],%0" \ + : "=r"( __r) : "r" (__l)); \ + !__r; \ +}) + +#define TSL_UNSET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + __asm__ volatile ("stb %%g0,[%0]" : : "r" (__l)); \ +}) +#define TSL_INIT(tsl) TSL_UNSET(tsl) diff --git a/db2/mutex/uts4.cc.s b/db2/mutex/uts4.cc.s new file mode 100644 index 0000000000..ee5f4143bd --- /dev/null +++ b/db2/mutex/uts4.cc.s @@ -0,0 +1,21 @@ + / + / int uts_lock ( int *p, int i ); + / Update the lock word pointed to by p with the + / value i, using compare-and-swap. + / Returns 0 if update was successful. + / Returns 1 if update failed. + / + entry uts_lock + uts_lock: + using .,r15 + st r2,8(sp) / Save R2 + l r2,64+0(sp) / R2 -> word to update + slr r0, r0 / R0 = current lock value must be 0 + l r1,64+4(sp) / R1 = new lock value + cs r0,r1,0(r2) / Try the update ... + be x / ... Success. Return 0 + la r0,1 / ... Failure. Return 1 + x: / + l r2,8(sp) / Restore R2 + b 2(,r14) / Return to caller + drop r15 diff --git a/db2/mutex/x86.gcc b/db2/mutex/x86.gcc new file mode 100644 index 0000000000..886a6811a2 --- /dev/null +++ b/db2/mutex/x86.gcc @@ -0,0 +1,17 @@ +/* + * @(#)x86.gcc 10.2 (Sleepycat) 6/21/97 + * + * For gcc/x86, 0 is clear, 1 is set. + */ +#define TSL_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + int __r; \ + asm volatile("movl $1,%%eax; xchgb %1,%%al; xorl $1,%%eax" \ + : "=&a" (__r), "=m" (*__l) \ + : "1" (*__l) \ + ); \ + __r & 1; \ +}) + +#define TSL_UNSET(tsl) (*(tsl) = 0) +#define TSL_INIT(tsl) TSL_UNSET(tsl) diff --git a/db2/os/db_os_abs.c b/db2/os/db_os_abs.c new file mode 100644 index 0000000000..8795205839 --- /dev/null +++ b/db2/os/db_os_abs.c @@ -0,0 +1,82 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_abs.c 10.5 (Sleepycat) 7/5/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <string.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_abspath -- + * Return if a path is an absolute path. + * + * PUBLIC: int __db_abspath __P((const char *)); + */ +int +__db_abspath(path) + const char *path; +{ +#ifdef _WIN32 + /* + * !!! + * Check for drive specifications, e.g., "C:". In addition, the path + * separator used by the win32 DB (PATH_SEPARATOR) is \; look for both + * / and \ since these are user-input paths. + */ + if (isalpha(path[0]) && path[1] == ':') + path += 2; + return (path[0] == '/' || path[0] == '\\'); +#else +#ifdef macintosh + /* + * !!! + * Absolute pathnames always start with a volume name, which must be + * followed by a colon, thus they are of the form: + * volume: or volume:dir1:dir2:file + * + * Relative pathnames are either a single name without colons or a + * path starting with a colon, thus of the form: + * file or :file or :dir1:dir2:file + */ + return (strchr(path, ':') != NULL && path[0] != ':'); +#else + return (path[0] == '/'); +#endif +#endif +} + +/* + * __db_rpath -- + * Return the last path separator in the path or NULL if none found. + * + * PUBLIC: char *__db_rpath __P((const char *)); + */ +char * +__db_rpath(path) + const char *path; +{ + const char *s, *last; + + last = NULL; + if (PATH_SEPARATOR[1] != '\0') { + for (s = path; s[0] != '\0'; ++s) + if (strchr(PATH_SEPARATOR, s[0]) != NULL) + last = s; + } else + for (s = path; s[0] != '\0'; ++s) + if (s[0] == PATH_SEPARATOR[0]) + last = s; + return ((char *)last); +} diff --git a/db2/os/db_os_dir.c b/db2/os/db_os_dir.c new file mode 100644 index 0000000000..23a6a45919 --- /dev/null +++ b/db2/os/db_os_dir.c @@ -0,0 +1,136 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_dir.c 10.7 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#if HAVE_DIRENT_H +# include <dirent.h> +# define NAMLEN(dirent) strlen((dirent)->d_name) +#else +# define dirent direct +# define NAMLEN(dirent) (dirent)->d_namlen +# if HAVE_SYS_NDIR_H +# include <sys/ndir.h> +# endif +# if HAVE_SYS_DIR_H +# include <sys/dir.h> +# endif +# if HAVE_NDIR_H +# include <ndir.h> +# endif +#endif + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" +#include "common_ext.h" + +/* + * __db_dir -- + * Return a list of the files in a directory. + * + * PUBLIC: int __db_dir __P((DB_ENV *, char *, char ***, int *)); + */ +int +__db_dir(dbenv, dir, namesp, cntp) + DB_ENV *dbenv; + const char *dir; + char ***namesp; + int *cntp; +{ + int arraysz, cnt; + char **names; +#ifdef _WIN32 + struct _finddata_t fdata; + long dirhandle; + int finished; + + if ((dirhandle = _findfirst(dir,&fdata)) == -1) { + __db_err(dbenv, "%s: %s", dir, strerror(errno)); + return (errno); + } + + names = NULL; + finished = 0; + for (arraysz = cnt = 0; finished != 1; ++cnt) { + if (cnt >= arraysz) { + arraysz += 100; + names = (char **)(names == NULL ? + malloc(arraysz * sizeof(names[0])) : + realloc(names, arraysz * sizeof(names[0]))); + if (names == NULL) + goto nomem; + } + if ((names[cnt] = (char *)strdup(fdata.name)) == NULL) + goto nomem; + if (_findnext(dirhandle,&fdata) != 0) + finished = 1; + } + _findclose(dirhandle); +#else /* !_WIN32 */ + struct dirent *dp; + DIR *dirp; + + if ((dirp = opendir(dir)) == NULL) { + __db_err(dbenv, "%s: %s", dir, strerror(errno)); + return (errno); + } + names = NULL; + for (arraysz = cnt = 0; (dp = readdir(dirp)) != NULL; ++cnt) { + if (cnt >= arraysz) { + arraysz += 100; + names = (char **)(names == NULL ? + malloc(arraysz * sizeof(names[0])) : + realloc(names, arraysz * sizeof(names[0]))); + if (names == NULL) + goto nomem; + } + if ((names[cnt] = (char *)strdup(dp->d_name)) == NULL) + goto nomem; + } + (void)closedir(dirp); +#endif /* !_WIN32 */ + + *namesp = names; + *cntp = cnt; + return (0); + +nomem: if (names != NULL) + __db_dirf(dbenv, names, cnt); + __db_err(dbenv, "%s", strerror(ENOMEM)); + return (ENOMEM); +} + +/* + * __db_dirf -- + * Free the list of files. + * + * PUBLIC: void __db_dirf __P((DB_ENV *, char **, int)); + */ +void +__db_dirf(dbenv, names, cnt) + DB_ENV *dbenv; + char **names; + int cnt; +{ + dbenv = dbenv; /* XXX: Shut the compiler up. */ + while (cnt > 0) + free(names[--cnt]); + free (names); +} diff --git a/db2/os/db_os_fid.c b/db2/os/db_os_fid.c new file mode 100644 index 0000000000..8fa55fa56c --- /dev/null +++ b/db2/os/db_os_fid.c @@ -0,0 +1,126 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_fid.c 10.7 (Sleepycat) 8/21/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "os_ext.h" +#include "common_ext.h" + +/* + * __db_fileid -- + * Return a unique identifier for a file. + * + * PUBLIC: int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); + */ +int +__db_fileid(dbenv, fname, timestamp, fidp) + DB_ENV *dbenv; + const char *fname; + int timestamp; + u_int8_t *fidp; +{ + time_t now; + u_int8_t *p; + unsigned int i; + +#ifdef _WIN32 + /* + * The documentation for GetFileInformationByHandle() states that the + * inode-type numbers are not constant between processes. Actually, + * they are, they're the NTFS MFT indexes. So, this works on NTFS, + * but perhaps not on other platforms, and perhaps not over a network. + * Can't think of a better solution right now. + */ + int fd = 0; + HANDLE fh = 0; + BY_HANDLE_FILE_INFORMATION fi; + BOOL retval = FALSE; + + /* Clear the buffer. */ + memset(fidp, 0, DB_FILE_ID_LEN); + + /* first we open the file, because we're not given a handle to it */ + fd = open(fname,_O_RDONLY,_S_IREAD); + if (-1 == fd) { + /* If we can't open it, we're in trouble */ + return (errno); + } + + /* File open, get its info */ + fh = (HANDLE)_get_osfhandle(fd); + if ((HANDLE)(-1) != fh) { + retval = GetFileInformationByHandle(fh,&fi); + } + close(fd); + + /* + * We want the three 32-bit words which tell us the volume ID and + * the file ID. We make a crude attempt to copy the bytes over to + * the callers buffer. + * + * DBDB: really we should ensure that the bytes get packed the same + * way on all compilers, platforms etc. + */ + if ( ((HANDLE)(-1) != fh) && (TRUE == retval) ) { + memcpy(fidp, &fi.nFileIndexLow, sizeof(u_int32_t)); + fidp += sizeof(u_int32_t); + memcpy(fidp, &fi.nFileIndexHigh, sizeof(u_int32_t)); + fidp += sizeof(u_int32_t); + memcpy(fidp, &fi.dwVolumeSerialNumber, sizeof(u_int32_t)); + } +#else + struct stat sb; + + /* Clear the buffer. */ + memset(fidp, 0, DB_FILE_ID_LEN); + + /* Check for the unthinkable. */ + if (sizeof(sb.st_ino) + + sizeof(sb.st_dev) + sizeof(time_t) > DB_FILE_ID_LEN) + return (EINVAL); + + /* On UNIX, use a dev/inode pair. */ + if (stat(fname, &sb)) { + __db_err(dbenv, "%s: %s", fname, strerror(errno)); + return (errno); + } + + /* + * Use the inode first and in reverse order, hopefully putting the + * distinguishing information early in the string. + */ + for (p = (u_int8_t *)&sb.st_ino + + sizeof(sb.st_ino), i = 0; i < sizeof(sb.st_ino); ++i) + *fidp++ = *--p; + for (p = (u_int8_t *)&sb.st_dev + + sizeof(sb.st_dev), i = 0; i < sizeof(sb.st_dev); ++i) + *fidp++ = *--p; +#endif + if (timestamp) { + (void)time(&now); + for (p = (u_int8_t *)&now + + sizeof(now), i = 0; i < sizeof(now); ++i) + *fidp++ = *--p; + } + return (0); +} diff --git a/db2/os/db_os_lseek.c b/db2/os/db_os_lseek.c new file mode 100644 index 0000000000..cecf0e156b --- /dev/null +++ b/db2/os/db_os_lseek.c @@ -0,0 +1,60 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_lseek.c 10.3 (Sleepycat) 6/28/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_lseek -- + * Seek to a page/byte offset in the file. + * + * PUBLIC: int __db_lseek __P((int, size_t, db_pgno_t, u_long, int)); + */ +int +__db_lseek(fd, pgsize, pageno, relative, whence) + int fd; + size_t pgsize; + db_pgno_t pageno; + u_long relative; + int whence; +{ + /* 64-bit offsets are done differently by different vendors. */ +#undef __LSEEK_SET +#ifdef HAVE_LLSEEK +#define __LSEEK_SET + offset_t offset; /* Solaris. */ + + offset = pgsize * pageno + relative; + return (llseek(fd, offset, whence) == -1 ? errno : 0); +#endif +#ifdef HAVE_LSEEKI +#define __LSEEK_SET + __int64 offset; /* WNT */ + + offset = pgsize * pageno + relative; + return (_lseeki64(fd, offset, whence) == -1 ? errno : 0); +#endif +#ifndef __LSEEK_SET + off_t offset; /* Default. */ + + offset = pgsize * pageno + relative; + return (lseek(fd, offset, whence) == -1 ? errno : 0); +#endif +} diff --git a/db2/os/db_os_mmap.c b/db2/os/db_os_mmap.c new file mode 100644 index 0000000000..0cd8fad0b0 --- /dev/null +++ b/db2/os/db_os_mmap.c @@ -0,0 +1,106 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_mmap.c 10.4 (Sleepycat) 6/28/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_mmap -- + * Map in some shared memory backed by a file descriptor. + * + * PUBLIC: int __db_mmap __P((int, size_t, int, int, void *)); + */ +int +__db_mmap(fd, len, is_private, rdonly, addr) + int fd, is_private, rdonly; + size_t len; + void *addr; +{ +#ifdef _WIN32 + /* We have not implemented copy-on-write here */ + void * pMemory = 0; + HANDLE hFile = (HANDLE)_get_osfhandle(fd); + HANDLE hMemory = CreateFileMapping( + hFile, + 0, + (rdonly ? PAGE_READONLY : PAGE_READWRITE), + 0, + len, /* This code fails if the library is ever compiled on a 64-bit machine */ + 0 + ); + if (NULL == hMemory) + { + return errno; + } + pMemory = MapViewOfFile( + hMemory, + (rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), + 0, + 0, + len + ); + CloseHandle(hMemory); + *(void **)addr = pMemory; + return 0; + +#else /* !_WIN32 */ + + void *p; + int flags, prot; + + flags = is_private ? MAP_PRIVATE : MAP_SHARED; +#ifdef MAP_HASSEMAPHORE + flags += MAP_HASSEMAPHORE; +#endif + prot = PROT_READ | (rdonly ? 0 : PROT_WRITE); + +#ifndef MAP_FAILED /* XXX: Mmap(2) failure return. */ +#define MAP_FAILED -1 +#endif + if ((p = + mmap(NULL, len, prot, flags, fd, (off_t)0)) == (void *)MAP_FAILED) + return (errno); + + *(void **)addr = p; + return (0); +#endif /* _WIN32 */ +} + +/* + * __db_unmap -- + * Release the specified shared memory. + * + * PUBLIC: int __db_munmap __P((void *, size_t)); + */ +int +__db_munmap(addr, len) + void *addr; + size_t len; +{ + /* + * !!! + * The argument len is always the same length as was mapped. + */ +#ifdef _WIN32 + return (!UnmapViewOfFile(addr) ? errno : 0); +#else + return (munmap(addr, len) ? errno : 0); +#endif +} diff --git a/db2/os/db_os_open.c b/db2/os/db_os_open.c new file mode 100644 index 0000000000..1d67ef9508 --- /dev/null +++ b/db2/os/db_os_open.c @@ -0,0 +1,147 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_open.c 10.14 (Sleepycat) 7/5/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_oflags -- + * Convert open(2) flags to DB flags. + * + * PUBLIC: int __db_oflags __P((int)); + */ +int +__db_oflags(oflags) + int oflags; +{ + int dbflags; + + /* + * XXX + * Convert POSIX 1003.1 open(2) flags to DB flags. Not an exact + * science as most POSIX implementations don't have a flag value + * for O_RDONLY, it's simply the lack of a write flag. + */ + dbflags = 0; + if (oflags & O_CREAT) + dbflags |= DB_CREATE; + if (!(oflags & (O_RDWR | O_WRONLY)) || oflags & O_RDONLY) + dbflags |= DB_RDONLY; + if (oflags & O_TRUNC) + dbflags |= DB_TRUNCATE; + return (dbflags); +} + +/* + * __db_fdopen -- + * Open a file descriptor. + * + * PUBLIC: int __db_fdopen __P((const char *, int, int, int, int *)); + */ +int +__db_fdopen(name, arg_flags, ok_flags, mode, fdp) + const char *name; + int arg_flags, ok_flags, mode, *fdp; +{ + int fd, flags; + + if (arg_flags & ~ok_flags) + return (EINVAL); + + flags = 0; + if (arg_flags & DB_CREATE) + flags |= O_CREAT; + + if (arg_flags & DB_EXCL) + flags |= O_EXCL; + + if (arg_flags & DB_RDONLY) + flags |= O_RDONLY; + else + flags |= O_RDWR; + +#ifdef _WIN32 +#ifdef _MSC_VER + if (arg_flags & DB_SEQUENTIAL) + flags |= _O_SEQUENTIAL; + else + flags |= _O_RANDOM; + + if (arg_flags & DB_TEMPORARY) + flags |= _O_TEMPORARY; +#endif + flags |= O_BINARY | O_NOINHERIT; +#endif + + if (arg_flags & DB_TRUNCATE) + flags |= O_TRUNC; + + /* Open the file. */ + if ((fd = open(name, flags, mode)) == -1) + return (errno); + +#ifndef _WIN32 + /* Delete any temporary file; done for Win32 by _O_TEMPORARY. */ + if (arg_flags & DB_TEMPORARY) + (void)unlink(name); +#endif + +#if !defined(_WIN32) && !defined(macintosh) + /* + * Deny access to any child process; done for Win32 by O_NOINHERIT, + * MacOS has neither child processes nor fd inheritance. + */ + if (fcntl(fd, F_SETFD, 1) == -1) { + int ret = errno; + + (void)__db_close(fd); + return (ret); + } +#endif + *fdp = fd; + return (0); +} + +/* + * __db_fsync -- + * Flush a file descriptor. + * + * PUBLIC: int __db_fsync __P((int)); + */ +int +__db_fsync(fd) + int fd; +{ + return (fsync(fd) ? errno : 0); +} + +/* + * __db_close -- + * Close a file descriptor. + * + * PUBLIC: int __db_close __P((int)); + */ +int +__db_close(fd) + int fd; +{ + return (close(fd) ? errno : 0); +} diff --git a/db2/os/db_os_rw.c b/db2/os/db_os_rw.c new file mode 100644 index 0000000000..5a6c2196fd --- /dev/null +++ b/db2/os/db_os_rw.c @@ -0,0 +1,75 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_rw.c 10.4 (Sleepycat) 6/28/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_read -- + * Read from a file handle. + * + * PUBLIC: int __db_read __P((int, void *, size_t, ssize_t *)); + */ +int +__db_read(fd, addr, len, nrp) + int fd; + void *addr; + size_t len; + ssize_t *nrp; +{ + size_t offset; + ssize_t nr; + u_int8_t *taddr; + + for (taddr = addr, + offset = 0; offset < len; taddr += nr, offset += nr) { + if ((nr = read(fd, taddr, len - offset)) < 0) + return (errno); + if (nr == 0) + break; + } + *nrp = taddr - (u_int8_t *)addr; + return (0); +} + +/* + * __db_write -- + * Write to a file handle. + * + * PUBLIC: int __db_write __P((int, void *, size_t, ssize_t *)); + */ +int +__db_write(fd, addr, len, nwp) + int fd; + void *addr; + size_t len; + ssize_t *nwp; +{ + size_t offset; + ssize_t nw; + u_int8_t *taddr; + + for (taddr = addr, + offset = 0; offset < len; taddr += nw, offset += nw) + if ((nw = write(fd, taddr, len - offset)) < 0) + return (errno); + *nwp = len; + return (0); +} diff --git a/db2/os/db_os_sleep.c b/db2/os/db_os_sleep.c new file mode 100644 index 0000000000..5591789f51 --- /dev/null +++ b/db2/os/db_os_sleep.c @@ -0,0 +1,62 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_sleep.c 10.6 (Sleepycat) 6/28/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#ifdef HAVE_SYS_TIME_H +#include <sys/time.h> +#endif +#ifdef HAVE_SYS_SELECT_H +#include <sys/select.h> +#endif + +#include <errno.h> +#ifndef HAVE_SYS_TIME_H +#include <time.h> +#endif +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_sleep -- + * Yield the processor for a period of time. + * + * PUBLIC: int __db_sleep __P((u_long, u_long)); + */ +int +__db_sleep(secs, usecs) + u_long secs, usecs; /* Seconds and microseconds. */ +{ +#ifndef _WIN32 + struct timeval t; +#endif + + /* Don't require that the values be normalized. */ + for (; usecs >= 1000000; ++secs, usecs -= 1000000); + + /* + * It's important that we yield the processor here so that other + * processes or threads are permitted to run. + */ +#ifdef _WIN32 + Sleep(secs * 1000 + usecs / 1000); + return (0); +#else + t.tv_sec = secs; + t.tv_usec = usecs; + return (select(0, NULL, NULL, NULL, &t) == -1 ? errno : 0); +#endif +} diff --git a/db2/os/db_os_stat.c b/db2/os/db_os_stat.c new file mode 100644 index 0000000000..7929b6b754 --- /dev/null +++ b/db2/os/db_os_stat.c @@ -0,0 +1,84 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_stat.c 10.6 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "os_ext.h" +#include "common_ext.h" + +/* + * __db_exists -- + * Return if the file exists. + * + * PUBLIC: int __db_exists __P((const char *, int *)); + */ +int +__db_exists(path, isdirp) + const char *path; + int *isdirp; +{ + struct stat sb; + + if (stat(path, &sb) != 0) + return (errno); + if (isdirp != NULL) + *isdirp = S_ISDIR(sb.st_mode); + return (0); +} + +/* + * __db_stat -- + * Return file size and I/O size; abstracted to make it easier + * to replace. + * + * PUBLIC: int __db_stat __P((DB_ENV *, const char *, int, off_t *, off_t *)); + */ +int +__db_stat(dbenv, path, fd, sizep, iop) + DB_ENV *dbenv; + const char *path; + int fd; + off_t *sizep, *iop; +{ + struct stat sb; + + if (fstat(fd, &sb) == -1) { + __db_err(dbenv, "%s: fstat: %s", path, strerror(errno)); + return (errno); + } + + /* Return the size of the file. */ + if (sizep != NULL) + *sizep = sb.st_size; + + /* + * Return the underlying filesystem blocksize, if available. Default + * to 8K on the grounds that most OS's use less than 8K as their VM + * page size. + */ +#ifdef HAVE_ST_BLKSIZE + if (iop != NULL) + *iop = sb.st_blksize; +#else + if (iop != NULL) + *iop = 8 * 1024; +#endif + return (0); +} diff --git a/db2/os/db_os_unlink.c b/db2/os/db_os_unlink.c new file mode 100644 index 0000000000..872beba3cf --- /dev/null +++ b/db2/os/db_os_unlink.c @@ -0,0 +1,35 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_os_unlink.c 10.2 (Sleepycat) 6/28/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "os_ext.h" + +/* + * __db_unlink -- + * Remove a file. + * + * PUBLIC: int __db_unlink __P((const char *)); + */ +int +__db_unlink(path) + const char *path; +{ + return (unlink(path) == -1 ? errno : 0); +} diff --git a/db2/progs/db_archive/db_archive.c b/db2/progs/db_archive/db_archive.c new file mode 100644 index 0000000000..136cf2c360 --- /dev/null +++ b/db2/progs/db_archive/db_archive.c @@ -0,0 +1,165 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_archive.c 10.12 (Sleepycat) 7/25/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <signal.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "log.h" +#include "db_dispatch.h" +#include "clib_ext.h" +#include "common_ext.h" + +DB_ENV *db_init __P((char *, int)); +void onint __P((int)); +void siginit __P((void)); +void usage __P((void)); +int main __P((int, char *[])); + +int interrupted; +const char *progname = "db_archive"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB_ENV *dbenv; + int ch, flags, verbose; + char *home, **list; + + flags = verbose = 0; + home = NULL; + while ((ch = getopt(argc, argv, "ah:lsv")) != EOF) + switch (ch) { + case 'a': + flags |= DB_ARCH_ABS; + break; + case 'h': + home = optarg; + break; + case 'l': + flags |= DB_ARCH_LOG; + break; + case 's': + flags |= DB_ARCH_DATA; + break; + case 'v': + verbose = 1; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 0) + usage(); + + /* Initialize the environment. */ + dbenv = db_init(home, verbose); + + /* Get the list of names. */ + if ((errno = log_archive(dbenv->lg_info, &list, flags, NULL)) != 0) { + (void)db_appexit(dbenv); + err(1, "log_archive"); + } + + /* Print the names. */ + if (list != NULL) + for (; *list != NULL; ++list) + printf("%s\n", *list); + + return (db_appexit(dbenv) ? 1 : 0); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home, verbose) + char *home; + int verbose; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + dbenv->db_verbose = verbose; + + if ((errno = db_appinit(home, NULL, dbenv, + DB_CREATE | DB_INIT_LOG | DB_INIT_TXN | DB_USE_ENVIRON)) != 0) + err(1, "db_appinit"); + + siginit(); + + return (dbenv); +} + +/* + * siginit -- + * Initialize the set of signals for which we want to clean up. + * Generally, we try not to leave the shared regions locked if + * we can. + */ +void +siginit() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, onint); +#endif + (void)signal(SIGINT, onint); +#ifdef SIGKILL + (void)signal(SIGKILL, onint); +#endif + (void)signal(SIGTERM, onint); +} + +/* + * oninit -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + if ((interrupted = signo) == 0) + interrupted = SIGINT; +} + +void +usage() +{ + (void)fprintf(stderr, "usage: db_archive [-alsv] [-h home]\n"); + exit(1); +} diff --git a/db2/progs/db_checkpoint/db_checkpoint.c b/db2/progs/db_checkpoint/db_checkpoint.c new file mode 100644 index 0000000000..586b4b9686 --- /dev/null +++ b/db2/progs/db_checkpoint/db_checkpoint.c @@ -0,0 +1,246 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_checkpoint.c 10.9 (Sleepycat) 7/4/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "log.h" +#include "btree.h" +#include "hash.h" +#include "clib_ext.h" +#include "common_ext.h" + +char *check __P((DB_ENV *, long, long)); +int checkpoint __P((DB_ENV *, char *, int)); +DB_ENV *db_init __P((char *)); +int logpid __P((char *, int)); +void onint __P((int)); +void siginit __P((void)); +void usage __P((void)); +int main __P((int, char *[])); + +int interrupted; +time_t now; /* Checkpoint time. */ +const char *progname = "db_checkpoint"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB_ENV *dbenv; + time_t now; + long kbytes, minutes, seconds; + int ch, rval, verbose; + char *home, *logfile; + + home = logfile = NULL; + kbytes = minutes = 0; + verbose = 0; + while ((ch = getopt(argc, argv, "h:k:L:p:v")) != EOF) + switch (ch) { + case 'h': + home = optarg; + break; + case 'k': + get_long(optarg, 1, LONG_MAX, &kbytes); + break; + case 'L': + logfile = optarg; + break; + case 'p': + get_long(optarg, 1, LONG_MAX, &minutes); + break; + case 'v': + verbose = 1; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 0) + usage(); + + if (kbytes == 0 && minutes == 0) { + warnx("at least one of -k and -p must be specified"); + usage(); + } + + /* Initialize the environment. */ + dbenv = db_init(home); + + if (logfile != NULL && logpid(logfile, 1)) { + (void)db_appexit(dbenv); + return (1); + } + + /* + * If we have only a time delay, then we'll sleep the right amount + * to wake up when a checkpoint is necessary. If we have a "kbytes" + * field set, then we'll check every 30 seconds. + */ + rval = 0; + seconds = kbytes != 0 ? 30 : minutes * 60; + while (!interrupted) { + (void)__db_sleep(seconds, 0); + + if (verbose) { + (void)time(&now); + printf("checkpoint: %s", ctime(&now)); + } + rval = txn_checkpoint(dbenv->tx_info, kbytes, minutes); + if (rval < 0) + break; + + while (rval > 0) { + if (verbose) + __db_err(dbenv, + "checkpoint did not finish, retrying"); + (void)__db_sleep(2, 0); + rval = txn_checkpoint(dbenv->tx_info, 0, 0); + } + if (rval < 0) + break; + } + + if (logfile != NULL && logpid(logfile, 0)) + rval = 1; + + if (interrupted) { + (void)signal(interrupted, SIG_DFL); + (void)raise(interrupted); + /* NOTREACHED */ + } + + return (db_appexit(dbenv) || rval ? 1 : 0); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home) + char *home; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + + if ((errno = db_appinit(home, NULL, dbenv, + DB_INIT_LOG | DB_INIT_TXN | DB_INIT_MPOOL | DB_USE_ENVIRON)) != 0) + err(1, "db_appinit"); + + if (memp_register(dbenv->mp_info, + DB_FTYPE_BTREE, __bam_pgin, __bam_pgout) || + memp_register(dbenv->mp_info, + DB_FTYPE_HASH, __ham_pgin, __ham_pgout)) { + (void)db_appexit(dbenv); + errx(1, + "db_appinit: failed to register access method functions"); + } + + siginit(); + + return (dbenv); +} + +/* + * logpid -- + * Log that we're running. + */ +int +logpid(fname, is_open) + char *fname; + int is_open; +{ + FILE *fp; + time_t now; + + if (is_open) { + if ((fp = fopen(fname, "w")) == NULL) { + warn("%s", fname); + return (1); + } + (void)time(&now); + fprintf(fp, + "%s: %lu %s", progname, (u_long)getpid(), ctime(&now)); + fclose(fp); + } else + (void)remove(fname); + return (0); +} + +/* + * siginit -- + * Initialize the set of signals for which we want to clean up. + * Generally, we try not to leave the shared regions locked if + * we can. + */ +void +siginit() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, onint); +#endif + (void)signal(SIGINT, onint); +#ifdef SIGKILL + (void)signal(SIGKILL, onint); +#endif + (void)signal(SIGTERM, onint); +} + +/* + * oninit -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + if ((interrupted = signo) == 0) + interrupted = SIGINT; +} + +void +usage() +{ + (void)fprintf(stderr, + "usage: db_checkpoint [-v] [-h home] [-k kbytes] [-L file] [-p min]\n"); + exit(1); +} diff --git a/db2/progs/db_deadlock/db_deadlock.c b/db2/progs/db_deadlock/db_deadlock.c new file mode 100644 index 0000000000..9437e3552d --- /dev/null +++ b/db2/progs/db_deadlock/db_deadlock.c @@ -0,0 +1,236 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_deadlock.c 10.13 (Sleepycat) 7/20/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "clib_ext.h" +#include "common_ext.h" + +#define BAD_KILLID 0xffffffff + +DB_ENV *db_init __P((char *, int)); +void onint __P((int)); +void siginit __P((void)); +void usage __P((void)); +int logpid __P((char *, int)); +int main __P((int, char *[])); + +int interrupted; +const char *progname = "db_deadlock"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB_ENV *dbenv; + u_int32_t atype; + time_t now; + long seconds; + int ch, flags, verbose; + char *home, *logfile; + + atype = DB_LOCK_DEFAULT; + home = logfile = NULL; + seconds = 0; + flags = verbose = 0; + while ((ch = getopt(argc, argv, "a:h:L:t:vw")) != EOF) + switch (ch) { + case 'a': + switch (optarg[0]) { + case 'o': + atype = DB_LOCK_OLDEST; + break; + case 'y': + atype = DB_LOCK_YOUNGEST; + break; + default: + usage(); + /* NOTREACHED */ + } + if (optarg[1] != '\0') + usage(); + break; + case 'h': + home = optarg; + break; + case 'L': + logfile = optarg; + break; + case 't': + get_long(optarg, 1, LONG_MAX, &seconds); + break; + case 'v': + verbose = 1; + break; + case 'w': + LF_SET(DB_LOCK_CONFLICT); + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 0) + usage(); + + if (seconds == 0 && !LF_ISSET(DB_LOCK_CONFLICT)) { + warnx("at least one of -t and -w must be specified"); + usage(); + } + + /* + * We detect every second when we're running in DB_LOCK_CONFLICT mode. + */ + if (seconds == 0) + seconds = 1; + + /* Initialize the deadlock detector by opening the lock manager. */ + dbenv = db_init(home, verbose); + + if (logfile != NULL && logpid(logfile, 1)) { + (void)db_appexit(dbenv); + return (1); + } + + while (!interrupted) { + if (dbenv->db_verbose != 0) { + time(&now); + __db_err(dbenv, "Running at %s", ctime(&now)); + } + + if ((errno = lock_detect(dbenv->lk_info, flags, atype)) != 0) + break; + + /* Make a pass every "seconds" seconds. */ + (void)__db_sleep(seconds, 0); + } + + if (logfile != NULL) + (void)logpid(logfile, 0); + + if (interrupted) { + (void)signal(interrupted, SIG_DFL); + (void)raise(interrupted); + /* NOTREACHED */ + } + + return (db_appexit(dbenv)); +} + +DB_ENV * +db_init(home, verbose) + char *home; + int verbose; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + dbenv->db_verbose = verbose; + + if ((errno = db_appinit(home, + NULL, dbenv, DB_INIT_LOCK | DB_USE_ENVIRON)) != 0) + err(1, "db_appinit"); + + siginit(); + + return (dbenv); +} + +/* + * logpid -- + * Log that we're running. + */ +int +logpid(fname, is_open) + char *fname; + int is_open; +{ + FILE *fp; + time_t now; + + if (is_open) { + if ((fp = fopen(fname, "w")) == NULL) { + warn("%s", fname); + return (1); + } + (void)time(&now); + fprintf(fp, + "%s: %lu %s", progname, (u_long)getpid(), ctime(&now)); + fclose(fp); + } else + (void)remove(fname); + return (0); +} + +/* + * siginit -- + * Initialize the set of signals for which we want to clean up. + * Generally, we try not to leave the shared regions locked if + * we can. + */ +void +siginit() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, onint); +#endif + (void)signal(SIGINT, onint); +#ifdef SIGKILL + (void)signal(SIGKILL, onint); +#endif + (void)signal(SIGTERM, onint); +} + +/* + * oninit -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + if ((interrupted = signo) == 0) + interrupted = SIGINT; +} + +void +usage() +{ + (void)fprintf(stderr, + "usage: db_deadlock [-vw] [-a m | o | y] [-h home] [-L file] [-t sec]\n"); + exit(1); +} diff --git a/db2/progs/db_dump/db_dump.c b/db2/progs/db_dump/db_dump.c new file mode 100644 index 0000000000..d60aa9b5c9 --- /dev/null +++ b/db2/progs/db_dump/db_dump.c @@ -0,0 +1,280 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_dump.c 10.13 (Sleepycat) 8/19/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "hash.h" +#include "clib_ext.h" + +void configure __P((char *)); +DB_ENV *db_init __P((char *)); +void dbt_dump __P((DBT *)); +void dbt_print __P((DBT *)); +void pheader __P((DB *, int)); +void usage __P((void)); +int main __P((int, char *[])); + +const char *progname = "db_dump"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB *dbp; + DBC *dbcp; + DBT key, data; + DB_ENV *dbenv; + int ch, dflag, pflag; + char *home; + + home = NULL; + dflag = pflag = 0; + while ((ch = getopt(argc, argv, "df:h:p")) != EOF) + switch (ch) { + case 'd': + dflag = 1; + break; + case 'f': + if (freopen(optarg, "w", stdout) == NULL) + err(1, "%s", optarg); + break; + case 'h': + home = optarg; + break; + case 'p': + pflag = 1; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + + if (dflag) { + if (home != NULL) + errx(1, + "the -d and -h options may not both be specified"); + if (pflag) + errx(1, + "the -d and -p options may not both be specified"); + } + /* Initialize the environment. */ + dbenv = dflag ? NULL : db_init(home); + + /* Open the DB file. */ + if ((errno = + db_open(argv[0], DB_UNKNOWN, DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0) + err(1, "%s", argv[0]); + + /* DB dump. */ + if (dflag) { + (void)__db_dump(dbp, NULL, 1); + if ((errno = dbp->close(dbp, 0)) != 0) + err(1, "close"); + exit (0); + } + + /* Get a cursor and step through the database. */ + if ((errno = dbp->cursor(dbp, NULL, &dbcp)) != 0) { + (void)dbp->close(dbp, 0); + err(1, "cursor"); + } + + /* Print out the header. */ + pheader(dbp, pflag); + + /* Print out the key/data pairs. */ + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + if (pflag) + while ((errno = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) { + if (dbp->type != DB_RECNO) + dbt_print(&key); + dbt_print(&data); + } + else + while ((errno = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) { + if (dbp->type != DB_RECNO) + dbt_dump(&key); + dbt_dump(&data); + } + if (errno != DB_NOTFOUND) + err(1, "cursor get"); + + if ((errno = dbp->close(dbp, 0)) != 0) + err(1, "close"); + return (0); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home) + char *home; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + + if ((errno = + db_appinit(home, NULL, dbenv, DB_CREATE | DB_USE_ENVIRON)) != 0) + err(1, "db_appinit"); + return (dbenv); +} + +/* + * pheader -- + * Write out the header information. + */ +void +pheader(dbp, pflag) + DB *dbp; + int pflag; +{ + DB_BTREE_STAT *btsp; + HTAB *hashp; + HASHHDR *hdr; + db_pgno_t pgno; + + printf("format=%s\n", pflag ? "print" : "bytevalue"); + switch (dbp->type) { + case DB_BTREE: + printf("type=btree\n"); + if ((errno = dbp->stat(dbp, &btsp, NULL, 0)) != 0) + err(1, "dbp->stat"); + if (F_ISSET(dbp, DB_BT_RECNUM)) + printf("recnum=1\n"); + if (btsp->bt_maxkey != 0) + printf("bt_maxkey=%lu\n", (u_long)btsp->bt_maxkey); + if (btsp->bt_minkey != 0) + printf("bt_minkey=%lu\n", (u_long)btsp->bt_minkey); + break; + case DB_HASH: + printf("type=hash\n"); + hashp = dbp->internal; + pgno = PGNO_METADATA; + if (memp_fget(dbp->mpf, &pgno, 0, &hdr) == 0) { + if (hdr->ffactor != 0) + printf("h_ffactor=%lu\n", (u_long)hdr->ffactor); + if (hdr->nelem != 0) + printf("h_nelem=%lu\n", (u_long)hdr->nelem); + (void)memp_fput(dbp->mpf, hdr, 0); + } + break; + case DB_RECNO: + printf("type=recno\n"); + if (F_ISSET(dbp, DB_RE_RENUMBER)) + printf("renumber=1\n"); + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + printf("re_len=%lu\n", (u_long)btsp->bt_re_len); + if (F_ISSET(dbp, DB_RE_PAD)) + printf("re_pad=%#x\n", btsp->bt_re_pad); + break; + case DB_UNKNOWN: + abort(); + /* NOTREACHED */ + } + + if (F_ISSET(dbp, DB_AM_DUP)) + printf("duplicates=1\n"); + + if (dbp->dbenv->db_lorder != 0) + printf("db_lorder=%lu\n", (u_long)dbp->dbenv->db_lorder); + + if (!F_ISSET(dbp, DB_AM_PGDEF)) + printf("db_pagesize=%lu\n", (u_long)dbp->pgsize); + + printf("HEADER=END\n"); +} + +static char hex[] = "0123456789abcdef"; + +/* + * dbt_dump -- + * Write out a key or data item using byte values. + */ +void +dbt_dump(dbtp) + DBT *dbtp; +{ + u_int32_t len; + u_int8_t *p; + + for (len = dbtp->size, p = dbtp->data; len--; ++p) + (void)printf("%c%c", + hex[(u_int8_t)(*p & 0xf0) >> 4], hex[*p & 0x0f]); + printf("\n"); +} + +/* + * dbt_print -- + * Write out a key or data item using printable characters. + */ +void +dbt_print(dbtp) + DBT *dbtp; +{ + u_int32_t len; + u_int8_t *p; + + for (len = dbtp->size, p = dbtp->data; len--; ++p) + if (isprint(*p)) { + if (*p == '\\') + (void)printf("\\"); + (void)printf("%c", *p); + } else + (void)printf("\\%c%c", + hex[(u_int8_t)(*p & 0xf0) >> 4], hex[*p & 0x0f]); + printf("\n"); +} + +/* + * usage -- + * Display the usage message. + */ +void +usage() +{ + (void)fprintf(stderr, + "usage: db_dump [-dp] [-f file] [-h home] db_file\n"); + exit(1); +} diff --git a/db2/progs/db_dump185/db_dump185.c b/db2/progs/db_dump185/db_dump185.c new file mode 100644 index 0000000000..f3c1187e45 --- /dev/null +++ b/db2/progs/db_dump185/db_dump185.c @@ -0,0 +1,322 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_dump185.c 10.5 (Sleepycat) 7/2/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_185.h" +#include "clib_ext.h" + +/* Hash Table Information */ +typedef struct hashhdr { /* Disk resident portion */ + int magic; /* Magic NO for hash tables */ + int version; /* Version ID */ + u_int32_t lorder; /* Byte Order */ + int bsize; /* Bucket/Page Size */ + int bshift; /* Bucket shift */ + int dsize; /* Directory Size */ + int ssize; /* Segment Size */ + int sshift; /* Segment shift */ + int ovfl_point; /* Where overflow pages are being + * allocated */ + int last_freed; /* Last overflow page freed */ + int max_bucket; /* ID of Maximum bucket in use */ + int high_mask; /* Mask to modulo into entire table */ + int low_mask; /* Mask to modulo into lower half of + * table */ + int ffactor; /* Fill factor */ + int nkeys; /* Number of keys in hash table */ +} HASHHDR; + +typedef struct htab { /* Memory resident data structure */ + HASHHDR hdr; /* Header */ +} HTAB; + +typedef struct _epgno { + u_int32_t pgno; /* the page number */ + u_int16_t index; /* the index on the page */ +} EPGNO; + +typedef struct _epg { + void *page; /* the (pinned) page */ + u_int16_t index; /* the index on the page */ +} EPG; + +typedef struct _cursor { + EPGNO pg; /* B: Saved tree reference. */ + DBT key; /* B: Saved key, or key.data == NULL. */ + u_int32_t rcursor; /* R: recno cursor (1-based) */ + +#define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */ +#define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */ +#define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */ +#define CURS_INIT 0x08 /* RB: Cursor initialized. */ + u_int8_t flags; +} CURSOR; + +/* The in-memory btree/recno data structure. */ +typedef struct _btree { + void *bt_mp; /* memory pool cookie */ + + void *bt_dbp; /* pointer to enclosing DB */ + + EPG bt_cur; /* current (pinned) page */ + void *bt_pinned; /* page pinned across calls */ + + CURSOR bt_cursor; /* cursor */ + + EPGNO bt_stack[50]; /* stack of parent pages */ + EPGNO *bt_sp; /* current stack pointer */ + + DBT bt_rkey; /* returned key */ + DBT bt_rdata; /* returned data */ + + int bt_fd; /* tree file descriptor */ + + u_int32_t bt_free; /* next free page */ + u_int32_t bt_psize; /* page size */ + u_int16_t bt_ovflsize; /* cut-off for key/data overflow */ + int bt_lorder; /* byte order */ + /* sorted order */ + enum { NOT, BACK, FORWARD } bt_order; + EPGNO bt_last; /* last insert */ + + /* B: key comparison function */ + int (*bt_cmp) __P((const DBT *, const DBT *)); + /* B: prefix comparison function */ + size_t (*bt_pfx) __P((const DBT *, const DBT *)); + /* R: recno input function */ + int (*bt_irec) __P((struct _btree *, u_int32_t)); + + FILE *bt_rfp; /* R: record FILE pointer */ + int bt_rfd; /* R: record file descriptor */ + + void *bt_cmap; /* R: current point in mapped space */ + void *bt_smap; /* R: start of mapped space */ + void *bt_emap; /* R: end of mapped space */ + size_t bt_msize; /* R: size of mapped region. */ + + u_int32_t bt_nrecs; /* R: number of records */ + size_t bt_reclen; /* R: fixed record length */ + u_char bt_bval; /* R: delimiting byte/pad character */ + +/* + * NB: + * B_NODUPS and R_RECNO are stored on disk, and may not be changed. + */ +#define B_INMEM 0x00001 /* in-memory tree */ +#define B_METADIRTY 0x00002 /* need to write metadata */ +#define B_MODIFIED 0x00004 /* tree modified */ +#define B_NEEDSWAP 0x00008 /* if byte order requires swapping */ +#define B_RDONLY 0x00010 /* read-only tree */ + +#define B_NODUPS 0x00020 /* no duplicate keys permitted */ +#define R_RECNO 0x00080 /* record oriented tree */ + +#define R_CLOSEFP 0x00040 /* opened a file pointer */ +#define R_EOF 0x00100 /* end of input file reached. */ +#define R_FIXLEN 0x00200 /* fixed length records */ +#define R_MEMMAPPED 0x00400 /* memory mapped file. */ +#define R_INMEM 0x00800 /* in-memory file */ +#define R_MODIFIED 0x01000 /* modified file */ +#define R_RDONLY 0x02000 /* read-only file */ + +#define B_DB_LOCK 0x04000 /* DB_LOCK specified. */ +#define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */ +#define B_DB_TXN 0x10000 /* DB_TXN specified. */ + u_int32_t flags; +} BTREE; + +void db_185_btree __P((DB *, int)); +void db_185_hash __P((DB *, int)); +void dbt_dump __P((DBT *)); +void dbt_print __P((DBT *)); +void usage __P((void)); +int main __P((int, char *[])); + +const char *progname = "db_dump185"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB *dbp; + DBT key, data; + int ch, pflag, rval; + + pflag = 0; + while ((ch = getopt(argc, argv, "f:p")) != EOF) + switch (ch) { + case 'f': + if (freopen(optarg, "w", stdout) == NULL) + err(1, "%s", optarg); + break; + case 'p': + pflag = 1; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + + if ((dbp = dbopen(argv[0], O_RDONLY, 0, DB_BTREE, NULL)) == NULL) { + if ((dbp = dbopen(argv[0], O_RDONLY, 0, DB_HASH, NULL)) == NULL) + return (1); + db_185_hash(dbp, pflag); + } else + db_185_btree(dbp, pflag); + + /* + * !!! + * DB 1.85 DBTs are a subset of DB 2.0 DBTs, so we just use the + * new dump/print routines. + */ + if (pflag) + while (!(rval = dbp->seq(dbp, &key, &data, R_NEXT))) { + dbt_print(&key); + dbt_print(&data); + } + else + while (!(rval = dbp->seq(dbp, &key, &data, R_NEXT))) { + dbt_dump(&key); + dbt_dump(&data); + } + + if (rval == -1) + err(1, "seq"); + return (0); +} + +/* + * db_185_hash -- + * Dump out hash header information. + */ +void +db_185_hash(dbp, pflag) + DB *dbp; + int pflag; +{ + HTAB *hashp; + + hashp = dbp->internal; + + printf("format=%s\n", pflag ? "print" : "bytevalue"); + printf("type=hash\n"); + printf("h_ffactor=%lu\n", (u_long)hashp->hdr.ffactor); +#ifdef NOT_AVAILABLE_IN_DB_185 + printf("h_nelem=%lu\n", (u_long)hashp->hdr.nelem); +#endif + if (hashp->hdr.lorder != 0) + printf("db_lorder=%lu\n", (u_long)hashp->hdr.lorder); + printf("db_pagesize=%lu\n", (u_long)hashp->hdr.bsize); + printf("HEADER=END\n"); +} + +/* + * db_185_btree -- + * Dump out btree header information. + */ +void +db_185_btree(dbp, pflag) + DB *dbp; + int pflag; +{ + BTREE *btp; + + btp = dbp->internal; + + printf("format=%s\n", pflag ? "print" : "bytevalue"); + printf("type=btree\n"); +#ifdef NOT_AVAILABLE_IN_185 + printf("bt_minkey=%lu\n", (u_long)XXX); + printf("bt_maxkey=%lu\n", (u_long)XXX); +#endif + if (btp->bt_lorder != 0) + printf("db_lorder=%lu\n", (u_long)btp->bt_lorder); + printf("db_pagesize=%lu\n", (u_long)btp->bt_psize); + if (!(btp->flags & B_NODUPS)) + printf("duplicates=1\n"); + printf("HEADER=END\n"); +} + +static char hex[] = "0123456789abcdef"; + +/* + * dbt_dump -- + * Write out a key or data item using byte values. + */ +void +dbt_dump(dbtp) + DBT *dbtp; +{ + size_t len; + u_int8_t *p; + + for (len = dbtp->size, p = dbtp->data; len--; ++p) + (void)printf("%c%c", + hex[(*p & 0xf0) >> 4], hex[*p & 0x0f]); + printf("\n"); +} + +/* + * dbt_print -- + * Write out a key or data item using printable characters. + */ +void +dbt_print(dbtp) + DBT *dbtp; +{ + size_t len; + u_int8_t *p; + + for (len = dbtp->size, p = dbtp->data; len--; ++p) + if (isprint(*p)) { + if (*p == '\\') + (void)printf("\\"); + (void)printf("%c", *p); + } else + (void)printf("\\%c%c", + hex[(*p & 0xf0) >> 4], hex[*p & 0x0f]); + printf("\n"); +} + +/* + * usage -- + * Display the usage message. + */ +void +usage() +{ + (void)fprintf(stderr, "usage: db_dump [-p] [-f file] db_file\n"); + exit(1); +} diff --git a/db2/progs/db_load/db_load.c b/db2/progs/db_load/db_load.c new file mode 100644 index 0000000000..cc90e7bd27 --- /dev/null +++ b/db2/progs/db_load/db_load.c @@ -0,0 +1,457 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_load.c 10.9 (Sleepycat) 8/19/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <getopt.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "clib_ext.h" + +void badnum __P((void)); +void configure __P((DB_INFO *, char **)); +DB_ENV *db_init __P((char *)); +int dbt_rdump __P((DBT *)); +int dbt_rprint __P((DBT *)); +int digitize __P((int)); +void rheader __P((DBTYPE *, int *, DB_INFO *)); +void usage __P((void)); +int main __P((int, char *[])); + +const char *progname = "db_load"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB *dbp; + DBT key, data; + DBTYPE argtype, headertype; + DB_ENV *dbenv; + DB_INFO dbinfo; + db_recno_t recno; + int ch, pflag; + char **clist, **clp, *home; + + /* Allocate enough room for configuration arguments. */ + if ((clp = clist = calloc(argc + 1, sizeof(char *))) == NULL) + err(1, NULL); + + home = NULL; + argtype = DB_UNKNOWN; + while ((ch = getopt(argc, argv, "c:f:h:t:")) != EOF) + switch (ch) { + case 'c': + *clp++ = optarg; + break; + case 'f': + if (freopen(optarg, "r", stdin) == NULL) + err(1, "%s", optarg); + break; + case 'h': + home = optarg; + break; + case 't': + if (strcmp(optarg, "btree") == 0) { + argtype = DB_BTREE; + break; + } + if (strcmp(optarg, "hash") == 0) { + argtype = DB_HASH; + break; + } + usage(); + /* NOTREACHED */ + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + + /* Initialize the environment. */ + dbenv = db_init(home); + memset(&dbinfo, 0, sizeof(DB_INFO)); + + /* Read the header. */ + rheader(&headertype, &pflag, &dbinfo); + + /* Apply command-line configuration changes. */ + configure(&dbinfo, clist); + + /* Conversion to/from recno is prohibited. */ + if (argtype != DB_UNKNOWN) { + if (headertype == DB_RECNO) + errx(1, "databases of type recno may not be converted"); + headertype = argtype; + } + + /* Open the DB file. */ + if ((errno = db_open(argv[0], headertype, DB_CREATE | DB_TRUNCATE, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, + dbenv, &dbinfo, &dbp)) != 0) + err(1, "%s", argv[0]); + + /* Initialize the key/data pair. */ + memset(&key, 0, sizeof(DBT)); + if ((key.data = (void *)malloc(key.ulen = 1024)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + memset(&data, 0, sizeof(DBT)); + if ((data.data = (void *)malloc(data.ulen = 1024)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + + /* Get each key/data pair and add them to the database. */ + if (headertype == DB_RECNO) { + key.data = &recno; + key.size = sizeof(recno); + for (recno = 1;; ++recno) { + if (pflag) { + if (dbt_rprint(&data)) + break; + } else + if (dbt_rdump(&data)) + break; + if ((errno = dbp->put(dbp, NULL, &key, &data, 0)) != 0) + err(1, "%s", argv[0]); + } + } else + for (;;) { + if (pflag) { + if (dbt_rprint(&key)) + break; + if (dbt_rprint(&data)) + goto fmt; + } else { + if (dbt_rdump(&key)) + break; + if (dbt_rdump(&data)) +fmt: err(1, "odd number of key/data pairs"); + } + if ((errno = dbp->put(dbp, NULL, &key, &data, 0)) != 0) + err(1, "%s", argv[0]); + } + + if ((errno = dbp->close(dbp, 0)) != 0) + err(1, "%s", argv[0]); + return (0); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home) + char *home; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + + if ((errno = + db_appinit(home, NULL, dbenv, DB_CREATE | DB_USE_ENVIRON)) != 0) + err(1, "db_appinit"); + return (dbenv); +} + +#define FLAG(name, value, keyword, flag) \ + if (strcmp(name, keyword) == 0) { \ + switch (*value) { \ + case '1': \ + dbinfop->flags |= (flag); \ + break; \ + case '0': \ + dbinfop->flags &= ~(flag); \ + break; \ + default: \ + badnum(); \ + /* NOTREACHED */ \ + } \ + continue; \ + } +#define NUMBER(name, value, keyword, field, flag) \ + if (strcmp(name, keyword) == 0) { \ + get_long(value, 1, LONG_MAX, &val); \ + dbinfop->field = val; \ + if (flag != 0) \ + dbinfop->flags |= (flag); \ + continue; \ + } +#define STRING(name, value, keyword, field, flag) \ + if (strcmp(name, keyword) == 0) { \ + dbinfop->field = value[0]; \ + if (flag != 0) \ + dbinfop->flags |= (flag); \ + continue; \ + } + +/* + * configure -- + * Handle command-line configuration options. + */ +void +configure(dbinfop, clp) + DB_INFO *dbinfop; + char **clp; +{ + long val; + char *name, *value; + + for (; (name = *clp) != NULL; ++clp) { + if ((value = strchr(name, '=')) == NULL) + errx(1, + "command-line configuration uses name=value format"); + *value++ = '\0'; + + NUMBER(name, value, "bt_maxkey", bt_maxkey, 0); + NUMBER(name, value, "bt_minkey", bt_minkey, 0); + NUMBER(name, value, "db_lorder", db_lorder, 0); + NUMBER(name, value, "db_pagesize", db_pagesize, 0); + FLAG(name, value, "duplicates", DB_DUP); + NUMBER(name, value, "h_ffactor", h_ffactor, 0); + NUMBER(name, value, "h_nelem", h_nelem, 0); + NUMBER(name, value, "re_len", re_len, DB_FIXEDLEN); + STRING(name, value, "re_pad", re_pad, DB_PAD); + FLAG(name, value, "recnum", DB_RECNUM); + FLAG(name, value, "renumber", DB_RENUMBER); + + errx(1, "unknown command-line configuration keyword"); + } +} + +/* + * rheader -- + * Read the header message. + */ +void +rheader(dbtypep, pflagp, dbinfop) + DBTYPE *dbtypep; + int *pflagp; + DB_INFO *dbinfop; +{ + long lineno, val; + char name[256], value[256]; + + *dbtypep = DB_UNKNOWN; + *pflagp = 0; + + for (lineno = 1;; ++lineno) { + if (fscanf(stdin, "%[^=]=%s\n", name, value) != 2) + errx(1, "line %lu: unexpected line", lineno); + if (strcmp(name, "HEADER") == 0) + break; + + if (strcmp(name, "format") == 0) { + if (strcmp(value, "bytevalue") == 0) { + *pflagp = 0; + continue; + } + if (strcmp(value, "print") == 0) { + *pflagp = 1; + continue; + } + errx(1, "line %d: unknown format", lineno); + } + if (strcmp(name, "type") == 0) { + if (strcmp(value, "btree") == 0) { + *dbtypep = DB_BTREE; + continue; + } + if (strcmp(value, "hash") == 0) { + *dbtypep = DB_HASH; + continue; + } + if (strcmp(value, "recno") == 0) { + *dbtypep = DB_RECNO; + continue; + } + errx(1, "line %d: unknown type", lineno); + } + NUMBER(name, value, "bt_maxkey", bt_maxkey, 0); + NUMBER(name, value, "bt_minkey", bt_minkey, 0); + NUMBER(name, value, "db_lorder", db_lorder, 0); + NUMBER(name, value, "db_pagesize", db_pagesize, 0); + FLAG(name, value, "duplicates", DB_DUP); + NUMBER(name, value, "h_ffactor", h_ffactor, 0); + NUMBER(name, value, "h_nelem", h_nelem, 0); + NUMBER(name, value, "re_len", re_len, DB_FIXEDLEN); + STRING(name, value, "re_pad", re_pad, DB_PAD); + FLAG(name, value, "recnum", DB_RECNUM); + FLAG(name, value, "renumber", DB_RENUMBER); + + errx(1, "unknown input-file header configuration keyword"); + } +} + +/* + * dbt_rprint -- + * Read a printable line into a DBT structure. + */ +int +dbt_rprint(dbtp) + DBT *dbtp; +{ + u_int32_t len; + u_int8_t *p; + int c1, c2, escape; + + escape = 0; + for (p = dbtp->data, len = 0; (c1 = getchar()) != '\n';) { + if (c1 == EOF) { + if (len == 0) + return (1); + err(1, "unexpected end of key/data pair"); + } + if (escape) { + if (c1 != '\\') { + if ((c2 = getchar()) == EOF) + err(1, + "unexpected end of key/data pair"); + c1 = digitize(c1) << 4 | digitize(c2); + } + escape = 0; + } else + if (c1 == '\\') { + escape = 1; + continue; + } + if (++len >= dbtp->ulen - 10) { + dbtp->ulen *= 2; + if ((dbtp->data = + (void *)realloc(dbtp->data, dbtp->ulen)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + p = (u_int8_t *)dbtp->data + len; + } + *p++ = c1; + } + dbtp->size = len; + return (0); +} + +/* + * digitize -- + * Convert a character to an integer. + */ +int +digitize(c) + int c; +{ + switch (c) { /* Don't depend on ASCII ordering. */ + case '0': return (0); + case '1': return (1); + case '2': return (2); + case '3': return (3); + case '4': return (4); + case '5': return (5); + case '6': return (6); + case '7': return (7); + case '8': return (8); + case '9': return (9); + case 'a': return (10); + case 'b': return (11); + case 'c': return (12); + case 'd': return (13); + case 'e': return (14); + case 'f': return (15); + } + + err(1, "unexpected hexadecimal value"); + /* NOTREACHED */ + + return (0); +} + +/* + * dbt_rdump -- + * Read a byte dump line into a DBT structure. + */ +int +dbt_rdump(dbtp) + DBT *dbtp; +{ + u_int32_t len; + u_int8_t *p; + int c1, c2; + + for (p = dbtp->data, len = 0; (c1 = getchar()) != '\n';) { + if (c1 == EOF) { + if (len == 0) + return (1); + err(1, "unexpected end of key/data pair"); + } + if ((c2 = getchar()) == EOF) + err(1, "unexpected end of key/data pair"); + if (++len >= dbtp->ulen - 10) { + dbtp->ulen *= 2; + if ((dbtp->data = + (void *)realloc(dbtp->data, dbtp->ulen)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + p = (u_int8_t *)dbtp->data + len; + } + *p++ = digitize(c1) << 4 | digitize(c2); + } + dbtp->size = len; + return (0); +} + +/* + * badnum -- + * Display the bad number message. + */ +void +badnum() +{ + err(1, "boolean name=value pairs require a value of 0 or 1"); +} + +/* + * usage -- + * Display the usage message. + */ +void +usage() +{ + (void)fprintf(stderr, +"usage: db_load [-c name=value] [-f file] [-h home] [-t btree | hash] db_file\n"); + exit(1); +} diff --git a/db2/progs/db_printlog/db_printlog.c b/db2/progs/db_printlog/db_printlog.c new file mode 100644 index 0000000000..12c365524f --- /dev/null +++ b/db2/progs/db_printlog/db_printlog.c @@ -0,0 +1,160 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_printlog.c 10.8 (Sleepycat) 7/15/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "btree.h" +#include "hash.h" +#include "log.h" +#include "txn.h" +#include "db_am.h" +#include "clib_ext.h" + +DB_ENV *db_init __P((char *)); +void onint __P((int)); +void usage __P((void)); + +int interrupted; +char *progname = "db_printlog"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB_ENV *dbenv; + DBT data; + DB_LSN key; + int ch, eval; + char *home; + + home = NULL; + while ((ch = getopt(argc, argv, "h:")) != EOF) + switch (ch) { + case 'h': + home = optarg; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if ((home != NULL && argc > 0) || argc > 1) + usage(); + + /* XXX: backward compatibility, first argument is home. */ + if (argc == 1) + home = argv[0]; + + dbenv = db_init(home); + + eval = 0; + if ((errno = __bam_init_print(dbenv)) != 0 || + (errno = __db_init_print(dbenv)) != 0 || + (errno = __ham_init_print(dbenv)) != 0 || + (errno = __log_init_print(dbenv)) != 0 || + (errno = __txn_init_print(dbenv)) != 0) { + warn("initialization"); + eval = 1; + (void)db_appexit(dbenv); + } + + (void)signal(SIGINT, onint); + + memset(&data, 0, sizeof(data)); + while (!interrupted) { + if ((errno = + log_get(dbenv->lg_info, &key, &data, DB_NEXT)) != 0) { + if (errno == DB_NOTFOUND) + break; + eval = 1; + warn("log_get"); + break; + } + if ((errno = + __db_dispatch(dbenv->lg_info, &data, &key, 0, NULL)) != 0) { + eval = 1; + warn("dispatch"); + break; + } + } + + (void)db_appexit(dbenv); + + if (interrupted) { + (void)signal(SIGINT, SIG_DFL); + (void)raise(SIGINT); + /* NOTREACHED */ + } + exit (eval); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home) + char *home; +{ + DB_ENV *dbenv; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + + if ((errno = + db_appinit(home, NULL, dbenv, DB_CREATE | DB_INIT_LOG)) != 0) + err(1, "db_appinit"); + return (dbenv); +} + +/* + * oninit -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + signo = 1; /* XXX: Shut the compiler up. */ + interrupted = 1; +} + +void +usage() +{ + fprintf(stderr, "usage: db_printlog [-h home]\n"); + exit (1); +} diff --git a/db2/progs/db_recover/db_recover.c b/db2/progs/db_recover/db_recover.c new file mode 100644 index 0000000000..4ac5925f79 --- /dev/null +++ b/db2/progs/db_recover/db_recover.c @@ -0,0 +1,122 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_recover.c 10.12 (Sleepycat) 7/27/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <getopt.h> +#include <stdlib.h> +#include <time.h> +#endif + +#include "db_int.h" +#include "txn.h" +#include "common_ext.h" +#include "clib_ext.h" + +DB_ENV *db_init __P((char *, int, int)); +void usage __P((void)); +int main __P((int, char *[])); + +const char *progname = "db_recover"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB_ENV *dbenv; + time_t now; + int ch, flags, verbose; + char *home; + + home = NULL; + flags = verbose = 0; + while ((ch = getopt(argc, argv, "ch:v")) != EOF) + switch (ch) { + case 'c': + LF_SET(DB_RECOVER_FATAL); + break; + case 'h': + home = optarg; + break; + case 'v': + verbose = 1; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 0) + usage(); + + dbenv = db_init(home, flags, verbose); + if (verbose) { + __db_err(dbenv, "Recovery complete at %s", ctime(&now)); + __db_err(dbenv, "%s %lu %s [%lu][%lu]", + "Maximum transaction id", + (u_long)dbenv->tx_info->region->last_txnid, + "Recovery checkpoint", + (u_long)dbenv->tx_info->region->last_ckp.file, + (u_long)dbenv->tx_info->region->last_ckp.offset); + } + + exit (db_appexit(dbenv)); +} + +DB_ENV * +db_init(home, flags, verbose) + char *home; + int flags, verbose; +{ + DB_ENV *dbenv; + int local_flags; + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = "db_recover"; + dbenv->db_verbose = verbose; + + /* Initialize environment for pathnames only. */ + local_flags = DB_CREATE | DB_INIT_LOG | + DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN | DB_USE_ENVIRON; + + if (LF_ISSET(DB_RECOVER_FATAL)) + local_flags |= DB_RECOVER_FATAL; + else + local_flags |= DB_RECOVER; + + if ((errno = db_appinit(home, NULL, dbenv, local_flags)) != 0) + err(1, "appinit failed"); + + return (dbenv); +} + +void +usage() +{ + (void)fprintf(stderr, "usage: db_recover [-cv] [-h home]\n"); + exit(1); +} diff --git a/db2/progs/db_stat/db_stat.c b/db2/progs/db_stat/db_stat.c new file mode 100644 index 0000000000..5c7044dbce --- /dev/null +++ b/db2/progs/db_stat/db_stat.c @@ -0,0 +1,434 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1997\n\ + Sleepycat Software Inc. All rights reserved.\n"; +static const char sccsid[] = "@(#)db_stat.c 8.17 (Sleepycat) 8/24/97"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "clib_ext.h" + +#define DIVIDER "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + +typedef enum { T_NOTSET, T_DB, T_MPOOL, T_TXN } test_t; + +void bstat __P((DB *)); +DB_ENV *db_init __P((char *, test_t)); +void hstat __P((DB *)); +void mstat __P((DB_ENV *)); +void prflags __P((u_int32_t, const FN *)); +void onint __P((int)); +void tstat __P((DB_ENV *)); +int txn_compare __P((const void *, const void *)); +void usage __P((void)); +int main __P((int, char *[])); + +int interrupted; +const char *progname = "db_stat"; /* Program name. */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern char *optarg; + extern int optind; + DB *dbp; + DB_ENV *dbenv; + test_t ttype; + int ch; + char *db, *home; + + ttype = T_NOTSET; + db = home = NULL; + while ((ch = getopt(argc, argv, "d:h:mt")) != EOF) + switch (ch) { + case 'd': + db = optarg; + ttype = T_DB; + break; + case 'h': + home = optarg; + break; + case 'm': + ttype = T_MPOOL; + break; + case 't': + ttype = T_TXN; + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + if (argc != 0 || ttype == T_NOTSET) + usage(); + + dbenv = db_init(home, ttype); + + (void)signal(SIGINT, onint); + + switch (ttype) { + case T_DB: + if ((errno = db_open(db, DB_UNKNOWN, + DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0) + return (1); + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + bstat(dbp); + break; + case DB_HASH: + hstat(dbp); + break; + case DB_UNKNOWN: + abort(); /* Impossible. */ + /* NOTREACHED */ + } + (void)dbp->close(dbp, 0); + break; + case T_MPOOL: + mstat(dbenv); + break; + case T_TXN: + tstat(dbenv); + break; + case T_NOTSET: + abort(); /* Impossible. */ + /* NOTREACHED */ + } + + (void)db_appexit(dbenv); + + if (interrupted) { + (void)signal(SIGINT, SIG_DFL); + (void)raise(SIGINT); + /* NOTREACHED */ + } + return (0); +} + +/* + * bstat -- + * Display btree/recno statistics. + */ +void +bstat(dbp) + DB *dbp; +{ + static const FN fn[] = { + { DB_DUP, "DB_DUP" }, + { DB_FIXEDLEN, "DB_FIXEDLEN" }, + { DB_RECNUM, "DB_RECNUM" }, + { DB_RENUMBER, "DB_RENUMBER" }, + { 0 } + }; + DB_BTREE_STAT *sp; + + if (dbp->stat(dbp, &sp, NULL, 0)) + err(1, "dbp->stat"); + +#define PCT(f, t) \ + (t == 0 ? 0 : \ + (((double)((t * sp->bt_pagesize) - f) / (t * sp->bt_pagesize)) * 100)) + + prflags(sp->bt_flags, fn); + if (dbp->type == DB_BTREE) { +#ifdef NOT_IMPLEMENTED + printf("%lu\tMaximum keys per-page.\n", (u_long)sp->bt_maxkey); +#endif + printf("%lu\tMinimum keys per-page.\n", (u_long)sp->bt_minkey); + } + if (dbp->type == DB_RECNO) { + printf("%lu\tFixed-length record size.\n", + (u_long)sp->bt_re_len); + if (isprint(sp->bt_re_pad)) + printf("%c\tFixed-length record pad.\n", + (int)sp->bt_re_pad); + else + printf("0x%x\tFixed-length record pad.\n", + (int)sp->bt_re_pad); + } + printf("%lu\tUnderlying tree page size.\n", (u_long)sp->bt_pagesize); + printf("%lu\tNumber of levels in the tree.\n", (u_long)sp->bt_levels); + printf("%lu\tNumber of keys in the tree.\n", (u_long)sp->bt_nrecs); + printf("%lu\tNumber of tree internal pages.\n", (u_long)sp->bt_int_pg); + printf("%lu\tNumber of tree leaf pages.\n", (u_long)sp->bt_leaf_pg); + printf("%lu\tNumber of tree duplicate pages.\n", + (u_long)sp->bt_dup_pg); + printf("%lu\tNumber of tree overflow pages.\n", + (u_long)sp->bt_over_pg); + printf("%lu\tNumber of pages on the free list.\n", + (u_long)sp->bt_free); + printf("%lu\tNumber of pages freed for reuse.\n", + (u_long)sp->bt_freed); + printf("%lu\tNumber of bytes free in tree internal pages (%.0f%% ff)\n", + (u_long)sp->bt_int_pgfree, + PCT(sp->bt_int_pgfree, sp->bt_int_pg)); + printf("%lu\tNumber of bytes free in tree leaf pages (%.0f%% ff).\n", + (u_long)sp->bt_leaf_pgfree, + PCT(sp->bt_leaf_pgfree, sp->bt_leaf_pg)); +printf("%lu\tNumber of bytes free in tree duplicate pages (%.0f%% ff).\n", + (u_long)sp->bt_dup_pgfree, + PCT(sp->bt_dup_pgfree, sp->bt_dup_pg)); +printf("%lu\tNumber of bytes free in tree overflow pages (%.0f%% ff).\n", + (u_long)sp->bt_over_pgfree, + PCT(sp->bt_over_pgfree, sp->bt_over_pg)); + printf("%lu\tNumber of bytes saved by prefix compression.\n", + (u_long)sp->bt_pfxsaved); + printf("%lu\tTotal number of tree page splits.\n", + (u_long)sp->bt_split); + printf("%lu\tNumber of root page splits.\n", (u_long)sp->bt_rootsplit); + printf("%lu\tNumber of fast splits.\n", (u_long)sp->bt_fastsplit); + printf("%lu\tNumber of hits in tree fast-insert code.\n", + (u_long)sp->bt_cache_hit); + printf("%lu\tNumber of misses in tree fast-insert code.\n", + (u_long)sp->bt_cache_miss); + printf("%lu\tNumber of keys added.\n", (u_long)sp->bt_added); + printf("%lu\tNumber of keys deleted.\n", (u_long)sp->bt_deleted); +} + +/* + * hstat -- + * Display hash statistics. + */ +void +hstat(dbp) + DB *dbp; +{ + return; +} + +/* + * mstat -- + * Display mpool statistics. + */ +void +mstat(dbenv) + DB_ENV *dbenv; +{ + DB_MPOOL_FSTAT **fsp; + DB_MPOOL_STAT *gsp; + + if (memp_stat(dbenv->mp_info, &gsp, &fsp, NULL)) + err(1, NULL); + + printf("%lu\tCache size (%luK).\n", + (u_long)gsp->st_cachesize, (u_long)gsp->st_cachesize / 1024); + printf("%lu\tRequested pages found in the cache", gsp->st_cache_hit); + if (gsp->st_cache_hit + gsp->st_cache_miss != 0) + printf(" (%.0f%%)", ((double)gsp->st_cache_hit / + (gsp->st_cache_hit + gsp->st_cache_miss)) * 100); + printf(".\n"); + printf("%lu\tRequested pages mapped into the process' address space.\n", + gsp->st_map); + printf("%lu\tRequested pages not found in the cache.\n", + gsp->st_cache_miss); + printf("%lu\tPages created in the cache.\n", gsp->st_page_create); + printf("%lu\tPages read into the cache.\n", gsp->st_page_in); + printf("%lu\tPages written from the cache to the backing file.\n", + gsp->st_page_out); + printf("%lu\tRead-only pages forced from the cache.\n", + gsp->st_ro_evict); + printf("%lu\tRead-write pages forced from the cache.\n", + gsp->st_rw_evict); + printf("%lu\tNumber of hash buckets used for page location.\n", + gsp->st_hash_buckets); + printf("%lu\tTotal number of times hash chains searched for a page.\n", + gsp->st_hash_searches); + printf("%lu\tThe longest hash chain searched for a page.\n", + gsp->st_hash_longest); + printf( + "%lu\tTotal number of hash buckets examined for page location.\n", + gsp->st_hash_examined); + + for (; fsp != NULL && *fsp != NULL; ++fsp) { + printf("%s\n", DIVIDER); + printf("%s\n", (*fsp)->file_name); + printf("%lu\tPage size.\n", (u_long)(*fsp)->st_pagesize); + printf("%lu\tRequested pages found in the cache", + (*fsp)->st_cache_hit); + if ((*fsp)->st_cache_hit + (*fsp)->st_cache_miss != 0) + printf(" (%.0f%%)", ((double)(*fsp)->st_cache_hit / + ((*fsp)->st_cache_hit + (*fsp)->st_cache_miss)) * + 100); + printf(".\n"); + printf("%lu\tRequested pages mapped into the process' address space.\n", + (*fsp)->st_map); + printf("%lu\tRequested pages not found in the cache.\n", + (*fsp)->st_cache_miss); + printf("%lu\tPages created in the cache.\n", + (*fsp)->st_page_create); + printf("%lu\tPages read into the cache.\n", (*fsp)->st_page_in); + printf("%lu\tPages written from the cache to the backing file.\n", + (*fsp)->st_page_out); + } +} + +/* + * tstat -- + * Display transaction statistics. + */ +void +tstat(dbenv) + DB_ENV *dbenv; +{ + DB_TXN_STAT *tstat; + unsigned int i; + const char *p; + + if (txn_stat(dbenv->tx_info, &tstat, NULL)) + err(1, NULL); + + p = tstat->st_last_ckp.file == 0 ? + "No checkpoint LSN." : "File/offset for last checkpoint LSN."; + printf("%lu/%lu\t%s\n", (u_long)tstat->st_last_ckp.file, + (u_long)tstat->st_last_ckp.offset, p); + p = tstat->st_pending_ckp.file == 0 ? + "No pending checkpoint LSN." : + "File/offset for last pending checkpoint LSN."; + printf("%lu/%lu\t%s.\n", + (u_long)tstat->st_pending_ckp.file, + (u_long)tstat->st_pending_ckp.offset, p); + if (tstat->st_time_ckp == 0) + printf("0\tNo checkpoint timestamp.\n"); + else + printf("%.24s\tCheckpoint timestamp.\n", + ctime(&tstat->st_time_ckp)); + printf("%lx\tLast transaction ID allocated.\n", + (u_long)tstat->st_last_txnid); + printf("%lu\tMaximum number of active transactions.\n", + (u_long)tstat->st_maxtxns); + printf("%lu\tNumber of transactions begun.\n", + (u_long)tstat->st_nbegins); + printf("%lu\tNumber of transactions aborted.\n", + (u_long)tstat->st_naborts); + printf("%lu\tNumber of transactions committed.\n", + (u_long)tstat->st_ncommits); + printf("%lu\tActive transactions.\n", (u_long)tstat->st_nactive); + qsort(tstat->st_txnarray, + tstat->st_nactive, sizeof(tstat->st_txnarray[0]), txn_compare); + for (i = 0; i < tstat->st_nactive; ++i) + printf("\tid: %lx; initial LSN file/offest %lu/%lu\n", + (u_long)tstat->st_txnarray[i].txnid, + (u_long)tstat->st_txnarray[i].lsn.file, + (u_long)tstat->st_txnarray[i].lsn.offset); +} + +int +txn_compare(a1, b1) + const void *a1, *b1; +{ + const DB_TXN_ACTIVE *a, *b; + + a = a1; + b = b1; + + if (a->txnid > b->txnid) + return (1); + if (a->txnid < b->txnid) + return (-1); + return (0); +} + +/* + * prflags -- + * Print out flag values. + */ +void +prflags(flags, fn) + u_int32_t flags; + FN const *fn; +{ + const FN *fnp; + int found; + const char *sep; + + sep = " "; + printf("Flags:"); + for (found = 0, fnp = fn; fnp->mask != 0; ++fnp) + if (fnp->mask & flags) { + printf("%s%s", sep, fnp->name); + sep = ", "; + found = 1; + } + printf("\n"); +} + +/* + * db_init -- + * Initialize the environment. + */ +DB_ENV * +db_init(home, ttype) + char *home; + test_t ttype; +{ + DB_ENV *dbenv; + int flags; + + flags = DB_USE_ENVIRON; + switch (ttype) { + case T_MPOOL: + flags |= DB_INIT_MPOOL; + break; + case T_TXN: + flags |= DB_INIT_TXN; + break; + default: + break; + } + + if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + errno = ENOMEM; + err(1, NULL); + } + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + + if ((errno = db_appinit(home, NULL, dbenv, flags)) != 0) + err(1, "db_appinit"); + return (dbenv); +} + +/* + * oninit -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + signo = 1; /* XXX: Shut the compiler up. */ + interrupted = 1; +} + +void +usage() +{ + fprintf(stderr, "usage: db_stat [-mt] [-d file] [-h home]\n"); + exit (1); +} diff --git a/db2/txn/txn.c b/db2/txn/txn.c new file mode 100644 index 0000000000..b20697be3b --- /dev/null +++ b/db2/txn/txn.c @@ -0,0 +1,809 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)txn.c 10.20 (Sleepycat) 8/24/97"; +#endif /* not lint */ + + +/* + * This file contains the top level routines of the transaction library. + * It assumes that a lock manager and log manager that conform to the db_log(3) + * and db_lock(3) interfaces exist. + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#endif + +#include "shqueue.h" +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "txn.h" +#include "db_dispatch.h" +#include "lock.h" +#include "log.h" +#include "db_am.h" +#include "common_ext.h" + +static int __txn_check_running __P((const DB_TXN *)); + +static int __txn_create __P((DB_ENV *, const char *, u_int)); +static int __txn_grow_region __P((DB_TXNMGR *)); +static int __txn_validate_region __P((DB_TXNMGR *)); +static int __txn_end __P((DB_TXN *, int)); +static int __txn_undo __P((DB_TXN *)); + +/* + * Create and initialize a transaction region in shared memory. + * 0 means, success. + * +1 means that the db_create failed, so we did not create the region. + * -1 means that we got some sort of system error. + */ +static int +__txn_create(dbenv, path, mode) + DB_ENV *dbenv; + const char *path; + u_int mode; +{ + DB_TXNREGION *txn_region; + TXN_DETAIL *txnp; + time_t now; + int fd, i, maxtxns, ret; + + maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 1000; + (void)time(&now); + + ret = __db_rcreate(dbenv, DB_APP_NONE, path, + DEFAULT_TXN_FILE, mode, TXN_REGION_SIZE(maxtxns), &fd, &txn_region); + + /* Region may have existed. If it didn't, the open will fail. */ + if (ret != 0) + return (ret); + + txn_region->magic = DB_TXNMAGIC; + txn_region->version = DB_TXNVERSION; + txn_region->maxtxns = maxtxns; + txn_region->last_txnid = TXN_MINIMUM; + /* XXX If we ever do more types of locking and logging, this changes. */ + txn_region->logtype = 0; + txn_region->locktype = 0; + txn_region->free_txn = 0; + txn_region->time_ckp = now; + ZERO_LSN(txn_region->last_ckp); + ZERO_LSN(txn_region->pending_ckp); + + for (txnp = &txn_region->table[0], i = 0; i < maxtxns; i++, txnp++) { + ZERO_LSN(txnp->begin_lsn); + txnp->status = TXN_UNALLOC; + txnp->txnid = i + 1; + } + txn_region->table[maxtxns - 1].txnid = TXN_INVALID; + + /* Unlock the region. */ + (void)__db_mutex_unlock(&txn_region->hdr.lock, fd); + + /* Now unmap and close the region. */ + if ((ret = __db_rclose(dbenv, fd, txn_region)) != 0) { + (void)txn_unlink(path, 1 /* force */, dbenv); + return (ret); + } + + return (0); +} + +int +txn_open(path, flags, mode, dbenv, mgrpp) + const char *path; + int flags, mode; + DB_ENV *dbenv; + DB_TXNMGR **mgrpp; +{ + DB_TXNMGR *tmgrp; + DB_TXNREGION *txn_regionp; + int fd, ret, retry_cnt; + + tmgrp = NULL; + txn_regionp = NULL; + fd = -1; + + /* Validate arguments. */ + if (dbenv == NULL) + return (EINVAL); +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_THREAD | DB_TXN_NOSYNC) +#else +#define OKFLAGS (DB_CREATE | DB_TXN_NOSYNC) +#endif + if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0) + return (ret); + + retry_cnt = 0; +retry: if (LF_ISSET(DB_CREATE) && (ret = __txn_create(dbenv, path, mode)) != 0) + if (ret == EAGAIN && ++retry_cnt < 0) { + (void)__db_sleep(1, 0); + goto retry; + } else /* We did not really create the region */ + flags &= ~DB_CREATE; + + retry_cnt = 0; +retry1: if ((ret = __db_ropen(dbenv, DB_APP_NONE, path, DEFAULT_TXN_FILE, + flags & ~(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC), + &fd, &txn_regionp)) != 0) { + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry1; + } + goto out; + } + + + /* Check if valid region. */ + if (txn_regionp->magic != DB_TXNMAGIC) { + __db_err(dbenv, "txn_open: Bad magic number"); + ret = EINVAL; + goto out; + } + + /* Now, create the transaction manager structure and set its fields. */ + if ((tmgrp = (DB_TXNMGR *)malloc(sizeof(DB_TXNMGR))) == NULL) { + __db_err(dbenv, "txn_open: %s", strerror(errno)); + ret = ENOMEM; + goto out; + } + + tmgrp->dbenv = dbenv; + tmgrp->recover = + dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover; + tmgrp->region = txn_regionp; + tmgrp->reg_size = txn_regionp->hdr.size; + tmgrp->fd = fd; + tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD); + TAILQ_INIT(&tmgrp->txn_chain); + if (LF_ISSET(DB_THREAD)) + __db_mutex_init(&tmgrp->mutex, -1); + *mgrpp = tmgrp; + return (0); + +out: if (txn_regionp != NULL) + (void)__db_rclose(dbenv, fd, txn_regionp); + if (flags & DB_CREATE) + (void)txn_unlink(path, 1, dbenv); + if (tmgrp != NULL) + free(tmgrp); + return (ret); +} + +/* + * Internally, we use TXN_DETAIL structures, but we allocate and return + * DB_TXN structures that provide access to the transaction ID and the + * offset in the transaction region of the TXN_DETAIL structure. + */ +int +txn_begin(tmgrp, parent, txnpp) + DB_TXNMGR *tmgrp; + DB_TXN *parent; + DB_TXN **txnpp; +{ + TXN_DETAIL *txnp; + DB_TXN *retp; + int id, index, ret; + + LOCK_TXNREGION(tmgrp); + + if ((ret = __txn_validate_region(tmgrp)) != 0) { + UNLOCK_TXNREGION(tmgrp); + return (ret); + } + + /* Remove element from free list. */ + if (tmgrp->region->free_txn == TXN_INVALID && + (ret = __txn_grow_region(tmgrp)) != 0) { + UNLOCK_TXNREGION(tmgrp); + return (ret); + } + + index = tmgrp->region->free_txn; + txnp = &tmgrp->region->table[index]; + tmgrp->region->free_txn = txnp->txnid; + + if (txnp->status != TXN_UNALLOC) { + UNLOCK_TXNREGION(tmgrp); + return (EINVAL); + } + + /* Make sure that last_txnid is not going to wrap around. */ + if (tmgrp->region->last_txnid == TXN_INVALID) + return (EINVAL); + + if ((retp = (DB_TXN *)malloc(sizeof(DB_TXN))) == NULL) { + __db_err(tmgrp->dbenv, "txn_begin : %s", strerror(ENOMEM)); + UNLOCK_TXNREGION(tmgrp); + return (ENOMEM); + } + + id = ++tmgrp->region->last_txnid; + tmgrp->region->nbegins++; + + txnp->txnid = id; + txnp->last_lock = 0; + txnp->status = TXN_RUNNING; + ZERO_LSN(txnp->last_lsn); + ZERO_LSN(txnp->begin_lsn); + + UNLOCK_TXNREGION(tmgrp); + + ZERO_LSN(retp->last_lsn); + retp->txnid = id; + retp->parent = parent; + retp->off = (u_int8_t *)txnp - (u_int8_t *)tmgrp->region; + retp->mgrp = tmgrp; + + if (tmgrp->dbenv->lg_info != NULL && + (ret = __txn_regop_log(tmgrp->dbenv->lg_info, + retp, &txnp->begin_lsn, 0, TXN_BEGIN)) != 0) { + + /* Deallocate transaction. */ + LOCK_TXNREGION(tmgrp); + txnp->txnid = tmgrp->region->free_txn; + tmgrp->region->free_txn = txnp - &tmgrp->region->table[0]; + UNLOCK_TXNREGION(tmgrp); + free (retp); + return (ret); + } + + LOCK_TXNTHREAD(tmgrp); + TAILQ_INSERT_TAIL(&tmgrp->txn_chain, retp, links); + UNLOCK_TXNTHREAD(tmgrp); + + *txnpp = retp; + return (0); +} + +/* The db_txn(3) man page describes txn_commit. */ +int +txn_commit(txnp) + DB_TXN *txnp; +{ + DB_LOG *logp; + int ret; + + if ((ret = __txn_check_running(txnp)) != 0) + return (ret); + + /* Sync the log. */ + if ((logp = txnp->mgrp->dbenv->lg_info) != NULL && + (ret = __txn_regop_log(logp, + txnp, &txnp->last_lsn, + F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_COMMIT)) + != 0) + return (ret); + + return (__txn_end(txnp, 1)); +} + +/* The db_txn(3) man page describes txn_abort. */ +int +txn_abort(txnp) + DB_TXN *txnp; +{ + int ret; + + if ((ret = __txn_check_running(txnp)) != 0) + return (ret); + + if ((ret = __txn_undo(txnp)) != 0) { + __db_err(txnp->mgrp->dbenv, + "txn_abort: Log undo failed %s", strerror(ret)); + return (ret); + } + return (__txn_end(txnp, 0)); +} + +/* + * Flush the log so a future commit is guaranteed to succeed. + */ +int +txn_prepare(txnp) + DB_TXN *txnp; +{ + int ret; + TXN_DETAIL *tp; + + ret = 0; + if ((ret = __txn_check_running(txnp)) != 0) + return (ret); + + if (txnp->mgrp->dbenv->lg_info) { + ret = log_flush(txnp->mgrp->dbenv->lg_info, &txnp->last_lsn); + if (ret) + __db_err(txnp->mgrp->dbenv, + "txn_prepare: log_flush failed %s\n", + strerror(errno)); + return (ret); + } + + LOCK_TXNTHREAD(txnp->mgrp); + tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off); + tp->status = TXN_PREPARED; + UNLOCK_TXNTHREAD(txnp->mgrp); + return (ret); +} + +/* + * Return the transaction ID associated with a particular transaction + */ +u_int32_t +txn_id(txnp) + DB_TXN *txnp; +{ + return (txnp->txnid); +} + +/* + * The db_txn(3) man page describes txn_close. Currently the caller should + * arrange a checkpoint before calling txn_close. + */ +int +txn_close(tmgrp) + DB_TXNMGR *tmgrp; +{ + DB_TXN *txnp; + int ret, t_ret; + + /* + * This function had better only be called once per process + * (i.e., not per thread), so there should be no synchronization + * required. + */ + for (ret = 0, txnp = TAILQ_FIRST(&tmgrp->txn_chain); + txnp != TAILQ_END(&tmgrp->txn_chain); + txnp = TAILQ_FIRST(&tmgrp->txn_chain)) { + if ((t_ret = txn_abort(txnp)) != 0 && ret == 0) + ret = t_ret; + } + + if (tmgrp->dbenv->lg_info && (t_ret = + log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 && + ret == 0) + ret = t_ret; + + if ((t_ret = __db_rclose(tmgrp->dbenv, tmgrp->fd, tmgrp->region)) != 0 + && ret == 0) + ret = t_ret; + + if (ret == 0) + free (tmgrp); + return (ret); +} + +/* + * The db_txn(3) man page describes txn_unlink. Right now it is up to + * txn_close to write the final checkpoint record. + */ +int +txn_unlink(path, force, dbenv) + const char *path; + int force; + DB_ENV *dbenv; +{ + return (__db_runlink(dbenv, + DB_APP_NONE, path, DEFAULT_TXN_FILE, force)); +} + +/* Internal routines. */ + +/* + * Return 0 if the txnp is reasonable, otherwise returns EINVAL. + */ +static int +__txn_check_running(txnp) + const DB_TXN *txnp; +{ + TXN_DETAIL *tp; + + tp = NULL; + if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) { + tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off); + if (tp->status != TXN_RUNNING) + tp = NULL; + } + + return (tp == NULL ? EINVAL : 0); +} + +static int +__txn_end(txnp, is_commit) + DB_TXN *txnp; + int is_commit; +{ + DB_TXNMGR *mgr; + TXN_DETAIL *tp; + DB_LOCKREQ request; + int ret; + u_int32_t locker; + + mgr = txnp->mgrp; + + LOCK_TXNTHREAD(mgr); + TAILQ_REMOVE(&mgr->txn_chain, txnp, links); + UNLOCK_TXNTHREAD(mgr); + + /* Release the locks. */ + locker = txnp->txnid; + request.op = DB_LOCK_PUT_ALL; + + if (mgr->dbenv->lk_info) { + ret = lock_vec(mgr->dbenv->lk_info, locker, 0, + &request, 1, NULL); + if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) { + __db_err(mgr->dbenv, "%s: release locks failed %s", + is_commit ? "txn_commit" : "txn_abort", + strerror(ret)); + return (ret); + } + } + + /* End the transaction. */ + LOCK_TXNREGION(mgr); + tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off); + tp->status = TXN_UNALLOC; + tp->txnid = mgr->region->free_txn; + mgr->region->free_txn = tp - &mgr->region->table[0]; + if (is_commit) + mgr->region->ncommits++; + else + mgr->region->naborts++; + UNLOCK_TXNREGION(mgr); + + FREE(txnp, sizeof(*txnp)); + + return (0); +} + + +/* + * Undo the transaction with id txnid. Returns 0 on success and sets + * errno and returns -1 on failure. + */ +static int +__txn_undo(txnp) + DB_TXN *txnp; +{ + DB_TXNMGR *mgr; + DB_LOG *logp; + DBT rdbt; + DB_LSN key_lsn; + int ret; + + mgr = txnp->mgrp; + logp = mgr->dbenv->lg_info; + if (logp == NULL) + return (0); + + /* + * This is the simplest way to code this, but if the mallocs during + * recovery turn out to be a performance issue, we can do the + * allocation here and use DB_DBT_USERMEM. + */ + memset(&rdbt, 0, sizeof(rdbt)); + if (F_ISSET(logp, DB_AM_THREAD)) + F_SET(&rdbt, DB_DBT_MALLOC); + + key_lsn = txnp->last_lsn; /* structure assignment */ + for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) { + /* + * The dispatch routine returns the lsn of the record + * before the current one in the key_lsn argument. + */ + if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) { + ret = + mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL); + if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) { + free(rdbt.data); + rdbt.data = NULL; + } + } + if (ret != 0) + return (ret); + } + + return (ret); +} + +/* + * Transaction checkpoint. + * If either kbytes or minutes is non-zero, then we only take the checkpoint + * more than "minutes" minutes have passed since the last checkpoint or if + * more than "kbytes" of log data have been written since the last checkpoint. + * When taking a checkpoint, find the oldest active transaction and figure out + * its first LSN. This is the lowest LSN we can checkpoint, since any record + * written after since that point may be involved in a transaction and may + * therefore need to be undone in the case of an abort. + */ +int +txn_checkpoint(mgr, kbytes, minutes) + const DB_TXNMGR *mgr; + long kbytes, minutes; +{ + TXN_DETAIL *txnp; + DB_LSN ckp_lsn, last_ckp; + DB_LOG *dblp; + u_int32_t bytes_written, i; + time_t last_ckp_time, now; + int ret; + + /* Check usage. */ + if (kbytes < 0 || minutes < 0) + return (EINVAL); + + /* + * Check if we need to run recovery. + */ + ZERO_LSN(ckp_lsn); + if (minutes != 0) { + (void)time(&now); + + LOCK_TXNREGION(mgr); + last_ckp_time = mgr->region->time_ckp; + UNLOCK_TXNREGION(mgr); + + if (now - last_ckp_time >= (time_t)(minutes * 60)) + goto do_ckp; + } + + if (kbytes != 0) { + dblp = mgr->dbenv->lg_info; + LOCK_LOGREGION(dblp); + bytes_written = dblp->lp->written; + ckp_lsn = dblp->lp->lsn; + UNLOCK_LOGREGION(dblp); + if (bytes_written >= (u_int32_t)(kbytes * 1024)) + goto do_ckp; + } + + /* + * If we checked time and data and didn't go to checkpoint, + * we're done. + */ + if (minutes != 0 || kbytes != 0) + return (0); + + if (IS_ZERO_LSN(ckp_lsn)) { + dblp = mgr->dbenv->lg_info; + LOCK_LOGREGION(dblp); + ckp_lsn = dblp->lp->lsn; + UNLOCK_LOGREGION(dblp); + } + + /* + * We have to find an LSN such that all transactions begun + * before that LSN are complete. + */ +do_ckp: + LOCK_TXNREGION(mgr); + + if (!IS_ZERO_LSN(mgr->region->pending_ckp)) + ckp_lsn = mgr->region->pending_ckp; + else + for (txnp = &mgr->region->table[0], i = 0; + i < mgr->region->maxtxns; i++, txnp++) { + + /* + * Look through the transaction table for the LSN of + * the transaction that is in-use (e.g., not + * TXN_UNALLOC) and whose begin lsn is the lowest. + */ + if (txnp->status != TXN_UNALLOC && + !IS_ZERO_LSN(txnp->begin_lsn) && + log_compare(&txnp->begin_lsn, &ckp_lsn) < 0) + ckp_lsn = txnp->begin_lsn; + } + + mgr->region->pending_ckp = ckp_lsn; + UNLOCK_TXNREGION(mgr); + + ret = memp_sync(mgr->dbenv->mp_info, &ckp_lsn); + if (ret > 0) { + __db_err(mgr->dbenv, + "txn_checkpoint: system failure in memp_sync %s\n", + strerror(ret)); + } else if (ret == 0 && mgr->dbenv->lg_info != NULL) { + LOCK_TXNREGION(mgr); + last_ckp = mgr->region->last_ckp; + ZERO_LSN(mgr->region->pending_ckp); + UNLOCK_TXNREGION(mgr); + + if ((ret = __txn_ckp_log(mgr->dbenv->lg_info, + NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) { + __db_err(mgr->dbenv, + "txn_checkpoint: log failed at LSN [%ld %ld] %s\n", + (long)ckp_lsn.file, (long)ckp_lsn.offset, + strerror(ret)); + return (ret); + } + + LOCK_TXNREGION(mgr); + mgr->region->last_ckp = ckp_lsn; + (void)time(&mgr->region->time_ckp); + UNLOCK_TXNREGION(mgr); + } + /* + * ret < 0 means that there are still buffers to flush; the + * checkpoint is not complete. Back off and try again. + */ + return (ret); +} + +/* + * This is called at every interface to verify if the region + * has changed size, and if so, to remap the region in and + * reset the process pointers. + */ +static int +__txn_validate_region(tp) + DB_TXNMGR *tp; +{ + int ret; + + if (tp->reg_size == tp->region->hdr.size) + return (0); + + /* Grow the region. */ + if ((ret = __db_rremap(tp->dbenv, tp->region, + tp->reg_size, tp->region->hdr.size, tp->fd, &tp->region)) != 0) + return (ret); + + tp->reg_size = tp->region->hdr.size; + + return (0); +} + +static int +__txn_grow_region(tp) + DB_TXNMGR *tp; +{ + TXN_DETAIL *tx; + size_t incr; + u_int32_t i, oldmax; + int ret; + + oldmax = tp->region->maxtxns; + incr = oldmax * sizeof(DB_TXN); + + if ((ret = __db_rgrow(tp->dbenv, tp->fd, incr)) != 0) + return (ret); + + if ((ret = __db_rremap(tp->dbenv, tp->region, + tp->reg_size, tp->reg_size + incr, tp->fd, &tp->region)) != 0) + return (ret); + tp->reg_size += incr; + + /* + * Initialize all the new transactions and up the transaction count. + */ + for (i = 0, tx = &tp->region->table[oldmax]; i < oldmax; i++, tx++) { + ZERO_LSN(tx->begin_lsn); + tx->status = TXN_UNALLOC; + tx->txnid = oldmax + i + 1; + } + tp->region->free_txn = oldmax; + tp->region->maxtxns = 2 * oldmax; + tp->region->table[tp->region->maxtxns - 1].txnid = TXN_INVALID; + + return (0); +} + +int +txn_stat(mgr, statp, db_malloc) + DB_TXNMGR *mgr; + DB_TXN_STAT **statp; + void *(*db_malloc) __P((size_t)); +{ + DB_TXN_STAT *stats; + size_t nbytes; + u_int32_t nactive; + unsigned int i, ndx; + + LOCK_TXNREGION(mgr); + nactive = mgr->region->nbegins - + mgr->region->naborts - mgr->region->ncommits; + UNLOCK_TXNREGION(mgr); + + /* + * Allocate a bunch of extra active structures to handle any + * that have been created since we unlocked the region. + */ + nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200); + if (db_malloc == NULL) + stats = (DB_TXN_STAT *)malloc(nbytes); + else + stats = (DB_TXN_STAT *)db_malloc(nbytes); + + if (stats == NULL) + return (ENOMEM); + + LOCK_TXNREGION(mgr); + stats->st_last_txnid = mgr->region->last_txnid; + stats->st_last_ckp = mgr->region->last_ckp; + stats->st_maxtxns = mgr->region->maxtxns; + stats->st_naborts = mgr->region->naborts; + stats->st_nbegins = mgr->region->nbegins; + stats->st_ncommits = mgr->region->ncommits; + stats->st_pending_ckp = mgr->region->pending_ckp; + stats->st_time_ckp = mgr->region->time_ckp; + stats->st_nactive = stats->st_nbegins - + stats->st_naborts - stats->st_ncommits; + if (stats->st_nactive > nactive + 200) + stats->st_nactive = nactive + 200; + stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1]; + + for (ndx = 0, i = 0; i < mgr->region->maxtxns; i++) + if (mgr->region->table[i].status != TXN_UNALLOC) { + stats->st_txnarray[ndx].txnid = + mgr->region->table[i].txnid; + stats->st_txnarray[ndx].lsn = + mgr->region->table[i].begin_lsn; + ndx++; + + if (ndx >= stats->st_nactive) + break; + } + + UNLOCK_TXNREGION(mgr); + *statp = stats; + return (0); +} diff --git a/db2/txn/txn.src b/db2/txn/txn.src new file mode 100644 index 0000000000..40bb63ecb6 --- /dev/null +++ b/db2/txn/txn.src @@ -0,0 +1,31 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + * + * @(#)txn.src 10.1 (Sleepycat) 4/12/97 + * + * This is the source file used to create the logging functions for the + * transaction system. + */ +PREFIX txn + +/* + * Everything except for checkpointing takes the same logging routine. + */ +BEGIN regop +ARG opcode u_int32_t lu +END + +/* + * This is the checkpoint record. It contains the lsn that the checkpoint + * guarantees and a pointer to the last checkpoint so that we can walk + * backwards by checkpoint. + * ckp_lsn: + * last_ckp: + */ +BEGIN ckp +POINTER ckp_lsn DB_LSN * lu +POINTER last_ckp DB_LSN * lu +END diff --git a/db2/txn/txn_auto.c b/db2/txn/txn_auto.c new file mode 100644 index 0000000000..c7f277ed0f --- /dev/null +++ b/db2/txn/txn_auto.c @@ -0,0 +1,308 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "txn.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __txn_regop_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t)); + */ +int __txn_regop_log(logp, txnid, ret_lsnp, flags, + opcode) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_txn_regop; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __txn_regop_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__txn_regop_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __txn_regop_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __txn_regop_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]txn_regop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __txn_regop_read __P((void *, __txn_regop_args **)); + */ +int +__txn_regop_read(recbuf, argpp) + void *recbuf; + __txn_regop_args **argpp; +{ + __txn_regop_args *argp; + u_int8_t *bp; + + argp = (__txn_regop_args *)malloc(sizeof(__txn_regop_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __txn_ckp_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: DB_LSN *, DB_LSN *)); + */ +int __txn_ckp_log(logp, txnid, ret_lsnp, flags, + ckp_lsn, last_ckp) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + DB_LSN * ckp_lsn; + DB_LSN * last_ckp; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_txn_ckp; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(*ckp_lsn) + + sizeof(*last_ckp); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + if (ckp_lsn != NULL) + memcpy(bp, ckp_lsn, sizeof(*ckp_lsn)); + else + memset(bp, 0, sizeof(*ckp_lsn)); + bp += sizeof(*ckp_lsn); + if (last_ckp != NULL) + memcpy(bp, last_ckp, sizeof(*last_ckp)); + else + memset(bp, 0, sizeof(*last_ckp)); + bp += sizeof(*last_ckp); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__txn_ckp_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __txn_ckp_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __txn_ckp_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]txn_ckp: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tckp_lsn: [%lu][%lu]\n", + (u_long)argp->ckp_lsn.file, (u_long)argp->ckp_lsn.offset); + printf("\tlast_ckp: [%lu][%lu]\n", + (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __txn_ckp_read __P((void *, __txn_ckp_args **)); + */ +int +__txn_ckp_read(recbuf, argpp) + void *recbuf; + __txn_ckp_args **argpp; +{ + __txn_ckp_args *argp; + u_int8_t *bp; + + argp = (__txn_ckp_args *)malloc(sizeof(__txn_ckp_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->ckp_lsn, bp, sizeof(argp->ckp_lsn)); + bp += sizeof(argp->ckp_lsn); + memcpy(&argp->last_ckp, bp, sizeof(argp->last_ckp)); + bp += sizeof(argp->last_ckp); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __txn_init_print __P((DB_ENV *)); + */ +int +__txn_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __txn_regop_print, DB_txn_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_ckp_print, DB_txn_ckp)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __txn_init_recover __P((DB_ENV *)); + */ +int +__txn_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __txn_regop_recover, DB_txn_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_ckp_recover, DB_txn_ckp)) != 0) + return (ret); + return (0); +} + diff --git a/db2/txn/txn_rec.c b/db2/txn/txn_rec.c new file mode 100644 index 0000000000..1fe720a1cf --- /dev/null +++ b/db2/txn/txn_rec.c @@ -0,0 +1,131 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)txn_rec.c 10.4 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "shqueue.h" +#include "txn.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __txn_regop_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__txn_regop_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __txn_regop_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_regop_print(logp, dbtp, lsnp, redo, info); +#endif + logp = logp; /* XXX: Shut the compiler up. */ + redo = redo; + + if ((ret = __txn_regop_read(dbtp->data, &argp)) != 0) + return (ret); + + switch (argp->opcode) { + case TXN_COMMIT: + if (__db_txnlist_find(info, + argp->txnid->txnid) == DB_NOTFOUND) + __db_txnlist_add(info, argp->txnid->txnid); + break; + case TXN_PREPARE: /* Nothing to do. */ + case TXN_BEGIN: + /* Call find so that we update the maxid. */ + (void)__db_txnlist_find(info, argp->txnid->txnid); + break; + } + + *lsnp = argp->prev_lsn; + free (argp); + return (0); +} + +/* + * PUBLIC: int __txn_ckp_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__txn_ckp_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __txn_ckp_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + __txn_ckp_print(logp, dbtp, lsnp, redo, info); +#endif + logp = logp; /* XXX: Shut the compiler up. */ + redo = redo; + info = info; + + if ((ret = __txn_ckp_read(dbtp->data, &argp)) != 0) + return (ret); + + *lsnp = argp->last_ckp; + free(argp); + return (1); +} |