diff options
Diffstat (limited to 'db2/mp')
-rw-r--r-- | db2/mp/mp_bh.c | 437 | ||||
-rw-r--r-- | db2/mp/mp_fget.c | 359 | ||||
-rw-r--r-- | db2/mp/mp_fopen.c | 437 | ||||
-rw-r--r-- | db2/mp/mp_fput.c | 140 | ||||
-rw-r--r-- | db2/mp/mp_fset.c | 72 | ||||
-rw-r--r-- | db2/mp/mp_open.c | 176 | ||||
-rw-r--r-- | db2/mp/mp_pr.c | 313 | ||||
-rw-r--r-- | db2/mp/mp_region.c | 340 | ||||
-rw-r--r-- | db2/mp/mp_sync.c | 205 |
9 files changed, 2479 insertions, 0 deletions
diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c new file mode 100644 index 0000000000..e1b68ce450 --- /dev/null +++ b/db2/mp/mp_bh.c @@ -0,0 +1,437 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_bh.c 10.12 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * __memp_bhwrite -- + * Write the page associated with a given bucket header. + * + * PUBLIC: int __memp_bhwrite + * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + int *restartp, *wrotep; +{ + DBT dbt; + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + + /* + * Walk the process' DB_MPOOLFILE list and try and find a file + * descriptor for this file. + */ + LOCKHANDLE(dbmp, &dbmp->mutex); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) + if (dbmfp->mfp == mfp) + break; + UNLOCKHANDLE(dbmp, &dbmp->mutex); + if (dbmfp != NULL) + goto found; + + /* + * It's not a page from a file we've opened. If the file requires + * input/output processing, see if this process has ever registered + * information as to how to write this type of file. If not, there's + * nothing we can do. + */ + if (mfp->ftype != 0) { + LOCKHANDLE(dbmp, &dbmp->mutex); + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) + if (mpreg->ftype == mfp->ftype) + break; + UNLOCKHANDLE(dbmp, &dbmp->mutex); + if (mpreg == NULL) + return (0); + } + + /* + * Try and open the file; ignore any error, assume it's a permissions + * problem. + */ + dbt.size = mfp->pgcookie_len; + dbt.data = ADDR(dbmp, mfp->pgcookie_off); + if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off), + mfp->ftype, 0, 0, mfp->stat.st_pagesize, + mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0) + return (0); + +found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep)); +} + +/* + * __memp_pgread -- + * Read a page from a file. + * + * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pgread(dbmfp, bhp, can_create) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int can_create; +{ + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + size_t pagesize; + ssize_t nr; + int ret; + + dbmp = dbmfp->dbmp; + mfp = dbmfp->mfp; + pagesize = mfp->stat.st_pagesize; + + F_SET(bhp, BH_LOCKED | BH_TRASH); + LOCKBUFFER(dbmp, bhp); + UNLOCKREGION(dbmp); + + /* + * Temporary files may not yet have been created. + * + * Seek to the page location. + */ + ret = 0; + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->fd == -1 || (ret = + __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) { + if (!can_create) { + if (dbmfp->fd == -1) + ret = EINVAL; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + __db_err(dbmp->dbenv, + "%s: page %lu doesn't exist, create flag not set", + dbmfp->path, (u_long)bhp->pgno); + goto err; + } + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + /* Clear any uninitialized data. */ + memset(bhp->buf, 0, pagesize); + goto pgin; + } + + /* + * Read the page; short reads are treated like creates, although + * any valid data is preserved. + */ + ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr); + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + if (ret != 0) + goto err; + + if (nr == (ssize_t)pagesize) + can_create = 0; + else { + if (!can_create) { + ret = EINVAL; + goto err; + } + + /* Clear any uninitialized data. */ + memset(bhp->buf + nr, 0, pagesize - nr); + } + + /* Call any pgin function. */ +pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); + + /* Reacquire the region lock. */ + LOCKREGION(dbmp); + + /* If the pgin function succeeded, the data is now valid. */ + if (ret == 0) + F_CLR(bhp, BH_TRASH); + + /* Update the statistics. */ + if (can_create) { + ++dbmp->mp->stat.st_page_create; + ++mfp->stat.st_page_create; + } else { + ++dbmp->mp->stat.st_page_in; + ++mfp->stat.st_page_in; + } + + if (0) { +err: LOCKREGION(dbmp); + } + + /* Release the buffer. */ + F_CLR(bhp, BH_LOCKED); + UNLOCKBUFFER(dbmp, bhp); + + return (ret); +} + +/* + * __memp_pgwrite -- + * Write a page to a file. + * + * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_pgwrite(dbmfp, bhp, restartp, wrotep) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int *restartp, *wrotep; +{ + DB_ENV *dbenv; + DB_LOG *lg_info; + DB_LSN lsn; + DB_MPOOL *dbmp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t pagesize; + ssize_t nw; + int callpgin, ret; + const char *fail; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mfp = dbmfp->mfp; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + callpgin = 0; + pagesize = mfp->stat.st_pagesize; + + F_SET(bhp, BH_LOCKED); + LOCKBUFFER(dbmp, bhp); + UNLOCKREGION(dbmp); + + if (restartp != NULL) + *restartp = 1; + + /* Copy the LSN off the page if we're going to need it. */ + lg_info = dbenv->lg_info; + if (lg_info != NULL || F_ISSET(bhp, BH_WRITE)) + memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); + + /* Ensure the appropriate log records are on disk. */ + if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0) + goto err; + + /* + * Call any pgout function. We set the callpgin flag so that on + * error we flag that the contents of the buffer may be trash. + */ + if (mfp->ftype == 0) + ret = 0; + else { + callpgin = 1; + if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) + goto err; + } + + /* Temporary files may not yet have been created. */ + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, DB_APP_TMP, + NULL, NULL, &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) { + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + __db_err(dbenv, "unable to create temporary backing file"); + goto err; + } + + /* Write the page out. */ + if ((ret = + __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) + fail = "seek"; + else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0) + fail = "write"; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + if (ret != 0) { + /* + * XXX + * Shut the compiler up; it doesn't understand the correlation + * between the failing clauses to __db_lseek and __db_write and + * this ret != 0. + */ + fail = NULL; + goto syserr; + } + + if (nw != (ssize_t)pagesize) { + ret = EIO; + fail = "write"; + goto syserr; + } + + if (wrotep != NULL) + *wrotep = 1; + + /* Reacquire the region lock. */ + LOCKREGION(dbmp); + + /* Clean up the flags based on a successful write. */ + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_DIRTY | BH_LOCKED); + UNLOCKBUFFER(dbmp, bhp); + + /* + * If we wrote a buffer which a checkpoint is waiting for, update + * the count of pending buffers (both in the mpool as a whole and + * for this file). If the count for this file goes to zero, flush + * the writes. + * + * XXX: + * We ignore errors from the sync -- it makes no sense to return an + * error to the calling process, so set a flag causing the sync to + * be retried later. + * + * If the buffer we wrote has a LSN larger than the current largest + * we've written for this checkpoint, update the saved value. + */ + mp = dbmp->mp; + if (F_ISSET(bhp, BH_WRITE)) { + if (log_compare(&lsn, &mp->lsn) > 0) + mp->lsn = lsn; + F_CLR(bhp, BH_WRITE); + + --mp->lsn_cnt; + if (--mfp->lsn_cnt == 0) { + /* + * Don't lock -- there are no atomicity issues for + * fsync(2). + */ + if (__db_fsync(dbmfp->fd) != 0) + F_SET(mp, MP_LSN_RETRY); + } + } + + /* Update I/O statistics. */ + ++mp->stat.st_page_out; + ++mfp->stat.st_page_out; + + return (0); + +syserr: __db_err(dbenv, + "%s: %s failed for page %lu", dbmfp->path, fail, (u_long)bhp->pgno); + +err: UNLOCKBUFFER(dbmp, bhp); + LOCKREGION(dbmp); + if (callpgin) + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_LOCKED); + return (ret); +} + +/* + * __memp_pg -- + * Call the pgin/pgout routine. + * + * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pg(dbmfp, bhp, is_pgin) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int is_pgin; +{ + DBT dbt, *dbtp; + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + MPOOLFILE *mfp; + int ftype, ret; + + dbmp = dbmfp->dbmp; + mfp = dbmfp->mfp; + + LOCKHANDLE(dbmp, &dbmp->mutex); + + ftype = mfp->ftype; + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { + if (ftype != mpreg->ftype) + continue; + if (mfp->pgcookie_len == 0) + dbtp = NULL; + else { + dbt.size = mfp->pgcookie_len; + dbt.data = ADDR(dbmp, mfp->pgcookie_off); + dbtp = &dbt; + } + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + if (is_pgin) { + if (mpreg->pgin != NULL && (ret = + mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + } else + if (mpreg->pgout != NULL && (ret = + mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + break; + } + + if (mpreg == NULL) + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + return (0); + +err: UNLOCKHANDLE(dbmp, &dbmp->mutex); + __db_err(dbmp->dbenv, "%s: %s failed for page %lu", + dbmfp->path, is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); + return (ret); +} + +/* + * __memp_bhfree -- + * Free a bucket header and its referenced data. + * + * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); + */ +void +__memp_bhfree(dbmp, mfp, bhp, free_mem) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + int free_mem; +{ + size_t off; + + /* Delete the buffer header from the MPOOL hash list. */ + off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno); + SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh); + + /* Delete the buffer header from the LRU chain. */ + SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); + + /* + * If we're not reusing it immediately, free the buffer header + * and data for real. + */ + if (free_mem) + __db_shalloc_free(dbmp->addr, bhp); +} diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c new file mode 100644 index 0000000000..418802a3b9 --- /dev/null +++ b/db2/mp/mp_fget.c @@ -0,0 +1,359 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fget.c 10.22 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +int __sleep_on_every_page_get; /* XXX: thread debugging option. */ + +/* + * memp_fget -- + * Get a page from the file. + */ +int +memp_fget(dbmfp, pgnoaddr, flags, addrp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; + u_long flags; + void *addrp; +{ + BH *bhp, *tbhp; + DB_MPOOL *dbmp; + MPOOL *mp; + MPOOLFILE *mfp; + db_pgno_t lastpgno; + size_t bucket, mf_offset; + off_t size; + u_long cnt; + int b_incr, b_inserted, readonly_alloc, ret; + void *addr; + + dbmp = dbmfp->dbmp; + + /* + * Validate arguments. + * + * !!! + * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly + * files here, and create non-existent pages in readonly files if the + * flags are set, later. The reason is that the hash access method + * wants to get empty pages that don't really exist in readonly files. + * The only alternative is for hash to write the last "bucket" all the + * time, which we don't want to do because one of our big goals in life + * is to keep database files small. It's sleazy as hell, but we catch + * any attempt to actually write the file in memp_fput(). + */ +#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) + if (flags != 0) { + if ((ret = + __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0) + return (ret); + + switch (flags) { + case DB_MPOOL_CREATE: + case DB_MPOOL_LAST: + case DB_MPOOL_NEW: + case 0: + break; + default: + return (__db_ferr(dbmp->dbenv, "memp_fget", 1)); + } + } + +#ifdef DEBUG + /* + * XXX + * We want to switch threads as often as possible. Sleep every time + * we get a new page to make it more likely. + */ + if (__sleep_on_every_page_get && (dbmp->dbenv == NULL || + dbmp->dbenv->db_yield == NULL || dbmp->dbenv->db_yield() != 0)) + __db_sleep(0, 1); +#endif + + mp = dbmp->mp; + mfp = dbmfp->mfp; + mf_offset = OFFSET(dbmp, mfp); + addr = NULL; + bhp = NULL; + b_incr = b_inserted = readonly_alloc = ret = 0; + + LOCKREGION(dbmp); + + /* + * If mmap'ing the file, just return a pointer. However, if another + * process has opened the file for writing since we mmap'd it, start + * playing the game by their rules, i.e. everything goes through the + * cache. All pages previously returned should be safe, as long as + * a locking protocol was observed. + * + * XXX + * We don't discard the map because we don't know when all of the + * pages will have been discarded from the process' address space. + * It would be possible to do so by reference counting the open + * pages from the mmap, but it's unclear to me that it's worth it. + */ + if (dbmfp->addr != NULL && dbmfp->mfp->can_mmap) { + lastpgno = dbmfp->len == 0 ? + 0 : (dbmfp->len - 1) / mfp->stat.st_pagesize; + if (LF_ISSET(DB_MPOOL_LAST)) + *pgnoaddr = lastpgno; + else { + /* + * !!! + * Allocate a page that can never really exist. See + * the comment above about non-existent pages and the + * hash access method. + */ + if (LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) + readonly_alloc = 1; + else if (*pgnoaddr > lastpgno) { + __db_err(dbmp->dbenv, + "%s: page %lu doesn't exist", + dbmfp->path, (u_long)*pgnoaddr); + ret = EINVAL; + goto err; + } + } + if (!readonly_alloc) { + addr = ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); + + ++mp->stat.st_map; + ++mfp->stat.st_map; + + goto mapret; + } + } + + /* + * If requesting the last page or a new page, find the last page. The + * tricky thing is that the user may have created a page already that's + * after any page that exists in the file. + */ + if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) { + /* + * Temporary files may not yet have been created. + * + * Don't lock -- there are no atomicity issues for stat(2). + */ + if (dbmfp->fd == -1) + size = 0; + else if ((ret = __db_stat(dbmp->dbenv, + dbmfp->path, dbmfp->fd, &size, NULL)) != 0) + goto err; + + *pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize; + + /* + * Walk the list of BH's, looking for later pages. Save the + * pointer if a later page is found so that we don't have to + * search the list twice. + * + * If requesting a new page, return the page one after the last + * page -- which we'll have to create. + */ + for (tbhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + tbhp != NULL; tbhp = SH_TAILQ_NEXT(tbhp, q, __bh)) + if (tbhp->pgno >= *pgnoaddr && + tbhp->mf_offset == mf_offset) { + bhp = tbhp; + *pgnoaddr = bhp->pgno; + } + if (LF_ISSET(DB_MPOOL_NEW)) + ++*pgnoaddr; + } + + /* If we already found the right buffer, return it. */ + if (LF_ISSET(DB_MPOOL_LAST) && bhp != NULL) { + addr = bhp->buf; + goto found; + } + + /* If we haven't checked the BH list yet, do the search. */ + if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) { + ++mp->stat.st_hash_searches; + bucket = BUCKET(mp, mf_offset, *pgnoaddr); + for (cnt = 0, + bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) { + ++cnt; + if (bhp->pgno == *pgnoaddr && + bhp->mf_offset == mf_offset) { + addr = bhp->buf; + if (cnt > mp->stat.st_hash_longest) + mp->stat.st_hash_longest = cnt; + mp->stat.st_hash_examined += cnt; + goto found; + } + } + if (cnt > mp->stat.st_hash_longest) + mp->stat.st_hash_longest = cnt; + mp->stat.st_hash_examined += cnt; + } + + /* + * Allocate a new buffer header and data space, and mark the contents + * as useless. + */ + if ((ret = __memp_ralloc(dbmp, sizeof(BH) - + sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0) + goto err; + addr = bhp->buf; +#ifdef DEBUG + if ((ALIGNTYPE)addr & (sizeof(size_t) - 1)) { + __db_err(dbmp->dbenv, + "Internal error: BH data NOT size_t aligned."); + abort(); + } +#endif + memset(bhp, 0, sizeof(BH)); + LOCKINIT(dbmp, &bhp->mutex); + + /* + * Prepend the bucket header to the head of the appropriate MPOOL + * bucket hash list. Append the bucket header to the tail of the + * MPOOL LRU chain. + * + * We have to do this before we read in the page so we can discard + * our region lock without screwing up the world. + */ + bucket = BUCKET(mp, mf_offset, *pgnoaddr); + SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, mq, __bh); + SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q); + b_inserted = 1; + + /* Set the page number, and associated MPOOLFILE. */ + bhp->mf_offset = mf_offset; + bhp->pgno = *pgnoaddr; + + /* + * If we know we created the page, zero it out and continue. + * + * !!! + * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function. + * If DB_MPOOL_CREATE is used, then the application's pgin function + * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, + * it can detect all of its page creates, and not bother. + * + * Otherwise, read the page into memory, optionally creating it if + * DB_MPOOL_CREATE is set. + * + * Increment the reference count for created buffers, but importantly, + * increment the reference count for buffers we're about to read so + * that the buffer can't move. + */ + ++bhp->ref; + b_incr = 1; + + if (LF_ISSET(DB_MPOOL_NEW)) + memset(addr, 0, mfp->stat.st_pagesize); + else { + /* + * It's possible for the read function to fail, which means + * that we fail as well. + */ +reread: if ((ret = __memp_pgread(dbmfp, + bhp, LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))) != 0) + goto err; + + /* + * !!! + * The __memp_pgread call discarded and reacquired the region + * lock. Because the buffer reference count was incremented + * before the region lock was discarded the buffer didn't move. + */ + ++mp->stat.st_cache_miss; + ++mfp->stat.st_cache_miss; + } + + if (0) { +found: /* Increment the reference count. */ + if (bhp->ref == UINT16_T_MAX) { + __db_err(dbmp->dbenv, + "%s: too many references to page %lu", + dbmfp->path, bhp->pgno); + ret = EAGAIN; + goto err; + } + ++bhp->ref; + b_incr = 1; + + /* + * Any found buffer might be trouble. + * + * BH_LOCKED -- + * I/O in progress, wait for it to finish. Because the buffer + * reference count was incremented before the region lock was + * discarded we know the buffer didn't move. + */ + if (F_ISSET(bhp, BH_LOCKED)) { + UNLOCKREGION(dbmp); + LOCKBUFFER(dbmp, bhp); + /* Waiting for I/O to finish... */ + UNLOCKBUFFER(dbmp, bhp); + LOCKREGION(dbmp); + } + + /* + * BH_TRASH -- + * The buffer is garbage. + */ + if (F_ISSET(bhp, BH_TRASH)) + goto reread; + + /* + * BH_CALLPGIN -- + * The buffer was written, and the contents need to be + * converted again. + */ + if (F_ISSET(bhp, BH_CALLPGIN)) { + if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) + goto err; + F_CLR(bhp, BH_CALLPGIN); + } + + ++mp->stat.st_cache_hit; + ++mfp->stat.st_cache_hit; + } + +mapret: LOCKHANDLE(dbmp, &dbmfp->mutex); + ++dbmfp->pinref; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + if (0) { +err: /* + * If no other process is already waiting on a created buffer, + * go ahead and discard it, it's not useful. + */ + if (b_incr) + --bhp->ref; + if (b_inserted && bhp->ref == 0) + __memp_bhfree(dbmp, mfp, bhp, 1); + } + + UNLOCKREGION(dbmp); + + *(void **)addrp = addr; + return (ret); +} diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c new file mode 100644 index 0000000000..7703847b73 --- /dev/null +++ b/db2/mp/mp_fopen.c @@ -0,0 +1,437 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fopen.c 10.24 (Sleepycat) 8/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *)); +static int __memp_mf_open __P((DB_MPOOL *, DB_MPOOLFILE *, + int, int, size_t, int, DBT *, u_int8_t *, int, MPOOLFILE **)); + +/* + * memp_fopen -- + * Open a backing file for the memory pool. + */ +int +memp_fopen(dbmp, path, ftype, + flags, mode, pagesize, lsn_offset, pgcookie, fileid, retp) + DB_MPOOL *dbmp; + const char *path; + int ftype, flags, mode, lsn_offset; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + DB_MPOOLFILE **retp; +{ + int ret; + + /* Validate arguments. */ + if ((ret = __db_fchk(dbmp->dbenv, + "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0) + return (ret); + + return (__memp_fopen(dbmp, path, ftype, + flags, mode, pagesize, lsn_offset, pgcookie, fileid, 1, retp)); +} + +/* + * __memp_fopen -- + * Open a backing file for the memory pool; internal version. + * + * PUBLIC: int __memp_fopen __P((DB_MPOOL *, const char *, int, int, + * PUBLIC: int, size_t, int, DBT *, u_int8_t *, int, DB_MPOOLFILE **)); + */ +int +__memp_fopen(dbmp, path, + ftype, flags, mode, pagesize, lsn_offset, pgcookie, fileid, needlock, retp) + DB_MPOOL *dbmp; + const char *path; + int ftype, flags, mode, lsn_offset, needlock; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + DB_MPOOLFILE **retp; +{ + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + off_t size; + int ret; + + dbenv = dbmp->dbenv; + ret = 0; + + /* Require a non-zero pagesize. */ + if (pagesize == 0) { + __db_err(dbenv, "memp_fopen: pagesize not specified"); + return (EINVAL); + } + + /* Allocate and initialize the per-process structure. */ + if ((dbmfp = + (DB_MPOOLFILE *)calloc(1, sizeof(DB_MPOOLFILE))) == NULL) { + __db_err(dbenv, "%s: %s", + path == NULL ? TEMPORARY : path, strerror(ENOMEM)); + return (ENOMEM); + } + LOCKINIT(dbmp, &dbmfp->mutex); + dbmfp->dbmp = dbmp; + dbmfp->fd = -1; + if (LF_ISSET(DB_RDONLY)) + F_SET(dbmfp, MP_READONLY); + + if (path == NULL) { + if (LF_ISSET(DB_RDONLY)) { + __db_err(dbenv, + "memp_fopen: temporary files can't be readonly"); + ret = EINVAL; + goto err; + } + dbmfp->path = (char *) TEMPORARY; + F_SET(dbmfp, MP_PATH_TEMP); + } else { + /* Calculate the real name for this file. */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, NULL, path, NULL, &dbmfp->path)) != 0) + goto err; + F_SET(dbmfp, MP_PATH_ALLOC); + + + /* Open the file. */ + if ((ret = __db_fdopen(dbmfp->path, + LF_ISSET(DB_CREATE | DB_RDONLY), DB_CREATE | DB_RDONLY, + mode, &dbmfp->fd)) != 0) { + __db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret)); + goto err; + } + + /* Don't permit files that aren't a multiple of the pagesize. */ + if ((ret = __db_stat(dbenv, + dbmfp->path, dbmfp->fd, &size, NULL)) != 0) + goto err; + if (size % pagesize) { + __db_err(dbenv, + "%s: file size not a multiple of the pagesize", + dbmfp->path); + ret = EINVAL; + goto err; + } + } + + /* Find/allocate the shared file object. */ + if (needlock) + LOCKREGION(dbmp); + ret = __memp_mf_open(dbmp, dbmfp, ftype, + F_ISSET(dbmfp, MP_READONLY), pagesize, + lsn_offset, pgcookie, fileid, F_ISSET(dbmfp, MP_PATH_TEMP), &mfp); + if (needlock) + UNLOCKREGION(dbmp); + if (ret != 0) + goto err; + + dbmfp->mfp = mfp; + + /* + * If a file: + * + * + is read-only + * + doesn't require any pgin/pgout support + * + is less than mp_mmapsize bytes in size. + * + and the DB_NOMMAP flag wasn't set + * + * we can mmap it instead of reading/writing buffers. Don't do error + * checking based on the mmap call failure. We want to do normal I/O + * on the file if the reason we failed was because the file was on an + * NFS mounted partition, and we can fail in buffer I/O just as easily + * as here. + * + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ +#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */ + dbmfp->addr = NULL; + mfp->can_mmap = F_ISSET(dbmfp, MP_READONLY) && + ftype == 0 && !LF_ISSET(DB_NOMMAP) && path != NULL && + size <= (dbenv == NULL || dbenv->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : (off_t)dbenv->mp_mmapsize); + if (mfp->can_mmap) { + dbmfp->len = size; + if (__db_mmap(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) { + mfp->can_mmap = 0; + dbmfp->addr = NULL; + } + } + + LOCKHANDLE(dbmp, &dbmp->mutex); + TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + *retp = dbmfp; + return (0); + +err: if (F_ISSET(dbmfp, MP_PATH_ALLOC)) + FREES(dbmfp->path); + if (dbmfp->fd != -1) + (void)__db_close(dbmfp->fd); + if (dbmfp != NULL) + FREE(dbmfp, sizeof(DB_MPOOLFILE)); + return (ret); +} + +/* + * __memp_mf_open -- + * Open an MPOOLFILE. + */ +static int +__memp_mf_open(dbmp, dbmfp, + ftype, readonly, pagesize, lsn_offset, pgcookie, fileid, istemp, retp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + int ftype, readonly, lsn_offset, istemp; + size_t pagesize; + DBT *pgcookie; + u_int8_t *fileid; + MPOOLFILE **retp; +{ + MPOOLFILE *mfp; + int ret; + u_int8_t idbuf[DB_FILE_ID_LEN]; + void *p; + + /* Temporary files can't match previous files. */ + if (istemp) + goto alloc; + + /* + * Get the file id if we weren't give one. Generated file id's don't + * use timestamps, otherwise there'd be no chance of anyone joining + * the party. + */ + if (fileid == NULL) { + if ((ret = + __db_fileid(dbmp->dbenv, dbmfp->path, 0, idbuf)) != 0) + return (ret); + fileid = idbuf; + } + + /* Walk the list of MPOOLFILE's, looking for a matching file. */ + for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + if (!memcmp(fileid, + ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) { + if (ftype != mfp->ftype || + pagesize != mfp->stat.st_pagesize) { + __db_err(dbmp->dbenv, + "%s: ftype or pagesize changed", + dbmfp->path); + ret = EINVAL; + mfp = NULL; + goto ret1; + } + /* + * Found it: increment the reference count and update + * the mmap-able status. + */ + ++mfp->ref; + if (!readonly) + mfp->can_mmap = 0; + goto ret1; + } + + /* Allocate a new MPOOLFILE. */ +alloc: if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + goto ret1; + + /* Initialize the structure. */ + memset(mfp, 0, sizeof(MPOOLFILE)); + mfp->ref = 1; + mfp->ftype = ftype; + mfp->lsn_off = lsn_offset; + mfp->stat.st_pagesize = pagesize; + + /* Copy the file path into shared memory. */ + if ((ret = __memp_ralloc(dbmp, + strlen(dbmfp->path) + 1, &mfp->path_off, &p)) != 0) + goto err; + memcpy(p, dbmfp->path, strlen(dbmfp->path) + 1); + + /* Copy the file identification string into shared memory. */ + if (istemp) + mfp->fileid_off = 0; + else { + if ((ret = __memp_ralloc(dbmp, + DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) + goto err; + memcpy(p, fileid, DB_FILE_ID_LEN); + } + + /* Copy the page cookie into shared memory. */ + if (pgcookie == NULL || pgcookie->size == 0) { + mfp->pgcookie_len = 0; + mfp->pgcookie_off = 0; + } else { + if ((ret = __memp_ralloc(dbmp, + pgcookie->size, &mfp->pgcookie_off, &p)) != 0) + goto err; + memcpy(p, pgcookie->data, pgcookie->size); + mfp->pgcookie_len = pgcookie->size; + } + + /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */ + SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile); + + if (0) { +err: if (mfp->path_off != 0) + __db_shalloc_free(dbmp->addr, + ADDR(dbmp, mfp->path_off)); + if (!istemp) + __db_shalloc_free(dbmp->addr, + ADDR(dbmp, mfp->fileid_off)); + if (mfp != NULL) + __db_shalloc_free(dbmp->addr, mfp); + mfp = NULL; + } + +ret1: *retp = mfp; + return (0); +} + +/* + * memp_fclose -- + * Close a backing file for the memory pool. + */ +int +memp_fclose(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_MPOOL *dbmp; + int ret, t_ret; + + dbmp = dbmfp->dbmp; + ret = 0; + + /* Complain if pinned blocks never returned. */ + if (dbmfp->pinref != 0) + __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned", + dbmfp->path, (u_long)dbmfp->pinref); + + /* Remove the DB_MPOOLFILE structure from the list. */ + LOCKHANDLE(dbmp, &dbmp->mutex); + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + /* Close the underlying MPOOLFILE. */ + (void)__memp_mf_close(dbmp, dbmfp); + + /* Discard any mmap information. */ + if (dbmfp->addr != NULL && + (ret = __db_munmap(dbmfp->addr, dbmfp->len)) != 0) + __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(ret)); + + /* Close the file; temporary files may not yet have been created. */ + if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) { + __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(t_ret)); + if (ret != 0) + t_ret = ret; + } + + /* Potentially allocated path. */ + if (F_ISSET(dbmfp, MP_PATH_ALLOC)) + FREES(dbmfp->path); + + /* Free the DB_MPOOLFILE structure. */ + FREE(dbmfp, sizeof(DB_MPOOLFILE)); + + return (ret); +} + +/* + * __memp_mf_close -- + * Close down an MPOOLFILE. + */ +static int +__memp_mf_close(dbmp, dbmfp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; +{ + BH *bhp, *nbhp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t mf_offset; + + mp = dbmp->mp; + mfp = dbmfp->mfp; + + LOCKREGION(dbmp); + + /* If more than a single reference, simply decrement. */ + if (mfp->ref > 1) { + --mfp->ref; + goto ret1; + } + + /* + * Move any BH's held by the file to the free list. We don't free the + * memory itself because we may be discarding the memory pool, and it's + * fairly expensive to reintegrate the buffers back into the region for + * no purpose. + */ + mf_offset = OFFSET(dbmp, mfp); + for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + +#ifdef DEBUG_NO_DIRTY + /* Complain if we find any blocks that were left dirty. */ + if (F_ISSET(bhp, BH_DIRTY)) + __db_err(dbmp->dbenv, + "%s: close: pgno %lu left dirty; ref %lu", + dbmfp->path, (u_long)bhp->pgno, (u_long)bhp->ref); +#endif + + if (bhp->mf_offset == mf_offset) { + __memp_bhfree(dbmp, mfp, bhp, 0); + SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh); + } + } + + /* Delete from the list of MPOOLFILEs. */ + SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); + + /* Free the space. */ + __db_shalloc_free(dbmp->addr, mfp); + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->path_off)); + if (mfp->fileid_off != 0) + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->fileid_off)); + if (mfp->pgcookie_off != 0) + __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->pgcookie_off)); + +ret1: UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c new file mode 100644 index 0000000000..5fac8ae76b --- /dev/null +++ b/db2/mp/mp_fput.c @@ -0,0 +1,140 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fput.c 10.10 (Sleepycat) 7/20/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_fput -- + * Mpool file put function. + */ +int +memp_fput(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_long flags; +{ + BH *bhp; + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + int wrote, ret; + + dbmp = dbmfp->dbmp; + + /* Validate arguments. */ + if (flags) { + if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags, + DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbmp->dbenv, "memp_fput", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbmp->dbenv, + "%s: dirty flag set for readonly file page", + dbmfp->path); + return (EACCES); + } + } + + /* Decrement the pinned reference count. */ + LOCKHANDLE(dbmp, &dbmfp->mutex); + if (dbmfp->pinref == 0) + __db_err(dbmp->dbenv, + "%s: put: more blocks returned than retrieved", + dbmfp->path); + else + --dbmfp->pinref; + UNLOCKHANDLE(dbmp, &dbmfp->mutex); + + /* + * If we're mapping the file, there's nothing to do. Because we can + * quit mapping at any time, we have to check on each buffer to see + * if it's in the map region. + */ + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) + return (0); + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + LOCKREGION(dbmp); + + /* Set/clear the page bits. */ + if (LF_ISSET(DB_MPOOL_CLEAN)) + F_CLR(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DIRTY)) + F_SET(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + /* + * If more than one reference to the page, we're done. Ignore discard + * flags (for now) and leave it at its position in the LRU chain. The + * rest gets done at last reference close. + */ +#ifdef DEBUG + if (bhp->ref == 0) { + __db_err(dbmp->dbenv, + "Internal error: bhp->ref on page %lu went negative.", + (u_long)bhp->pgno); + abort(); + } +#endif + if (--bhp->ref > 0) { + UNLOCKREGION(dbmp); + return (0); + } + + /* Move the buffer to the head/tail of the LRU chain. */ + SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); + if (F_ISSET(bhp, BH_DISCARD)) + SH_TAILQ_INSERT_HEAD(&dbmp->mp->bhq, bhp, q, __bh); + else + SH_TAILQ_INSERT_TAIL(&dbmp->mp->bhq, bhp, q); + + /* + * If this buffer is scheduled for writing because of a checkpoint, + * write it now. If we can't write it, set a flag so that the next + * time the memp_sync function is called we try writing it there, + * as the checkpoint application better be able to write all of the + * files. + */ + if (F_ISSET(bhp, BH_WRITE)) + if (F_ISSET(bhp, BH_DIRTY)) { + if (__memp_bhwrite(dbmp, + dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote) + F_SET(dbmp->mp, MP_LSN_RETRY); + } else { + F_CLR(bhp, BH_WRITE); + + mfp = ADDR(dbmp, bhp->mf_offset); + --mfp->lsn_cnt; + + --dbmp->mp->lsn_cnt; + } + + UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c new file mode 100644 index 0000000000..588085a358 --- /dev/null +++ b/db2/mp/mp_fset.c @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_fset.c 10.8 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_fset -- + * Mpool page set-flag routine. + */ +int +memp_fset(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_long flags; +{ + BH *bhp; + DB_MPOOL *dbmp; + int ret; + + dbmp = dbmfp->dbmp; + + /* Validate arguments. */ + if (flags != 0) { + if ((ret = __db_fchk(dbmp->dbenv, "memp_fset", flags, + DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbmp->dbenv, "memp_fset", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbmp->dbenv, + "%s: dirty flag set for readonly file page", + dbmfp->path); + return (EACCES); + } + } + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + LOCKREGION(dbmp); + + if (LF_ISSET(DB_MPOOL_DIRTY)) + F_SET(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_CLEAN)) + F_CLR(bhp, BH_DIRTY); + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + UNLOCKREGION(dbmp); + return (0); +} diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c new file mode 100644 index 0000000000..257ce1b9e9 --- /dev/null +++ b/db2/mp/mp_open.c @@ -0,0 +1,176 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_open.c 10.12 (Sleepycat) 7/6/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_open -- + * Initialize and/or join a memory pool. + */ +int +memp_open(path, flags, mode, dbenv, retp) + const char *path; + int flags, mode; + DB_ENV *dbenv; + DB_MPOOL **retp; +{ + DB_MPOOL *dbmp; + size_t cachesize; + int ret; + + /* Validate arguments. */ +#ifdef HAVE_SPINLOCKS +#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD) +#else +#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP) +#endif + if ((ret = __db_fchk(dbenv, "memp_open", flags, OKFLAGS)) != 0) + return (ret); + + /* Extract fields from DB_ENV structure. */ + cachesize = dbenv == NULL ? 0 : dbenv->mp_size; + + /* Create and initialize the DB_MPOOL structure. */ + if ((dbmp = (DB_MPOOL *)calloc(1, sizeof(DB_MPOOL))) == NULL) + return (ENOMEM); + LOCKINIT(dbmp, &dbmp->mutex); + LIST_INIT(&dbmp->dbregq); + TAILQ_INIT(&dbmp->dbmfq); + + dbmp->dbenv = dbenv; + + /* Decide if it's possible for anyone else to access the pool. */ + if ((dbenv == NULL && path == NULL) || + (dbenv != NULL && F_ISSET(dbenv, DB_MPOOL_PRIVATE))) + F_SET(dbmp, MP_ISPRIVATE); + + /* + * Map in the region. We do locking regardless, as portions of it are + * implemented in common code (if we put the region in a file, that is). + */ + F_SET(dbmp, MP_LOCKREGION); + if ((ret = __memp_ropen(dbmp, path, cachesize, mode, flags)) != 0) + goto err; + F_CLR(dbmp, MP_LOCKREGION); + + /* + * If there's concurrent access, then we have to lock the region. + * If it's threaded, then we have to lock both the handles and the + * region. + */ + if (!F_ISSET(dbmp, MP_ISPRIVATE)) + F_SET(dbmp, MP_LOCKREGION); + if (LF_ISSET(DB_THREAD)) + F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION); + + *retp = dbmp; + return (0); + +err: if (dbmp != NULL) + FREE(dbmp, sizeof(DB_MPOOL)); + return (ret); +} + +/* + * memp_close -- + * Close a memory pool. + */ +int +memp_close(dbmp) + DB_MPOOL *dbmp; +{ + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + int ret, t_ret; + + ret = 0; + + /* Discard DB_MPREGs. */ + while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { + LIST_REMOVE(mpreg, q); + FREE(mpreg, sizeof(DB_MPREG)); + } + + /* Discard DB_MPOOLFILEs. */ + while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) + if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0) + ret = t_ret; + + /* Close the region. */ + if ((t_ret = __memp_rclose(dbmp)) && ret == 0) + ret = t_ret; + + /* Free the structure. */ + FREE(dbmp, sizeof(DB_MPOOL)); + + return (ret); +} + +/* + * memp_unlink -- + * Exit a memory pool. + */ +int +memp_unlink(path, force, dbenv) + const char *path; + int force; + DB_ENV *dbenv; +{ + return (__db_runlink(dbenv, + DB_APP_NONE, path, DB_DEFAULT_MPOOL_FILE, force)); +} + +/* + * memp_register -- + * Register a file type's pgin, pgout routines. + */ +int +memp_register(dbmp, ftype, pgin, pgout) + DB_MPOOL *dbmp; + int ftype; + int (*pgin) __P((db_pgno_t, void *, DBT *)); + int (*pgout) __P((db_pgno_t, void *, DBT *)); +{ + DB_MPREG *mpr; + + if ((mpr = (DB_MPREG *)malloc(sizeof(DB_MPREG))) == NULL) + return (ENOMEM); + + mpr->ftype = ftype; + mpr->pgin = pgin; + mpr->pgout = pgout; + + /* + * Insert at the head. Because we do a linear walk, we'll find + * the most recent registry in the case of multiple entries, so + * we don't have to check for multiple registries. + */ + LOCKHANDLE(dbmp, &dbmp->mutex); + LIST_INSERT_HEAD(&dbmp->dbregq, mpr, q); + UNLOCKHANDLE(dbmp, &dbmp->mutex); + + return (0); +} diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c new file mode 100644 index 0000000000..94eabf5947 --- /dev/null +++ b/db2/mp/mp_pr.c @@ -0,0 +1,313 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_pr.c 10.12 (Sleepycat) 7/29/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" + +void __memp_debug __P((DB_MPOOL *, FILE *, int)); + +static void __memp_pbh __P((FILE *, DB_MPOOL *, BH *, int)); +static void __memp_pdbmf __P((FILE *, DB_MPOOLFILE *, int)); +static void __memp_pmf __P((FILE *, MPOOLFILE *, int)); +static void __memp_pmp __P((FILE *, DB_MPOOL *, MPOOL *, int)); + +/* + * memp_stat -- + * Display MPOOL statistics. + */ +int +memp_stat(dbmp, gspp, fspp, db_malloc) + DB_MPOOL *dbmp; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + void *(*db_malloc) __P((size_t)); +{ + DB_MPOOL_FSTAT **tfsp; + MPOOLFILE *mfp; + size_t len, nlen; + char *name; + + /* Allocate space for the global statistics. */ + if (gspp != NULL) { + *gspp = NULL; + + if ((*gspp = db_malloc == NULL ? + (DB_MPOOL_STAT *)malloc(sizeof(**gspp)) : + (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL) + return (ENOMEM); + + LOCKREGION(dbmp); + + /* Copy out the global statistics. */ + **gspp = dbmp->mp->stat; + (*gspp)->st_hash_buckets = dbmp->mp->htab_buckets; + + UNLOCKREGION(dbmp); + } + + if (fspp != NULL) { + *fspp = NULL; + + LOCKREGION(dbmp); + + /* Count the MPOOLFILE structures. */ + for (len = 0, + mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; + ++len, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)); + + UNLOCKREGION(dbmp); + + if (len == 0) + return (0); + + /* Allocate space for the pointers. */ + len = (len + 1) * sizeof(DB_MPOOL_FSTAT *); + if ((*fspp = db_malloc == NULL ? + (DB_MPOOL_FSTAT **)malloc(len) : + (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL) + return (ENOMEM); + + LOCKREGION(dbmp); + + /* Build each individual entry. */ + for (tfsp = *fspp, + mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; + ++tfsp, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + name = ADDR(dbmp, mfp->path_off); + nlen = strlen(name); + len = sizeof(DB_MPOOL_FSTAT) + nlen + 1; + if ((*tfsp = db_malloc == NULL ? + (DB_MPOOL_FSTAT *)malloc(len) : + (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL) + return (ENOMEM); + **tfsp = mfp->stat; + (*tfsp)->file_name = (char *) + (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT); + memcpy((*tfsp)->file_name, name, nlen + 1); + } + *tfsp = NULL; + + UNLOCKREGION(dbmp); + } + return (0); +} + +/* + * __memp_debug -- + * Display MPOOL structures. + * + * PUBLIC: void __memp_debug __P((DB_MPOOL *, FILE *, int)); + */ +void +__memp_debug(dbmp, fp, data) + DB_MPOOL *dbmp; + FILE *fp; + int data; +{ + DB_MPOOLFILE *dbmfp; + u_long cnt; + + /* Make it easy to call from the debugger. */ + if (fp == NULL) + fp = stderr; + + /* Welcome message. */ + (void)fprintf(fp, "%s\nMpool per-process (%lu) statistics\n", + DB_LINE, (u_long)getpid()); + + if (data) + (void)fprintf(fp, " fd: %d; addr %lx; maddr %lx\n", + dbmp->fd, (u_long)dbmp->addr, (u_long)dbmp->maddr); + + /* Display the DB_MPOOLFILE structures. */ + for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; ++cnt, dbmfp = TAILQ_NEXT(dbmfp, q)); + (void)fprintf(fp, "%lu process-local files\n", cnt); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) { + (void)fprintf(fp, "%s\n", dbmfp->path); + __memp_pdbmf(fp, dbmfp, data); + } + + /* Switch to global statistics. */ + (void)fprintf(fp, "\n%s\nMpool statistics\n", DB_LINE); + + /* Display the MPOOL structure. */ + __memp_pmp(fp, dbmp, dbmp->mp, data); + + /* Flush in case we're debugging. */ + (void)fflush(fp); +} + +/* + * __memp_pdbmf -- + * Display a DB_MPOOLFILE structure. + */ +static void +__memp_pdbmf(fp, dbmfp, data) + FILE *fp; + DB_MPOOLFILE *dbmfp; + int data; +{ + if (!data) + return; + + (void)fprintf(fp, " fd: %d; %s\n", + dbmfp->fd, F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write"); +} + +/* + * __memp_pmp -- + * Display the MPOOL structure. + */ +static void +__memp_pmp(fp, dbmp, mp, data) + FILE *fp; + DB_MPOOL *dbmp; + MPOOL *mp; + int data; +{ + BH *bhp; + MPOOLFILE *mfp; + DB_HASHTAB *htabp; + size_t bucket; + int cnt; + const char *sep; + + (void)fprintf(fp, "references: %lu; cachesize: %lu\n", + (u_long)mp->rlayout.refcnt, (u_long)mp->stat.st_cachesize); + (void)fprintf(fp, + " %lu pages created\n", mp->stat.st_page_create); + (void)fprintf(fp, + " %lu mmap pages returned\n", mp->stat.st_map); + (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n", + mp->stat.st_page_in + mp->stat.st_page_out, + mp->stat.st_page_in, mp->stat.st_page_out); + if (mp->stat.st_cache_hit + mp->stat.st_cache_miss != 0) + (void)fprintf(fp, + " %.0f%% cache hit rate (%lu hit, %lu miss)\n", + ((double)mp->stat.st_cache_hit / + (mp->stat.st_cache_hit + mp->stat.st_cache_miss)) * 100, + mp->stat.st_cache_hit, mp->stat.st_cache_miss); + + /* Display the MPOOLFILE structures. */ + for (cnt = 0, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)); + (void)fprintf(fp, "%d total files\n", cnt); + for (cnt = 1, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + (void)fprintf(fp, "file %d\n", cnt); + __memp_pmf(fp, mfp, data); + } + + if (!data) + return; + + /* Display the hash table list of BH's. */ + (void)fprintf(fp, "%s\nHASH table of BH's (%lu buckets):\n", + DB_LINE, (u_long)mp->htab_buckets); + (void)fprintf(fp, + "longest chain searched %lu\n", mp->stat.st_hash_longest); + (void)fprintf(fp, "average chain searched %lu (total/calls: %lu/%lu)\n", + mp->stat.st_hash_examined / + (mp->stat.st_hash_searches ? mp->stat.st_hash_searches : 1), + mp->stat.st_hash_examined, mp->stat.st_hash_searches); + for (htabp = dbmp->htab, + bucket = 0; bucket < mp->htab_buckets; ++htabp, ++bucket) { + if (SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh) != NULL) + (void)fprintf(fp, "%lu:\n", (u_long)bucket); + for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) + __memp_pbh(fp, dbmp, bhp, data); + } + + /* Display the LRU list of BH's. */ + (void)fprintf(fp, "LRU list of BH's (pgno/offset):"); + for (sep = "\n ", bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + bhp != NULL; sep = ", ", bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + (void)fprintf(fp, "%s%lu/%lu", sep, + (u_long)bhp->pgno, (u_long)OFFSET(dbmp, bhp)); + (void)fprintf(fp, "\n"); +} + +/* + * __memp_pmf -- + * Display an MPOOLFILE structure. + */ +static void +__memp_pmf(fp, mfp, data) + FILE *fp; + MPOOLFILE *mfp; + int data; +{ + (void)fprintf(fp, " %lu pages created\n", mfp->stat.st_page_create); + (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n", + mfp->stat.st_page_in + mfp->stat.st_page_out, + mfp->stat.st_page_in, mfp->stat.st_page_out); + if (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss != 0) + (void)fprintf(fp, + " %.0f%% cache hit rate (%lu hit, %lu miss)\n", + ((double)mfp->stat.st_cache_hit / + (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss)) * 100, + mfp->stat.st_cache_hit, mfp->stat.st_cache_miss); + if (!data) + return; + + (void)fprintf(fp, " %d references; %s; pagesize: %lu\n", mfp->ref, + mfp->can_mmap ? "mmap" : "read/write", + (u_long)mfp->stat.st_pagesize); +} + +/* + * __memp_pbh -- + * Display a BH structure. + */ +static void +__memp_pbh(fp, dbmp, bhp, data) + FILE *fp; + DB_MPOOL *dbmp; + BH *bhp; + int data; +{ + const char *sep; + + if (!data) + return; + + (void)fprintf(fp, " BH @ %lu (mf: %lu): page %lu; ref %lu", + (u_long)OFFSET(dbmp, bhp), + (u_long)bhp->mf_offset, (u_long)bhp->pgno, (u_long)bhp->ref); + sep = "; "; + if (F_ISSET(bhp, BH_DIRTY)) { + (void)fprintf(fp, "%sdirty", sep); + sep = ", "; + } + if (F_ISSET(bhp, BH_WRITE)) { + (void)fprintf(fp, "%schk_write", sep); + sep = ", "; + } + (void)fprintf(fp, "\n"); +} diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c new file mode 100644 index 0000000000..a5c52123b9 --- /dev/null +++ b/db2/mp/mp_region.c @@ -0,0 +1,340 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_region.c 10.11 (Sleepycat) 8/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * __memp_ralloc -- + * Allocate some space in the mpool region. + * + * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *)); + */ +int +__memp_ralloc(dbmp, len, offsetp, retp) + DB_MPOOL *dbmp; + size_t len, *offsetp; + void *retp; +{ + BH *bhp, *nbhp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t fsize, total; + int nomore, restart, ret, wrote; + void *p; + + mp = dbmp->mp; + + nomore = 0; +alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) { + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, p); + *(void **)retp = p; + return (0); + } + if (nomore) { + __db_err(dbmp->dbenv, "%s", strerror(ret)); + return (ret); + } + + /* Look for a buffer on the free list that's the right size. */ + for (bhp = + SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + if (__db_shsizeof(bhp) == len) { + SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh); + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, bhp); + *(void **)retp = bhp; + return (0); + } + } + + /* Discard from the free list until we've freed enough memory. */ + total = 0; + for (bhp = + SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh); + __db_shalloc_free(dbmp->addr, bhp); + + /* + * Retry as soon as we've freed up sufficient space. If we + * have to coalesce of memory to satisfy the request, don't + * try until it's likely (possible?) that we'll succeed. + */ + total += fsize = __db_shsizeof(bhp); + if (fsize >= len || total >= 3 * len) + goto alloc; + } + +retry: /* Find a buffer we can flush; pure LRU. */ + total = 0; + for (bhp = + SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + /* Ignore pinned or locked (I/O in progress) buffers. */ + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + continue; + + /* Find the associated MPOOLFILE. */ + mfp = ADDR(dbmp, bhp->mf_offset); + + /* + * Write the page if it's dirty. + * + * If we wrote the page, fall through and free the buffer. We + * don't have to rewalk the list to acquire the buffer because + * it was never available for any other process to modify it. + * If we didn't write the page, but we discarded and reacquired + * the region lock, restart the buffer list walk. If we neither + * wrote the buffer nor discarded the region lock, continue down + * the buffer list. + */ + if (F_ISSET(bhp, BH_DIRTY)) { + if ((ret = __memp_bhwrite(dbmp, + mfp, bhp, &restart, &wrote)) != 0) + return (ret); + + /* + * It's possible that another process wants this buffer + * and incremented the ref count while we were writing + * it. + */ + if (bhp->ref != 0) + goto retry; + + if (wrote) + ++mp->stat.st_rw_evict; + else { + if (restart) + goto retry; + else + continue; + } + } else + ++mp->stat.st_ro_evict; + + /* + * Check to see if the buffer is the size we're looking for. + * If it is, simply reuse it. + */ + total += fsize = __db_shsizeof(bhp); + if (fsize == len) { + __memp_bhfree(dbmp, mfp, bhp, 0); + + if (offsetp != NULL) + *offsetp = OFFSET(dbmp, bhp); + *(void **)retp = bhp; + return (0); + } + + /* Free the buffer. */ + __memp_bhfree(dbmp, mfp, bhp, 1); + + /* + * Retry as soon as we've freed up sufficient space. If we + * have to coalesce of memory to satisfy the request, don't + * try until it's likely (possible?) that we'll succeed. + */ + if (fsize >= len || total >= 3 * len) + goto alloc; + + /* Restart the walk if we discarded the region lock. */ + if (restart) + goto retry; + } + nomore = 1; + goto alloc; +} + +/* + * __memp_ropen -- + * Attach to, and optionally create, the mpool region. + * + * PUBLIC: int __memp_ropen + * PUBLIC: __P((DB_MPOOL *, const char *, size_t, int, int)); + */ +int +__memp_ropen(dbmp, path, cachesize, mode, flags) + DB_MPOOL *dbmp; + const char *path; + size_t cachesize; + int mode, flags; +{ + MPOOL *mp; + size_t rlen; + int fd, newregion, ret, retry_cnt; + + /* + * Unlike other DB subsystems, mpool can't simply grow the region + * because it returns pointers into the region to its clients. To + * "grow" the region, we'd have to allocate a new region and then + * store a region number in the structures that reference regional + * objects. It's reasonable that we fail regardless, as clients + * shouldn't have every page in the region pinned, so the only + * "failure" mode should be a performance penalty because we don't + * find a page in the cache that we'd like to have found. + * + * Up the user's cachesize by 25% to account for our overhead. + */ + if (cachesize < DB_CACHESIZE_MIN) + if (cachesize == 0) + cachesize = DB_CACHESIZE_DEF; + else + cachesize = DB_CACHESIZE_MIN; + rlen = cachesize + cachesize / 4; + + /* Map in the region. */ + retry_cnt = newregion = 0; +retry: if (LF_ISSET(DB_CREATE)) { + /* + * If it's a private mpool, use malloc, it's a lot faster than + * instantiating a region. + * + * XXX + * If we're doing locking and don't have spinlocks for this + * architecture, we'd have to instantiate the file, we need + * the file descriptor for locking. However, it should not + * be possible for DB_THREAD to be set if HAVE_SPINLOCKS aren't + * defined. + */ + if (F_ISSET(dbmp, MP_ISPRIVATE)) + ret = (dbmp->maddr = malloc(rlen)) == NULL ? ENOMEM : 0; + else + ret = __db_rcreate(dbmp->dbenv, DB_APP_NONE, path, + DB_DEFAULT_MPOOL_FILE, mode, rlen, &fd, + &dbmp->maddr); + if (ret == 0) { + /* Put the MPOOL structure first in the region. */ + mp = dbmp->maddr; + + SH_TAILQ_INIT(&mp->bhq); + SH_TAILQ_INIT(&mp->bhfq); + SH_TAILQ_INIT(&mp->mpfq); + + /* Initialize the rest of the region as free space. */ + dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL); + __db_shalloc_init(dbmp->addr, rlen - sizeof(MPOOL)); + + /* + * + * Pretend that the cache will be broken up into 4K + * pages, and that we want to keep it under, say, 10 + * pages on each chain. This means a 256MB cache will + * allocate ~6500 offset pairs. + */ + mp->htab_buckets = + __db_tablesize((cachesize / (4 * 1024)) / 10); + + /* Allocate hash table space and initialize it. */ + if ((ret = __db_shalloc(dbmp->addr, + mp->htab_buckets * sizeof(DB_HASHTAB), + 0, &dbmp->htab)) != 0) + goto err; + __db_hashinit(dbmp->htab, mp->htab_buckets); + mp->htab = OFFSET(dbmp, dbmp->htab); + + memset(&mp->stat, 0, sizeof(mp->stat)); + mp->stat.st_cachesize = cachesize; + + mp->flags = 0; + + newregion = 1; + } else if (ret != EEXIST) + return (ret); + } + + /* If we didn't or couldn't create the region, try and join it. */ + if (!newregion && + (ret = __db_ropen(dbmp->dbenv, DB_APP_NONE, + path, DB_DEFAULT_MPOOL_FILE, 0, &fd, &dbmp->maddr)) != 0) { + /* + * If we failed because the file wasn't available, wait a + * second and try again. + */ + if (ret == EAGAIN && ++retry_cnt < 3) { + (void)__db_sleep(1, 0); + goto retry; + } + return (ret); + } + + /* Set up the common pointers. */ + dbmp->mp = dbmp->maddr; + dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL); + + /* + * If not already locked, lock the region -- if it's a new region, + * then either __db_rcreate() locked it for us or we malloc'd it + * instead of creating a region, neither of which requires locking + * here. + */ + if (!newregion) + LOCKREGION(dbmp); + + /* + * Get the hash table address; it's on the shared page, so we have + * to lock first. + */ + dbmp->htab = ADDR(dbmp, dbmp->mp->htab); + + dbmp->fd = fd; + + /* If we locked the region, release it now. */ + if (!F_ISSET(dbmp, MP_ISPRIVATE)) + UNLOCKREGION(dbmp); + return (0); + +err: if (fd != -1) { + dbmp->fd = fd; + (void)__memp_rclose(dbmp); + } + + if (newregion) + (void)memp_unlink(path, 1, dbmp->dbenv); + return (ret); +} + +/* + * __memp_rclose -- + * Close the mpool region. + * + * PUBLIC: int __memp_rclose __P((DB_MPOOL *)); + */ +int +__memp_rclose(dbmp) + DB_MPOOL *dbmp; +{ + if (F_ISSET(dbmp, MP_ISPRIVATE)) { + free(dbmp->maddr); + return (0); + } + return (__db_rclose(dbmp->dbenv, dbmp->fd, dbmp->maddr)); +} diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c new file mode 100644 index 0000000000..4f1205661a --- /dev/null +++ b/db2/mp/mp_sync.c @@ -0,0 +1,205 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_sync.c 10.8 (Sleepycat) 7/2/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "common_ext.h" + +/* + * memp_sync -- + * Mpool sync function. + */ +int +memp_sync(dbmp, lsnp) + DB_MPOOL *dbmp; + DB_LSN *lsnp; +{ + BH *bhp; + DB_ENV *dbenv; + MPOOL *mp; + MPOOLFILE *mfp; + int can_write, wrote, lsn_cnt, restart, ret; + + dbenv = dbmp->dbenv; + + if (dbmp->dbenv->lg_info == NULL) { + __db_err(dbenv, "memp_sync requires logging"); + return (EINVAL); + } + + LOCKREGION(dbmp); + + /* + * If the application is asking about a previous call, and we haven't + * found any buffers that the application holding the pin couldn't + * write, return yes or no based on the current count. Note, if the + * application is asking about a LSN *smaller* than one we've already + * handled, then we return based on the count for that LSN. + */ + mp = dbmp->mp; + if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { + if (mp->lsn_cnt == 0) { + *lsnp = mp->lsn; + ret = 0; + } else + ret = DB_INCOMPLETE; + + UNLOCKREGION(dbmp); + return (ret); + } + + /* Else, it's a new checkpoint. */ + F_CLR(mp, MP_LSN_RETRY); + + /* + * Save the LSN. We know that it's a new LSN or larger than the one + * for which we were already doing a checkpoint. (BTW, I don't expect + * to see multiple LSN's from the same or multiple processes, but You + * Just Never Know. Responding as if they all called with the largest + * of the LSNs specified makes everything work. + * + * We don't currently use the LSN we save. We could potentially save + * the last-written LSN in each buffer header and use it to determine + * what buffers need to be written. The problem with this is that it's + * sizeof(LSN) more bytes of buffer header. We currently write all the + * dirty buffers instead. + * + * Walk the list of shared memory segments clearing the count of + * buffers waiting to be written. + */ + mp->lsn = *lsnp; + mp->lsn_cnt = 0; + for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + mfp->lsn_cnt = 0; + + /* + * Walk the list of buffers and mark all dirty buffers to be written + * and all pinned buffers to be potentially written. We do this in + * single fell swoop while holding the region locked so that processes + * can't make new buffers dirty, causing us to never finish. Since + * the application may have restarted the sync, clear any BH_WRITE + * flags that appear to be left over. + */ + can_write = lsn_cnt = 0; + for (lsn_cnt = 0, bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { + F_SET(bhp, BH_WRITE); + + if (bhp->ref == 0) + can_write = 1; + + mfp = ADDR(dbmp, bhp->mf_offset); + ++mfp->lsn_cnt; + + ++lsn_cnt; + } else + F_CLR(bhp, BH_WRITE); + + mp->lsn_cnt = lsn_cnt; + + /* If there no buffers we can write, we're done. */ + if (!can_write) { + UNLOCKREGION(dbmp); + return (mp->lsn_cnt ? DB_INCOMPLETE : 0); + } + + /* + * Write any buffers that we can. Restart the walk after each write, + * __memp_pgwrite() discards and reacquires the region lock during I/O. + */ +retry: for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + /* Ignore pinned or locked buffers. */ + if (!F_ISSET(bhp, BH_WRITE) || + bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + continue; + + mfp = ADDR(dbmp, bhp->mf_offset); + if ((ret = + __memp_bhwrite(dbmp, mfp, bhp, &restart, &wrote)) != 0) + goto err; + if (wrote) { + if (restart) + goto retry; + continue; + } + __db_err(dbenv, "%s: unable to flush page: %lu", + ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno); + ret = EPERM; + goto err; + } + ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + +err: UNLOCKREGION(dbmp); + return (ret); +} + +/* + * memp_fsync -- + * Mpool file sync function. + */ +int +memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + BH *bhp; + DB_MPOOL *dbmp; + size_t mf_offset; + int pincnt, restart, ret, wrote; + + /* We don't sync temporary files -- what's the use? */ + if (F_ISSET(dbmfp, MP_PATH_TEMP)) + return (0); + + dbmp = dbmfp->dbmp; + ret = 0; + + mf_offset = OFFSET(dbmp, dbmfp->mfp); + + LOCKREGION(dbmp); + + /* + * Walk the list of buffer headers for the MPOOLFILE, and write out any + * dirty buffers that we can. + */ +retry: pincnt = 0; + for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + if (F_ISSET(bhp, BH_DIRTY) && bhp->mf_offset == mf_offset) { + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { + ++pincnt; + continue; + } + if ((ret = + __memp_pgwrite(dbmfp, bhp, &restart, &wrote)) != 0) + goto err; + if (!wrote) + ++pincnt; + if (restart) + goto retry; + } + + UNLOCKREGION(dbmp); + +err: return (ret == 0 ? (pincnt ? DB_INCOMPLETE : 0) : ret); +} |