about summary refs log tree commit diff
path: root/rt/aio_misc.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>1997-12-08 03:06:47 +0000
committerUlrich Drepper <drepper@redhat.com>1997-12-08 03:06:47 +0000
commitd71b808a96f0fd1b6cd82c41698b518bf345692b (patch)
tree7b86cfd6b465c72888d7e85a61c08cafabf5747b /rt/aio_misc.c
parent6195235142bd246d972cf1d88b4e208071a3e318 (diff)
downloadglibc-d71b808a96f0fd1b6cd82c41698b518bf345692b.tar.gz
glibc-d71b808a96f0fd1b6cd82c41698b518bf345692b.tar.xz
glibc-d71b808a96f0fd1b6cd82c41698b518bf345692b.zip
1997-12-08 03:24  Ulrich Drepper  <drepper@cygnus.com>

	* Makeconfig: Define $(rt) to rt if linuxthreads is available.
	* Makefile (subdirs): Replace rt by $(rt).

	* shlib-versions: Use version .2 for NSS modules.

	* grp/fgetgrent_r.c (LINE_PARSER): Call parse_line with extra
	argument.
	* pwd/fgetpwent_r.c: Likewise.
	* spwd/fgetspent_r.c: Likewise.
	* spwd/sgetspent_r.c: Likewise.
	* hesiod/nss_hesiod/hesiod-grp.c (lookup): Add new argument.
	Store error number in *ERRNOP.  Little optimization.
	* hesiod/nss_hesiod/hesiod-pwd.c: Likewise.
	* hesiod/nss_hesiod/hesiod-service.c: Likewise.
	* inet/getnetgrent_r.c (__internal_setnetgrent_reuse): Likewise.
	(__internal_getnetgrent_r): New argument.
	Call __internal_setnetgrent_reuse with new argument.
	(__getnetgrent_r): Call __internal_getnetgrent_r with new argument.
	* inet/netgroup.h: Add argument to prototypes.
	* nis/nisplus-parser.h: Likewise.
	* nis/nss_compat/compat-grp.c: Change everything to store error
	code through provided pointer and not use errno.
	* nis/nss_compat/compat-pwd.c: Likewise.
	* nis/nss_compat/compat-spwd.c: Likewise.
	* nis/nss_nis/nis-alias.c: Likewise.
	* nis/nss_nis/nis-ethers.c: Likewise.
	* nis/nss_nis/nis-grp.c: Likewise.
	* nis/nss_nis/nis-hosts.c: Likewise.
	* nis/nss_nis/nis-netgrp.c: Likewise.
	* nis/nss_nis/nis-network.c: Likewise.
	* nis/nss_nis/nis-proto.c: Likewise.
	* nis/nss_nis/nis-publickey.c: Likewise.
	* nis/nss_nis/nis-pwd.c: Likewise.
	* nis/nss_nis/nis-rpc.c: Likewise.
	* nis/nss_nis/nis-service.c: Likewise.
	* nis/nss_nis/nis-spwd.c: Likewise.
	* nis/nss_nisplus/nisplus-alias.c: Likewise.
	* nis/nss_nisplus/nisplus-ethers.c: Likewise.
	* nis/nss_nisplus/nisplus-grp.c: Likewise.
	* nis/nss_nisplus/nisplus-hosts.c: Likewise.
	* nis/nss_nisplus/nisplus-netgrp.c: Likewise.
	* nis/nss_nisplus/nisplus-network.c: Likewise.
	* nis/nss_nisplus/nisplus-parser.c: Likewise.
	* nis/nss_nisplus/nisplus-proto.c: Likewise.
	* nis/nss_nisplus/nisplus-publickey.c: Likewise.
	* nis/nss_nisplus/nisplus-pwd.c: Likewise.
	* nis/nss_nisplus/nisplus-rpc.c: Likewise.
	* nis/nss_nisplus/nisplus-service.c: Likewise.
	* nis/nss_nisplus/nisplus-spwd.c: Likewise.
	* nss/nss_db/db-XXX.c: Likewise.
	* nss/nss_db/db-alias.c: Likewise.
	* nss/nss_db/db-netgrp.c: Likewise.
	* nss/nss_files/files-XXX.c: Likewise
	* nss/nss_files/files-alias.c: Likewise
	* nss/nss_files/files-netgrp.c: Likewise
	* nss/nss_files/files-parse.c: Likewise
	* nss/getXXbyYY_r.c: Call with __errno_location() as value for new
	parameter of get functions.
	* nss/getXXent_r.c: Likewise.

	* localedata/tst-fmon.c: Print debug info if test fails.

	* rt/Makefile (routines): Add aio_notify.
	* rt/aio.h: Remove non-public definitions.  Add aioinit stuff.
	* rt/aio_cancel.c: Rewrite to support lio_listio and aio_suspend.
	* rt/aio_fsync.c: Likewise.
	* rt/aio_misc.c: Likewise.
	* rt/aio_misc.h: Likewise.
	* rt/aio_read.c: Likewise.
	* rt/aio_read64.c: Likewise.
	* rt/aio_suspend.c: Likewise.
	* rt/aio_write.c: Likewise.
	* rt/aio_write64.c: Likewise.
	* rt/lio_listio.c: Likewise.
	* rt/lio_listio64.c: Likewise.
	* rt/aio_notify.c: New file.

	* sysdeps/generic/bits/sigset.h: Pretty print.

	* sysdeps/posix/pwrite.c: Define weak alias only if function is
	really defined as __pwrite.

	* sysdeps/unix/sysv/linux/pread.c: Rewrite to use syscall directly
	instead of pread64.
	* sysdeps/unix/sysv/linux/pwrite.c: Likewise.

	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=signal]
	(sysdep_routines): Add rt_sigpending and rt_sigreturn.
	* sysdeps/unix/sysv/linux/sigaction.c: Use correct value for sigset_t
	size in syscall.
	* sysdeps/unix/sysv/linux/sigpending.c: Likewise.
	* sysdeps/unix/sysv/linux/sigprocmask.c: Likewise.
	* sysdeps/unix/sysv/linux/sigreturn.c: Likewise.
	* sysdeps/unix/sysv/linux/sigsuspend.c: Likewise.
	* sysdeps/unix/sysv/linux/sigtimedwait.c: Likewise.
	* sysdeps/unix/sysv/linux/sigwaitinfo.c: Likewise.
	* sysdeps/unix/sysv/linux/i386/sigaction.c: Likewise.  Take care
	for ACT being NULL.
	* sysdeps/unix/sysv/linux/syscalls.list: Don't mention user for
	RT signals.
	* sysdeps/unix/sysv/linux/i386/s_pread64.s: Return ENOSYS if
	__NR_pread is not defined.
	* sysdeps/unix/sysv/linux/i386/s_pwrite64.s: Return ENOSYS if
	__NR_pwrite is not defined.

1997-12-07 11:15  Thorsten Kukuk  <kukuk@vt.uni-paderborn.de>

	* nis/nss_compat/compat-grp.c: If buffer is to small, set the file
	handle for /etc/group back in some more cases.
	* nis/nss_compat/compat-pwd.c: If buffer is to small, set the file
	handle for /etc/passwd back in some more cases.
	* nis/nss_compat/compat-spwd.c: If buffer is to small, set the file
	handle for /etc/shadow back in some more cases.

1997-12-06 17:00  H.J. Lu  <hjl@gnu.org>

	* sysdeps/posix/sigblock.c (__sigblock): Don't check int mask
	beyound its size.
	* sysdeps/posix/sigsetmask.c (__sigsetmask): Ditto.
	* sysdeps/posix/sigvec.c (convert_mask, __sigvec): Ditto.

1997-12-06 04:38  Ulrich Drepper  <drepper@cygnus.com>

	* sysdeps/wordsize-32/inttypes.h: Rewrite {str,wcs}to{i,u}max
	handling.  Don't use macros, we need functions.  Use inline functions
	for optimization.
	* sysdeps/wordsize-64/inttypes.h: Likewise.
	* sysdeps/wordsize-32/Makefile: Add strtoimax, stroumax, wcstoimax,
	and wcstoumax.
	* sysdeps/wordsize-64/Makefile: Likewise.
	* sysdeps/wordsize-32/Dist: Likewise.
	* sysdeps/wordsize-64/Dist: Likewise.
	* sysdeps/wordsize-32/strtoimax.c: New file.
	* sysdeps/wordsize-32/strtoumax.c: New file.
	* sysdeps/wordsize-32/wcstoimax.c: New file.
	* sysdeps/wordsize-32/wcstoumax.c: New file.
	* sysdeps/wordsize-64/strtoimax.c: New file.
	* sysdeps/wordsize-64/strtoumax.c: New file.
	* sysdeps/wordsize-64/wcstoimax.c: New file.
	* sysdeps/wordsize-64/wcstoumax.c: New file.

1997-12-04 10:40  Philip Blundell  <pb@nexus.co.uk>

	* sysdeps/generic/wait3.c (__wait3): Use ANSI-style definition so
	that transparent union works.
	* sysdeps/generic/wait4.c (__wait4): Likewise.
	* sysdeps/generic/wait.c (__wait): Likewise.

	* sysdeps/generic/getpeername.c (getpeername): Use socklen_t not
	size_t.

	* sysdeps/generic/syscall.c (syscall): Return value and arg are
	long, to match prototype.

	* malloc/malloc.c (HAVE_MREMAP): Set to 0 for ARM, as we don't
	have mremap() yet.

	* sysdeps/generic/usleep.c (usleep): Returns void, to match prototype.

	* sysdeps/unix/sysv/linux/arm/syscalls.list: New file.

	* sysdeps/generic/bits/time.h: Use __time_t not time_t.

	* sysdeps/generic/bits/resource.h: Include <bits/types.h> for
	__rlim_t.

	* sysdeps/standalone/arm/bits/errno.h (ENOTTY): Added.

	* sysdeps/generic/bits/dirent.h (struct dirent64): Added.

	* io/sys/stat.h (S_ISLNK): Always false if we don't have
	__S_IFLNK.

	* sysdeps/standalone/close.c (_STDIO_H): Define before including
	<bits/stdio_lim.h>.
	* sysdeps/standalone/filedesc.h (_STDIO_H): Likewise.

1997-12-06 01:09  Ulrich Drepper  <drepper@cygnus.com>

	* posix/sys/wait.h: Don't use transparent unions in C++.

1997-12-05  Andreas Jaeger  <aj@arthur.rhein-neckar.de>

	* time/time.h: Add strfxtime.

	* libc.map: Add tcgetsid, strcasestr, wmemrtombs, wmemrtowcs,
	sysv_signal, strfxtime, pread64, pwrite64.

1997-11-30 21:57  Richard Henderson  <rth@cygnus.com>

	Alpha changes for EGCS:
	* config.h.in (ASM_ALPHA_NG_SYMBOL_PREFIX): New entry.
	* configure.in (libc_cv_gcc_alpha_ng_prefix): New check.
	* sysdeps/alpha/dl-machine.h (_dl_runtime_resolve): Care for prefix.
	(_start): Likewise.
	* sysdeps/unix/sysv/linux/alpha/init-first.h: Likewise.
Diffstat (limited to 'rt/aio_misc.c')
-rw-r--r--rt/aio_misc.c588
1 files changed, 414 insertions, 174 deletions
diff --git a/rt/aio_misc.c b/rt/aio_misc.c
index e4bb12c500..6ea30c2158 100644
--- a/rt/aio_misc.c
+++ b/rt/aio_misc.c
@@ -21,7 +21,6 @@
 #include <aio.h>
 #include <errno.h>
 #include <pthread.h>
-#include <semaphore.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/stat.h>
@@ -29,40 +28,199 @@
 #include "aio_misc.h"
 
 
-/* We need a list of pending operations.  This is sorted according to
-   the priority given in the aio_reqprio member.  */
-aiocb_union *__aio_requests;
+/* Pool of request list entries.  */
+static struct requestlist **pool;
 
-/* Since the list is global we need a semaphore protecting it.  */
-sem_t __aio_requests_sema;
+/* Number of total and allocated pool entries.  */
+static size_t pool_tab_size;
+static size_t pool_size;
 
+/* We implement a two dimensional array but allocate each row separately.
+   The macro below determines how many entries should be used per row.
+   It should better be a power of two.  */
+#define ENTRIES_PER_ROW	16
 
-/* The initialization function.  It gets automatically called if any
-   aio_* function is used in the program.  */
-static void
-__attribute__ ((unused))
-aio_initialize (void)
+/* The row table is incremented in units of this.  */
+#define ROW_STEP	8
+
+/* List of available entries.  */
+static struct requestlist *freelist;
+
+/* List of request waiting to be processed.  */
+static struct requestlist *runlist;
+
+/* Structure list of all currently processed requests.  */
+static struct requestlist *requests;
+
+/* Number of threads currently running.  */
+static int nthreads;
+
+
+/* These are the values used to optimize the use of AIO.  The user can
+   overwrite them by using the `aio_init' function.  */
+static struct aioinit optim =
+{
+  20,	/* int aio_threads;	Maximal number of threads.  */
+  256,	/* int aio_num;		Number of expected simultanious requests. */
+  0,
+  0,
+  0,
+  0,
+  { 0, }
+};
+
+
+/* Since the list is global we need a mutex protecting it.  */
+pthread_mutex_t __aio_requests_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+
+/* Functions to handle request list pool.  */
+static struct requestlist *
+get_elem (void)
 {
-  /* Initialize the semaphore.  We allow exactly one user at a time.  */
-  sem_init (&__aio_requests_sema, 0, 1);
+  struct requestlist *result;
+
+  if (freelist == NULL)
+    {
+      struct requestlist *new_row;
+      size_t new_size;
+
+      /* Compute new size.  */
+      new_size = pool_size ? pool_size + ENTRIES_PER_ROW : optim.aio_num;
+
+      if ((new_size / ENTRIES_PER_ROW) >= pool_tab_size)
+	{
+	  size_t new_tab_size = new_size / ENTRIES_PER_ROW;
+	  struct requestlist **new_tab;
+
+	  new_tab = (struct requestlist **)
+	    realloc (pool, (new_tab_size * sizeof (struct requestlist *)));
+
+	  if (new_tab == NULL)
+	    return NULL;
+
+	  pool_tab_size = new_tab_size;
+	  pool = new_tab;
+	}
+
+      if (pool_size == 0)
+	{
+	  size_t cnt;
+
+	  new_row = (struct requestlist *)
+	    calloc (new_size, sizeof (struct requestlist));
+
+	  if (new_row == NULL)
+	    return NULL;
+
+	  for (cnt = 0; cnt < new_size / ENTRIES_PER_ROW; ++cnt)
+	    pool[cnt] = &new_row[cnt * ENTRIES_PER_ROW];
+	}
+      else
+	{
+	  /* Allocat one new row.  */
+	  new_row = (struct requestlist *)
+	    calloc (ENTRIES_PER_ROW, sizeof (struct requestlist));
+	  if (new_row == NULL)
+	    return NULL;
+
+	  pool[new_size / ENTRIES_PER_ROW] = new_row;
+	}
+
+      /* Put all the new entries in the freelist.  */
+      do
+	{
+	  new_row->next_prio = freelist;
+	  freelist = new_row++;
+	}
+      while (++pool_size < new_size);
+    }
+
+  result = freelist;
+  freelist = freelist->next_prio;
+
+  return result;
 }
 
-text_set_element (__libc_subinit, aio_initialize);
+
+void
+__aio_free_req (struct requestlist *elem)
+{
+  elem->running = no;
+  elem->next_prio = freelist;
+  freelist = elem;
+}
+
+
+struct requestlist *
+__aio_find_req (aiocb_union *elem)
+{
+  struct requestlist *runp = requests;
+  int fildes = elem->aiocb.aio_fildes;
+
+  while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
+    runp = runp->next_fd;
+
+  if (runp != NULL)
+    if (runp->aiocbp->aiocb.aio_fildes != fildes)
+      runp = NULL;
+    else
+      while (runp != NULL && runp->aiocbp != elem)
+	runp = runp->next_prio;
+
+  return runp;
+}
+
+
+struct requestlist *
+__aio_find_req_fd (int fildes)
+{
+  struct requestlist *runp = requests;
+
+  while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
+    runp = runp->next_fd;
+
+  return (runp != NULL && runp->aiocbp->aiocb.aio_fildes == fildes
+	  ? runp : NULL);
+}
 
 
 /* The thread handler.  */
 static void *handle_fildes_io (void *arg);
 
 
+/* User optimization.  */
+void
+__aio_init (const struct aioinit *init)
+{
+  /* Get the mutex.  */
+  pthread_mutex_lock (&__aio_requests_mutex);
+
+  /* Only allow writing new values if the table is not yet allocated.  */
+  if (pool == NULL)
+    {
+      optim.aio_threads = init->aio_threads < 1 ? 1 : init->aio_threads;
+      optim.aio_num = (init->aio_num < ENTRIES_PER_ROW
+		       ? ENTRIES_PER_ROW
+		       : init->aio_num & ~ENTRIES_PER_ROW);
+    }
+
+  /* Release the mutex.  */
+  pthread_mutex_unlock (&__aio_requests_mutex);
+}
+weak_alias (__aio_init, aio_init)
+
+
 /* The main function of the async I/O handling.  It enqueues requests
    and if necessary starts and handles threads.  */
-int
-__aio_enqueue_request (aiocb_union *aiocbp, int operation, int require_lock)
+struct requestlist *
+__aio_enqueue_request (aiocb_union *aiocbp, int operation)
 {
-  int result;
+  int result = 0;
   int policy, prio;
   struct sched_param param;
-  aiocb_union *runp;
+  struct requestlist *last, *runp, *newp;
+  int running = no;
 
   if (aiocbp->aiocb.aio_reqprio < 0
       || aiocbp->aiocb.aio_reqprio > AIO_PRIO_DELTA_MAX)
@@ -71,94 +229,160 @@ __aio_enqueue_request (aiocb_union *aiocbp, int operation, int require_lock)
       __set_errno (EINVAL);
       aiocbp->aiocb.__error_code = EINVAL;
       aiocbp->aiocb.__return_value = -1;
-      return -1;
-    }
-
-  if (pthread_getschedparam (pthread_self (), &policy, &param) < 0)
-    {
-      /* Something went wrong.  */
-      aiocbp->aiocb.__error_code = errno;
-      aiocbp->aiocb.__return_value = -1;
-      return -1;
+      return NULL;
     }
 
   /* Compute priority for this request.  */
+  pthread_getschedparam (pthread_self (), &policy, &param);
   prio = param.sched_priority - aiocbp->aiocb.aio_reqprio;
 
+  /* Get the mutex.  */
+  pthread_mutex_lock (&__aio_requests_mutex);
 
-  /* Get the semaphore.  */
-  if (require_lock)
-    sem_wait (&__aio_requests_sema);
-
-  runp = __aio_requests;
+  last = NULL;
+  runp = requests;
   /* First look whether the current file descriptor is currently
      worked with.  */
-  while (runp != NULL && runp->aiocb.aio_fildes < aiocbp->aiocb.aio_fildes)
-    runp = (aiocb_union *) runp->aiocb.__next_fd;
+  while (runp != NULL
+	 && runp->aiocbp->aiocb.aio_fildes < aiocbp->aiocb.aio_fildes)
+    {
+      last = runp;
+      runp = runp->next_fd;
+    }
 
-  if (runp != NULL)
+  /* Get a new element for the waiting list.  */
+  newp = get_elem ();
+  if (newp == NULL)
+    {
+      __set_errno (EAGAIN);
+      pthread_mutex_unlock (&__aio_requests_mutex);
+      return NULL;
+    }
+  newp->aiocbp = aiocbp;
+  newp->waiting = NULL;
+
+  aiocbp->aiocb.__abs_prio = prio;
+  aiocbp->aiocb.__policy = policy;
+  aiocbp->aiocb.aio_lio_opcode = operation;
+  aiocbp->aiocb.__error_code = EINPROGRESS;
+  aiocbp->aiocb.__return_value = 0;
+
+  if (runp != NULL
+      && runp->aiocbp->aiocb.aio_fildes == aiocbp->aiocb.aio_fildes)
     {
       /* The current file descriptor is worked on.  It makes no sense
-	 to start another thread since this new thread would have to
-	 wait for the previous one to terminate.  Simply enqueue it
-	 after the running one according to the priority.  */
-      while (runp->aiocb.__next_prio != NULL
-	     && runp->aiocb.__next_prio->__abs_prio >= prio)
-	runp = (aiocb_union *) runp->aiocb.__next_prio;
-
-      aiocbp->aiocb.__next_prio = runp->aiocb.__next_prio;
-      aiocbp->aiocb.__abs_prio = prio;
-      aiocbp->aiocb.__policy = policy;
-      aiocbp->aiocb.aio_lio_opcode = operation;
-      aiocbp->aiocb.__error_code = EINPROGRESS;
-      aiocbp->aiocb.__return_value = 0;
-      runp->aiocb.__next_prio = (struct aiocb *) aiocbp;
-
-      result = 0;
+	 to start another thread since this new thread would fight
+	 with the running thread for the resources.  But we also cannot
+	 say that the thread processing this desriptor shall imeediately
+	 after finishing the current job process this request if there
+	 are other threads in the running queue which have a higher
+	 priority.  */
+
+      /* Simply enqueue it after the running one according to the
+	 priority.  */
+      while (runp->next_prio != NULL
+	     && runp->next_prio->aiocbp->aiocb.__abs_prio >= prio)
+	runp = runp->next_prio;
+
+      newp->next_prio = runp->next_prio;
+      runp->next_prio = newp;
+
+      running = queued;
     }
   else
     {
-      /* We create a new thread for this file descriptor.  The
+      /* Enqueue this request for a new descriptor.  */
+      if (last == NULL)
+	{
+	  newp->last_fd = NULL;
+	  newp->next_fd = requests;
+	  if (requests != NULL)
+	    requests->last_fd = newp;
+	  requests = newp;
+	}
+      else
+	{
+	  newp->next_fd = last->next_fd;
+	  newp->last_fd = last;
+	  last->next_fd = newp;
+	  if (newp->next_fd != NULL)
+	    newp->next_fd->last_fd = newp;
+	}
+
+      newp->next_prio = NULL;
+    }
+
+  if (running == no)
+    {
+      /* We try to create a new thread for this file descriptor.  The
 	 function which gets called will handle all available requests
 	 for this descriptor and when all are processed it will
-	 terminate.  */
-      pthread_t thid;
-      pthread_attr_t attr;
-
-      /* First enqueue the request (the list is empty).  */
-      aiocbp->aiocb.__next_fd = NULL;
-      aiocbp->aiocb.__last_fd = NULL;
-
-      aiocbp->aiocb.__next_prio = NULL;
-      aiocbp->aiocb.__abs_prio = prio;
-      aiocbp->aiocb.__policy = policy;
-      aiocbp->aiocb.aio_lio_opcode = operation;
-      aiocbp->aiocb.__error_code = EINPROGRESS;
-      aiocbp->aiocb.__return_value = 0;
-
-      /* Make sure the thread is created detached.  */
-      pthread_attr_init (&attr);
-      pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
-
-      /* Now try to start a thread.  */
-      if (pthread_create (&thid, &attr, handle_fildes_io, aiocbp) < 0)
+	 terminate.
+
+	 If no new thread can be created or if the specified limit of
+	 threads for AIO is reached we queue the request.  */
+
+      /* See if we can create a thread.  */
+      if (nthreads < optim.aio_threads)
 	{
-	  result = -1;
-	  aiocbp->aiocb.__error_code = errno;
-	  aiocbp->aiocb.__return_value = -1;
+	  pthread_t thid;
+	  pthread_attr_t attr;
+
+	  /* Make sure the thread is created detached.  */
+	  pthread_attr_init (&attr);
+	  pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+
+	  /* Now try to start a thread.  */
+	  if (pthread_create (&thid, &attr, handle_fildes_io, newp) == 0)
+	    {
+	      /* We managed to enqueue the request.  All errors which can
+		 happen now can be recognized by calls to `aio_return' and
+		 `aio_error'.  */
+	      running = allocated;
+	      ++nthreads;
+	    }
+	  else if (nthreads == 0)
+	    /* We cannot create a thread in the moment and there is
+	       also no thread running.  This is a problem.  `errno' is
+	       set to EAGAIN if this is only a temporary problem.  */
+	    result = -1;
+	}
+    }
+
+  /* Enqueue the request in the run queue if it is not yet running.  */
+  if (running < yes && result == 0)
+    {
+      if (runlist == NULL || runlist->aiocbp->aiocb.__abs_prio < prio)
+	{
+	  newp->next_run = runlist;
+	  runlist = newp;
 	}
       else
-	/* We managed to enqueue the request.  All errors which can
-	   happen now can be recognized by calls to `aio_return' and
-	   `aio_error'.  */
-	  result = 0;
+	{
+	  runp = runlist;
+
+	  while (runp->next_run != NULL
+		 && runp->next_run->aiocbp->aiocb.__abs_prio >= prio)
+	    runp = runp->next_run;
+
+	  newp->next_run = runp->next_run;
+	  runp->next_run = newp;
+	}
     }
 
-  /* Release the semaphore.  */
-  if (require_lock)
-    sem_post (&__aio_requests_sema);
+  if (result == 0)
+    newp->running = running;
+  else
+    {
+      /* Something went wrong.  */
+      __aio_free_req (newp);
+      newp = NULL;
+    }
 
-  return result;
+  /* Release the mutex.  */
+  pthread_mutex_unlock (&__aio_requests_mutex);
+
+  return newp;
 }
 
 
@@ -167,140 +391,156 @@ handle_fildes_io (void *arg)
 {
   pthread_t self = pthread_self ();
   struct sched_param param;
-  aiocb_union *runp = (aiocb_union *) arg;
+  struct requestlist *runp = (struct requestlist *) arg;
+  aiocb_union *aiocbp = runp->aiocbp;
   int policy;
-  int fildes = runp->aiocb.aio_fildes;	/* This is always the same.  */
+  int fildes = runp->aiocbp->aiocb.aio_fildes;
 
   pthread_getschedparam (self, &policy, &param);
 
   do
     {
       /* Change the priority to the requested value (if necessary).  */
-      if (runp->aiocb.__abs_prio != param.sched_priority
-	  || runp->aiocb.__policy != policy)
+      if (aiocbp->aiocb.__abs_prio != param.sched_priority
+	  || aiocbp->aiocb.__policy != policy)
 	{
-	  param.sched_priority = runp->aiocb.__abs_prio;
-	  policy = runp->aiocb.__policy;
+	  param.sched_priority = aiocbp->aiocb.__abs_prio;
+	  policy = aiocbp->aiocb.__policy;
 	  pthread_setschedparam (self, policy, &param);
 	}
 
       /* Process request pointed to by RUNP.  We must not be disturbed
 	 by signals.  */
-      if ((runp->aiocb.aio_lio_opcode & 127) == LIO_READ)
+      if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_READ)
 	{
-	  if (runp->aiocb.aio_lio_opcode & 128)
-	    runp->aiocb.__return_value =
+	  if (aiocbp->aiocb.aio_lio_opcode & 128)
+	    aiocbp->aiocb.__return_value =
 	      TEMP_FAILURE_RETRY (__pread64 (fildes,
-					     (void *) runp->aiocb64.aio_buf,
-					     runp->aiocb64.aio_nbytes,
-					     runp->aiocb64.aio_offset));
+					     (void *) aiocbp->aiocb64.aio_buf,
+					     aiocbp->aiocb64.aio_nbytes,
+					     aiocbp->aiocb64.aio_offset));
 	  else
-	    runp->aiocb.__return_value =
+	    aiocbp->aiocb.__return_value =
 	      TEMP_FAILURE_RETRY (__pread (fildes,
-					   (void *) runp->aiocb.aio_buf,
-					   runp->aiocb.aio_nbytes,
-					   runp->aiocb.aio_offset));
+					   (void *) aiocbp->aiocb.aio_buf,
+					   aiocbp->aiocb.aio_nbytes,
+					   aiocbp->aiocb.aio_offset));
 	}
-      else if ((runp->aiocb.aio_lio_opcode & 127) == LIO_WRITE)
+      else if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_WRITE)
 	{
-	  if (runp->aiocb.aio_lio_opcode & 128)
-	    runp->aiocb.__return_value =
+	  if (aiocbp->aiocb.aio_lio_opcode & 128)
+	    aiocbp->aiocb.__return_value =
 	      TEMP_FAILURE_RETRY (__pwrite64 (fildes,
-					      (const void *) runp->aiocb64.aio_buf,
-					      runp->aiocb64.aio_nbytes,
-					      runp->aiocb64.aio_offset));
+					      (const void *) aiocbp->aiocb64.aio_buf,
+					      aiocbp->aiocb64.aio_nbytes,
+					      aiocbp->aiocb64.aio_offset));
 	  else
-	    runp->aiocb.__return_value =
+	    aiocbp->aiocb.__return_value =
 	      TEMP_FAILURE_RETRY (__pwrite (fildes,
-					    (const void *) runp->aiocb.aio_buf,
-					    runp->aiocb.aio_nbytes,
-					    runp->aiocb.aio_offset));
+					    (const void *) aiocbp->aiocb.aio_buf,
+					    aiocbp->aiocb.aio_nbytes,
+					    aiocbp->aiocb.aio_offset));
 	}
-      else if (runp->aiocb.aio_lio_opcode == __LIO_DSYNC)
-	runp->aiocb.__return_value = TEMP_FAILURE_RETRY (fdatasync (fildes));
-      else if (runp->aiocb.aio_lio_opcode == __LIO_SYNC)
-	runp->aiocb.__return_value = TEMP_FAILURE_RETRY (fsync (fildes));
+      else if (aiocbp->aiocb.aio_lio_opcode == LIO_DSYNC)
+	aiocbp->aiocb.__return_value = TEMP_FAILURE_RETRY (fdatasync (fildes));
+      else if (aiocbp->aiocb.aio_lio_opcode == LIO_SYNC)
+	aiocbp->aiocb.__return_value = TEMP_FAILURE_RETRY (fsync (fildes));
       else
 	{
 	  /* This is an invalid opcode.  */
-	  runp->aiocb.__return_value = -1;
+	  aiocbp->aiocb.__return_value = -1;
 	  __set_errno (EINVAL);
 	}
 
-      if (runp->aiocb.__return_value == -1)
-	runp->aiocb.__error_code = errno;
+      /* Get the mutex.  */
+      pthread_mutex_lock (&__aio_requests_mutex);
+
+      if (aiocbp->aiocb.__return_value == -1)
+	aiocbp->aiocb.__error_code = errno;
       else
-	runp->aiocb.__error_code = 0;
+	aiocbp->aiocb.__error_code = 0;
 
       /* Send the signal to notify about finished processing of the
 	 request.  */
-      if (runp->aiocb.aio_sigevent.sigev_notify == SIGEV_THREAD)
+      __aio_notify (runp);
+
+      /* Now dequeue the current request.  */
+      if (runp->next_prio == NULL)
 	{
-	  /* We have to start a thread.  */
-	  pthread_t tid;
-	  pthread_attr_t attr, *pattr;
+	  /* No outstanding request for this descriptor.  Process the
+	     runlist if necessary.  */
+	  if (runp->next_fd != NULL)
+	    runp->next_fd->last_fd = runp->last_fd;
+	  if (runp->last_fd != NULL)
+	    runp->last_fd->next_fd = runp->next_fd;
+	}
+      else
+	{
+	  runp->next_prio->last_fd = runp->last_fd;
+	  runp->next_prio->next_fd = runp->next_fd;
+	  runp->next_prio->running = yes;
+	  if (runp->next_fd != NULL)
+	    runp->next_fd->last_fd = runp->next_prio;
+	  if (runp->last_fd != NULL)
+	    runp->last_fd->next_fd = runp->next_prio;
+	}
+
+      /* Free the old element.  */
+      __aio_free_req (runp);
 
-	  pattr = (pthread_attr_t *)
-	    runp->aiocb.aio_sigevent.sigev_notify_attributes;
-	  if (pattr == NULL)
+      runp = freelist;
+      if (runp != NULL)
+	{
+	  /* We must not run requests which are not marked `running'.  */
+	  if (runp->running == yes)
 	    {
-	      pthread_attr_init (&attr);
-	      pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
-	      pattr = &attr;
+	      freelist = runp->next_run;
+	      runp->running = allocated;
 	    }
-
-	  if (pthread_create (&tid,
-			      (pthread_attr_t *)
-			      runp->aiocb.aio_sigevent.sigev_notify_attributes,
-			      (void *(*) (void *))
-			      runp->aiocb.aio_sigevent.sigev_notify_function,
-			      runp->aiocb.aio_sigevent.sigev_value.sival_ptr)
-	      < 0)
+	  else
 	    {
-	      /* XXX What shall we do if already an error is set by
-		 read/write/fsync?  */
-	      runp->aiocb.__error_code = errno;
-	      runp->aiocb.__return_value = -1;
+	      struct requestlist *old;
+
+	      do
+		{
+		  old = runp;
+		  runp = runp->next_run;
+		}
+	      while (runp != NULL && runp->running != yes);
+
+	      if (runp != NULL)
+		old->next_run = runp->next_run;
 	    }
 	}
-      else if (runp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
-	/* We have to send a signal.  */
-	if (__aio_sigqueue (runp->aiocb.aio_sigevent.sigev_signo,
-			    runp->aiocb.aio_sigevent.sigev_value) < 0)
-	  {
-	    /* XXX What shall we do if already an error is set by
-	       read/write/fsync?  */
-	    runp->aiocb.__error_code = errno;
-	    runp->aiocb.__return_value = -1;
-	  }
-
-      /* Get the semaphore.  */
-      sem_wait (&__aio_requests_sema);
 
-      /* Now dequeue the current request.  */
-      if (runp->aiocb.__next_prio == NULL)
-	{
-	  if (runp->aiocb.__next_fd != NULL)
-	    runp->aiocb.__next_fd->__last_fd = runp->aiocb.__last_fd;
-	  if (runp->aiocb.__last_fd != NULL)
-	    runp->aiocb.__last_fd->__next_fd = runp->aiocb.__next_fd;
-	  runp = NULL;
-	}
-      else
-	{
-	  runp->aiocb.__next_prio->__last_fd = runp->aiocb.__last_fd;
-	  runp->aiocb.__next_prio->__next_fd = runp->aiocb.__next_fd;
-	  if (runp->aiocb.__next_fd != NULL)
-	    runp->aiocb.__next_fd->__last_fd = runp->aiocb.__next_prio;
-	  if (runp->aiocb.__last_fd != NULL)
-	    runp->aiocb.__last_fd->__next_fd = runp->aiocb.__next_prio;
-	  runp = (aiocb_union *) runp->aiocb.__next_prio;
-	}
+      /* If no request to work on we will stop the thread.  */
+      if (runp == NULL)
+	--nthreads;
 
-      /* Release the semaphore.  */
-      sem_post (&__aio_requests_sema);
+      /* Release the mutex.  */
+      pthread_mutex_unlock (&__aio_requests_mutex);
     }
   while (runp != NULL);
 
   pthread_exit (NULL);
 }
+
+
+/* Free allocated resources.  */
+static void
+__attribute__ ((unused))
+free_res (void)
+{
+  size_t row;
+
+  /* The first block of rows as specified in OPTIM is allocated in
+     one chunk.  */
+  free (pool[0]);
+
+  for (row = optim.aio_num / ENTRIES_PER_ROW; row < pool_tab_size; ++row)
+    free (pool[row]);
+
+  free (pool);
+}
+
+text_set_element (__libc_subfreeres, free_res);