1 files changed, 708 insertions, 0 deletions
diff --git a/sysdeps/pthread/aio_misc.c b/sysdeps/pthread/aio_misc.c
new file mode 100644
index 0000000000..1026e061c8
--- /dev/null
+++ b/sysdeps/pthread/aio_misc.c
@@ -0,0 +1,708 @@
+/* Handle general operations.
+   Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+#include <aio.h>
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#include "aio_misc.h"
+
+static void add_request_to_runlist (struct requestlist *newrequest);
+
+/* Pool of request list entries.  */
+static struct requestlist **pool;
+
+/* Number of total and allocated pool entries.  */
+static size_t pool_max_size;
+static size_t pool_size;
+
+/* We implement a two dimensional array but allocate each row separately.
+   The macro below determines how many entries should be used per row.
+   It should better be a power of two.  */
+#define ENTRIES_PER_ROW	32
+
+/* How many rows we allocate at once.  */
+#define ROWS_STEP	8
+
+/* List of available entries.  */
+static struct requestlist *freelist;
+
+/* List of request waiting to be processed.  */
+static struct requestlist *runlist;
+
+/* Structure list of all currently processed requests.  */
+static struct requestlist *requests;
+
+/* Number of threads currently running.  */
+static int nthreads;
+
+/* Number of threads waiting for work to arrive. */
+static int idle_thread_count;
+
+
+/* These are the values used to optimize the use of AIO.  The user can
+   overwrite them by using the `aio_init' function.  */
+static struct aioinit optim =
+{
+  20,	/* int aio_threads;	Maximal number of threads.  */
+  64,	/* int aio_num;		Number of expected simultanious requests. */
+  0,
+  0,
+  0,
+  0,
+  1,
+  0
+};
+
+
+/* Since the list is global we need a mutex protecting it.  */
+pthread_mutex_t __aio_requests_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/* When you add a request to the list and there are idle threads present,
+   you signal this condition variable. When a thread finishes work, it waits
+   on this condition variable for a time before it actually exits. */
+pthread_cond_t __aio_new_request_notification = PTHREAD_COND_INITIALIZER;
+
+
+/* Functions to handle request list pool.  */
+static struct requestlist *
+get_elem (void)
+{
+  struct requestlist *result;
+
+  if (freelist == NULL)
+    {
+      struct requestlist *new_row;
+      int cnt;
+
+      assert (sizeof (struct aiocb) == sizeof (struct aiocb64));
+
+      if (pool_size + 1 >= pool_max_size)
+	{
+	  size_t new_max_size = pool_max_size + ROWS_STEP;
+	  struct requestlist **new_tab;
+
+	  new_tab = (struct requestlist **)
+	    realloc (pool, new_max_size * sizeof (struct requestlist *));
+
+	  if (new_tab == NULL)
+	    return NULL;
+
+	  pool_max_size = new_max_size;
+	  pool = new_tab;
+	}
+
+      /* Allocate the new row.  */
+      cnt = pool_size == 0 ? optim.aio_num : ENTRIES_PER_ROW;
+      new_row = (struct requestlist *) calloc (cnt,
+					       sizeof (struct requestlist));
+      if (new_row == NULL)
+	return NULL;
+
+      pool[pool_size++] = new_row;
+
+      /* Put all the new entries in the freelist.  */
+      do
+	{
+	  new_row->next_prio = freelist;
+	  freelist = new_row++;
+	}
+      while (--cnt > 0);
+    }
+
+  result = freelist;
+  freelist = freelist->next_prio;
+
+  return result;
+}
+
+
+void
+internal_function
+__aio_free_request (struct requestlist *elem)
+{
+  elem->running = no;
+  elem->next_prio = freelist;
+  freelist = elem;
+}
+
+
+struct requestlist *
+internal_function
+__aio_find_req (aiocb_union *elem)
+{
+  struct requestlist *runp = requests;
+  int fildes = elem->aiocb.aio_fildes;
+
+  while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
+    runp = runp->next_fd;
+
+  if (runp != NULL)
+    {
+      if (runp->aiocbp->aiocb.aio_fildes != fildes)
+	runp = NULL;
+      else
+	while (runp != NULL && runp->aiocbp != elem)
+	  runp = runp->next_prio;
+    }
+
+  return runp;
+}
+
+
+struct requestlist *
+internal_function
+__aio_find_req_fd (int fildes)
+{
+  struct requestlist *runp = requests;
+
+  while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
+    runp = runp->next_fd;
+
+  return (runp != NULL && runp->aiocbp->aiocb.aio_fildes == fildes
+	  ? runp : NULL);
+}
+
+
+void
+internal_function
+__aio_remove_request (struct requestlist *last, struct requestlist *req,
+		      int all)
+{
+  assert (req->running == yes || req->running == queued
+	  || req->running == done);
+
+  if (last != NULL)
+    last->next_prio = all ? NULL : req->next_prio;
+  else
+    {
+      if (all || req->next_prio == NULL)
+	{
+	  if (req->last_fd != NULL)
+	    req->last_fd->next_fd = req->next_fd;
+	  else
+	    requests = req->next_fd;
+	  if (req->next_fd != NULL)
+	    req->next_fd->last_fd = req->last_fd;
+	}
+      else
+	{
+	  if (req->last_fd != NULL)
+	    req->last_fd->next_fd = req->next_prio;
+	  else
+	    requests = req->next_prio;
+
+	  if (req->next_fd != NULL)
+	    req->next_fd->last_fd = req->next_prio;
+
+	  req->next_prio->last_fd = req->last_fd;
+	  req->next_prio->next_fd = req->next_fd;
+
+	  /* Mark this entry as runnable.  */
+	  req->next_prio->running = yes;
+	}
+
+      if (req->running == yes)
+	{
+	  struct requestlist *runp = runlist;
+
+	  last = NULL;
+	  while (runp != NULL)
+	    {
+	      if (runp == req)
+		{
+		  if (last == NULL)
+		    runlist = runp->next_run;
+		  else
+		    last->next_run = runp->next_run;
+		  break;
+		}
+	      last = runp;
+	      runp = runp->next_run;
+	    }
+	}
+    }
+}
+
+
+/* The thread handler.  */
+static void *handle_fildes_io (void *arg);
+
+
+/* User optimization.  */
+void
+__aio_init (const struct aioinit *init)
+{
+  /* Get the mutex.  */
+  pthread_mutex_lock (&__aio_requests_mutex);
+
+  /* Only allow writing new values if the table is not yet allocated.  */
+  if (pool == NULL)
+    {
+      optim.aio_threads = init->aio_threads < 1 ? 1 : init->aio_threads;
+      optim.aio_num = (init->aio_num < ENTRIES_PER_ROW
+		       ? ENTRIES_PER_ROW
+		       : init->aio_num & ~ENTRIES_PER_ROW);
+    }
+
+  if (init->aio_idle_time != 0)
+    optim.aio_idle_time = init->aio_idle_time;
+
+  /* Release the mutex.  */
+  pthread_mutex_unlock (&__aio_requests_mutex);
+}
+weak_alias (__aio_init, aio_init)
+
+
+/* The main function of the async I/O handling.  It enqueues requests
+   and if necessary starts and handles threads.  */
+struct requestlist *
+internal_function
+__aio_enqueue_request (aiocb_union *aiocbp, int operation)
+{
+  int result = 0;
+  int policy, prio;
+  struct sched_param param;
+  struct requestlist *last, *runp, *newp;
+  int running = no;
+
+  if (operation == LIO_SYNC || operation == LIO_DSYNC)
+    aiocbp->aiocb.aio_reqprio = 0;
+  else if (aiocbp->aiocb.aio_reqprio < 0
+	   || aiocbp->aiocb.aio_reqprio > AIO_PRIO_DELTA_MAX)
+    {
+      /* Invalid priority value.  */
+      __set_errno (EINVAL);
+      aiocbp->aiocb.__error_code = EINVAL;
+      aiocbp->aiocb.__return_value = -1;
+      return NULL;
+    }
+
+  /* Compute priority for this request.  */
+  pthread_getschedparam (pthread_self (), &policy, &param);
+  prio = param.sched_priority - aiocbp->aiocb.aio_reqprio;
+
+  /* Get the mutex.  */
+  pthread_mutex_lock (&__aio_requests_mutex);
+
+  last = NULL;
+  runp = requests;
+  /* First look whether the current file descriptor is currently
+     worked with.  */
+  while (runp != NULL
+	 && runp->aiocbp->aiocb.aio_fildes < aiocbp->aiocb.aio_fildes)
+    {
+      last = runp;
+      runp = runp->next_fd;
+    }
+
+  /* Get a new element for the waiting list.  */
+  newp = get_elem ();
+  if (newp == NULL)
+    {
+      pthread_mutex_unlock (&__aio_requests_mutex);
+      __set_errno (EAGAIN);
+      return NULL;
+    }
+  newp->aiocbp = aiocbp;
+  newp->caller_pid = (aiocbp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL
+		      ? getpid () : 0);
+  newp->waiting = NULL;
+
+  aiocbp->aiocb.__abs_prio = prio;
+  aiocbp->aiocb.__policy = policy;
+  aiocbp->aiocb.aio_lio_opcode = operation;
+  aiocbp->aiocb.__error_code = EINPROGRESS;
+  aiocbp->aiocb.__return_value = 0;
+
+  if (runp != NULL
+      && runp->aiocbp->aiocb.aio_fildes == aiocbp->aiocb.aio_fildes)
+    {
+      /* The current file descriptor is worked on.  It makes no sense
+	 to start another thread since this new thread would fight
+	 with the running thread for the resources.  But we also cannot
+	 say that the thread processing this desriptor shall immediately
+	 after finishing the current job process this request if there
+	 are other threads in the running queue which have a higher
+	 priority.  */
+
+      /* Simply enqueue it after the running one according to the
+	 priority.  */
+      while (runp->next_prio != NULL
+	     && runp->next_prio->aiocbp->aiocb.__abs_prio >= prio)
+	runp = runp->next_prio;
+
+      newp->next_prio = runp->next_prio;
+      runp->next_prio = newp;
+
+      running = queued;
+    }
+  else
+    {
+      running = yes;
+      /* Enqueue this request for a new descriptor.  */
+      if (last == NULL)
+	{
+	  newp->last_fd = NULL;
+	  newp->next_fd = requests;
+	  if (requests != NULL)
+	    requests->last_fd = newp;
+	  requests = newp;
+	}
+      else
+	{
+	  newp->next_fd = last->next_fd;
+	  newp->last_fd = last;
+	  last->next_fd = newp;
+	  if (newp->next_fd != NULL)
+	    newp->next_fd->last_fd = newp;
+	}
+
+      newp->next_prio = NULL;
+    }
+
+  if (running == yes)
+    {
+      /* We try to create a new thread for this file descriptor.  The
+	 function which gets called will handle all available requests
+	 for this descriptor and when all are processed it will
+	 terminate.
+
+	 If no new thread can be created or if the specified limit of
+	 threads for AIO is reached we queue the request.  */
+
+      /* See if we need to and are able to create a thread.  */
+      if (nthreads < optim.aio_threads && idle_thread_count == 0)
+	{
+	  pthread_t thid;
+	  pthread_attr_t attr;
+
+	  /* Make sure the thread is created detached.  */
+	  pthread_attr_init (&attr);
+	  pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+
+	  running = newp->running = allocated;
+
+	  /* Now try to start a thread.  */
+	  if (pthread_create (&thid, &attr, handle_fildes_io, newp) == 0)
+	    /* We managed to enqueue the request.  All errors which can
+	       happen now can be recognized by calls to `aio_return' and
+	       `aio_error'.  */
+	    ++nthreads;
+	  else
+	    {
+	      /* Reset the running flag.  The new request is not running.  */
+	      running = newp->running = yes;
+
+	      if (nthreads == 0)
+		/* We cannot create a thread in the moment and there is
+		   also no thread running.  This is a problem.  `errno' is
+		   set to EAGAIN if this is only a temporary problem.  */
+		result = -1;
+	    }
+	}
+    }
+
+  /* Enqueue the request in the run queue if it is not yet running.  */
+  if (running == yes && result == 0)
+    {
+      add_request_to_runlist (newp);
+
+      /* If there is a thread waiting for work, then let it know that we
+	 have just given it something to do. */
+      if (idle_thread_count > 0)
+	pthread_cond_signal (&__aio_new_request_notification);
+    }
+
+  if (result == 0)
+    newp->running = running;
+  else
+    {
+      /* Something went wrong.  */
+      __aio_free_request (newp);
+      newp = NULL;
+    }
+
+  /* Release the mutex.  */
+  pthread_mutex_unlock (&__aio_requests_mutex);
+
+  return newp;
+}
+
+
+static void *
+__attribute__ ((noreturn))
+handle_fildes_io (void *arg)
+{
+  pthread_t self = pthread_self ();
+  struct sched_param param;
+  struct requestlist *runp = (struct requestlist *) arg;
+  aiocb_union *aiocbp;
+  int policy;
+  int fildes;
+
+  pthread_getschedparam (self, &policy, &param);
+
+  do
+    {
+      /* If runp is NULL, then we were created to service the work queue
+	 in general, not to handle any particular request. In that case we
+	 skip the "do work" stuff on the first pass, and go directly to the
+	 "get work off the work queue" part of this loop, which is near the
+	 end. */
+      if (runp == NULL)
+	pthread_mutex_lock (&__aio_requests_mutex);
+      else
+	{
+	  /* Hopefully this request is marked as running.  */
+	  assert (runp->running == allocated);
+
+	  /* Update our variables.  */
+	  aiocbp = runp->aiocbp;
+	  fildes = aiocbp->aiocb.aio_fildes;
+
+	  /* Change the priority to the requested value (if necessary).  */
+	  if (aiocbp->aiocb.__abs_prio != param.sched_priority
+	      || aiocbp->aiocb.__policy != policy)
+	    {
+	      param.sched_priority = aiocbp->aiocb.__abs_prio;
+	      policy = aiocbp->aiocb.__policy;
+	      pthread_setschedparam (self, policy, &param);
+	    }
+
+	  /* Process request pointed to by RUNP.  We must not be disturbed
+	     by signals.  */
+	  if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_READ)
+	    {
+	      if (aiocbp->aiocb.aio_lio_opcode & 128)
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (__pread64 (fildes, (void *)
+						 aiocbp->aiocb64.aio_buf,
+						 aiocbp->aiocb64.aio_nbytes,
+						 aiocbp->aiocb64.aio_offset));
+	      else
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (pread (fildes,
+					     (void *) aiocbp->aiocb.aio_buf,
+					     aiocbp->aiocb.aio_nbytes,
+					     aiocbp->aiocb.aio_offset));
+
+	      if (aiocbp->aiocb.__return_value == -1 && errno == ESPIPE)
+		/* The Linux kernel is different from others.  It returns
+		   ESPIPE if using pread on a socket.  Other platforms
+		   simply ignore the offset parameter and behave like
+		   read.  */
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (read (fildes,
+					    (void *) aiocbp->aiocb64.aio_buf,
+					    aiocbp->aiocb64.aio_nbytes));
+	    }
+	  else if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_WRITE)
+	    {
+	      if (aiocbp->aiocb.aio_lio_opcode & 128)
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (__pwrite64 (fildes, (const void *)
+						  aiocbp->aiocb64.aio_buf,
+						  aiocbp->aiocb64.aio_nbytes,
+						  aiocbp->aiocb64.aio_offset));
+	      else
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (pwrite (fildes, (const void *)
+					      aiocbp->aiocb.aio_buf,
+					      aiocbp->aiocb.aio_nbytes,
+					      aiocbp->aiocb.aio_offset));
+
+	      if (aiocbp->aiocb.__return_value == -1 && errno == ESPIPE)
+		/* The Linux kernel is different from others.  It returns
+		   ESPIPE if using pwrite on a socket.  Other platforms
+		   simply ignore the offset parameter and behave like
+		   write.  */
+		aiocbp->aiocb.__return_value =
+		  TEMP_FAILURE_RETRY (write (fildes,
+					     (void *) aiocbp->aiocb64.aio_buf,
+					     aiocbp->aiocb64.aio_nbytes));
+	    }
+	  else if (aiocbp->aiocb.aio_lio_opcode == LIO_DSYNC)
+	    aiocbp->aiocb.__return_value =
+	      TEMP_FAILURE_RETRY (fdatasync (fildes));
+	  else if (aiocbp->aiocb.aio_lio_opcode == LIO_SYNC)
+	    aiocbp->aiocb.__return_value =
+	      TEMP_FAILURE_RETRY (fsync (fildes));
+	  else
+	    {
+	      /* This is an invalid opcode.  */
+	      aiocbp->aiocb.__return_value = -1;
+	      __set_errno (EINVAL);
+	    }
+
+	  /* Get the mutex.  */
+	  pthread_mutex_lock (&__aio_requests_mutex);
+
+	  /* In theory we would need here a write memory barrier since the
+	     callers test using aio_error() whether the request finished
+	     and once this value != EINPROGRESS the field __return_value
+	     must be committed to memory.
+
+	     But since the pthread_mutex_lock call involves write memory
+	     barriers as well it is not necessary.  */
+
+	  if (aiocbp->aiocb.__return_value == -1)
+	    aiocbp->aiocb.__error_code = errno;
+	  else
+	    aiocbp->aiocb.__error_code = 0;
+
+	  /* Send the signal to notify about finished processing of the
+	     request.  */
+	  __aio_notify (runp);
+
+	  /* For debugging purposes we reset the running flag of the
+	     finished request.  */
+	  assert (runp->running == allocated);
+	  runp->running = done;
+
+	  /* Now dequeue the current request.  */
+	  __aio_remove_request (NULL, runp, 0);
+	  if (runp->next_prio != NULL)
+	    add_request_to_runlist (runp->next_prio);
+
+	  /* Free the old element.  */
+	  __aio_free_request (runp);
+	}
+
+      runp = runlist;
+
+      /* If the runlist is empty, then we sleep for a while, waiting for
+	 something to arrive in it. */
+      if (runp == NULL && optim.aio_idle_time >= 0)
+	{
+	  struct timeval now;
+	  struct timespec wakeup_time;
+
+	  ++idle_thread_count;
+	  gettimeofday (&now, NULL);
+	  wakeup_time.tv_sec = now.tv_sec + optim.aio_idle_time;
+	  wakeup_time.tv_nsec = now.tv_usec * 1000;
+	  if (wakeup_time.tv_nsec > 1000000000)
+	    {
+	      wakeup_time.tv_nsec -= 1000000000;
+	      ++wakeup_time.tv_sec;
+	    }
+	  pthread_cond_timedwait (&__aio_new_request_notification,
+				  &__aio_requests_mutex,
+				  &wakeup_time);
+	  --idle_thread_count;
+	  runp = runlist;
+	}
+
+      if (runp == NULL)
+	--nthreads;
+      else
+	{
+	  assert (runp->running == yes);
+	  runp->running = allocated;
+	  runlist = runp->next_run;
+
+	  /* If we have a request to process, and there's still another in
+	     the run list, then we need to either wake up or create a new
+	     thread to service the request that is still in the run list. */
+	  if (runlist != NULL)
+	    {
+	      /* There are at least two items in the work queue to work on.
+		 If there are other idle threads, then we should wake them
+		 up for these other work elements; otherwise, we should try
+		 to create a new thread. */
+	      if (idle_thread_count > 0)
+		pthread_cond_signal (&__aio_new_request_notification);
+	      else if (nthreads < optim.aio_threads)
+		{
+		  pthread_t thid;
+		  pthread_attr_t attr;
+
+		  /* Make sure the thread is created detached.  */
+		  pthread_attr_init (&attr);
+		  pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+
+		  /* Now try to start a thread. If we fail, no big deal,
+		     because we know that there is at least one thread (us)
+		     that is working on AIO operations. */
+		  if (pthread_create (&thid, &attr, handle_fildes_io, NULL)
+		      == 0)
+		    ++nthreads;
+		}
+	    }
+	}
+
+      /* Release the mutex.  */
+      pthread_mutex_unlock (&__aio_requests_mutex);
+    }
+  while (runp != NULL);
+
+  pthread_exit (NULL);
+}
+
+
+/* Free allocated resources.  */
+static void
+__attribute__ ((unused))
+free_res (void)
+{
+  size_t row;
+
+  for (row = 0; row < pool_max_size; ++row)
+    free (pool[row]);
+
+  free (pool);
+}
+text_set_element (__libc_subfreeres, free_res);
+
+
+/* Add newrequest to the runlist. The __abs_prio flag of newrequest must
+   be correctly set to do this. Also, you had better set newrequest's
+   "running" flag to "yes" before you release your lock or you'll throw an
+   assertion. */
+static void
+add_request_to_runlist (struct requestlist *newrequest)
+{
+  int prio = newrequest->aiocbp->aiocb.__abs_prio;
+  struct requestlist *runp;
+
+  if (runlist == NULL || runlist->aiocbp->aiocb.__abs_prio < prio)
+    {
+      newrequest->next_run = runlist;
+      runlist = newrequest;
+    }
+  else
+    {
+      runp = runlist;
+
+      while (runp->next_run != NULL
+	     && runp->next_run->aiocbp->aiocb.__abs_prio >= prio)
+	runp = runp->next_run;
+
+      newrequest->next_run = runp->next_run;
+      runp->next_run = newrequest;
+    }
+}