about summary refs log tree commit diff
diff options
context:
space:
mode:
authorLeah Neukirchen <leah@vuxu.org>2021-02-27 20:20:55 +0100
committerLeah Neukirchen <leah@vuxu.org>2021-03-13 16:27:55 +0100
commit9942e645b33c7d8772f7419e4b3fc8f50aa7eda8 (patch)
tree7414645645c9e5d0a7786aa137ebdaaf6ae53ca5
parent6c81764641c7fe4c6ac190a12b9308f4de54cb4e (diff)
downloadnq-9942e645b33c7d8772f7419e4b3fc8f50aa7eda8.tar.gz
nq-9942e645b33c7d8772f7419e4b3fc8f50aa7eda8.tar.xz
nq-9942e645b33c7d8772f7419e4b3fc8f50aa7eda8.zip
nq: revamp waiting loop to avoid busy loops in contested cases
As reported in #37, the current rewind-loop has the issue of waiting
for a lock on the first file that is locked, but it could be detected
as being locked by another process that also just wants to check this
file.  As both processes rewind the dir and retry in this case, this
can yield a busy loop.

Instead, find the newest locked file and wait for that.  This should
result in most nq processes waiting for different locks, so this kind
of synchonization cannot appear.

In theory, removing the rewinddir would be enough, as later nq runs
should not result in enqueuing earlier jobs, but it's better to be
safe and check there are no more locked files before we launch
our job.
-rw-r--r--nq.c45
1 files changed, 27 insertions, 18 deletions
diff --git a/nq.c b/nq.c
index 5c23b15..178626b 100644
--- a/nq.c
+++ b/nq.c
@@ -110,7 +110,7 @@ main(int argc, char *argv[])
 	int dirfd = 0, lockfd = 0;
 	int opt = 0, cflag = 0, qflag = 0, tflag = 0, wflag = 0;
 	int pipefd[2];
-	char lockfile[64];
+	char lockfile[64], newestlocked[64];
 	pid_t child;
 	struct timeval started;
 	struct dirent *ent;
@@ -303,31 +303,40 @@ wait:
 		}
 
 again:
+		*newestlocked = 0;
+
 		while ((ent = readdir(dir))) {
-			/* wait for all ,* files.  */
+			/* wait for all older ,* files than ours.  */
 
-			if (ent->d_name[0] == ',' &&
-			    strcmp(ent->d_name, lockfile+1) < 0) {
-				int fd;
+			if (!(ent->d_name[0] == ',' &&
+			    strcmp(ent->d_name, lockfile+1) < 0))
+				continue;
 
-				fd = openat(dirfd, ent->d_name, O_RDWR);
-				if (fd < 0)
-					continue;
+			int fd = openat(dirfd, ent->d_name, O_RDWR);
+			if (fd < 0)
+				continue;
 
-				if (flock(fd, LOCK_EX | LOCK_NB) == -1 &&
-				    errno == EWOULDBLOCK) {
-					if (tflag)
-						exit(1);
-					flock(fd, LOCK_EX);   /* sit it out.  */
+			if (flock(fd, LOCK_EX | LOCK_NB) == -1 &&
+			    errno == EWOULDBLOCK) {
+				if (tflag)
+					exit(1);
+				if (strcmp(ent->d_name, newestlocked) > 0)
+					strcpy(newestlocked, ent->d_name);
+			} else {
+				fchmod(fd, 0600);
+			}
 
-					close(fd);
-					rewinddir(dir);
-					goto again;
-				}
+			close(fd);
+		}
 
-				fchmod(fd, 0600);
+		if (*newestlocked) {
+			int fd = openat(dirfd, newestlocked, O_RDWR);
+			if (fd >= 0) {
+				flock(fd, LOCK_EX);   /* sit it out.  */
 				close(fd);
 			}
+			rewinddir(dir);
+			goto again;
 		}
 
 		closedir(dir);          /* closes dirfd too.  */