From 998765e5588b197737d457e16f72832d8036190f Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 30 Sep 2005 11:58:54 -0700
Subject: [PATCH] aio: lock around kiocbTryKick()

Only one of the run or kick path is supposed to put an iocb on the run
list.  If both of them do it than one of them can end up referencing a
freed iocb.  The kick patch could set the Kicked bit before acquiring the
ctx_lock and putting the iocb on the run list.  The run path, while holding
the ctx_lock, could see this partial kick and mistake it for a kick that
was deferred while it was doing work with the run_list NULLed out.  It
would then race with the kick thread to add the iocb to the run list.

This patch moves the kick setting under the ctx_lock so that only one of
the kick or run path queues the iocb on the run list, as intended.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 0e11e31dbb77..b8f296999c04 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -899,16 +899,24 @@ static void aio_kick_handler(void *data)
  * and if required activate the aio work queue to process
  * it
  */
-static void queue_kicked_iocb(struct kiocb *iocb)
+static void try_queue_kicked_iocb(struct kiocb *iocb)
 {
  	struct kioctx	*ctx = iocb->ki_ctx;
 	unsigned long flags;
 	int run = 0;
 
-	WARN_ON((!list_empty(&iocb->ki_wait.task_list)));
+	/* We're supposed to be the only path putting the iocb back on the run
+	 * list.  If we find that the iocb is *back* on a wait queue already
+	 * than retry has happened before we could queue the iocb.  This also
+	 * means that the retry could have completed and freed our iocb, no
+	 * good. */
+	BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
 
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	run = __queue_kicked_iocb(iocb);
+	/* set this inside the lock so that we can't race with aio_run_iocb()
+	 * testing it and putting the iocb on the run list under the lock */
+	if (!kiocbTryKick(iocb))
+		run = __queue_kicked_iocb(iocb);
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	if (run)
 		aio_queue_work(ctx);
@@ -931,10 +939,7 @@ void fastcall kick_iocb(struct kiocb *iocb)
 		return;
 	}
 
-	/* If its already kicked we shouldn't queue it again */
-	if (!kiocbTryKick(iocb)) {
-		queue_kicked_iocb(iocb);
-	}
+	try_queue_kicked_iocb(iocb);
 }
 EXPORT_SYMBOL(kick_iocb);
 
-- 
cgit v1.2.2


From 897f15fb587fd2772b9e7ff6ec0265057f3c3975 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 30 Sep 2005 11:58:55 -0700
Subject: [PATCH] aio: remove unlocked task_list test and resulting race

Only one of the run or kick path is supposed to put an iocb on the run
list.  If both of them do it than one of them can end up referencing a
freed iocb.  The kick path could delete the task_list item from the wait
queue before getting the ctx_lock and putting the iocb on the run list.
The run path was testing the task_list item outside the lock so that it
could catch ki_retry methods that return -EIOCBRETRY *without* putting the
iocb on a wait queue and promising to call kick_iocb.  This unlocked check
could then race with the kick path to cause both to try and put the iocb on
the run list.

The patch stops the run path from testing task_list by requring that any
ki_retry that returns -EIOCBRETRY *must* guarantee that kick_iocb() will be
called in the future.  aio_p{read,write}, the only in-tree -EIOCBRETRY
users, are updated.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 79 +++++++++++++++++++++++++++-------------------------------------
 1 file changed, 33 insertions(+), 46 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index b8f296999c04..9edc0e4a1219 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -741,19 +741,9 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	ret = retry(iocb);
 	current->io_wait = NULL;
 
-	if (-EIOCBRETRY != ret) {
- 		if (-EIOCBQUEUED != ret) {
-			BUG_ON(!list_empty(&iocb->ki_wait.task_list));
-			aio_complete(iocb, ret, 0);
-			/* must not access the iocb after this */
-		}
-	} else {
-		/*
-		 * Issue an additional retry to avoid waiting forever if
-		 * no waits were queued (e.g. in case of a short read).
-		 */
-		if (list_empty(&iocb->ki_wait.task_list))
-			kiocbSetKicked(iocb);
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+		BUG_ON(!list_empty(&iocb->ki_wait.task_list));
+		aio_complete(iocb, ret, 0);
 	}
 out:
 	spin_lock_irq(&ctx->ctx_lock);
@@ -1327,8 +1317,11 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
 }
 
 /*
- * Default retry method for aio_read (also used for first time submit)
- * Responsible for updating iocb state as retries progress
+ * aio_p{read,write} are the default  ki_retry methods for
+ * IO_CMD_P{READ,WRITE}.  They maintains kiocb retry state around potentially
+ * multiple calls to f_op->aio_read().  They loop around partial progress
+ * instead of returning -EIOCBRETRY because they don't have the means to call
+ * kick_iocb().
  */
 static ssize_t aio_pread(struct kiocb *iocb)
 {
@@ -1337,25 +1330,25 @@ static ssize_t aio_pread(struct kiocb *iocb)
 	struct inode *inode = mapping->host;
 	ssize_t ret = 0;
 
-	ret = file->f_op->aio_read(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
+	do {
+		ret = file->f_op->aio_read(iocb, iocb->ki_buf,
+			iocb->ki_left, iocb->ki_pos);
+		/*
+		 * Can't just depend on iocb->ki_left to determine
+		 * whether we are done. This may have been a short read.
+		 */
+		if (ret > 0) {
+			iocb->ki_buf += ret;
+			iocb->ki_left -= ret;
+		}
 
-	/*
-	 * Can't just depend on iocb->ki_left to determine
-	 * whether we are done. This may have been a short read.
-	 */
-	if (ret > 0) {
-		iocb->ki_buf += ret;
-		iocb->ki_left -= ret;
 		/*
-		 * For pipes and sockets we return once we have
-		 * some data; for regular files we retry till we
-		 * complete the entire read or find that we can't
-		 * read any more data (e.g short reads).
+		 * For pipes and sockets we return once we have some data; for
+		 * regular files we retry till we complete the entire read or
+		 * find that we can't read any more data (e.g short reads).
 		 */
-		if (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))
-			ret = -EIOCBRETRY;
-	}
+	} while (ret > 0 &&
+		 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
 
 	/* This means we must have transferred all that we could */
 	/* No need to retry anymore */
@@ -1365,27 +1358,21 @@ static ssize_t aio_pread(struct kiocb *iocb)
 	return ret;
 }
 
-/*
- * Default retry method for aio_write (also used for first time submit)
- * Responsible for updating iocb state as retries progress
- */
+/* see aio_pread() */
 static ssize_t aio_pwrite(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t ret = 0;
 
-	ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
-
-	if (ret > 0) {
-		iocb->ki_buf += ret;
-		iocb->ki_left -= ret;
-
-		ret = -EIOCBRETRY;
-	}
+	do {
+		ret = file->f_op->aio_write(iocb, iocb->ki_buf,
+			iocb->ki_left, iocb->ki_pos);
+		if (ret > 0) {
+			iocb->ki_buf += ret;
+			iocb->ki_left -= ret;
+		}
+	} while (ret > 0);
 
-	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
 	if ((ret == 0) || (iocb->ki_left == 0))
 		ret = iocb->ki_nbytes - iocb->ki_left;
 
-- 
cgit v1.2.2


From 353fb07e2043d2df12dddf4e2c39552d0ab9b026 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 30 Sep 2005 11:58:56 -0700
Subject: [PATCH] aio: avoid extra aio_{read,write} call when ki_left == 0

Recently aio_p{read,write} changed to perform retries internally rather
than returning -EIOCBRETRY.  This inadvertantly resulted in always calling
aio_{read,write} with ki_left at 0 which would in turn immediately return
0.  Harmless, but we can avoid this call by checking in the caller.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 9edc0e4a1219..d6b1551342b7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1347,7 +1347,7 @@ static ssize_t aio_pread(struct kiocb *iocb)
 		 * regular files we retry till we complete the entire read or
 		 * find that we can't read any more data (e.g short reads).
 		 */
-	} while (ret > 0 &&
+	} while (ret > 0 && iocb->ki_left > 0 &&
 		 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
 
 	/* This means we must have transferred all that we could */
@@ -1371,7 +1371,7 @@ static ssize_t aio_pwrite(struct kiocb *iocb)
 			iocb->ki_buf += ret;
 			iocb->ki_left -= ret;
 		}
-	} while (ret > 0);
+	} while (ret > 0 && iocb->ki_left > 0);
 
 	if ((ret == 0) || (iocb->ki_left == 0))
 		ret = iocb->ki_nbytes - iocb->ki_left;
-- 
cgit v1.2.2