From 897f15fb587fd2772b9e7ff6ec0265057f3c3975 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Fri, 30 Sep 2005 11:58:55 -0700
Subject: [PATCH] aio: remove unlocked task_list test and resulting race

Only one of the run or kick path is supposed to put an iocb on the run
list.  If both of them do it than one of them can end up referencing a
freed iocb.  The kick path could delete the task_list item from the wait
queue before getting the ctx_lock and putting the iocb on the run list.
The run path was testing the task_list item outside the lock so that it
could catch ki_retry methods that return -EIOCBRETRY *without* putting the
iocb on a wait queue and promising to call kick_iocb.  This unlocked check
could then race with the kick path to cause both to try and put the iocb on
the run list.

The patch stops the run path from testing task_list by requring that any
ki_retry that returns -EIOCBRETRY *must* guarantee that kick_iocb() will be
called in the future.  aio_p{read,write}, the only in-tree -EIOCBRETRY
users, are updated.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux/aio.h')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index a4d5af907f90..60def658b246 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -43,6 +43,40 @@ struct kioctx;
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
+/* is there a better place to document function pointer methods? */
+/**
+ * ki_retry	-	iocb forward progress callback
+ * @kiocb:	The kiocb struct to advance by performing an operation.
+ *
+ * This callback is called when the AIO core wants a given AIO operation
+ * to make forward progress.  The kiocb argument describes the operation
+ * that is to be performed.  As the operation proceeds, perhaps partially,
+ * ki_retry is expected to update the kiocb with progress made.  Typically
+ * ki_retry is set in the AIO core and it itself calls file_operations
+ * helpers.
+ *
+ * ki_retry's return value determines when the AIO operation is completed
+ * and an event is generated in the AIO event ring.  Except the special
+ * return values described below, the value that is returned from ki_retry
+ * is transferred directly into the completion ring as the operation's
+ * resulting status.  Once this has happened ki_retry *MUST NOT* reference
+ * the kiocb pointer again.
+ *
+ * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
+ * will be called on the kiocb pointer in the future.  The AIO core will
+ * not ask the method again -- ki_retry must ensure forward progress.
+ * aio_complete() must be called once and only once in the future, multiple
+ * calls may result in undefined behaviour.
+ *
+ * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
+ * will be called on the kiocb pointer in the future.  This may happen
+ * through generic helpers that associate kiocb->ki_wait with a wait
+ * queue head that ki_retry uses via current->io_wait.  It can also happen
+ * with custom tracking and manual calls to kick_iocb(), though that is
+ * discouraged.  In either case, kick_iocb() must be called once and only
+ * once.  ki_retry must ensure forward progress, the AIO core will wait
+ * indefinitely for kick_iocb() to be called.
+ */
 struct kiocb {
 	struct list_head	ki_run_list;
 	long			ki_flags;
-- 
cgit v1.2.2


From 4faa5285283fad081443e3612ca426a311bb6c7e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 17 Oct 2005 16:43:33 -0700
Subject: [PATCH] aio: revert lock_kiocb()

lock_kiocb() was introduced to serialize retrying and cancellation.  In the
process of doing so it tried to sleep waiting for KIF_LOCKED while holding
the ctx_lock spinlock.  Recent fixes have ensured that multiple concurrent
retries won't be attempted for a given iocb.  Cancel has other problems and
has no significant in-tree users that have been complaining about it.  So
for the immediate future we'll revert sleeping with the lock held and will
address proper cancellation and retry serialization in the future.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux/aio.h')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 60def658b246..0decf66117c1 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -24,7 +24,12 @@ struct kioctx;
 #define KIOCB_SYNC_KEY		(~0U)
 
 /* ki_flags bits */
-#define KIF_LOCKED		0
+/*
+ * This may be used for cancel/retry serialization in the future, but
+ * for now it's unused and we probably don't want modules to even
+ * think they can use it.
+ */
+/* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
 
-- 
cgit v1.2.2


From d55b5fdaf40846221d543937b786956e27837fda Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 7 Nov 2005 00:59:31 -0800
Subject: [PATCH] aio: remove aio_max_nr accounting race

AIO was adding a new context's max requests to the global total before
testing if that resulting total was over the global limit.  This let
innocent tasks get their new limit tested along with a racing guilty task
that was crossing the limit.  This serializes the _nr accounting with a
spinlock It also switches to using unsigned long for the global totals.
Individual contexts are still limited to an unsigned int's worth of
requests by the syscall interface.

The problem and fix were verified with a simple program that spun creating
and destroying a context while holding on to another long lived context.
Before the patch a task creating a tiny context could get a spurious EAGAIN
if it raced with a task creating a very large context that overran the
limit.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux/aio.h')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 0decf66117c1..403d71dcb7c8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -183,6 +183,7 @@ struct kioctx {
 	struct list_head	active_reqs;	/* used for cancellation */
 	struct list_head	run_list;	/* used for kicked reqs */
 
+	/* sys_io_setup currently limits this to an unsigned int */
 	unsigned		max_reqs;
 
 	struct aio_ring_info	ring_info;
@@ -234,7 +235,7 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
 }
 
 /* for sysctl: */
-extern atomic_t aio_nr;
-extern unsigned aio_max_nr;
+extern unsigned long aio_nr;
+extern unsigned long aio_max_nr;
 
 #endif /* __LINUX__AIO_H */
-- 
cgit v1.2.2


From 20dcae32439384b6863c626bb3b2a09bed65b33e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Sun, 13 Nov 2005 16:07:33 -0800
Subject: [PATCH] aio: remove kioctx from mm_struct

Sync iocbs have a life cycle that don't need a kioctx.  Their retrying, if
any, is done in the context of their owner who has allocated them on the
stack.

The sole user of a sync iocb's ctx reference was aio_complete() checking for
an elevated iocb ref count that could never happen.  No path which grabs an
iocb ref has access to sync iocbs.

If we were to implement sync iocb cancelation it would be done by the owner of
the iocb using its on-stack reference.

Removing this chunk from aio_complete allows us to remove the entire kioctx
instance from mm_struct, reducing its size by a third.  On a i386 testing box
the slab size went from 768 to 504 bytes and from 5 to 8 per page.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/aio.h')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 403d71dcb7c8..9e0ae8711276 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -124,7 +124,7 @@ struct kiocb {
 		(x)->ki_users = 1;			\
 		(x)->ki_key = KIOCB_SYNC_KEY;		\
 		(x)->ki_filp = (filp);			\
-		(x)->ki_ctx = &tsk->active_mm->default_kioctx;	\
+		(x)->ki_ctx = NULL;			\
 		(x)->ki_cancel = NULL;			\
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
-- 
cgit v1.2.2


From 5ef1c49f8f9f0d6b5b8d57bb4b66c605a3d65876 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Sun, 13 Nov 2005 16:07:35 -0800
Subject: [PATCH] aio: don't ref kioctx after decref in put_ioctx

put_ioctx's refcount debugging was doing an atomic_read after dropping its
reference when it wasn't the last ref, leaving a tiny race for another freeing
thread to sneak into.  This shifts the debugging before the ops, uses BUG_ON,
and reformats the defines a little.  Sadly, moving to inlines increased the
code size but this change decreases the code size by a whole 9 bytes :)

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux/aio.h')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9e0ae8711276..49fd37629ee4 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -210,8 +210,15 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id);
 int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 				  struct iocb *iocb));
 
-#define get_ioctx(kioctx)	do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
-#define put_ioctx(kioctx)	do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)
+#define get_ioctx(kioctx) do {						\
+	BUG_ON(unlikely(atomic_read(&(kioctx)->users) <= 0));		\
+	atomic_inc(&(kioctx)->users);					\
+} while (0)
+#define put_ioctx(kioctx) do {						\
+	BUG_ON(unlikely(atomic_read(&(kioctx)->users) <= 0));		\
+	if (unlikely(atomic_dec_and_test(&(kioctx)->users))) 		\
+		__put_ioctx(kioctx);					\
+} while (0)
 
 #define in_aio() !is_sync_wait(current->io_wait)
 /* may be used for debugging */
-- 
cgit v1.2.2