From 897f15fb587fd2772b9e7ff6ec0265057f3c3975 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 30 Sep 2005 11:58:55 -0700 Subject: [PATCH] aio: remove unlocked task_list test and resulting race Only one of the run or kick path is supposed to put an iocb on the run list. If both of them do it than one of them can end up referencing a freed iocb. The kick path could delete the task_list item from the wait queue before getting the ctx_lock and putting the iocb on the run list. The run path was testing the task_list item outside the lock so that it could catch ki_retry methods that return -EIOCBRETRY *without* putting the iocb on a wait queue and promising to call kick_iocb. This unlocked check could then race with the kick path to cause both to try and put the iocb on the run list. The patch stops the run path from testing task_list by requring that any ki_retry that returns -EIOCBRETRY *must* guarantee that kick_iocb() will be called in the future. aio_p{read,write}, the only in-tree -EIOCBRETRY users, are updated. Signed-off-by: Zach Brown Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux/aio.h') diff --git a/include/linux/aio.h b/include/linux/aio.h index a4d5af907f90..60def658b246 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -43,6 +43,40 @@ struct kioctx; #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) +/* is there a better place to document function pointer methods? */ +/** + * ki_retry - iocb forward progress callback + * @kiocb: The kiocb struct to advance by performing an operation. + * + * This callback is called when the AIO core wants a given AIO operation + * to make forward progress. The kiocb argument describes the operation + * that is to be performed. As the operation proceeds, perhaps partially, + * ki_retry is expected to update the kiocb with progress made. Typically + * ki_retry is set in the AIO core and it itself calls file_operations + * helpers. + * + * ki_retry's return value determines when the AIO operation is completed + * and an event is generated in the AIO event ring. Except the special + * return values described below, the value that is returned from ki_retry + * is transferred directly into the completion ring as the operation's + * resulting status. Once this has happened ki_retry *MUST NOT* reference + * the kiocb pointer again. + * + * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete() + * will be called on the kiocb pointer in the future. The AIO core will + * not ask the method again -- ki_retry must ensure forward progress. + * aio_complete() must be called once and only once in the future, multiple + * calls may result in undefined behaviour. + * + * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb() + * will be called on the kiocb pointer in the future. This may happen + * through generic helpers that associate kiocb->ki_wait with a wait + * queue head that ki_retry uses via current->io_wait. It can also happen + * with custom tracking and manual calls to kick_iocb(), though that is + * discouraged. In either case, kick_iocb() must be called once and only + * once. ki_retry must ensure forward progress, the AIO core will wait + * indefinitely for kick_iocb() to be called. + */ struct kiocb { struct list_head ki_run_list; long ki_flags; -- cgit v1.2.2 From 4faa5285283fad081443e3612ca426a311bb6c7e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 17 Oct 2005 16:43:33 -0700 Subject: [PATCH] aio: revert lock_kiocb() lock_kiocb() was introduced to serialize retrying and cancellation. In the process of doing so it tried to sleep waiting for KIF_LOCKED while holding the ctx_lock spinlock. Recent fixes have ensured that multiple concurrent retries won't be attempted for a given iocb. Cancel has other problems and has no significant in-tree users that have been complaining about it. So for the immediate future we'll revert sleeping with the lock held and will address proper cancellation and retry serialization in the future. Signed-off-by: Zach Brown Acked-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/aio.h') diff --git a/include/linux/aio.h b/include/linux/aio.h index 60def658b246..0decf66117c1 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -24,7 +24,12 @@ struct kioctx; #define KIOCB_SYNC_KEY (~0U) /* ki_flags bits */ -#define KIF_LOCKED 0 +/* + * This may be used for cancel/retry serialization in the future, but + * for now it's unused and we probably don't want modules to even + * think they can use it. + */ +/* #define KIF_LOCKED 0 */ #define KIF_KICKED 1 #define KIF_CANCELLED 2 -- cgit v1.2.2 From d55b5fdaf40846221d543937b786956e27837fda Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 7 Nov 2005 00:59:31 -0800 Subject: [PATCH] aio: remove aio_max_nr accounting race AIO was adding a new context's max requests to the global total before testing if that resulting total was over the global limit. This let innocent tasks get their new limit tested along with a racing guilty task that was crossing the limit. This serializes the _nr accounting with a spinlock It also switches to using unsigned long for the global totals. Individual contexts are still limited to an unsigned int's worth of requests by the syscall interface. The problem and fix were verified with a simple program that spun creating and destroying a context while holding on to another long lived context. Before the patch a task creating a tiny context could get a spurious EAGAIN if it raced with a task creating a very large context that overran the limit. Signed-off-by: Zach Brown Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/aio.h') diff --git a/include/linux/aio.h b/include/linux/aio.h index 0decf66117c1..403d71dcb7c8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -183,6 +183,7 @@ struct kioctx { struct list_head active_reqs; /* used for cancellation */ struct list_head run_list; /* used for kicked reqs */ + /* sys_io_setup currently limits this to an unsigned int */ unsigned max_reqs; struct aio_ring_info ring_info; @@ -234,7 +235,7 @@ static inline struct kiocb *list_kiocb(struct list_head *h) } /* for sysctl: */ -extern atomic_t aio_nr; -extern unsigned aio_max_nr; +extern unsigned long aio_nr; +extern unsigned long aio_max_nr; #endif /* __LINUX__AIO_H */ -- cgit v1.2.2 From 20dcae32439384b6863c626bb3b2a09bed65b33e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Sun, 13 Nov 2005 16:07:33 -0800 Subject: [PATCH] aio: remove kioctx from mm_struct Sync iocbs have a life cycle that don't need a kioctx. Their retrying, if any, is done in the context of their owner who has allocated them on the stack. The sole user of a sync iocb's ctx reference was aio_complete() checking for an elevated iocb ref count that could never happen. No path which grabs an iocb ref has access to sync iocbs. If we were to implement sync iocb cancelation it would be done by the owner of the iocb using its on-stack reference. Removing this chunk from aio_complete allows us to remove the entire kioctx instance from mm_struct, reducing its size by a third. On a i386 testing box the slab size went from 768 to 504 bytes and from 5 to 8 per page. Signed-off-by: Zach Brown Acked-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/aio.h') diff --git a/include/linux/aio.h b/include/linux/aio.h index 403d71dcb7c8..9e0ae8711276 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -124,7 +124,7 @@ struct kiocb { (x)->ki_users = 1; \ (x)->ki_key = KIOCB_SYNC_KEY; \ (x)->ki_filp = (filp); \ - (x)->ki_ctx = &tsk->active_mm->default_kioctx; \ + (x)->ki_ctx = NULL; \ (x)->ki_cancel = NULL; \ (x)->ki_dtor = NULL; \ (x)->ki_obj.tsk = tsk; \ -- cgit v1.2.2 From 5ef1c49f8f9f0d6b5b8d57bb4b66c605a3d65876 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Sun, 13 Nov 2005 16:07:35 -0800 Subject: [PATCH] aio: don't ref kioctx after decref in put_ioctx put_ioctx's refcount debugging was doing an atomic_read after dropping its reference when it wasn't the last ref, leaving a tiny race for another freeing thread to sneak into. This shifts the debugging before the ops, uses BUG_ON, and reformats the defines a little. Sadly, moving to inlines increased the code size but this change decreases the code size by a whole 9 bytes :) Signed-off-by: Zach Brown Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/aio.h') diff --git a/include/linux/aio.h b/include/linux/aio.h index 9e0ae8711276..49fd37629ee4 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -210,8 +210,15 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id); int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb)); -#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) -#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) +#define get_ioctx(kioctx) do { \ + BUG_ON(unlikely(atomic_read(&(kioctx)->users) <= 0)); \ + atomic_inc(&(kioctx)->users); \ +} while (0) +#define put_ioctx(kioctx) do { \ + BUG_ON(unlikely(atomic_read(&(kioctx)->users) <= 0)); \ + if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ + __put_ioctx(kioctx); \ +} while (0) #define in_aio() !is_sync_wait(current->io_wait) /* may be used for debugging */ -- cgit v1.2.2