aio: fix reqs_available handling

As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request leak when events are reaped by userspace") introduces a regression when user code attempts to perform io_submit() with more events than are available in the ring buffer. Reverting that commit would reintroduce a regression when user space event reaping is used. Fixing this bug is a bit more involved than the previous attempts to fix this regression. Since we do not have a single point at which we can count events as being reaped by user space and io_getevents(), we have to track event completion by looking at the number of events left in the event ring. So long as there are as many events in the ring buffer as there have been completion events generate, we cannot call put_reqs_available(). The code to check for this is now placed in refill_reqs_available(). A test program from Dan and modified by me for verifying this bug is available at http://www.kvack.org/~bcrl/20140824-aio_bug.c . Reported-by: Dan Aloni <dan@kernelim.com> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org> Acked-by: Dan Aloni <dan@kernelim.com> Cc: Kent Overstreet <kmo@daterainc.com> Cc: Mateusz Guzik <mguzik@redhat.com> Cc: Petr Matousek <pmatouse@redhat.com> Cc: stable@vger.kernel.org # v3.16 and anything that f8567a3845ac was backported to Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Benjamin LaHaise <bcrl@kvack.org> 2014-08-24 13:14:05 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-24 18:47:27 -0400
commit: d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 (patch)
tree: 885097a564287bad1cc32039d89501e5a2e8b867
parent: 451fd72219dd6f3355e2d036c598544c760ee532 (diff)
1 files changed, 73 insertions, 4 deletions
diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..97bc62cbe2da 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
        struct {
                unsigned        tail;
+                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
@@ -857,6 +858,68 @@ out:
        return ret;
 }
+/* refill_reqs_available
+ *      Updates the reqs_available reference counts used for tracking the
+ *      number of free slots in the completion ring.  This can be called
+ *      from aio_complete() (to optimistically update reqs_available) or
+ *      from aio_get_req() (the we're out of events case).  It must be
+ *      called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+        unsigned events_in_ring, completed;
+        /* Clamp head since userland can write to it. */
+        head %= ctx->nr_events;
+        if (head <= tail)
+                events_in_ring = tail - head;
+        else
+                events_in_ring = ctx->nr_events - (head - tail);
+        completed = ctx->completed_events;
+        if (events_in_ring < completed)
+                completed -= events_in_ring;
+        else
+                completed = 0;
+        if (!completed)
+                return;
+        ctx->completed_events -= completed;
+        put_reqs_available(ctx, completed);
+}
+/* user_refill_reqs_available
+ *      Called to refill reqs_available when aio_get_req() encounters an
+ *      out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+        spin_lock_irq(&ctx->completion_lock);
+        if (ctx->completed_events) {
+                struct aio_ring *ring;
+                unsigned head;
+                /* Access of ring->head may race with aio_read_events_ring()
+                 * here, but that's okay since whether we read the old version
+                 * or the new version, and either will be valid.  The important
+                 * part is that head cannot pass tail since we prevent
+                 * aio_complete() from updating tail by holding
+                 * ctx->completion_lock.  Even if head is invalid, the check
+                 * against ctx->completed_events below will make sure we do the
+                 * safe/right thing.
+                 */
+                ring = kmap_atomic(ctx->ring_pages[0]);
+                head = ring->head;
+                kunmap_atomic(ring);
+                refill_reqs_available(ctx, head, ctx->tail);
+        }
+        spin_unlock_irq(&ctx->completion_lock);
+}
 /* aio_get_req
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
@@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req;
-        if (!get_reqs_available(ctx))
+        if (!get_reqs_available(ctx)) {
-                return NULL;
+                user_refill_reqs_available(ctx);
+                if (!get_reqs_available(ctx))
+                        return NULL;
+        }
        req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
        if (unlikely(!req))
@@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
+        unsigned tail, pos, head;
        unsigned long   flags;
-        unsigned tail, pos;
        /*
         * Special case handling for sync iocbs:
@@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ctx->tail = tail;
        ring = kmap_atomic(ctx->ring_pages[0]);
+        head = ring->head;
        ring->tail = tail;
        kunmap_atomic(ring);
        flush_dcache_page(ctx->ring_pages[0]);
+        ctx->completed_events++;
+        if (ctx->completed_events > 1)
+                refill_reqs_available(ctx, head, tail);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
-        put_reqs_available(ctx, 1);
        /*
         * We have to order our ring_info tail store above and test
author	Benjamin LaHaise <bcrl@kvack.org>	2014-08-24 13:14:05 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-08-24 18:47:27 -0400
commit	d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 (patch)
tree	885097a564287bad1cc32039d89501e5a2e8b867
parent	451fd72219dd6f3355e2d036c598544c760ee532 (diff)

diff --git a/fs/aio.c b/fs/aio.c index ae635872affb..97bc62cbe2da 100644 --- a/fs/aio.c +++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
141		141
142	struct {	142	struct {
143	unsigned tail;	143	unsigned tail;
		144	unsigned completed_events;
144	spinlock_t completion_lock;	145	spinlock_t completion_lock;
145	} ____cacheline_aligned_in_smp;	146	} ____cacheline_aligned_in_smp;
146		147
@@ -857,6 +858,68 @@ out:
857	return ret;	858	return ret;
858	}	859	}
859		860
		861	/* refill_reqs_available
		862	* Updates the reqs_available reference counts used for tracking the
		863	* number of free slots in the completion ring. This can be called
		864	* from aio_complete() (to optimistically update reqs_available) or
		865	* from aio_get_req() (the we're out of events case). It must be
		866	* called holding ctx->completion_lock.
		867	*/
		868	static void refill_reqs_available(struct kioctx *ctx, unsigned head,
		869	unsigned tail)
		870	{
		871	unsigned events_in_ring, completed;
		872
		873	/* Clamp head since userland can write to it. */
		874	head %= ctx->nr_events;
		875	if (head <= tail)
		876	events_in_ring = tail - head;
		877	else
		878	events_in_ring = ctx->nr_events - (head - tail);
		879
		880	completed = ctx->completed_events;
		881	if (events_in_ring < completed)
		882	completed -= events_in_ring;
		883	else
		884	completed = 0;
		885
		886	if (!completed)
		887	return;
		888
		889	ctx->completed_events -= completed;
		890	put_reqs_available(ctx, completed);
		891	}
		892
		893	/* user_refill_reqs_available
		894	* Called to refill reqs_available when aio_get_req() encounters an
		895	* out of space in the completion ring.
		896	*/
		897	static void user_refill_reqs_available(struct kioctx *ctx)
		898	{
		899	spin_lock_irq(&ctx->completion_lock);
		900	if (ctx->completed_events) {
		901	struct aio_ring *ring;
		902	unsigned head;
		903
		904	/* Access of ring->head may race with aio_read_events_ring()
		905	* here, but that's okay since whether we read the old version
		906	* or the new version, and either will be valid. The important
		907	* part is that head cannot pass tail since we prevent
		908	* aio_complete() from updating tail by holding
		909	* ctx->completion_lock. Even if head is invalid, the check
		910	* against ctx->completed_events below will make sure we do the
		911	* safe/right thing.
		912	*/
		913	ring = kmap_atomic(ctx->ring_pages[0]);
		914	head = ring->head;
		915	kunmap_atomic(ring);
		916
		917	refill_reqs_available(ctx, head, ctx->tail);
		918	}
		919
		920	spin_unlock_irq(&ctx->completion_lock);
		921	}
		922
860	/* aio_get_req	923	/* aio_get_req
861	* Allocate a slot for an aio request.	924	* Allocate a slot for an aio request.
862	* Returns NULL if no requests are free.	925	* Returns NULL if no requests are free.
@@ -865,8 +928,11 @@ static inline struct kiocb aio_get_req(struct kioctx ctx)
865	{	928	{
866	struct kiocb *req;	929	struct kiocb *req;
867		930
868	if (!get_reqs_available(ctx))	931	if (!get_reqs_available(ctx)) {
869	return NULL;	932	user_refill_reqs_available(ctx);
		933	if (!get_reqs_available(ctx))
		934	return NULL;
		935	}
870		936
871	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL\|__GFP_ZERO);	937	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL\|__GFP_ZERO);
872	if (unlikely(!req))	938	if (unlikely(!req))
@@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
925	struct kioctx *ctx = iocb->ki_ctx;	991	struct kioctx *ctx = iocb->ki_ctx;
926	struct aio_ring *ring;	992	struct aio_ring *ring;
927	struct io_event ev_page, event;	993	struct io_event ev_page, event;
		994	unsigned tail, pos, head;
928	unsigned long flags;	995	unsigned long flags;
929	unsigned tail, pos;
930		996
931	/*	997	/*
932	* Special case handling for sync iocbs:	998	* Special case handling for sync iocbs:
@@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
987	ctx->tail = tail;	1053	ctx->tail = tail;
988		1054
989	ring = kmap_atomic(ctx->ring_pages[0]);	1055	ring = kmap_atomic(ctx->ring_pages[0]);
		1056	head = ring->head;
990	ring->tail = tail;	1057	ring->tail = tail;
991	kunmap_atomic(ring);	1058	kunmap_atomic(ring);
992	flush_dcache_page(ctx->ring_pages[0]);	1059	flush_dcache_page(ctx->ring_pages[0]);
993		1060
		1061	ctx->completed_events++;
		1062	if (ctx->completed_events > 1)
		1063	refill_reqs_available(ctx, head, tail);
994	spin_unlock_irqrestore(&ctx->completion_lock, flags);	1064	spin_unlock_irqrestore(&ctx->completion_lock, flags);
995		1065
996	pr_debug("added to ring %p at [%u]\n", iocb, tail);	1066	pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1005		1075
1006	/* everything turned out well, dispose of the aiocb. */	1076	/* everything turned out well, dispose of the aiocb. */
1007	kiocb_free(iocb);	1077	kiocb_free(iocb);
1008	put_reqs_available(ctx, 1);
1009		1078
1010	/*	1079	/*
1011	* We have to order our ring_info tail store above and test	1080	* We have to order our ring_info tail store above and test