1 files changed, 87 insertions, 7 deletions
diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
        struct {
                unsigned        tail;
+                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
@@ -660,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        INIT_LIST_HEAD(&ctx->active_reqs);
-        if (percpu_ref_init(&ctx->users, free_ioctx_users))
+        if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
                goto err;
-        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
                goto err;
        ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -792,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
        for (i = 0; i < table->nr; ++i) {
                struct kioctx *ctx = table->table[i];
+                struct completion requests_done =
+                        COMPLETION_INITIALIZER_ONSTACK(requests_done);
                if (!ctx)
                        continue;
@@ -803,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
                 * that it needs to unmap the area, just set it to 0.
                 */
                ctx->mmap_size = 0;
-                kill_ioctx(mm, ctx, NULL);
+                kill_ioctx(mm, ctx, &requests_done);
+                /* Wait until all IO for the context are done. */
+                wait_for_completion(&requests_done);
        }
        RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -857,6 +863,68 @@ out:
        return ret;
 }
+/* refill_reqs_available
+ *      Updates the reqs_available reference counts used for tracking the
+ *      number of free slots in the completion ring.  This can be called
+ *      from aio_complete() (to optimistically update reqs_available) or
+ *      from aio_get_req() (the we're out of events case).  It must be
+ *      called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+        unsigned events_in_ring, completed;
+        /* Clamp head since userland can write to it. */
+        head %= ctx->nr_events;
+        if (head <= tail)
+                events_in_ring = tail - head;
+        else
+                events_in_ring = ctx->nr_events - (head - tail);
+        completed = ctx->completed_events;
+        if (events_in_ring < completed)
+                completed -= events_in_ring;
+        else
+                completed = 0;
+        if (!completed)
+                return;
+        ctx->completed_events -= completed;
+        put_reqs_available(ctx, completed);
+}
+/* user_refill_reqs_available
+ *      Called to refill reqs_available when aio_get_req() encounters an
+ *      out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+        spin_lock_irq(&ctx->completion_lock);
+        if (ctx->completed_events) {
+                struct aio_ring *ring;
+                unsigned head;
+                /* Access of ring->head may race with aio_read_events_ring()
+                 * here, but that's okay since whether we read the old version
+                 * or the new version, and either will be valid.  The important
+                 * part is that head cannot pass tail since we prevent
+                 * aio_complete() from updating tail by holding
+                 * ctx->completion_lock.  Even if head is invalid, the check
+                 * against ctx->completed_events below will make sure we do the
+                 * safe/right thing.
+                 */
+                ring = kmap_atomic(ctx->ring_pages[0]);
+                head = ring->head;
+                kunmap_atomic(ring);
+                refill_reqs_available(ctx, head, ctx->tail);
+        }
+        spin_unlock_irq(&ctx->completion_lock);
+}
 /* aio_get_req
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
@@ -865,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req;
-        if (!get_reqs_available(ctx))
+        if (!get_reqs_available(ctx)) {
-                return NULL;
+                user_refill_reqs_available(ctx);
+                if (!get_reqs_available(ctx))
+                        return NULL;
+        }
        req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
        if (unlikely(!req))
@@ -925,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
+        unsigned tail, pos, head;
        unsigned long   flags;
-        unsigned tail, pos;
        /*
         * Special case handling for sync iocbs:
@@ -987,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ctx->tail = tail;
        ring = kmap_atomic(ctx->ring_pages[0]);
+        head = ring->head;
        ring->tail = tail;
        kunmap_atomic(ring);
        flush_dcache_page(ctx->ring_pages[0]);
+        ctx->completed_events++;
+        if (ctx->completed_events > 1)
+                refill_reqs_available(ctx, head, tail);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
-        put_reqs_available(ctx, 1);
        /*
         * We have to order our ring_info tail store above and test
@@ -1042,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
        tail = ring->tail;
        kunmap_atomic(ring);
+        /*
+         * Ensure that once we've read the current tail pointer, that
+         * we also see the events that were stored up to the tail.
+         */
+        smp_rmb();
        pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
        if (head == tail)

diff --git a/fs/aio.c b/fs/aio.c index ae635872affb..84a751005f5b 100644 --- a/fs/aio.c +++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
141		141
142	struct {	142	struct {
143	unsigned tail;	143	unsigned tail;
		144	unsigned completed_events;
144	spinlock_t completion_lock;	145	spinlock_t completion_lock;
145	} ____cacheline_aligned_in_smp;	146	} ____cacheline_aligned_in_smp;
146		147
@@ -660,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
660		661
661	INIT_LIST_HEAD(&ctx->active_reqs);	662	INIT_LIST_HEAD(&ctx->active_reqs);
662		663
663	if (percpu_ref_init(&ctx->users, free_ioctx_users))	664	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
664	goto err;	665	goto err;
665		666
666	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))	667	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
667	goto err;	668	goto err;
668		669
669	ctx->cpu = alloc_percpu(struct kioctx_cpu);	670	ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -792,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
792		793
793	for (i = 0; i < table->nr; ++i) {	794	for (i = 0; i < table->nr; ++i) {
794	struct kioctx *ctx = table->table[i];	795	struct kioctx *ctx = table->table[i];
		796	struct completion requests_done =
		797	COMPLETION_INITIALIZER_ONSTACK(requests_done);
795		798
796	if (!ctx)	799	if (!ctx)
797	continue;	800	continue;
@@ -803,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
803	* that it needs to unmap the area, just set it to 0.	806	* that it needs to unmap the area, just set it to 0.
804	*/	807	*/
805	ctx->mmap_size = 0;	808	ctx->mmap_size = 0;
806	kill_ioctx(mm, ctx, NULL);	809	kill_ioctx(mm, ctx, &requests_done);
		810
		811	/* Wait until all IO for the context are done. */
		812	wait_for_completion(&requests_done);
807	}	813	}
808		814
809	RCU_INIT_POINTER(mm->ioctx_table, NULL);	815	RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -857,6 +863,68 @@ out:
857	return ret;	863	return ret;
858	}	864	}
859		865
		866	/* refill_reqs_available
		867	* Updates the reqs_available reference counts used for tracking the
		868	* number of free slots in the completion ring. This can be called
		869	* from aio_complete() (to optimistically update reqs_available) or
		870	* from aio_get_req() (the we're out of events case). It must be
		871	* called holding ctx->completion_lock.
		872	*/
		873	static void refill_reqs_available(struct kioctx *ctx, unsigned head,
		874	unsigned tail)
		875	{
		876	unsigned events_in_ring, completed;
		877
		878	/* Clamp head since userland can write to it. */
		879	head %= ctx->nr_events;
		880	if (head <= tail)
		881	events_in_ring = tail - head;
		882	else
		883	events_in_ring = ctx->nr_events - (head - tail);
		884
		885	completed = ctx->completed_events;
		886	if (events_in_ring < completed)
		887	completed -= events_in_ring;
		888	else
		889	completed = 0;
		890
		891	if (!completed)
		892	return;
		893
		894	ctx->completed_events -= completed;
		895	put_reqs_available(ctx, completed);
		896	}
		897
		898	/* user_refill_reqs_available
		899	* Called to refill reqs_available when aio_get_req() encounters an
		900	* out of space in the completion ring.
		901	*/
		902	static void user_refill_reqs_available(struct kioctx *ctx)
		903	{
		904	spin_lock_irq(&ctx->completion_lock);
		905	if (ctx->completed_events) {
		906	struct aio_ring *ring;
		907	unsigned head;
		908
		909	/* Access of ring->head may race with aio_read_events_ring()
		910	* here, but that's okay since whether we read the old version
		911	* or the new version, and either will be valid. The important
		912	* part is that head cannot pass tail since we prevent
		913	* aio_complete() from updating tail by holding
		914	* ctx->completion_lock. Even if head is invalid, the check
		915	* against ctx->completed_events below will make sure we do the
		916	* safe/right thing.
		917	*/
		918	ring = kmap_atomic(ctx->ring_pages[0]);
		919	head = ring->head;
		920	kunmap_atomic(ring);
		921
		922	refill_reqs_available(ctx, head, ctx->tail);
		923	}
		924
		925	spin_unlock_irq(&ctx->completion_lock);
		926	}
		927
860	/* aio_get_req	928	/* aio_get_req
861	* Allocate a slot for an aio request.	929	* Allocate a slot for an aio request.
862	* Returns NULL if no requests are free.	930	* Returns NULL if no requests are free.
@@ -865,8 +933,11 @@ static inline struct kiocb aio_get_req(struct kioctx ctx)
865	{	933	{
866	struct kiocb *req;	934	struct kiocb *req;
867		935
868	if (!get_reqs_available(ctx))	936	if (!get_reqs_available(ctx)) {
869	return NULL;	937	user_refill_reqs_available(ctx);
		938	if (!get_reqs_available(ctx))
		939	return NULL;
		940	}
870		941
871	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL\|__GFP_ZERO);	942	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL\|__GFP_ZERO);
872	if (unlikely(!req))	943	if (unlikely(!req))
@@ -925,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
925	struct kioctx *ctx = iocb->ki_ctx;	996	struct kioctx *ctx = iocb->ki_ctx;
926	struct aio_ring *ring;	997	struct aio_ring *ring;
927	struct io_event ev_page, event;	998	struct io_event ev_page, event;
		999	unsigned tail, pos, head;
928	unsigned long flags;	1000	unsigned long flags;
929	unsigned tail, pos;
930		1001
931	/*	1002	/*
932	* Special case handling for sync iocbs:	1003	* Special case handling for sync iocbs:
@@ -987,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
987	ctx->tail = tail;	1058	ctx->tail = tail;
988		1059
989	ring = kmap_atomic(ctx->ring_pages[0]);	1060	ring = kmap_atomic(ctx->ring_pages[0]);
		1061	head = ring->head;
990	ring->tail = tail;	1062	ring->tail = tail;
991	kunmap_atomic(ring);	1063	kunmap_atomic(ring);
992	flush_dcache_page(ctx->ring_pages[0]);	1064	flush_dcache_page(ctx->ring_pages[0]);
993		1065
		1066	ctx->completed_events++;
		1067	if (ctx->completed_events > 1)
		1068	refill_reqs_available(ctx, head, tail);
994	spin_unlock_irqrestore(&ctx->completion_lock, flags);	1069	spin_unlock_irqrestore(&ctx->completion_lock, flags);
995		1070
996	pr_debug("added to ring %p at [%u]\n", iocb, tail);	1071	pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1005		1080
1006	/* everything turned out well, dispose of the aiocb. */	1081	/* everything turned out well, dispose of the aiocb. */
1007	kiocb_free(iocb);	1082	kiocb_free(iocb);
1008	put_reqs_available(ctx, 1);
1009		1083
1010	/*	1084	/*
1011	* We have to order our ring_info tail store above and test	1085	* We have to order our ring_info tail store above and test
@@ -1042,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
1042	tail = ring->tail;	1116	tail = ring->tail;
1043	kunmap_atomic(ring);	1117	kunmap_atomic(ring);
1044		1118
		1119	/*
		1120	* Ensure that once we've read the current tail pointer, that
		1121	* we also see the events that were stored up to the tail.
		1122	*/
		1123	smp_rmb();
		1124
1045	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);	1125	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1046		1126
1047	if (head == tail)	1127	if (head == tail)