aboutsummaryrefslogtreecommitdiffstats
path: root/fs/aio.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-09-24 13:00:21 -0400
committerTejun Heo <tj@kernel.org>2014-09-24 13:00:21 -0400
commitd06efebf0c37d438fcf07057be00dd40fcfce08d (patch)
tree31a0786d132aadf4cbb9725f3f444ef6e1052128 /fs/aio.c
parentbb2e226b3bef596dd56be97df655d857b4603923 (diff)
parent0a30288da1aec914e158c2d7a3482a85f632750f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into for-3.18
This is to receive 0a30288da1ae ("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") which implements __percpu_ref_kill_expedited() to work around SCSI blk-mq stall. The commit reverted and patches to implement proper fix will be added. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/aio.c')
-rw-r--r--fs/aio.c174
1 files changed, 114 insertions, 60 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 93fbcc0f5696..8d217ed04e6e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
141 141
142 struct { 142 struct {
143 unsigned tail; 143 unsigned tail;
144 unsigned completed_events;
144 spinlock_t completion_lock; 145 spinlock_t completion_lock;
145 } ____cacheline_aligned_in_smp; 146 } ____cacheline_aligned_in_smp;
146 147
@@ -192,7 +193,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
192 } 193 }
193 194
194 file->f_flags = O_RDWR; 195 file->f_flags = O_RDWR;
195 file->private_data = ctx;
196 return file; 196 return file;
197} 197}
198 198
@@ -202,7 +202,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
202 static const struct dentry_operations ops = { 202 static const struct dentry_operations ops = {
203 .d_dname = simple_dname, 203 .d_dname = simple_dname,
204 }; 204 };
205 return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); 205 return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
206} 206}
207 207
208/* aio_setup 208/* aio_setup
@@ -556,8 +556,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
556 struct aio_ring *ring; 556 struct aio_ring *ring;
557 557
558 spin_lock(&mm->ioctx_lock); 558 spin_lock(&mm->ioctx_lock);
559 rcu_read_lock(); 559 table = rcu_dereference_raw(mm->ioctx_table);
560 table = rcu_dereference(mm->ioctx_table);
561 560
562 while (1) { 561 while (1) {
563 if (table) 562 if (table)
@@ -565,7 +564,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
565 if (!table->table[i]) { 564 if (!table->table[i]) {
566 ctx->id = i; 565 ctx->id = i;
567 table->table[i] = ctx; 566 table->table[i] = ctx;
568 rcu_read_unlock();
569 spin_unlock(&mm->ioctx_lock); 567 spin_unlock(&mm->ioctx_lock);
570 568
571 /* While kioctx setup is in progress, 569 /* While kioctx setup is in progress,
@@ -579,8 +577,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
579 } 577 }
580 578
581 new_nr = (table ? table->nr : 1) * 4; 579 new_nr = (table ? table->nr : 1) * 4;
582
583 rcu_read_unlock();
584 spin_unlock(&mm->ioctx_lock); 580 spin_unlock(&mm->ioctx_lock);
585 581
586 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * 582 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -591,8 +587,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
591 table->nr = new_nr; 587 table->nr = new_nr;
592 588
593 spin_lock(&mm->ioctx_lock); 589 spin_lock(&mm->ioctx_lock);
594 rcu_read_lock(); 590 old = rcu_dereference_raw(mm->ioctx_table);
595 old = rcu_dereference(mm->ioctx_table);
596 591
597 if (!old) { 592 if (!old) {
598 rcu_assign_pointer(mm->ioctx_table, table); 593 rcu_assign_pointer(mm->ioctx_table, table);
@@ -739,12 +734,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
739 734
740 735
741 spin_lock(&mm->ioctx_lock); 736 spin_lock(&mm->ioctx_lock);
742 rcu_read_lock(); 737 table = rcu_dereference_raw(mm->ioctx_table);
743 table = rcu_dereference(mm->ioctx_table);
744
745 WARN_ON(ctx != table->table[ctx->id]); 738 WARN_ON(ctx != table->table[ctx->id]);
746 table->table[ctx->id] = NULL; 739 table->table[ctx->id] = NULL;
747 rcu_read_unlock();
748 spin_unlock(&mm->ioctx_lock); 740 spin_unlock(&mm->ioctx_lock);
749 741
750 /* percpu_ref_kill() will do the necessary call_rcu() */ 742 /* percpu_ref_kill() will do the necessary call_rcu() */
@@ -793,40 +785,35 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
793 */ 785 */
794void exit_aio(struct mm_struct *mm) 786void exit_aio(struct mm_struct *mm)
795{ 787{
796 struct kioctx_table *table; 788 struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
797 struct kioctx *ctx; 789 int i;
798 unsigned i = 0;
799
800 while (1) {
801 rcu_read_lock();
802 table = rcu_dereference(mm->ioctx_table);
803
804 do {
805 if (!table || i >= table->nr) {
806 rcu_read_unlock();
807 rcu_assign_pointer(mm->ioctx_table, NULL);
808 if (table)
809 kfree(table);
810 return;
811 }
812 790
813 ctx = table->table[i++]; 791 if (!table)
814 } while (!ctx); 792 return;
815 793
816 rcu_read_unlock(); 794 for (i = 0; i < table->nr; ++i) {
795 struct kioctx *ctx = table->table[i];
796 struct completion requests_done =
797 COMPLETION_INITIALIZER_ONSTACK(requests_done);
817 798
799 if (!ctx)
800 continue;
818 /* 801 /*
819 * We don't need to bother with munmap() here - 802 * We don't need to bother with munmap() here - exit_mmap(mm)
820 * exit_mmap(mm) is coming and it'll unmap everything. 803 * is coming and it'll unmap everything. And we simply can't,
821 * Since aio_free_ring() uses non-zero ->mmap_size 804 * this is not necessarily our ->mm.
822 * as indicator that it needs to unmap the area, 805 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
823 * just set it to 0; aio_free_ring() is the only 806 * that it needs to unmap the area, just set it to 0.
824 * place that uses ->mmap_size, so it's safe.
825 */ 807 */
826 ctx->mmap_size = 0; 808 ctx->mmap_size = 0;
809 kill_ioctx(mm, ctx, &requests_done);
827 810
828 kill_ioctx(mm, ctx, NULL); 811 /* Wait until all IO for the context are done. */
812 wait_for_completion(&requests_done);
829 } 813 }
814
815 RCU_INIT_POINTER(mm->ioctx_table, NULL);
816 kfree(table);
830} 817}
831 818
832static void put_reqs_available(struct kioctx *ctx, unsigned nr) 819static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -834,10 +821,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
834 struct kioctx_cpu *kcpu; 821 struct kioctx_cpu *kcpu;
835 unsigned long flags; 822 unsigned long flags;
836 823
837 preempt_disable();
838 kcpu = this_cpu_ptr(ctx->cpu);
839
840 local_irq_save(flags); 824 local_irq_save(flags);
825 kcpu = this_cpu_ptr(ctx->cpu);
841 kcpu->reqs_available += nr; 826 kcpu->reqs_available += nr;
842 827
843 while (kcpu->reqs_available >= ctx->req_batch * 2) { 828 while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -846,7 +831,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
846 } 831 }
847 832
848 local_irq_restore(flags); 833 local_irq_restore(flags);
849 preempt_enable();
850} 834}
851 835
852static bool get_reqs_available(struct kioctx *ctx) 836static bool get_reqs_available(struct kioctx *ctx)
@@ -855,10 +839,8 @@ static bool get_reqs_available(struct kioctx *ctx)
855 bool ret = false; 839 bool ret = false;
856 unsigned long flags; 840 unsigned long flags;
857 841
858 preempt_disable();
859 kcpu = this_cpu_ptr(ctx->cpu);
860
861 local_irq_save(flags); 842 local_irq_save(flags);
843 kcpu = this_cpu_ptr(ctx->cpu);
862 if (!kcpu->reqs_available) { 844 if (!kcpu->reqs_available) {
863 int old, avail = atomic_read(&ctx->reqs_available); 845 int old, avail = atomic_read(&ctx->reqs_available);
864 846
@@ -878,10 +860,71 @@ static bool get_reqs_available(struct kioctx *ctx)
878 kcpu->reqs_available--; 860 kcpu->reqs_available--;
879out: 861out:
880 local_irq_restore(flags); 862 local_irq_restore(flags);
881 preempt_enable();
882 return ret; 863 return ret;
883} 864}
884 865
866/* refill_reqs_available
867 * Updates the reqs_available reference counts used for tracking the
868 * number of free slots in the completion ring. This can be called
869 * from aio_complete() (to optimistically update reqs_available) or
870 * from aio_get_req() (the we're out of events case). It must be
871 * called holding ctx->completion_lock.
872 */
873static void refill_reqs_available(struct kioctx *ctx, unsigned head,
874 unsigned tail)
875{
876 unsigned events_in_ring, completed;
877
878 /* Clamp head since userland can write to it. */
879 head %= ctx->nr_events;
880 if (head <= tail)
881 events_in_ring = tail - head;
882 else
883 events_in_ring = ctx->nr_events - (head - tail);
884
885 completed = ctx->completed_events;
886 if (events_in_ring < completed)
887 completed -= events_in_ring;
888 else
889 completed = 0;
890
891 if (!completed)
892 return;
893
894 ctx->completed_events -= completed;
895 put_reqs_available(ctx, completed);
896}
897
898/* user_refill_reqs_available
899 * Called to refill reqs_available when aio_get_req() encounters an
900 * out of space in the completion ring.
901 */
902static void user_refill_reqs_available(struct kioctx *ctx)
903{
904 spin_lock_irq(&ctx->completion_lock);
905 if (ctx->completed_events) {
906 struct aio_ring *ring;
907 unsigned head;
908
909 /* Access of ring->head may race with aio_read_events_ring()
910 * here, but that's okay since whether we read the old version
911 * or the new version, and either will be valid. The important
912 * part is that head cannot pass tail since we prevent
913 * aio_complete() from updating tail by holding
914 * ctx->completion_lock. Even if head is invalid, the check
915 * against ctx->completed_events below will make sure we do the
916 * safe/right thing.
917 */
918 ring = kmap_atomic(ctx->ring_pages[0]);
919 head = ring->head;
920 kunmap_atomic(ring);
921
922 refill_reqs_available(ctx, head, ctx->tail);
923 }
924
925 spin_unlock_irq(&ctx->completion_lock);
926}
927
885/* aio_get_req 928/* aio_get_req
886 * Allocate a slot for an aio request. 929 * Allocate a slot for an aio request.
887 * Returns NULL if no requests are free. 930 * Returns NULL if no requests are free.
@@ -890,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
890{ 933{
891 struct kiocb *req; 934 struct kiocb *req;
892 935
893 if (!get_reqs_available(ctx)) 936 if (!get_reqs_available(ctx)) {
894 return NULL; 937 user_refill_reqs_available(ctx);
938 if (!get_reqs_available(ctx))
939 return NULL;
940 }
895 941
896 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 942 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
897 if (unlikely(!req)) 943 if (unlikely(!req))
@@ -950,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
950 struct kioctx *ctx = iocb->ki_ctx; 996 struct kioctx *ctx = iocb->ki_ctx;
951 struct aio_ring *ring; 997 struct aio_ring *ring;
952 struct io_event *ev_page, *event; 998 struct io_event *ev_page, *event;
999 unsigned tail, pos, head;
953 unsigned long flags; 1000 unsigned long flags;
954 unsigned tail, pos;
955 1001
956 /* 1002 /*
957 * Special case handling for sync iocbs: 1003 * Special case handling for sync iocbs:
@@ -1012,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1012 ctx->tail = tail; 1058 ctx->tail = tail;
1013 1059
1014 ring = kmap_atomic(ctx->ring_pages[0]); 1060 ring = kmap_atomic(ctx->ring_pages[0]);
1061 head = ring->head;
1015 ring->tail = tail; 1062 ring->tail = tail;
1016 kunmap_atomic(ring); 1063 kunmap_atomic(ring);
1017 flush_dcache_page(ctx->ring_pages[0]); 1064 flush_dcache_page(ctx->ring_pages[0]);
1018 1065
1066 ctx->completed_events++;
1067 if (ctx->completed_events > 1)
1068 refill_reqs_available(ctx, head, tail);
1019 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1069 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1020 1070
1021 pr_debug("added to ring %p at [%u]\n", iocb, tail); 1071 pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1030,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1030 1080
1031 /* everything turned out well, dispose of the aiocb. */ 1081 /* everything turned out well, dispose of the aiocb. */
1032 kiocb_free(iocb); 1082 kiocb_free(iocb);
1033 put_reqs_available(ctx, 1);
1034 1083
1035 /* 1084 /*
1036 * We have to order our ring_info tail store above and test 1085 * We have to order our ring_info tail store above and test
@@ -1047,7 +1096,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1047} 1096}
1048EXPORT_SYMBOL(aio_complete); 1097EXPORT_SYMBOL(aio_complete);
1049 1098
1050/* aio_read_events 1099/* aio_read_events_ring
1051 * Pull an event off of the ioctx's event ring. Returns the number of 1100 * Pull an event off of the ioctx's event ring. Returns the number of
1052 * events fetched 1101 * events fetched
1053 */ 1102 */
@@ -1067,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
1067 tail = ring->tail; 1116 tail = ring->tail;
1068 kunmap_atomic(ring); 1117 kunmap_atomic(ring);
1069 1118
1119 /*
1120 * Ensure that once we've read the current tail pointer, that
1121 * we also see the events that were stored up to the tail.
1122 */
1123 smp_rmb();
1124
1070 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); 1125 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1071 1126
1072 if (head == tail) 1127 if (head == tail)
@@ -1270,12 +1325,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
1270 if (compat) 1325 if (compat)
1271 ret = compat_rw_copy_check_uvector(rw, 1326 ret = compat_rw_copy_check_uvector(rw,
1272 (struct compat_iovec __user *)buf, 1327 (struct compat_iovec __user *)buf,
1273 *nr_segs, 1, *iovec, iovec); 1328 *nr_segs, UIO_FASTIOV, *iovec, iovec);
1274 else 1329 else
1275#endif 1330#endif
1276 ret = rw_copy_check_uvector(rw, 1331 ret = rw_copy_check_uvector(rw,
1277 (struct iovec __user *)buf, 1332 (struct iovec __user *)buf,
1278 *nr_segs, 1, *iovec, iovec); 1333 *nr_segs, UIO_FASTIOV, *iovec, iovec);
1279 if (ret < 0) 1334 if (ret < 0)
1280 return ret; 1335 return ret;
1281 1336
@@ -1299,9 +1354,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
1299} 1354}
1300 1355
1301/* 1356/*
1302 * aio_setup_iocb: 1357 * aio_run_iocb:
1303 * Performs the initial checks and aio retry method 1358 * Performs the initial checks and io submission.
1304 * setup for the kiocb at the time of io submission.
1305 */ 1359 */
1306static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, 1360static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1307 char __user *buf, bool compat) 1361 char __user *buf, bool compat)
@@ -1313,7 +1367,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1313 fmode_t mode; 1367 fmode_t mode;
1314 aio_rw_op *rw_op; 1368 aio_rw_op *rw_op;
1315 rw_iter_op *iter_op; 1369 rw_iter_op *iter_op;
1316 struct iovec inline_vec, *iovec = &inline_vec; 1370 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1317 struct iov_iter iter; 1371 struct iov_iter iter;
1318 1372
1319 switch (opcode) { 1373 switch (opcode) {
@@ -1348,7 +1402,7 @@ rw_common:
1348 if (!ret) 1402 if (!ret)
1349 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); 1403 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
1350 if (ret < 0) { 1404 if (ret < 0) {
1351 if (iovec != &inline_vec) 1405 if (iovec != inline_vecs)
1352 kfree(iovec); 1406 kfree(iovec);
1353 return ret; 1407 return ret;
1354 } 1408 }
@@ -1395,7 +1449,7 @@ rw_common:
1395 return -EINVAL; 1449 return -EINVAL;
1396 } 1450 }
1397 1451
1398 if (iovec != &inline_vec) 1452 if (iovec != inline_vecs)
1399 kfree(iovec); 1453 kfree(iovec);
1400 1454
1401 if (ret != -EIOCBQUEUED) { 1455 if (ret != -EIOCBQUEUED) {