diff options
author | Tejun Heo <tj@kernel.org> | 2014-09-24 13:00:21 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2014-09-24 13:00:21 -0400 |
commit | d06efebf0c37d438fcf07057be00dd40fcfce08d (patch) | |
tree | 31a0786d132aadf4cbb9725f3f444ef6e1052128 /fs/aio.c | |
parent | bb2e226b3bef596dd56be97df655d857b4603923 (diff) | |
parent | 0a30288da1aec914e158c2d7a3482a85f632750f (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into for-3.18
This is to receive 0a30288da1ae ("blk-mq, percpu_ref: implement a
kludge for SCSI blk-mq stall during probe") which implements
__percpu_ref_kill_expedited() to work around SCSI blk-mq stall. The
commit reverted and patches to implement proper fix will be added.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 174 |
1 files changed, 114 insertions, 60 deletions
@@ -141,6 +141,7 @@ struct kioctx { | |||
141 | 141 | ||
142 | struct { | 142 | struct { |
143 | unsigned tail; | 143 | unsigned tail; |
144 | unsigned completed_events; | ||
144 | spinlock_t completion_lock; | 145 | spinlock_t completion_lock; |
145 | } ____cacheline_aligned_in_smp; | 146 | } ____cacheline_aligned_in_smp; |
146 | 147 | ||
@@ -192,7 +193,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) | |||
192 | } | 193 | } |
193 | 194 | ||
194 | file->f_flags = O_RDWR; | 195 | file->f_flags = O_RDWR; |
195 | file->private_data = ctx; | ||
196 | return file; | 196 | return file; |
197 | } | 197 | } |
198 | 198 | ||
@@ -202,7 +202,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, | |||
202 | static const struct dentry_operations ops = { | 202 | static const struct dentry_operations ops = { |
203 | .d_dname = simple_dname, | 203 | .d_dname = simple_dname, |
204 | }; | 204 | }; |
205 | return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); | 205 | return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); |
206 | } | 206 | } |
207 | 207 | ||
208 | /* aio_setup | 208 | /* aio_setup |
@@ -556,8 +556,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
556 | struct aio_ring *ring; | 556 | struct aio_ring *ring; |
557 | 557 | ||
558 | spin_lock(&mm->ioctx_lock); | 558 | spin_lock(&mm->ioctx_lock); |
559 | rcu_read_lock(); | 559 | table = rcu_dereference_raw(mm->ioctx_table); |
560 | table = rcu_dereference(mm->ioctx_table); | ||
561 | 560 | ||
562 | while (1) { | 561 | while (1) { |
563 | if (table) | 562 | if (table) |
@@ -565,7 +564,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
565 | if (!table->table[i]) { | 564 | if (!table->table[i]) { |
566 | ctx->id = i; | 565 | ctx->id = i; |
567 | table->table[i] = ctx; | 566 | table->table[i] = ctx; |
568 | rcu_read_unlock(); | ||
569 | spin_unlock(&mm->ioctx_lock); | 567 | spin_unlock(&mm->ioctx_lock); |
570 | 568 | ||
571 | /* While kioctx setup is in progress, | 569 | /* While kioctx setup is in progress, |
@@ -579,8 +577,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
579 | } | 577 | } |
580 | 578 | ||
581 | new_nr = (table ? table->nr : 1) * 4; | 579 | new_nr = (table ? table->nr : 1) * 4; |
582 | |||
583 | rcu_read_unlock(); | ||
584 | spin_unlock(&mm->ioctx_lock); | 580 | spin_unlock(&mm->ioctx_lock); |
585 | 581 | ||
586 | table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * | 582 | table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * |
@@ -591,8 +587,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
591 | table->nr = new_nr; | 587 | table->nr = new_nr; |
592 | 588 | ||
593 | spin_lock(&mm->ioctx_lock); | 589 | spin_lock(&mm->ioctx_lock); |
594 | rcu_read_lock(); | 590 | old = rcu_dereference_raw(mm->ioctx_table); |
595 | old = rcu_dereference(mm->ioctx_table); | ||
596 | 591 | ||
597 | if (!old) { | 592 | if (!old) { |
598 | rcu_assign_pointer(mm->ioctx_table, table); | 593 | rcu_assign_pointer(mm->ioctx_table, table); |
@@ -739,12 +734,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, | |||
739 | 734 | ||
740 | 735 | ||
741 | spin_lock(&mm->ioctx_lock); | 736 | spin_lock(&mm->ioctx_lock); |
742 | rcu_read_lock(); | 737 | table = rcu_dereference_raw(mm->ioctx_table); |
743 | table = rcu_dereference(mm->ioctx_table); | ||
744 | |||
745 | WARN_ON(ctx != table->table[ctx->id]); | 738 | WARN_ON(ctx != table->table[ctx->id]); |
746 | table->table[ctx->id] = NULL; | 739 | table->table[ctx->id] = NULL; |
747 | rcu_read_unlock(); | ||
748 | spin_unlock(&mm->ioctx_lock); | 740 | spin_unlock(&mm->ioctx_lock); |
749 | 741 | ||
750 | /* percpu_ref_kill() will do the necessary call_rcu() */ | 742 | /* percpu_ref_kill() will do the necessary call_rcu() */ |
@@ -793,40 +785,35 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); | |||
793 | */ | 785 | */ |
794 | void exit_aio(struct mm_struct *mm) | 786 | void exit_aio(struct mm_struct *mm) |
795 | { | 787 | { |
796 | struct kioctx_table *table; | 788 | struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); |
797 | struct kioctx *ctx; | 789 | int i; |
798 | unsigned i = 0; | ||
799 | |||
800 | while (1) { | ||
801 | rcu_read_lock(); | ||
802 | table = rcu_dereference(mm->ioctx_table); | ||
803 | |||
804 | do { | ||
805 | if (!table || i >= table->nr) { | ||
806 | rcu_read_unlock(); | ||
807 | rcu_assign_pointer(mm->ioctx_table, NULL); | ||
808 | if (table) | ||
809 | kfree(table); | ||
810 | return; | ||
811 | } | ||
812 | 790 | ||
813 | ctx = table->table[i++]; | 791 | if (!table) |
814 | } while (!ctx); | 792 | return; |
815 | 793 | ||
816 | rcu_read_unlock(); | 794 | for (i = 0; i < table->nr; ++i) { |
795 | struct kioctx *ctx = table->table[i]; | ||
796 | struct completion requests_done = | ||
797 | COMPLETION_INITIALIZER_ONSTACK(requests_done); | ||
817 | 798 | ||
799 | if (!ctx) | ||
800 | continue; | ||
818 | /* | 801 | /* |
819 | * We don't need to bother with munmap() here - | 802 | * We don't need to bother with munmap() here - exit_mmap(mm) |
820 | * exit_mmap(mm) is coming and it'll unmap everything. | 803 | * is coming and it'll unmap everything. And we simply can't, |
821 | * Since aio_free_ring() uses non-zero ->mmap_size | 804 | * this is not necessarily our ->mm. |
822 | * as indicator that it needs to unmap the area, | 805 | * Since kill_ioctx() uses non-zero ->mmap_size as indicator |
823 | * just set it to 0; aio_free_ring() is the only | 806 | * that it needs to unmap the area, just set it to 0. |
824 | * place that uses ->mmap_size, so it's safe. | ||
825 | */ | 807 | */ |
826 | ctx->mmap_size = 0; | 808 | ctx->mmap_size = 0; |
809 | kill_ioctx(mm, ctx, &requests_done); | ||
827 | 810 | ||
828 | kill_ioctx(mm, ctx, NULL); | 811 | /* Wait until all IO for the context are done. */ |
812 | wait_for_completion(&requests_done); | ||
829 | } | 813 | } |
814 | |||
815 | RCU_INIT_POINTER(mm->ioctx_table, NULL); | ||
816 | kfree(table); | ||
830 | } | 817 | } |
831 | 818 | ||
832 | static void put_reqs_available(struct kioctx *ctx, unsigned nr) | 819 | static void put_reqs_available(struct kioctx *ctx, unsigned nr) |
@@ -834,10 +821,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) | |||
834 | struct kioctx_cpu *kcpu; | 821 | struct kioctx_cpu *kcpu; |
835 | unsigned long flags; | 822 | unsigned long flags; |
836 | 823 | ||
837 | preempt_disable(); | ||
838 | kcpu = this_cpu_ptr(ctx->cpu); | ||
839 | |||
840 | local_irq_save(flags); | 824 | local_irq_save(flags); |
825 | kcpu = this_cpu_ptr(ctx->cpu); | ||
841 | kcpu->reqs_available += nr; | 826 | kcpu->reqs_available += nr; |
842 | 827 | ||
843 | while (kcpu->reqs_available >= ctx->req_batch * 2) { | 828 | while (kcpu->reqs_available >= ctx->req_batch * 2) { |
@@ -846,7 +831,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) | |||
846 | } | 831 | } |
847 | 832 | ||
848 | local_irq_restore(flags); | 833 | local_irq_restore(flags); |
849 | preempt_enable(); | ||
850 | } | 834 | } |
851 | 835 | ||
852 | static bool get_reqs_available(struct kioctx *ctx) | 836 | static bool get_reqs_available(struct kioctx *ctx) |
@@ -855,10 +839,8 @@ static bool get_reqs_available(struct kioctx *ctx) | |||
855 | bool ret = false; | 839 | bool ret = false; |
856 | unsigned long flags; | 840 | unsigned long flags; |
857 | 841 | ||
858 | preempt_disable(); | ||
859 | kcpu = this_cpu_ptr(ctx->cpu); | ||
860 | |||
861 | local_irq_save(flags); | 842 | local_irq_save(flags); |
843 | kcpu = this_cpu_ptr(ctx->cpu); | ||
862 | if (!kcpu->reqs_available) { | 844 | if (!kcpu->reqs_available) { |
863 | int old, avail = atomic_read(&ctx->reqs_available); | 845 | int old, avail = atomic_read(&ctx->reqs_available); |
864 | 846 | ||
@@ -878,10 +860,71 @@ static bool get_reqs_available(struct kioctx *ctx) | |||
878 | kcpu->reqs_available--; | 860 | kcpu->reqs_available--; |
879 | out: | 861 | out: |
880 | local_irq_restore(flags); | 862 | local_irq_restore(flags); |
881 | preempt_enable(); | ||
882 | return ret; | 863 | return ret; |
883 | } | 864 | } |
884 | 865 | ||
866 | /* refill_reqs_available | ||
867 | * Updates the reqs_available reference counts used for tracking the | ||
868 | * number of free slots in the completion ring. This can be called | ||
869 | * from aio_complete() (to optimistically update reqs_available) or | ||
870 | * from aio_get_req() (the we're out of events case). It must be | ||
871 | * called holding ctx->completion_lock. | ||
872 | */ | ||
873 | static void refill_reqs_available(struct kioctx *ctx, unsigned head, | ||
874 | unsigned tail) | ||
875 | { | ||
876 | unsigned events_in_ring, completed; | ||
877 | |||
878 | /* Clamp head since userland can write to it. */ | ||
879 | head %= ctx->nr_events; | ||
880 | if (head <= tail) | ||
881 | events_in_ring = tail - head; | ||
882 | else | ||
883 | events_in_ring = ctx->nr_events - (head - tail); | ||
884 | |||
885 | completed = ctx->completed_events; | ||
886 | if (events_in_ring < completed) | ||
887 | completed -= events_in_ring; | ||
888 | else | ||
889 | completed = 0; | ||
890 | |||
891 | if (!completed) | ||
892 | return; | ||
893 | |||
894 | ctx->completed_events -= completed; | ||
895 | put_reqs_available(ctx, completed); | ||
896 | } | ||
897 | |||
898 | /* user_refill_reqs_available | ||
899 | * Called to refill reqs_available when aio_get_req() encounters an | ||
900 | * out of space in the completion ring. | ||
901 | */ | ||
902 | static void user_refill_reqs_available(struct kioctx *ctx) | ||
903 | { | ||
904 | spin_lock_irq(&ctx->completion_lock); | ||
905 | if (ctx->completed_events) { | ||
906 | struct aio_ring *ring; | ||
907 | unsigned head; | ||
908 | |||
909 | /* Access of ring->head may race with aio_read_events_ring() | ||
910 | * here, but that's okay since whether we read the old version | ||
911 | * or the new version, and either will be valid. The important | ||
912 | * part is that head cannot pass tail since we prevent | ||
913 | * aio_complete() from updating tail by holding | ||
914 | * ctx->completion_lock. Even if head is invalid, the check | ||
915 | * against ctx->completed_events below will make sure we do the | ||
916 | * safe/right thing. | ||
917 | */ | ||
918 | ring = kmap_atomic(ctx->ring_pages[0]); | ||
919 | head = ring->head; | ||
920 | kunmap_atomic(ring); | ||
921 | |||
922 | refill_reqs_available(ctx, head, ctx->tail); | ||
923 | } | ||
924 | |||
925 | spin_unlock_irq(&ctx->completion_lock); | ||
926 | } | ||
927 | |||
885 | /* aio_get_req | 928 | /* aio_get_req |
886 | * Allocate a slot for an aio request. | 929 | * Allocate a slot for an aio request. |
887 | * Returns NULL if no requests are free. | 930 | * Returns NULL if no requests are free. |
@@ -890,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) | |||
890 | { | 933 | { |
891 | struct kiocb *req; | 934 | struct kiocb *req; |
892 | 935 | ||
893 | if (!get_reqs_available(ctx)) | 936 | if (!get_reqs_available(ctx)) { |
894 | return NULL; | 937 | user_refill_reqs_available(ctx); |
938 | if (!get_reqs_available(ctx)) | ||
939 | return NULL; | ||
940 | } | ||
895 | 941 | ||
896 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); | 942 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
897 | if (unlikely(!req)) | 943 | if (unlikely(!req)) |
@@ -950,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
950 | struct kioctx *ctx = iocb->ki_ctx; | 996 | struct kioctx *ctx = iocb->ki_ctx; |
951 | struct aio_ring *ring; | 997 | struct aio_ring *ring; |
952 | struct io_event *ev_page, *event; | 998 | struct io_event *ev_page, *event; |
999 | unsigned tail, pos, head; | ||
953 | unsigned long flags; | 1000 | unsigned long flags; |
954 | unsigned tail, pos; | ||
955 | 1001 | ||
956 | /* | 1002 | /* |
957 | * Special case handling for sync iocbs: | 1003 | * Special case handling for sync iocbs: |
@@ -1012,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
1012 | ctx->tail = tail; | 1058 | ctx->tail = tail; |
1013 | 1059 | ||
1014 | ring = kmap_atomic(ctx->ring_pages[0]); | 1060 | ring = kmap_atomic(ctx->ring_pages[0]); |
1061 | head = ring->head; | ||
1015 | ring->tail = tail; | 1062 | ring->tail = tail; |
1016 | kunmap_atomic(ring); | 1063 | kunmap_atomic(ring); |
1017 | flush_dcache_page(ctx->ring_pages[0]); | 1064 | flush_dcache_page(ctx->ring_pages[0]); |
1018 | 1065 | ||
1066 | ctx->completed_events++; | ||
1067 | if (ctx->completed_events > 1) | ||
1068 | refill_reqs_available(ctx, head, tail); | ||
1019 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 1069 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
1020 | 1070 | ||
1021 | pr_debug("added to ring %p at [%u]\n", iocb, tail); | 1071 | pr_debug("added to ring %p at [%u]\n", iocb, tail); |
@@ -1030,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
1030 | 1080 | ||
1031 | /* everything turned out well, dispose of the aiocb. */ | 1081 | /* everything turned out well, dispose of the aiocb. */ |
1032 | kiocb_free(iocb); | 1082 | kiocb_free(iocb); |
1033 | put_reqs_available(ctx, 1); | ||
1034 | 1083 | ||
1035 | /* | 1084 | /* |
1036 | * We have to order our ring_info tail store above and test | 1085 | * We have to order our ring_info tail store above and test |
@@ -1047,7 +1096,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
1047 | } | 1096 | } |
1048 | EXPORT_SYMBOL(aio_complete); | 1097 | EXPORT_SYMBOL(aio_complete); |
1049 | 1098 | ||
1050 | /* aio_read_events | 1099 | /* aio_read_events_ring |
1051 | * Pull an event off of the ioctx's event ring. Returns the number of | 1100 | * Pull an event off of the ioctx's event ring. Returns the number of |
1052 | * events fetched | 1101 | * events fetched |
1053 | */ | 1102 | */ |
@@ -1067,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
1067 | tail = ring->tail; | 1116 | tail = ring->tail; |
1068 | kunmap_atomic(ring); | 1117 | kunmap_atomic(ring); |
1069 | 1118 | ||
1119 | /* | ||
1120 | * Ensure that once we've read the current tail pointer, that | ||
1121 | * we also see the events that were stored up to the tail. | ||
1122 | */ | ||
1123 | smp_rmb(); | ||
1124 | |||
1070 | pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); | 1125 | pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); |
1071 | 1126 | ||
1072 | if (head == tail) | 1127 | if (head == tail) |
@@ -1270,12 +1325,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, | |||
1270 | if (compat) | 1325 | if (compat) |
1271 | ret = compat_rw_copy_check_uvector(rw, | 1326 | ret = compat_rw_copy_check_uvector(rw, |
1272 | (struct compat_iovec __user *)buf, | 1327 | (struct compat_iovec __user *)buf, |
1273 | *nr_segs, 1, *iovec, iovec); | 1328 | *nr_segs, UIO_FASTIOV, *iovec, iovec); |
1274 | else | 1329 | else |
1275 | #endif | 1330 | #endif |
1276 | ret = rw_copy_check_uvector(rw, | 1331 | ret = rw_copy_check_uvector(rw, |
1277 | (struct iovec __user *)buf, | 1332 | (struct iovec __user *)buf, |
1278 | *nr_segs, 1, *iovec, iovec); | 1333 | *nr_segs, UIO_FASTIOV, *iovec, iovec); |
1279 | if (ret < 0) | 1334 | if (ret < 0) |
1280 | return ret; | 1335 | return ret; |
1281 | 1336 | ||
@@ -1299,9 +1354,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, | |||
1299 | } | 1354 | } |
1300 | 1355 | ||
1301 | /* | 1356 | /* |
1302 | * aio_setup_iocb: | 1357 | * aio_run_iocb: |
1303 | * Performs the initial checks and aio retry method | 1358 | * Performs the initial checks and io submission. |
1304 | * setup for the kiocb at the time of io submission. | ||
1305 | */ | 1359 | */ |
1306 | static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, | 1360 | static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, |
1307 | char __user *buf, bool compat) | 1361 | char __user *buf, bool compat) |
@@ -1313,7 +1367,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, | |||
1313 | fmode_t mode; | 1367 | fmode_t mode; |
1314 | aio_rw_op *rw_op; | 1368 | aio_rw_op *rw_op; |
1315 | rw_iter_op *iter_op; | 1369 | rw_iter_op *iter_op; |
1316 | struct iovec inline_vec, *iovec = &inline_vec; | 1370 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
1317 | struct iov_iter iter; | 1371 | struct iov_iter iter; |
1318 | 1372 | ||
1319 | switch (opcode) { | 1373 | switch (opcode) { |
@@ -1348,7 +1402,7 @@ rw_common: | |||
1348 | if (!ret) | 1402 | if (!ret) |
1349 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); | 1403 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); |
1350 | if (ret < 0) { | 1404 | if (ret < 0) { |
1351 | if (iovec != &inline_vec) | 1405 | if (iovec != inline_vecs) |
1352 | kfree(iovec); | 1406 | kfree(iovec); |
1353 | return ret; | 1407 | return ret; |
1354 | } | 1408 | } |
@@ -1395,7 +1449,7 @@ rw_common: | |||
1395 | return -EINVAL; | 1449 | return -EINVAL; |
1396 | } | 1450 | } |
1397 | 1451 | ||
1398 | if (iovec != &inline_vec) | 1452 | if (iovec != inline_vecs) |
1399 | kfree(iovec); | 1453 | kfree(iovec); |
1400 | 1454 | ||
1401 | if (ret != -EIOCBQUEUED) { | 1455 | if (ret != -EIOCBQUEUED) { |