diff options
author | Kent Overstreet <koverstreet@google.com> | 2013-05-07 19:18:45 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-07 21:38:28 -0400 |
commit | a31ad380bed817aa25f8830ad23e1a0480fef797 (patch) | |
tree | 8cec21e08a224700e0af469aa303458f4fdfd595 /fs/aio.c | |
parent | 774a08b3548f3f4f36c1a4f2a29a1b3710a2c939 (diff) |
aio: make aio_read_evt() more efficient, convert to hrtimers
Previously, aio_read_event() pulled a single completion off the
ringbuffer at a time, locking and unlocking each time. Change it to
pull off as many events as it can at a time, and copy them directly to
userspace.
This also fixes a bug where if copying the event to userspace failed,
we'd lose the event.
Also convert it to wait_event_interruptible_hrtimeout(), which
simplifies it quite a bit.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 240 |
1 files changed, 90 insertions, 150 deletions
@@ -63,7 +63,7 @@ struct aio_ring_info { | |||
63 | unsigned long mmap_size; | 63 | unsigned long mmap_size; |
64 | 64 | ||
65 | struct page **ring_pages; | 65 | struct page **ring_pages; |
66 | spinlock_t ring_lock; | 66 | struct mutex ring_lock; |
67 | long nr_pages; | 67 | long nr_pages; |
68 | 68 | ||
69 | unsigned nr, tail; | 69 | unsigned nr, tail; |
@@ -344,7 +344,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
344 | atomic_set(&ctx->users, 2); | 344 | atomic_set(&ctx->users, 2); |
345 | atomic_set(&ctx->dead, 0); | 345 | atomic_set(&ctx->dead, 0); |
346 | spin_lock_init(&ctx->ctx_lock); | 346 | spin_lock_init(&ctx->ctx_lock); |
347 | spin_lock_init(&ctx->ring_info.ring_lock); | 347 | mutex_init(&ctx->ring_info.ring_lock); |
348 | init_waitqueue_head(&ctx->wait); | 348 | init_waitqueue_head(&ctx->wait); |
349 | 349 | ||
350 | INIT_LIST_HEAD(&ctx->active_reqs); | 350 | INIT_LIST_HEAD(&ctx->active_reqs); |
@@ -748,187 +748,127 @@ put_rq: | |||
748 | } | 748 | } |
749 | EXPORT_SYMBOL(aio_complete); | 749 | EXPORT_SYMBOL(aio_complete); |
750 | 750 | ||
751 | /* aio_read_evt | 751 | /* aio_read_events |
752 | * Pull an event off of the ioctx's event ring. Returns the number of | 752 | * Pull an event off of the ioctx's event ring. Returns the number of |
753 | * events fetched (0 or 1 ;-) | 753 | * events fetched |
754 | * FIXME: make this use cmpxchg. | ||
755 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | ||
756 | */ | 754 | */ |
757 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | 755 | static long aio_read_events_ring(struct kioctx *ctx, |
756 | struct io_event __user *event, long nr) | ||
758 | { | 757 | { |
759 | struct aio_ring_info *info = &ioctx->ring_info; | 758 | struct aio_ring_info *info = &ctx->ring_info; |
760 | struct aio_ring *ring; | 759 | struct aio_ring *ring; |
761 | unsigned long head; | 760 | unsigned head, pos; |
762 | int ret = 0; | 761 | long ret = 0; |
762 | int copy_ret; | ||
763 | |||
764 | mutex_lock(&info->ring_lock); | ||
763 | 765 | ||
764 | ring = kmap_atomic(info->ring_pages[0]); | 766 | ring = kmap_atomic(info->ring_pages[0]); |
765 | pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr); | 767 | head = ring->head; |
768 | kunmap_atomic(ring); | ||
769 | |||
770 | pr_debug("h%u t%u m%u\n", head, info->tail, info->nr); | ||
766 | 771 | ||
767 | if (ring->head == ring->tail) | 772 | if (head == info->tail) |
768 | goto out; | 773 | goto out; |
769 | 774 | ||
770 | spin_lock(&info->ring_lock); | 775 | while (ret < nr) { |
771 | 776 | long avail; | |
772 | head = ring->head % info->nr; | 777 | struct io_event *ev; |
773 | if (head != ring->tail) { | 778 | struct page *page; |
774 | struct io_event *evp = aio_ring_event(info, head); | 779 | |
775 | *ent = *evp; | 780 | avail = (head <= info->tail ? info->tail : info->nr) - head; |
776 | head = (head + 1) % info->nr; | 781 | if (head == info->tail) |
777 | smp_mb(); /* finish reading the event before updatng the head */ | 782 | break; |
778 | ring->head = head; | 783 | |
779 | ret = 1; | 784 | avail = min(avail, nr - ret); |
780 | put_aio_ring_event(evp); | 785 | avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - |
786 | ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); | ||
787 | |||
788 | pos = head + AIO_EVENTS_OFFSET; | ||
789 | page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE]; | ||
790 | pos %= AIO_EVENTS_PER_PAGE; | ||
791 | |||
792 | ev = kmap(page); | ||
793 | copy_ret = copy_to_user(event + ret, ev + pos, | ||
794 | sizeof(*ev) * avail); | ||
795 | kunmap(page); | ||
796 | |||
797 | if (unlikely(copy_ret)) { | ||
798 | ret = -EFAULT; | ||
799 | goto out; | ||
800 | } | ||
801 | |||
802 | ret += avail; | ||
803 | head += avail; | ||
804 | head %= info->nr; | ||
781 | } | 805 | } |
782 | spin_unlock(&info->ring_lock); | ||
783 | 806 | ||
784 | out: | 807 | ring = kmap_atomic(info->ring_pages[0]); |
808 | ring->head = head; | ||
785 | kunmap_atomic(ring); | 809 | kunmap_atomic(ring); |
786 | pr_debug("%d h%u t%u\n", ret, ring->head, ring->tail); | 810 | |
811 | pr_debug("%li h%u t%u\n", ret, head, info->tail); | ||
812 | out: | ||
813 | mutex_unlock(&info->ring_lock); | ||
814 | |||
787 | return ret; | 815 | return ret; |
788 | } | 816 | } |
789 | 817 | ||
790 | struct aio_timeout { | 818 | static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, |
791 | struct timer_list timer; | 819 | struct io_event __user *event, long *i) |
792 | int timed_out; | ||
793 | struct task_struct *p; | ||
794 | }; | ||
795 | |||
796 | static void timeout_func(unsigned long data) | ||
797 | { | 820 | { |
798 | struct aio_timeout *to = (struct aio_timeout *)data; | 821 | long ret = aio_read_events_ring(ctx, event + *i, nr - *i); |
799 | 822 | ||
800 | to->timed_out = 1; | 823 | if (ret > 0) |
801 | wake_up_process(to->p); | 824 | *i += ret; |
802 | } | ||
803 | 825 | ||
804 | static inline void init_timeout(struct aio_timeout *to) | 826 | if (unlikely(atomic_read(&ctx->dead))) |
805 | { | 827 | ret = -EINVAL; |
806 | setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); | ||
807 | to->timed_out = 0; | ||
808 | to->p = current; | ||
809 | } | ||
810 | 828 | ||
811 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | 829 | if (!*i) |
812 | const struct timespec *ts) | 830 | *i = ret; |
813 | { | ||
814 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | ||
815 | if (time_after(to->timer.expires, jiffies)) | ||
816 | add_timer(&to->timer); | ||
817 | else | ||
818 | to->timed_out = 1; | ||
819 | } | ||
820 | 831 | ||
821 | static inline void clear_timeout(struct aio_timeout *to) | 832 | return ret < 0 || *i >= min_nr; |
822 | { | ||
823 | del_singleshot_timer_sync(&to->timer); | ||
824 | } | 833 | } |
825 | 834 | ||
826 | static int read_events(struct kioctx *ctx, | 835 | static long read_events(struct kioctx *ctx, long min_nr, long nr, |
827 | long min_nr, long nr, | ||
828 | struct io_event __user *event, | 836 | struct io_event __user *event, |
829 | struct timespec __user *timeout) | 837 | struct timespec __user *timeout) |
830 | { | 838 | { |
831 | long start_jiffies = jiffies; | 839 | ktime_t until = { .tv64 = KTIME_MAX }; |
832 | struct task_struct *tsk = current; | 840 | long ret = 0; |
833 | DECLARE_WAITQUEUE(wait, tsk); | ||
834 | int ret; | ||
835 | int i = 0; | ||
836 | struct io_event ent; | ||
837 | struct aio_timeout to; | ||
838 | |||
839 | /* needed to zero any padding within an entry (there shouldn't be | ||
840 | * any, but C is fun! | ||
841 | */ | ||
842 | memset(&ent, 0, sizeof(ent)); | ||
843 | ret = 0; | ||
844 | while (likely(i < nr)) { | ||
845 | ret = aio_read_evt(ctx, &ent); | ||
846 | if (unlikely(ret <= 0)) | ||
847 | break; | ||
848 | |||
849 | pr_debug("%Lx %Lx %Lx %Lx\n", | ||
850 | ent.data, ent.obj, ent.res, ent.res2); | ||
851 | |||
852 | /* Could we split the check in two? */ | ||
853 | ret = -EFAULT; | ||
854 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | ||
855 | pr_debug("lost an event due to EFAULT.\n"); | ||
856 | break; | ||
857 | } | ||
858 | ret = 0; | ||
859 | |||
860 | /* Good, event copied to userland, update counts. */ | ||
861 | event ++; | ||
862 | i ++; | ||
863 | } | ||
864 | |||
865 | if (min_nr <= i) | ||
866 | return i; | ||
867 | if (ret) | ||
868 | return ret; | ||
869 | |||
870 | /* End fast path */ | ||
871 | 841 | ||
872 | init_timeout(&to); | ||
873 | if (timeout) { | 842 | if (timeout) { |
874 | struct timespec ts; | 843 | struct timespec ts; |
875 | ret = -EFAULT; | 844 | |
876 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) | 845 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
877 | goto out; | 846 | return -EFAULT; |
878 | 847 | ||
879 | set_timeout(start_jiffies, &to, &ts); | 848 | until = timespec_to_ktime(ts); |
880 | } | 849 | } |
881 | 850 | ||
882 | while (likely(i < nr)) { | 851 | /* |
883 | add_wait_queue_exclusive(&ctx->wait, &wait); | 852 | * Note that aio_read_events() is being called as the conditional - i.e. |
884 | do { | 853 | * we're calling it after prepare_to_wait() has set task state to |
885 | set_task_state(tsk, TASK_INTERRUPTIBLE); | 854 | * TASK_INTERRUPTIBLE. |
886 | ret = aio_read_evt(ctx, &ent); | 855 | * |
887 | if (ret) | 856 | * But aio_read_events() can block, and if it blocks it's going to flip |
888 | break; | 857 | * the task state back to TASK_RUNNING. |
889 | if (min_nr <= i) | 858 | * |
890 | break; | 859 | * This should be ok, provided it doesn't flip the state back to |
891 | if (unlikely(atomic_read(&ctx->dead))) { | 860 | * TASK_RUNNING and return 0 too much - that causes us to spin. That |
892 | ret = -EINVAL; | 861 | * will only happen if the mutex_lock() call blocks, and we then find |
893 | break; | 862 | * the ringbuffer empty. So in practice we should be ok, but it's |
894 | } | 863 | * something to be aware of when touching this code. |
895 | if (to.timed_out) /* Only check after read evt */ | 864 | */ |
896 | break; | 865 | wait_event_interruptible_hrtimeout(ctx->wait, |
897 | /* Try to only show up in io wait if there are ops | 866 | aio_read_events(ctx, min_nr, nr, event, &ret), until); |
898 | * in flight */ | ||
899 | if (atomic_read(&ctx->reqs_active)) | ||
900 | io_schedule(); | ||
901 | else | ||
902 | schedule(); | ||
903 | if (signal_pending(tsk)) { | ||
904 | ret = -EINTR; | ||
905 | break; | ||
906 | } | ||
907 | /*ret = aio_read_evt(ctx, &ent);*/ | ||
908 | } while (1) ; | ||
909 | |||
910 | set_task_state(tsk, TASK_RUNNING); | ||
911 | remove_wait_queue(&ctx->wait, &wait); | ||
912 | |||
913 | if (unlikely(ret <= 0)) | ||
914 | break; | ||
915 | |||
916 | ret = -EFAULT; | ||
917 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | ||
918 | pr_debug("lost an event due to EFAULT.\n"); | ||
919 | break; | ||
920 | } | ||
921 | 867 | ||
922 | /* Good, event copied to userland, update counts. */ | 868 | if (!ret && signal_pending(current)) |
923 | event ++; | 869 | ret = -EINTR; |
924 | i ++; | ||
925 | } | ||
926 | 870 | ||
927 | if (timeout) | 871 | return ret; |
928 | clear_timeout(&to); | ||
929 | out: | ||
930 | destroy_timer_on_stack(&to.timer); | ||
931 | return i ? i : ret; | ||
932 | } | 872 | } |
933 | 873 | ||
934 | /* sys_io_setup: | 874 | /* sys_io_setup: |