diff options
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 155 |
1 files changed, 74 insertions, 81 deletions
@@ -58,18 +58,6 @@ struct aio_ring { | |||
58 | }; /* 128 bytes + ring size */ | 58 | }; /* 128 bytes + ring size */ |
59 | 59 | ||
60 | #define AIO_RING_PAGES 8 | 60 | #define AIO_RING_PAGES 8 |
61 | struct aio_ring_info { | ||
62 | unsigned long mmap_base; | ||
63 | unsigned long mmap_size; | ||
64 | |||
65 | struct page **ring_pages; | ||
66 | struct mutex ring_lock; | ||
67 | long nr_pages; | ||
68 | |||
69 | unsigned nr, tail; | ||
70 | |||
71 | struct page *internal_pages[AIO_RING_PAGES]; | ||
72 | }; | ||
73 | 61 | ||
74 | struct kioctx { | 62 | struct kioctx { |
75 | atomic_t users; | 63 | atomic_t users; |
@@ -90,14 +78,30 @@ struct kioctx { | |||
90 | * This is what userspace passed to io_setup(), it's not used for | 78 | * This is what userspace passed to io_setup(), it's not used for |
91 | * anything but counting against the global max_reqs quota. | 79 | * anything but counting against the global max_reqs quota. |
92 | * | 80 | * |
93 | * The real limit is ring->nr - 1, which will be larger (see | 81 | * The real limit is nr_events - 1, which will be larger (see |
94 | * aio_setup_ring()) | 82 | * aio_setup_ring()) |
95 | */ | 83 | */ |
96 | unsigned max_reqs; | 84 | unsigned max_reqs; |
97 | 85 | ||
98 | struct aio_ring_info ring_info; | 86 | /* Size of ringbuffer, in units of struct io_event */ |
87 | unsigned nr_events; | ||
99 | 88 | ||
100 | spinlock_t completion_lock; | 89 | unsigned long mmap_base; |
90 | unsigned long mmap_size; | ||
91 | |||
92 | struct page **ring_pages; | ||
93 | long nr_pages; | ||
94 | |||
95 | struct { | ||
96 | struct mutex ring_lock; | ||
97 | } ____cacheline_aligned; | ||
98 | |||
99 | struct { | ||
100 | unsigned tail; | ||
101 | spinlock_t completion_lock; | ||
102 | } ____cacheline_aligned; | ||
103 | |||
104 | struct page *internal_pages[AIO_RING_PAGES]; | ||
101 | 105 | ||
102 | struct rcu_head rcu_head; | 106 | struct rcu_head rcu_head; |
103 | struct work_struct rcu_work; | 107 | struct work_struct rcu_work; |
@@ -129,26 +133,21 @@ __initcall(aio_setup); | |||
129 | 133 | ||
130 | static void aio_free_ring(struct kioctx *ctx) | 134 | static void aio_free_ring(struct kioctx *ctx) |
131 | { | 135 | { |
132 | struct aio_ring_info *info = &ctx->ring_info; | ||
133 | long i; | 136 | long i; |
134 | 137 | ||
135 | for (i=0; i<info->nr_pages; i++) | 138 | for (i = 0; i < ctx->nr_pages; i++) |
136 | put_page(info->ring_pages[i]); | 139 | put_page(ctx->ring_pages[i]); |
137 | 140 | ||
138 | if (info->mmap_size) { | 141 | if (ctx->mmap_size) |
139 | vm_munmap(info->mmap_base, info->mmap_size); | 142 | vm_munmap(ctx->mmap_base, ctx->mmap_size); |
140 | } | ||
141 | 143 | ||
142 | if (info->ring_pages && info->ring_pages != info->internal_pages) | 144 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
143 | kfree(info->ring_pages); | 145 | kfree(ctx->ring_pages); |
144 | info->ring_pages = NULL; | ||
145 | info->nr = 0; | ||
146 | } | 146 | } |
147 | 147 | ||
148 | static int aio_setup_ring(struct kioctx *ctx) | 148 | static int aio_setup_ring(struct kioctx *ctx) |
149 | { | 149 | { |
150 | struct aio_ring *ring; | 150 | struct aio_ring *ring; |
151 | struct aio_ring_info *info = &ctx->ring_info; | ||
152 | unsigned nr_events = ctx->max_reqs; | 151 | unsigned nr_events = ctx->max_reqs; |
153 | struct mm_struct *mm = current->mm; | 152 | struct mm_struct *mm = current->mm; |
154 | unsigned long size, populate; | 153 | unsigned long size, populate; |
@@ -166,45 +165,44 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
166 | 165 | ||
167 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 166 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); |
168 | 167 | ||
169 | info->nr = 0; | 168 | ctx->nr_events = 0; |
170 | info->ring_pages = info->internal_pages; | 169 | ctx->ring_pages = ctx->internal_pages; |
171 | if (nr_pages > AIO_RING_PAGES) { | 170 | if (nr_pages > AIO_RING_PAGES) { |
172 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 171 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
173 | if (!info->ring_pages) | 172 | GFP_KERNEL); |
173 | if (!ctx->ring_pages) | ||
174 | return -ENOMEM; | 174 | return -ENOMEM; |
175 | } | 175 | } |
176 | 176 | ||
177 | info->mmap_size = nr_pages * PAGE_SIZE; | 177 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
178 | pr_debug("attempting mmap of %lu bytes\n", info->mmap_size); | 178 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
179 | down_write(&mm->mmap_sem); | 179 | down_write(&mm->mmap_sem); |
180 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 180 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, |
181 | PROT_READ|PROT_WRITE, | 181 | PROT_READ|PROT_WRITE, |
182 | MAP_ANONYMOUS|MAP_PRIVATE, 0, | 182 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); |
183 | &populate); | 183 | if (IS_ERR((void *)ctx->mmap_base)) { |
184 | if (IS_ERR((void *)info->mmap_base)) { | ||
185 | up_write(&mm->mmap_sem); | 184 | up_write(&mm->mmap_sem); |
186 | info->mmap_size = 0; | 185 | ctx->mmap_size = 0; |
187 | aio_free_ring(ctx); | 186 | aio_free_ring(ctx); |
188 | return -EAGAIN; | 187 | return -EAGAIN; |
189 | } | 188 | } |
190 | 189 | ||
191 | pr_debug("mmap address: 0x%08lx\n", info->mmap_base); | 190 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
192 | info->nr_pages = get_user_pages(current, mm, info->mmap_base, nr_pages, | 191 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
193 | 1, 0, info->ring_pages, NULL); | 192 | 1, 0, ctx->ring_pages, NULL); |
194 | up_write(&mm->mmap_sem); | 193 | up_write(&mm->mmap_sem); |
195 | 194 | ||
196 | if (unlikely(info->nr_pages != nr_pages)) { | 195 | if (unlikely(ctx->nr_pages != nr_pages)) { |
197 | aio_free_ring(ctx); | 196 | aio_free_ring(ctx); |
198 | return -EAGAIN; | 197 | return -EAGAIN; |
199 | } | 198 | } |
200 | if (populate) | 199 | if (populate) |
201 | mm_populate(info->mmap_base, populate); | 200 | mm_populate(ctx->mmap_base, populate); |
202 | 201 | ||
203 | ctx->user_id = info->mmap_base; | 202 | ctx->user_id = ctx->mmap_base; |
203 | ctx->nr_events = nr_events; /* trusted copy */ | ||
204 | 204 | ||
205 | info->nr = nr_events; /* trusted copy */ | 205 | ring = kmap_atomic(ctx->ring_pages[0]); |
206 | |||
207 | ring = kmap_atomic(info->ring_pages[0]); | ||
208 | ring->nr = nr_events; /* user copy */ | 206 | ring->nr = nr_events; /* user copy */ |
209 | ring->id = ctx->user_id; | 207 | ring->id = ctx->user_id; |
210 | ring->head = ring->tail = 0; | 208 | ring->head = ring->tail = 0; |
@@ -213,7 +211,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
213 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; | 211 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; |
214 | ring->header_length = sizeof(struct aio_ring); | 212 | ring->header_length = sizeof(struct aio_ring); |
215 | kunmap_atomic(ring); | 213 | kunmap_atomic(ring); |
216 | flush_dcache_page(info->ring_pages[0]); | 214 | flush_dcache_page(ctx->ring_pages[0]); |
217 | 215 | ||
218 | return 0; | 216 | return 0; |
219 | } | 217 | } |
@@ -284,7 +282,6 @@ static void free_ioctx_rcu(struct rcu_head *head) | |||
284 | */ | 282 | */ |
285 | static void free_ioctx(struct kioctx *ctx) | 283 | static void free_ioctx(struct kioctx *ctx) |
286 | { | 284 | { |
287 | struct aio_ring_info *info = &ctx->ring_info; | ||
288 | struct aio_ring *ring; | 285 | struct aio_ring *ring; |
289 | struct io_event res; | 286 | struct io_event res; |
290 | struct kiocb *req; | 287 | struct kiocb *req; |
@@ -302,18 +299,18 @@ static void free_ioctx(struct kioctx *ctx) | |||
302 | 299 | ||
303 | spin_unlock_irq(&ctx->ctx_lock); | 300 | spin_unlock_irq(&ctx->ctx_lock); |
304 | 301 | ||
305 | ring = kmap_atomic(info->ring_pages[0]); | 302 | ring = kmap_atomic(ctx->ring_pages[0]); |
306 | head = ring->head; | 303 | head = ring->head; |
307 | kunmap_atomic(ring); | 304 | kunmap_atomic(ring); |
308 | 305 | ||
309 | while (atomic_read(&ctx->reqs_active) > 0) { | 306 | while (atomic_read(&ctx->reqs_active) > 0) { |
310 | wait_event(ctx->wait, head != info->tail); | 307 | wait_event(ctx->wait, head != ctx->tail); |
311 | 308 | ||
312 | avail = (head <= info->tail ? info->tail : info->nr) - head; | 309 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; |
313 | 310 | ||
314 | atomic_sub(avail, &ctx->reqs_active); | 311 | atomic_sub(avail, &ctx->reqs_active); |
315 | head += avail; | 312 | head += avail; |
316 | head %= info->nr; | 313 | head %= ctx->nr_events; |
317 | } | 314 | } |
318 | 315 | ||
319 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); | 316 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); |
@@ -372,7 +369,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
372 | atomic_set(&ctx->dead, 0); | 369 | atomic_set(&ctx->dead, 0); |
373 | spin_lock_init(&ctx->ctx_lock); | 370 | spin_lock_init(&ctx->ctx_lock); |
374 | spin_lock_init(&ctx->completion_lock); | 371 | spin_lock_init(&ctx->completion_lock); |
375 | mutex_init(&ctx->ring_info.ring_lock); | 372 | mutex_init(&ctx->ring_lock); |
376 | init_waitqueue_head(&ctx->wait); | 373 | init_waitqueue_head(&ctx->wait); |
377 | 374 | ||
378 | INIT_LIST_HEAD(&ctx->active_reqs); | 375 | INIT_LIST_HEAD(&ctx->active_reqs); |
@@ -396,7 +393,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
396 | spin_unlock(&mm->ioctx_lock); | 393 | spin_unlock(&mm->ioctx_lock); |
397 | 394 | ||
398 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 395 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
399 | ctx, ctx->user_id, mm, ctx->ring_info.nr); | 396 | ctx, ctx->user_id, mm, ctx->nr_events); |
400 | return ctx; | 397 | return ctx; |
401 | 398 | ||
402 | out_cleanup: | 399 | out_cleanup: |
@@ -491,7 +488,7 @@ void exit_aio(struct mm_struct *mm) | |||
491 | * just set it to 0; aio_free_ring() is the only | 488 | * just set it to 0; aio_free_ring() is the only |
492 | * place that uses ->mmap_size, so it's safe. | 489 | * place that uses ->mmap_size, so it's safe. |
493 | */ | 490 | */ |
494 | ctx->ring_info.mmap_size = 0; | 491 | ctx->mmap_size = 0; |
495 | 492 | ||
496 | if (!atomic_xchg(&ctx->dead, 1)) { | 493 | if (!atomic_xchg(&ctx->dead, 1)) { |
497 | hlist_del_rcu(&ctx->list); | 494 | hlist_del_rcu(&ctx->list); |
@@ -514,10 +511,10 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) | |||
514 | { | 511 | { |
515 | struct kiocb *req; | 512 | struct kiocb *req; |
516 | 513 | ||
517 | if (atomic_read(&ctx->reqs_active) >= ctx->ring_info.nr) | 514 | if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) |
518 | return NULL; | 515 | return NULL; |
519 | 516 | ||
520 | if (atomic_inc_return(&ctx->reqs_active) > ctx->ring_info.nr - 1) | 517 | if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) |
521 | goto out_put; | 518 | goto out_put; |
522 | 519 | ||
523 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); | 520 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
@@ -578,7 +575,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) | |||
578 | void aio_complete(struct kiocb *iocb, long res, long res2) | 575 | void aio_complete(struct kiocb *iocb, long res, long res2) |
579 | { | 576 | { |
580 | struct kioctx *ctx = iocb->ki_ctx; | 577 | struct kioctx *ctx = iocb->ki_ctx; |
581 | struct aio_ring_info *info; | ||
582 | struct aio_ring *ring; | 578 | struct aio_ring *ring; |
583 | struct io_event *ev_page, *event; | 579 | struct io_event *ev_page, *event; |
584 | unsigned long flags; | 580 | unsigned long flags; |
@@ -599,8 +595,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
599 | return; | 595 | return; |
600 | } | 596 | } |
601 | 597 | ||
602 | info = &ctx->ring_info; | ||
603 | |||
604 | /* | 598 | /* |
605 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we | 599 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we |
606 | * need to issue a wakeup after decrementing reqs_active. | 600 | * need to issue a wakeup after decrementing reqs_active. |
@@ -633,13 +627,13 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
633 | */ | 627 | */ |
634 | spin_lock_irqsave(&ctx->completion_lock, flags); | 628 | spin_lock_irqsave(&ctx->completion_lock, flags); |
635 | 629 | ||
636 | tail = info->tail; | 630 | tail = ctx->tail; |
637 | pos = tail + AIO_EVENTS_OFFSET; | 631 | pos = tail + AIO_EVENTS_OFFSET; |
638 | 632 | ||
639 | if (++tail >= info->nr) | 633 | if (++tail >= ctx->nr_events) |
640 | tail = 0; | 634 | tail = 0; |
641 | 635 | ||
642 | ev_page = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); | 636 | ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); |
643 | event = ev_page + pos % AIO_EVENTS_PER_PAGE; | 637 | event = ev_page + pos % AIO_EVENTS_PER_PAGE; |
644 | 638 | ||
645 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; | 639 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; |
@@ -648,7 +642,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
648 | event->res2 = res2; | 642 | event->res2 = res2; |
649 | 643 | ||
650 | kunmap_atomic(ev_page); | 644 | kunmap_atomic(ev_page); |
651 | flush_dcache_page(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); | 645 | flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); |
652 | 646 | ||
653 | pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", | 647 | pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", |
654 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | 648 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, |
@@ -659,12 +653,12 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
659 | */ | 653 | */ |
660 | smp_wmb(); /* make event visible before updating tail */ | 654 | smp_wmb(); /* make event visible before updating tail */ |
661 | 655 | ||
662 | info->tail = tail; | 656 | ctx->tail = tail; |
663 | 657 | ||
664 | ring = kmap_atomic(info->ring_pages[0]); | 658 | ring = kmap_atomic(ctx->ring_pages[0]); |
665 | ring->tail = tail; | 659 | ring->tail = tail; |
666 | kunmap_atomic(ring); | 660 | kunmap_atomic(ring); |
667 | flush_dcache_page(info->ring_pages[0]); | 661 | flush_dcache_page(ctx->ring_pages[0]); |
668 | 662 | ||
669 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 663 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
670 | 664 | ||
@@ -704,21 +698,20 @@ EXPORT_SYMBOL(aio_complete); | |||
704 | static long aio_read_events_ring(struct kioctx *ctx, | 698 | static long aio_read_events_ring(struct kioctx *ctx, |
705 | struct io_event __user *event, long nr) | 699 | struct io_event __user *event, long nr) |
706 | { | 700 | { |
707 | struct aio_ring_info *info = &ctx->ring_info; | ||
708 | struct aio_ring *ring; | 701 | struct aio_ring *ring; |
709 | unsigned head, pos; | 702 | unsigned head, pos; |
710 | long ret = 0; | 703 | long ret = 0; |
711 | int copy_ret; | 704 | int copy_ret; |
712 | 705 | ||
713 | mutex_lock(&info->ring_lock); | 706 | mutex_lock(&ctx->ring_lock); |
714 | 707 | ||
715 | ring = kmap_atomic(info->ring_pages[0]); | 708 | ring = kmap_atomic(ctx->ring_pages[0]); |
716 | head = ring->head; | 709 | head = ring->head; |
717 | kunmap_atomic(ring); | 710 | kunmap_atomic(ring); |
718 | 711 | ||
719 | pr_debug("h%u t%u m%u\n", head, info->tail, info->nr); | 712 | pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); |
720 | 713 | ||
721 | if (head == info->tail) | 714 | if (head == ctx->tail) |
722 | goto out; | 715 | goto out; |
723 | 716 | ||
724 | while (ret < nr) { | 717 | while (ret < nr) { |
@@ -726,8 +719,8 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
726 | struct io_event *ev; | 719 | struct io_event *ev; |
727 | struct page *page; | 720 | struct page *page; |
728 | 721 | ||
729 | avail = (head <= info->tail ? info->tail : info->nr) - head; | 722 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; |
730 | if (head == info->tail) | 723 | if (head == ctx->tail) |
731 | break; | 724 | break; |
732 | 725 | ||
733 | avail = min(avail, nr - ret); | 726 | avail = min(avail, nr - ret); |
@@ -735,7 +728,7 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
735 | ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); | 728 | ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); |
736 | 729 | ||
737 | pos = head + AIO_EVENTS_OFFSET; | 730 | pos = head + AIO_EVENTS_OFFSET; |
738 | page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE]; | 731 | page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; |
739 | pos %= AIO_EVENTS_PER_PAGE; | 732 | pos %= AIO_EVENTS_PER_PAGE; |
740 | 733 | ||
741 | ev = kmap(page); | 734 | ev = kmap(page); |
@@ -750,19 +743,19 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
750 | 743 | ||
751 | ret += avail; | 744 | ret += avail; |
752 | head += avail; | 745 | head += avail; |
753 | head %= info->nr; | 746 | head %= ctx->nr_events; |
754 | } | 747 | } |
755 | 748 | ||
756 | ring = kmap_atomic(info->ring_pages[0]); | 749 | ring = kmap_atomic(ctx->ring_pages[0]); |
757 | ring->head = head; | 750 | ring->head = head; |
758 | kunmap_atomic(ring); | 751 | kunmap_atomic(ring); |
759 | flush_dcache_page(info->ring_pages[0]); | 752 | flush_dcache_page(ctx->ring_pages[0]); |
760 | 753 | ||
761 | pr_debug("%li h%u t%u\n", ret, head, info->tail); | 754 | pr_debug("%li h%u t%u\n", ret, head, ctx->tail); |
762 | 755 | ||
763 | atomic_sub(ret, &ctx->reqs_active); | 756 | atomic_sub(ret, &ctx->reqs_active); |
764 | out: | 757 | out: |
765 | mutex_unlock(&info->ring_lock); | 758 | mutex_unlock(&ctx->ring_lock); |
766 | 759 | ||
767 | return ret; | 760 | return ret; |
768 | } | 761 | } |