aboutsummaryrefslogtreecommitdiffstats
path: root/fs/aio.c
diff options
context:
space:
mode:
authorBenjamin LaHaise <bcrl@kvack.org>2013-07-30 12:54:40 -0400
committerBenjamin LaHaise <bcrl@kvack.org>2013-07-30 12:56:36 -0400
commitdb446a08c23d5475e6b08c87acca79ebb20f283c (patch)
tree9410c14312ac57df04cdb6129c0c369de67bcfb4 /fs/aio.c
parent4cd81c3dfc4a34e4a0b6fa577860077c8e5b13af (diff)
aio: convert the ioctx list to table lookup v3
On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote: > On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote: > > When using a large number of threads performing AIO operations the > > IOCTX list may get a significant number of entries which will cause > > significant overhead. For example, when running this fio script: > > > > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=512; thread; loops=100 > > > > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to > > 30% CPU time spent by lookup_ioctx: > > > > 32.51% [guest.kernel] [g] lookup_ioctx > > 9.19% [guest.kernel] [g] __lock_acquire.isra.28 > > 4.40% [guest.kernel] [g] lock_release > > 4.19% [guest.kernel] [g] sched_clock_local > > 3.86% [guest.kernel] [g] local_clock > > 3.68% [guest.kernel] [g] native_sched_clock > > 3.08% [guest.kernel] [g] sched_clock_cpu > > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11 > > 2.60% [guest.kernel] [g] memcpy > > 2.33% [guest.kernel] [g] lock_acquired > > 2.25% [guest.kernel] [g] lock_acquire > > 1.84% [guest.kernel] [g] do_io_submit > > > > This patchs converts the ioctx list to a radix tree. For a performance > > comparison the above FIO script was run on a 2 sockets 8 core > > machine. This are the results (average and %rsd of 10 runs) for the > > original list based implementation and for the radix tree based > > implementation: > > > > cores 1 2 4 8 16 32 > > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms > > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43% > > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms > > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75% > > % of radix > > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66% > > to list > > > > To consider the impact of the patch on the typical case of having > > only one ctx per process the following FIO script was run: > > > > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=1; thread; loops=100 > > > > on the same system and the results are the following: > > > > list 58892 ms > > %rsd 0.91% > > radix 59404 ms > > %rsd 0.81% > > % of radix > > relative 100.87% > > to list > > So, I was just doing some benchmarking/profiling to get ready to send > out the aio patches I've got for 3.11 - and it looks like your patch is > causing a ~1.5% throughput regression in my testing :/ ... <snip> I've got an alternate approach for fixing this wart in lookup_ioctx()... Instead of using an rbtree, just use the reserved id in the ring buffer header to index an array pointing the ioctx. It's not finished yet, and it needs to be tidied up, but is most of the way there. -ben -- "Thought is the essence of where you are now." -- kmo> And, a rework of Ben's code, but this was entirely his idea kmo> -Kent bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually free memory. Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Diffstat (limited to 'fs/aio.c')
-rw-r--r--fs/aio.c136
1 files changed, 114 insertions, 22 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 945dd0d072f3..52f200ebef07 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -66,6 +66,12 @@ struct aio_ring {
66 66
67#define AIO_RING_PAGES 8 67#define AIO_RING_PAGES 8
68 68
69struct kioctx_table {
70 struct rcu_head rcu;
71 unsigned nr;
72 struct kioctx *table[];
73};
74
69struct kioctx_cpu { 75struct kioctx_cpu {
70 unsigned reqs_available; 76 unsigned reqs_available;
71}; 77};
@@ -74,9 +80,7 @@ struct kioctx {
74 struct percpu_ref users; 80 struct percpu_ref users;
75 atomic_t dead; 81 atomic_t dead;
76 82
77 /* This needs improving */
78 unsigned long user_id; 83 unsigned long user_id;
79 struct hlist_node list;
80 84
81 struct __percpu kioctx_cpu *cpu; 85 struct __percpu kioctx_cpu *cpu;
82 86
@@ -135,6 +139,8 @@ struct kioctx {
135 139
136 struct page *internal_pages[AIO_RING_PAGES]; 140 struct page *internal_pages[AIO_RING_PAGES];
137 struct file *aio_ring_file; 141 struct file *aio_ring_file;
142
143 unsigned id;
138}; 144};
139 145
140/*------ sysctl variables----*/ 146/*------ sysctl variables----*/
@@ -326,7 +332,7 @@ static int aio_setup_ring(struct kioctx *ctx)
326 332
327 ring = kmap_atomic(ctx->ring_pages[0]); 333 ring = kmap_atomic(ctx->ring_pages[0]);
328 ring->nr = nr_events; /* user copy */ 334 ring->nr = nr_events; /* user copy */
329 ring->id = ctx->user_id; 335 ring->id = ~0U;
330 ring->head = ring->tail = 0; 336 ring->head = ring->tail = 0;
331 ring->magic = AIO_RING_MAGIC; 337 ring->magic = AIO_RING_MAGIC;
332 ring->compat_features = AIO_RING_COMPAT_FEATURES; 338 ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -462,6 +468,58 @@ static void free_ioctx_ref(struct percpu_ref *ref)
462 schedule_work(&ctx->free_work); 468 schedule_work(&ctx->free_work);
463} 469}
464 470
471static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
472{
473 unsigned i, new_nr;
474 struct kioctx_table *table, *old;
475 struct aio_ring *ring;
476
477 spin_lock(&mm->ioctx_lock);
478 table = rcu_dereference(mm->ioctx_table);
479
480 while (1) {
481 if (table)
482 for (i = 0; i < table->nr; i++)
483 if (!table->table[i]) {
484 ctx->id = i;
485 table->table[i] = ctx;
486 spin_unlock(&mm->ioctx_lock);
487
488 ring = kmap_atomic(ctx->ring_pages[0]);
489 ring->id = ctx->id;
490 kunmap_atomic(ring);
491 return 0;
492 }
493
494 new_nr = (table ? table->nr : 1) * 4;
495
496 spin_unlock(&mm->ioctx_lock);
497
498 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
499 new_nr, GFP_KERNEL);
500 if (!table)
501 return -ENOMEM;
502
503 table->nr = new_nr;
504
505 spin_lock(&mm->ioctx_lock);
506 old = rcu_dereference(mm->ioctx_table);
507
508 if (!old) {
509 rcu_assign_pointer(mm->ioctx_table, table);
510 } else if (table->nr > old->nr) {
511 memcpy(table->table, old->table,
512 old->nr * sizeof(struct kioctx *));
513
514 rcu_assign_pointer(mm->ioctx_table, table);
515 kfree_rcu(old, rcu);
516 } else {
517 kfree(table);
518 table = old;
519 }
520 }
521}
522
465/* ioctx_alloc 523/* ioctx_alloc
466 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 524 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
467 */ 525 */
@@ -520,6 +578,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
520 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); 578 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
521 BUG_ON(!ctx->req_batch); 579 BUG_ON(!ctx->req_batch);
522 580
581 err = ioctx_add_table(ctx, mm);
582 if (err)
583 goto out_cleanup_noerr;
584
523 /* limit the number of system wide aios */ 585 /* limit the number of system wide aios */
524 spin_lock(&aio_nr_lock); 586 spin_lock(&aio_nr_lock);
525 if (aio_nr + nr_events > (aio_max_nr * 2UL) || 587 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
@@ -532,17 +594,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
532 594
533 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ 595 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
534 596
535 /* now link into global list. */
536 spin_lock(&mm->ioctx_lock);
537 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
538 spin_unlock(&mm->ioctx_lock);
539
540 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 597 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
541 ctx, ctx->user_id, mm, ctx->nr_events); 598 ctx, ctx->user_id, mm, ctx->nr_events);
542 return ctx; 599 return ctx;
543 600
544out_cleanup: 601out_cleanup:
545 err = -EAGAIN; 602 err = -EAGAIN;
603out_cleanup_noerr:
546 aio_free_ring(ctx); 604 aio_free_ring(ctx);
547out_freepcpu: 605out_freepcpu:
548 free_percpu(ctx->cpu); 606 free_percpu(ctx->cpu);
@@ -561,10 +619,18 @@ out_freectx:
561 * when the processes owning a context have all exited to encourage 619 * when the processes owning a context have all exited to encourage
562 * the rapid destruction of the kioctx. 620 * the rapid destruction of the kioctx.
563 */ 621 */
564static void kill_ioctx(struct kioctx *ctx) 622static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
565{ 623{
566 if (!atomic_xchg(&ctx->dead, 1)) { 624 if (!atomic_xchg(&ctx->dead, 1)) {
567 hlist_del_rcu(&ctx->list); 625 struct kioctx_table *table;
626
627 spin_lock(&mm->ioctx_lock);
628 table = rcu_dereference(mm->ioctx_table);
629
630 WARN_ON(ctx != table->table[ctx->id]);
631 table->table[ctx->id] = NULL;
632 spin_unlock(&mm->ioctx_lock);
633
568 /* percpu_ref_kill() will do the necessary call_rcu() */ 634 /* percpu_ref_kill() will do the necessary call_rcu() */
569 wake_up_all(&ctx->wait); 635 wake_up_all(&ctx->wait);
570 636
@@ -613,10 +679,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
613 */ 679 */
614void exit_aio(struct mm_struct *mm) 680void exit_aio(struct mm_struct *mm)
615{ 681{
682 struct kioctx_table *table;
616 struct kioctx *ctx; 683 struct kioctx *ctx;
617 struct hlist_node *n; 684 unsigned i = 0;
685
686 while (1) {
687 rcu_read_lock();
688 table = rcu_dereference(mm->ioctx_table);
689
690 do {
691 if (!table || i >= table->nr) {
692 rcu_read_unlock();
693 rcu_assign_pointer(mm->ioctx_table, NULL);
694 if (table)
695 kfree(table);
696 return;
697 }
698
699 ctx = table->table[i++];
700 } while (!ctx);
701
702 rcu_read_unlock();
618 703
619 hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
620 /* 704 /*
621 * We don't need to bother with munmap() here - 705 * We don't need to bother with munmap() here -
622 * exit_mmap(mm) is coming and it'll unmap everything. 706 * exit_mmap(mm) is coming and it'll unmap everything.
@@ -627,7 +711,7 @@ void exit_aio(struct mm_struct *mm)
627 */ 711 */
628 ctx->mmap_size = 0; 712 ctx->mmap_size = 0;
629 713
630 kill_ioctx(ctx); 714 kill_ioctx(mm, ctx);
631 } 715 }
632} 716}
633 717
@@ -710,19 +794,27 @@ static void kiocb_free(struct kiocb *req)
710 794
711static struct kioctx *lookup_ioctx(unsigned long ctx_id) 795static struct kioctx *lookup_ioctx(unsigned long ctx_id)
712{ 796{
797 struct aio_ring __user *ring = (void __user *)ctx_id;
713 struct mm_struct *mm = current->mm; 798 struct mm_struct *mm = current->mm;
714 struct kioctx *ctx, *ret = NULL; 799 struct kioctx *ctx, *ret = NULL;
800 struct kioctx_table *table;
801 unsigned id;
802
803 if (get_user(id, &ring->id))
804 return NULL;
715 805
716 rcu_read_lock(); 806 rcu_read_lock();
807 table = rcu_dereference(mm->ioctx_table);
717 808
718 hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { 809 if (!table || id >= table->nr)
719 if (ctx->user_id == ctx_id) { 810 goto out;
720 percpu_ref_get(&ctx->users);
721 ret = ctx;
722 break;
723 }
724 }
725 811
812 ctx = table->table[id];
813 if (ctx->user_id == ctx_id) {
814 percpu_ref_get(&ctx->users);
815 ret = ctx;
816 }
817out:
726 rcu_read_unlock(); 818 rcu_read_unlock();
727 return ret; 819 return ret;
728} 820}
@@ -998,7 +1090,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
998 if (!IS_ERR(ioctx)) { 1090 if (!IS_ERR(ioctx)) {
999 ret = put_user(ioctx->user_id, ctxp); 1091 ret = put_user(ioctx->user_id, ctxp);
1000 if (ret) 1092 if (ret)
1001 kill_ioctx(ioctx); 1093 kill_ioctx(current->mm, ioctx);
1002 percpu_ref_put(&ioctx->users); 1094 percpu_ref_put(&ioctx->users);
1003 } 1095 }
1004 1096
@@ -1016,7 +1108,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1016{ 1108{
1017 struct kioctx *ioctx = lookup_ioctx(ctx); 1109 struct kioctx *ioctx = lookup_ioctx(ctx);
1018 if (likely(NULL != ioctx)) { 1110 if (likely(NULL != ioctx)) {
1019 kill_ioctx(ioctx); 1111 kill_ioctx(current->mm, ioctx);
1020 percpu_ref_put(&ioctx->users); 1112 percpu_ref_put(&ioctx->users);
1021 return 0; 1113 return 0;
1022 } 1114 }