aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mmap.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2010-03-05 16:42:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:26 -0500
commit5beb49305251e5669852ed541e8e2f2f7696c53e (patch)
tree46457450a22f23938b24904aeba5d4ada2f53b20 /mm/mmap.c
parent648bcc771145172a14bc35eeb849ed08f6aa4f1e (diff)
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c138
1 files changed, 97 insertions, 41 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 31656147128e..6a0c15db7f60 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
437{ 437{
438 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
439 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
440 __anon_vma_link(vma);
441} 440}
442 441
443static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
500 * before we drop the necessary locks. 499 * before we drop the necessary locks.
501 */ 500 */
502void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
503 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
504{ 503{
505 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,28 @@ again: remove_next = 1 + (end > next->vm_end);
542 } 541 }
543 } 542 }
544 543
544 /*
545 * When changing only vma->vm_end, we don't really need anon_vma lock.
546 */
547 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
548 anon_vma = vma->anon_vma;
549 if (anon_vma) {
550 /*
551 * Easily overlooked: when mprotect shifts the boundary,
552 * make sure the expanding vma has anon_vma set if the
553 * shrinking vma had, to cover any anon pages imported.
554 */
555 if (importer && !importer->anon_vma) {
556 /* Block reverse map lookups until things are set up. */
557 importer->vm_flags |= VM_LOCK_RMAP;
558 if (anon_vma_clone(importer, vma)) {
559 importer->vm_flags &= ~VM_LOCK_RMAP;
560 return -ENOMEM;
561 }
562 importer->anon_vma = anon_vma;
563 }
564 }
565
545 if (file) { 566 if (file) {
546 mapping = file->f_mapping; 567 mapping = file->f_mapping;
547 if (!(vma->vm_flags & VM_NONLINEAR)) 568 if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +588,6 @@ again: remove_next = 1 + (end > next->vm_end);
567 } 588 }
568 } 589 }
569 590
570 /*
571 * When changing only vma->vm_end, we don't really need
572 * anon_vma lock.
573 */
574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
575 anon_vma = vma->anon_vma;
576 if (anon_vma) {
577 spin_lock(&anon_vma->lock);
578 /*
579 * Easily overlooked: when mprotect shifts the boundary,
580 * make sure the expanding vma has anon_vma set if the
581 * shrinking vma had, to cover any anon pages imported.
582 */
583 if (importer && !importer->anon_vma) {
584 importer->anon_vma = anon_vma;
585 __anon_vma_link(importer);
586 }
587 }
588
589 if (root) { 591 if (root) {
590 flush_dcache_mmap_lock(mapping); 592 flush_dcache_mmap_lock(mapping);
591 vma_prio_tree_remove(vma, root); 593 vma_prio_tree_remove(vma, root);
@@ -616,8 +618,11 @@ again: remove_next = 1 + (end > next->vm_end);
616 __vma_unlink(mm, next, vma); 618 __vma_unlink(mm, next, vma);
617 if (file) 619 if (file)
618 __remove_shared_vm_struct(next, file, mapping); 620 __remove_shared_vm_struct(next, file, mapping);
619 if (next->anon_vma) 621 /*
620 __anon_vma_merge(vma, next); 622 * This VMA is now dead, no need for rmap to follow it.
623 * Call anon_vma_merge below, outside of i_mmap_lock.
624 */
625 next->vm_flags |= VM_LOCK_RMAP;
621 } else if (insert) { 626 } else if (insert) {
622 /* 627 /*
623 * split_vma has split insert from vma, and needs 628 * split_vma has split insert from vma, and needs
@@ -627,17 +632,25 @@ again: remove_next = 1 + (end > next->vm_end);
627 __insert_vm_struct(mm, insert); 632 __insert_vm_struct(mm, insert);
628 } 633 }
629 634
630 if (anon_vma)
631 spin_unlock(&anon_vma->lock);
632 if (mapping) 635 if (mapping)
633 spin_unlock(&mapping->i_mmap_lock); 636 spin_unlock(&mapping->i_mmap_lock);
634 637
638 /*
639 * The current VMA has been set up. It is now safe for the
640 * rmap code to get from the pages to the ptes.
641 */
642 if (anon_vma && importer)
643 importer->vm_flags &= ~VM_LOCK_RMAP;
644
635 if (remove_next) { 645 if (remove_next) {
636 if (file) { 646 if (file) {
637 fput(file); 647 fput(file);
638 if (next->vm_flags & VM_EXECUTABLE) 648 if (next->vm_flags & VM_EXECUTABLE)
639 removed_exe_file_vma(mm); 649 removed_exe_file_vma(mm);
640 } 650 }
651 /* Protected by mmap_sem and VM_LOCK_RMAP. */
652 if (next->anon_vma)
653 anon_vma_merge(vma, next);
641 mm->map_count--; 654 mm->map_count--;
642 mpol_put(vma_policy(next)); 655 mpol_put(vma_policy(next));
643 kmem_cache_free(vm_area_cachep, next); 656 kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +666,8 @@ again: remove_next = 1 + (end > next->vm_end);
653 } 666 }
654 667
655 validate_mm(mm); 668 validate_mm(mm);
669
670 return 0;
656} 671}
657 672
658/* 673/*
@@ -759,6 +774,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
759{ 774{
760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 775 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
761 struct vm_area_struct *area, *next; 776 struct vm_area_struct *area, *next;
777 int err;
762 778
763 /* 779 /*
764 * We later require that vma->vm_flags == vm_flags, 780 * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +808,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
792 is_mergeable_anon_vma(prev->anon_vma, 808 is_mergeable_anon_vma(prev->anon_vma,
793 next->anon_vma)) { 809 next->anon_vma)) {
794 /* cases 1, 6 */ 810 /* cases 1, 6 */
795 vma_adjust(prev, prev->vm_start, 811 err = vma_adjust(prev, prev->vm_start,
796 next->vm_end, prev->vm_pgoff, NULL); 812 next->vm_end, prev->vm_pgoff, NULL);
797 } else /* cases 2, 5, 7 */ 813 } else /* cases 2, 5, 7 */
798 vma_adjust(prev, prev->vm_start, 814 err = vma_adjust(prev, prev->vm_start,
799 end, prev->vm_pgoff, NULL); 815 end, prev->vm_pgoff, NULL);
816 if (err)
817 return NULL;
800 return prev; 818 return prev;
801 } 819 }
802 820
@@ -808,11 +826,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
808 can_vma_merge_before(next, vm_flags, 826 can_vma_merge_before(next, vm_flags,
809 anon_vma, file, pgoff+pglen)) { 827 anon_vma, file, pgoff+pglen)) {
810 if (prev && addr < prev->vm_end) /* case 4 */ 828 if (prev && addr < prev->vm_end) /* case 4 */
811 vma_adjust(prev, prev->vm_start, 829 err = vma_adjust(prev, prev->vm_start,
812 addr, prev->vm_pgoff, NULL); 830 addr, prev->vm_pgoff, NULL);
813 else /* cases 3, 8 */ 831 else /* cases 3, 8 */
814 vma_adjust(area, addr, next->vm_end, 832 err = vma_adjust(area, addr, next->vm_end,
815 next->vm_pgoff - pglen, NULL); 833 next->vm_pgoff - pglen, NULL);
834 if (err)
835 return NULL;
816 return area; 836 return area;
817 } 837 }
818 838
@@ -1205,6 +1225,7 @@ munmap_back:
1205 vma->vm_flags = vm_flags; 1225 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1226 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff; 1227 vma->vm_pgoff = pgoff;
1228 INIT_LIST_HEAD(&vma->anon_vma_chain);
1208 1229
1209 if (file) { 1230 if (file) {
1210 error = -EINVAL; 1231 error = -EINVAL;
@@ -1865,6 +1886,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1865{ 1886{
1866 struct mempolicy *pol; 1887 struct mempolicy *pol;
1867 struct vm_area_struct *new; 1888 struct vm_area_struct *new;
1889 int err = -ENOMEM;
1868 1890
1869 if (is_vm_hugetlb_page(vma) && (addr & 1891 if (is_vm_hugetlb_page(vma) && (addr &
1870 ~(huge_page_mask(hstate_vma(vma))))) 1892 ~(huge_page_mask(hstate_vma(vma)))))
@@ -1872,11 +1894,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1872 1894
1873 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1895 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1874 if (!new) 1896 if (!new)
1875 return -ENOMEM; 1897 goto out_err;
1876 1898
1877 /* most fields are the same, copy all, and then fixup */ 1899 /* most fields are the same, copy all, and then fixup */
1878 *new = *vma; 1900 *new = *vma;
1879 1901
1902 INIT_LIST_HEAD(&new->anon_vma_chain);
1903
1880 if (new_below) 1904 if (new_below)
1881 new->vm_end = addr; 1905 new->vm_end = addr;
1882 else { 1906 else {
@@ -1886,11 +1910,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1886 1910
1887 pol = mpol_dup(vma_policy(vma)); 1911 pol = mpol_dup(vma_policy(vma));
1888 if (IS_ERR(pol)) { 1912 if (IS_ERR(pol)) {
1889 kmem_cache_free(vm_area_cachep, new); 1913 err = PTR_ERR(pol);
1890 return PTR_ERR(pol); 1914 goto out_free_vma;
1891 } 1915 }
1892 vma_set_policy(new, pol); 1916 vma_set_policy(new, pol);
1893 1917
1918 if (anon_vma_clone(new, vma))
1919 goto out_free_mpol;
1920
1894 if (new->vm_file) { 1921 if (new->vm_file) {
1895 get_file(new->vm_file); 1922 get_file(new->vm_file);
1896 if (vma->vm_flags & VM_EXECUTABLE) 1923 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1901,12 +1928,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1901 new->vm_ops->open(new); 1928 new->vm_ops->open(new);
1902 1929
1903 if (new_below) 1930 if (new_below)
1904 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1931 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1905 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1932 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1906 else 1933 else
1907 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1934 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1908 1935
1909 return 0; 1936 /* Success. */
1937 if (!err)
1938 return 0;
1939
1940 /* Clean everything up if vma_adjust failed. */
1941 new->vm_ops->close(new);
1942 if (new->vm_file) {
1943 if (vma->vm_flags & VM_EXECUTABLE)
1944 removed_exe_file_vma(mm);
1945 fput(new->vm_file);
1946 }
1947 out_free_mpol:
1948 mpol_put(pol);
1949 out_free_vma:
1950 kmem_cache_free(vm_area_cachep, new);
1951 out_err:
1952 return err;
1910} 1953}
1911 1954
1912/* 1955/*
@@ -2116,6 +2159,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2116 return -ENOMEM; 2159 return -ENOMEM;
2117 } 2160 }
2118 2161
2162 INIT_LIST_HEAD(&vma->anon_vma_chain);
2119 vma->vm_mm = mm; 2163 vma->vm_mm = mm;
2120 vma->vm_start = addr; 2164 vma->vm_start = addr;
2121 vma->vm_end = addr + len; 2165 vma->vm_end = addr + len;
@@ -2252,10 +2296,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2252 if (new_vma) { 2296 if (new_vma) {
2253 *new_vma = *vma; 2297 *new_vma = *vma;
2254 pol = mpol_dup(vma_policy(vma)); 2298 pol = mpol_dup(vma_policy(vma));
2255 if (IS_ERR(pol)) { 2299 if (IS_ERR(pol))
2256 kmem_cache_free(vm_area_cachep, new_vma); 2300 goto out_free_vma;
2257 return NULL; 2301 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2258 } 2302 if (anon_vma_clone(new_vma, vma))
2303 goto out_free_mempol;
2259 vma_set_policy(new_vma, pol); 2304 vma_set_policy(new_vma, pol);
2260 new_vma->vm_start = addr; 2305 new_vma->vm_start = addr;
2261 new_vma->vm_end = addr + len; 2306 new_vma->vm_end = addr + len;
@@ -2271,6 +2316,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2271 } 2316 }
2272 } 2317 }
2273 return new_vma; 2318 return new_vma;
2319
2320 out_free_mempol:
2321 mpol_put(pol);
2322 out_free_vma:
2323 kmem_cache_free(vm_area_cachep, new_vma);
2324 return NULL;
2274} 2325}
2275 2326
2276/* 2327/*
@@ -2348,6 +2399,7 @@ int install_special_mapping(struct mm_struct *mm,
2348 if (unlikely(vma == NULL)) 2399 if (unlikely(vma == NULL))
2349 return -ENOMEM; 2400 return -ENOMEM;
2350 2401
2402 INIT_LIST_HEAD(&vma->anon_vma_chain);
2351 vma->vm_mm = mm; 2403 vma->vm_mm = mm;
2352 vma->vm_start = addr; 2404 vma->vm_start = addr;
2353 vma->vm_end = addr + len; 2405 vma->vm_end = addr + len;
@@ -2448,6 +2500,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2448int mm_take_all_locks(struct mm_struct *mm) 2500int mm_take_all_locks(struct mm_struct *mm)
2449{ 2501{
2450 struct vm_area_struct *vma; 2502 struct vm_area_struct *vma;
2503 struct anon_vma_chain *avc;
2451 int ret = -EINTR; 2504 int ret = -EINTR;
2452 2505
2453 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2506 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2465,7 +2518,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2465 if (signal_pending(current)) 2518 if (signal_pending(current))
2466 goto out_unlock; 2519 goto out_unlock;
2467 if (vma->anon_vma) 2520 if (vma->anon_vma)
2468 vm_lock_anon_vma(mm, vma->anon_vma); 2521 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2522 vm_lock_anon_vma(mm, avc->anon_vma);
2469 } 2523 }
2470 2524
2471 ret = 0; 2525 ret = 0;
@@ -2520,13 +2574,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2520void mm_drop_all_locks(struct mm_struct *mm) 2574void mm_drop_all_locks(struct mm_struct *mm)
2521{ 2575{
2522 struct vm_area_struct *vma; 2576 struct vm_area_struct *vma;
2577 struct anon_vma_chain *avc;
2523 2578
2524 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2579 BUG_ON(down_read_trylock(&mm->mmap_sem));
2525 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2580 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2526 2581
2527 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2582 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2528 if (vma->anon_vma) 2583 if (vma->anon_vma)
2529 vm_unlock_anon_vma(vma->anon_vma); 2584 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2585 vm_unlock_anon_vma(avc->anon_vma);
2530 if (vma->vm_file && vma->vm_file->f_mapping) 2586 if (vma->vm_file && vma->vm_file->f_mapping)
2531 vm_unlock_mapping(vma->vm_file->f_mapping); 2587 vm_unlock_mapping(vma->vm_file->f_mapping);
2532 } 2588 }