aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2010-03-05 16:42:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:26 -0500
commit5beb49305251e5669852ed541e8e2f2f7696c53e (patch)
tree46457450a22f23938b24904aeba5d4ada2f53b20 /mm
parent648bcc771145172a14bc35eeb849ed08f6aa4f1e (diff)
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c4
-rw-r--r--mm/mmap.c138
-rw-r--r--mm/mremap.c7
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/rmap.c156
7 files changed, 248 insertions, 76 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f997..a93f1b7f508 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1563again: 1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1566 struct vm_area_struct *vma; 1567 struct vm_area_struct *vma;
1567 1568
1568 spin_lock(&anon_vma->lock); 1569 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1570 if (rmap_item->address < vma->vm_start || 1572 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end) 1573 rmap_item->address >= vma->vm_end)
1572 continue; 1574 continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1614again: 1616again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1617 struct vm_area_struct *vma; 1620 struct vm_area_struct *vma;
1618 1621
1619 spin_lock(&anon_vma->lock); 1622 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1621 if (rmap_item->address < vma->vm_start || 1625 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end) 1626 rmap_item->address >= vma->vm_end)
1623 continue; 1627 continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1664again: 1668again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1667 struct vm_area_struct *vma; 1672 struct vm_area_struct *vma;
1668 1673
1669 spin_lock(&anon_vma->lock); 1674 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1671 if (rmap_item->address < vma->vm_start || 1677 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end) 1678 rmap_item->address >= vma->vm_end)
1673 continue; 1679 continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577..d1f33516297 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
383 if (av == NULL) /* Not actually mapped anymore */ 383 if (av == NULL) /* Not actually mapped anymore */
384 goto out; 384 goto out;
385 for_each_process (tsk) { 385 for_each_process (tsk) {
386 struct anon_vma_chain *vmac;
387
386 if (!task_early_kill(tsk)) 388 if (!task_early_kill(tsk))
387 continue; 389 continue;
388 list_for_each_entry (vma, &av->head, anon_vma_node) { 390 list_for_each_entry(vmac, &av->head, same_anon_vma) {
391 vma = vmac->vma;
389 if (!page_mapped_in_vma(page, vma)) 392 if (!page_mapped_in_vma(page, vma))
390 continue; 393 continue;
391 if (vma->vm_mm == tsk->mm) 394 if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 77d9f840936..dc785b438d7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -374,7 +374,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
374 * Hide vma from rmap and truncate_pagecache before freeing 374 * Hide vma from rmap and truncate_pagecache before freeing
375 * pgtables 375 * pgtables
376 */ 376 */
377 anon_vma_unlink(vma); 377 unlink_anon_vmas(vma);
378 unlink_file_vma(vma); 378 unlink_file_vma(vma);
379 379
380 if (is_vm_hugetlb_page(vma)) { 380 if (is_vm_hugetlb_page(vma)) {
@@ -388,7 +388,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
388 && !is_vm_hugetlb_page(next)) { 388 && !is_vm_hugetlb_page(next)) {
389 vma = next; 389 vma = next;
390 next = vma->vm_next; 390 next = vma->vm_next;
391 anon_vma_unlink(vma); 391 unlink_anon_vmas(vma);
392 unlink_file_vma(vma); 392 unlink_file_vma(vma);
393 } 393 }
394 free_pgd_range(tlb, addr, vma->vm_end, 394 free_pgd_range(tlb, addr, vma->vm_end,
diff --git a/mm/mmap.c b/mm/mmap.c
index 31656147128..6a0c15db7f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
437{ 437{
438 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
439 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
440 __anon_vma_link(vma);
441} 440}
442 441
443static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
500 * before we drop the necessary locks. 499 * before we drop the necessary locks.
501 */ 500 */
502void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
503 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
504{ 503{
505 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,28 @@ again: remove_next = 1 + (end > next->vm_end);
542 } 541 }
543 } 542 }
544 543
544 /*
545 * When changing only vma->vm_end, we don't really need anon_vma lock.
546 */
547 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
548 anon_vma = vma->anon_vma;
549 if (anon_vma) {
550 /*
551 * Easily overlooked: when mprotect shifts the boundary,
552 * make sure the expanding vma has anon_vma set if the
553 * shrinking vma had, to cover any anon pages imported.
554 */
555 if (importer && !importer->anon_vma) {
556 /* Block reverse map lookups until things are set up. */
557 importer->vm_flags |= VM_LOCK_RMAP;
558 if (anon_vma_clone(importer, vma)) {
559 importer->vm_flags &= ~VM_LOCK_RMAP;
560 return -ENOMEM;
561 }
562 importer->anon_vma = anon_vma;
563 }
564 }
565
545 if (file) { 566 if (file) {
546 mapping = file->f_mapping; 567 mapping = file->f_mapping;
547 if (!(vma->vm_flags & VM_NONLINEAR)) 568 if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +588,6 @@ again: remove_next = 1 + (end > next->vm_end);
567 } 588 }
568 } 589 }
569 590
570 /*
571 * When changing only vma->vm_end, we don't really need
572 * anon_vma lock.
573 */
574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
575 anon_vma = vma->anon_vma;
576 if (anon_vma) {
577 spin_lock(&anon_vma->lock);
578 /*
579 * Easily overlooked: when mprotect shifts the boundary,
580 * make sure the expanding vma has anon_vma set if the
581 * shrinking vma had, to cover any anon pages imported.
582 */
583 if (importer && !importer->anon_vma) {
584 importer->anon_vma = anon_vma;
585 __anon_vma_link(importer);
586 }
587 }
588
589 if (root) { 591 if (root) {
590 flush_dcache_mmap_lock(mapping); 592 flush_dcache_mmap_lock(mapping);
591 vma_prio_tree_remove(vma, root); 593 vma_prio_tree_remove(vma, root);
@@ -616,8 +618,11 @@ again: remove_next = 1 + (end > next->vm_end);
616 __vma_unlink(mm, next, vma); 618 __vma_unlink(mm, next, vma);
617 if (file) 619 if (file)
618 __remove_shared_vm_struct(next, file, mapping); 620 __remove_shared_vm_struct(next, file, mapping);
619 if (next->anon_vma) 621 /*
620 __anon_vma_merge(vma, next); 622 * This VMA is now dead, no need for rmap to follow it.
623 * Call anon_vma_merge below, outside of i_mmap_lock.
624 */
625 next->vm_flags |= VM_LOCK_RMAP;
621 } else if (insert) { 626 } else if (insert) {
622 /* 627 /*
623 * split_vma has split insert from vma, and needs 628 * split_vma has split insert from vma, and needs
@@ -627,17 +632,25 @@ again: remove_next = 1 + (end > next->vm_end);
627 __insert_vm_struct(mm, insert); 632 __insert_vm_struct(mm, insert);
628 } 633 }
629 634
630 if (anon_vma)
631 spin_unlock(&anon_vma->lock);
632 if (mapping) 635 if (mapping)
633 spin_unlock(&mapping->i_mmap_lock); 636 spin_unlock(&mapping->i_mmap_lock);
634 637
638 /*
639 * The current VMA has been set up. It is now safe for the
640 * rmap code to get from the pages to the ptes.
641 */
642 if (anon_vma && importer)
643 importer->vm_flags &= ~VM_LOCK_RMAP;
644
635 if (remove_next) { 645 if (remove_next) {
636 if (file) { 646 if (file) {
637 fput(file); 647 fput(file);
638 if (next->vm_flags & VM_EXECUTABLE) 648 if (next->vm_flags & VM_EXECUTABLE)
639 removed_exe_file_vma(mm); 649 removed_exe_file_vma(mm);
640 } 650 }
651 /* Protected by mmap_sem and VM_LOCK_RMAP. */
652 if (next->anon_vma)
653 anon_vma_merge(vma, next);
641 mm->map_count--; 654 mm->map_count--;
642 mpol_put(vma_policy(next)); 655 mpol_put(vma_policy(next));
643 kmem_cache_free(vm_area_cachep, next); 656 kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +666,8 @@ again: remove_next = 1 + (end > next->vm_end);
653 } 666 }
654 667
655 validate_mm(mm); 668 validate_mm(mm);
669
670 return 0;
656} 671}
657 672
658/* 673/*
@@ -759,6 +774,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
759{ 774{
760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 775 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
761 struct vm_area_struct *area, *next; 776 struct vm_area_struct *area, *next;
777 int err;
762 778
763 /* 779 /*
764 * We later require that vma->vm_flags == vm_flags, 780 * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +808,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
792 is_mergeable_anon_vma(prev->anon_vma, 808 is_mergeable_anon_vma(prev->anon_vma,
793 next->anon_vma)) { 809 next->anon_vma)) {
794 /* cases 1, 6 */ 810 /* cases 1, 6 */
795 vma_adjust(prev, prev->vm_start, 811 err = vma_adjust(prev, prev->vm_start,
796 next->vm_end, prev->vm_pgoff, NULL); 812 next->vm_end, prev->vm_pgoff, NULL);
797 } else /* cases 2, 5, 7 */ 813 } else /* cases 2, 5, 7 */
798 vma_adjust(prev, prev->vm_start, 814 err = vma_adjust(prev, prev->vm_start,
799 end, prev->vm_pgoff, NULL); 815 end, prev->vm_pgoff, NULL);
816 if (err)
817 return NULL;
800 return prev; 818 return prev;
801 } 819 }
802 820
@@ -808,11 +826,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
808 can_vma_merge_before(next, vm_flags, 826 can_vma_merge_before(next, vm_flags,
809 anon_vma, file, pgoff+pglen)) { 827 anon_vma, file, pgoff+pglen)) {
810 if (prev && addr < prev->vm_end) /* case 4 */ 828 if (prev && addr < prev->vm_end) /* case 4 */
811 vma_adjust(prev, prev->vm_start, 829 err = vma_adjust(prev, prev->vm_start,
812 addr, prev->vm_pgoff, NULL); 830 addr, prev->vm_pgoff, NULL);
813 else /* cases 3, 8 */ 831 else /* cases 3, 8 */
814 vma_adjust(area, addr, next->vm_end, 832 err = vma_adjust(area, addr, next->vm_end,
815 next->vm_pgoff - pglen, NULL); 833 next->vm_pgoff - pglen, NULL);
834 if (err)
835 return NULL;
816 return area; 836 return area;
817 } 837 }
818 838
@@ -1205,6 +1225,7 @@ munmap_back:
1205 vma->vm_flags = vm_flags; 1225 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1226 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff; 1227 vma->vm_pgoff = pgoff;
1228 INIT_LIST_HEAD(&vma->anon_vma_chain);
1208 1229
1209 if (file) { 1230 if (file) {
1210 error = -EINVAL; 1231 error = -EINVAL;
@@ -1865,6 +1886,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1865{ 1886{
1866 struct mempolicy *pol; 1887 struct mempolicy *pol;
1867 struct vm_area_struct *new; 1888 struct vm_area_struct *new;
1889 int err = -ENOMEM;
1868 1890
1869 if (is_vm_hugetlb_page(vma) && (addr & 1891 if (is_vm_hugetlb_page(vma) && (addr &
1870 ~(huge_page_mask(hstate_vma(vma))))) 1892 ~(huge_page_mask(hstate_vma(vma)))))
@@ -1872,11 +1894,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1872 1894
1873 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1895 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1874 if (!new) 1896 if (!new)
1875 return -ENOMEM; 1897 goto out_err;
1876 1898
1877 /* most fields are the same, copy all, and then fixup */ 1899 /* most fields are the same, copy all, and then fixup */
1878 *new = *vma; 1900 *new = *vma;
1879 1901
1902 INIT_LIST_HEAD(&new->anon_vma_chain);
1903
1880 if (new_below) 1904 if (new_below)
1881 new->vm_end = addr; 1905 new->vm_end = addr;
1882 else { 1906 else {
@@ -1886,11 +1910,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1886 1910
1887 pol = mpol_dup(vma_policy(vma)); 1911 pol = mpol_dup(vma_policy(vma));
1888 if (IS_ERR(pol)) { 1912 if (IS_ERR(pol)) {
1889 kmem_cache_free(vm_area_cachep, new); 1913 err = PTR_ERR(pol);
1890 return PTR_ERR(pol); 1914 goto out_free_vma;
1891 } 1915 }
1892 vma_set_policy(new, pol); 1916 vma_set_policy(new, pol);
1893 1917
1918 if (anon_vma_clone(new, vma))
1919 goto out_free_mpol;
1920
1894 if (new->vm_file) { 1921 if (new->vm_file) {
1895 get_file(new->vm_file); 1922 get_file(new->vm_file);
1896 if (vma->vm_flags & VM_EXECUTABLE) 1923 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1901,12 +1928,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1901 new->vm_ops->open(new); 1928 new->vm_ops->open(new);
1902 1929
1903 if (new_below) 1930 if (new_below)
1904 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1931 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1905 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1932 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1906 else 1933 else
1907 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1934 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1908 1935
1909 return 0; 1936 /* Success. */
1937 if (!err)
1938 return 0;
1939
1940 /* Clean everything up if vma_adjust failed. */
1941 new->vm_ops->close(new);
1942 if (new->vm_file) {
1943 if (vma->vm_flags & VM_EXECUTABLE)
1944 removed_exe_file_vma(mm);
1945 fput(new->vm_file);
1946 }
1947 out_free_mpol:
1948 mpol_put(pol);
1949 out_free_vma:
1950 kmem_cache_free(vm_area_cachep, new);
1951 out_err:
1952 return err;
1910} 1953}
1911 1954
1912/* 1955/*
@@ -2116,6 +2159,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2116 return -ENOMEM; 2159 return -ENOMEM;
2117 } 2160 }
2118 2161
2162 INIT_LIST_HEAD(&vma->anon_vma_chain);
2119 vma->vm_mm = mm; 2163 vma->vm_mm = mm;
2120 vma->vm_start = addr; 2164 vma->vm_start = addr;
2121 vma->vm_end = addr + len; 2165 vma->vm_end = addr + len;
@@ -2252,10 +2296,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2252 if (new_vma) { 2296 if (new_vma) {
2253 *new_vma = *vma; 2297 *new_vma = *vma;
2254 pol = mpol_dup(vma_policy(vma)); 2298 pol = mpol_dup(vma_policy(vma));
2255 if (IS_ERR(pol)) { 2299 if (IS_ERR(pol))
2256 kmem_cache_free(vm_area_cachep, new_vma); 2300 goto out_free_vma;
2257 return NULL; 2301 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2258 } 2302 if (anon_vma_clone(new_vma, vma))
2303 goto out_free_mempol;
2259 vma_set_policy(new_vma, pol); 2304 vma_set_policy(new_vma, pol);
2260 new_vma->vm_start = addr; 2305 new_vma->vm_start = addr;
2261 new_vma->vm_end = addr + len; 2306 new_vma->vm_end = addr + len;
@@ -2271,6 +2316,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2271 } 2316 }
2272 } 2317 }
2273 return new_vma; 2318 return new_vma;
2319
2320 out_free_mempol:
2321 mpol_put(pol);
2322 out_free_vma:
2323 kmem_cache_free(vm_area_cachep, new_vma);
2324 return NULL;
2274} 2325}
2275 2326
2276/* 2327/*
@@ -2348,6 +2399,7 @@ int install_special_mapping(struct mm_struct *mm,
2348 if (unlikely(vma == NULL)) 2399 if (unlikely(vma == NULL))
2349 return -ENOMEM; 2400 return -ENOMEM;
2350 2401
2402 INIT_LIST_HEAD(&vma->anon_vma_chain);
2351 vma->vm_mm = mm; 2403 vma->vm_mm = mm;
2352 vma->vm_start = addr; 2404 vma->vm_start = addr;
2353 vma->vm_end = addr + len; 2405 vma->vm_end = addr + len;
@@ -2448,6 +2500,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2448int mm_take_all_locks(struct mm_struct *mm) 2500int mm_take_all_locks(struct mm_struct *mm)
2449{ 2501{
2450 struct vm_area_struct *vma; 2502 struct vm_area_struct *vma;
2503 struct anon_vma_chain *avc;
2451 int ret = -EINTR; 2504 int ret = -EINTR;
2452 2505
2453 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2506 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2465,7 +2518,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2465 if (signal_pending(current)) 2518 if (signal_pending(current))
2466 goto out_unlock; 2519 goto out_unlock;
2467 if (vma->anon_vma) 2520 if (vma->anon_vma)
2468 vm_lock_anon_vma(mm, vma->anon_vma); 2521 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2522 vm_lock_anon_vma(mm, avc->anon_vma);
2469 } 2523 }
2470 2524
2471 ret = 0; 2525 ret = 0;
@@ -2520,13 +2574,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2520void mm_drop_all_locks(struct mm_struct *mm) 2574void mm_drop_all_locks(struct mm_struct *mm)
2521{ 2575{
2522 struct vm_area_struct *vma; 2576 struct vm_area_struct *vma;
2577 struct anon_vma_chain *avc;
2523 2578
2524 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2579 BUG_ON(down_read_trylock(&mm->mmap_sem));
2525 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2580 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2526 2581
2527 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2582 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2528 if (vma->anon_vma) 2583 if (vma->anon_vma)
2529 vm_unlock_anon_vma(vma->anon_vma); 2584 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2585 vm_unlock_anon_vma(avc->anon_vma);
2530 if (vma->vm_file && vma->vm_file->f_mapping) 2586 if (vma->vm_file && vma->vm_file->f_mapping)
2531 vm_unlock_mapping(vma->vm_file->f_mapping); 2587 vm_unlock_mapping(vma->vm_file->f_mapping);
2532 } 2588 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 4c4c803453f..e9c75efce60 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr,
460 if (vma_expandable(vma, new_len - old_len)) { 460 if (vma_expandable(vma, new_len - old_len)) {
461 int pages = (new_len - old_len) >> PAGE_SHIFT; 461 int pages = (new_len - old_len) >> PAGE_SHIFT;
462 462
463 vma_adjust(vma, vma->vm_start, 463 if (vma_adjust(vma, vma->vm_start, addr + new_len,
464 addr + new_len, vma->vm_pgoff, NULL); 464 vma->vm_pgoff, NULL)) {
465 ret = -ENOMEM;
466 goto out;
467 }
465 468
466 mm->total_vm += pages; 469 mm->total_vm += pages;
467 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 470 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf05..55727a74af9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1209 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1210 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1211 1211
1212 INIT_LIST_HEAD(&vma->anon_vma_node); 1212 INIT_LIST_HEAD(&vma->anon_vma_chain);
1213 vma->vm_flags = vm_flags; 1213 vma->vm_flags = vm_flags;
1214 vma->vm_pgoff = pgoff; 1214 vma->vm_pgoff = pgoff;
1215 1215
diff --git a/mm/rmap.c b/mm/rmap.c
index 5cb47111f79..be34094e459 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock); 136 spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
128 } 147 }
129 spin_unlock(&mm->page_table_lock); 148 spin_unlock(&mm->page_table_lock);
130 149
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
132 if (unlikely(allocated)) 151 if (unlikely(allocated)) {
133 anon_vma_free(allocated); 152 anon_vma_free(allocated);
153 anon_vma_chain_free(avc);
154 }
134 } 155 }
135 return 0; 156 return 0;
157
158 out_enomem_free_avc:
159 anon_vma_chain_free(avc);
160 out_enomem:
161 return -ENOMEM;
136} 162}
137 163
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 164static void anon_vma_chain_link(struct vm_area_struct *vma,
165 struct anon_vma_chain *avc,
166 struct anon_vma *anon_vma)
139{ 167{
140 BUG_ON(vma->anon_vma != next->anon_vma); 168 avc->vma = vma;
141 list_del(&next->anon_vma_node); 169 avc->anon_vma = anon_vma;
170 list_add(&avc->same_vma, &vma->anon_vma_chain);
171
172 spin_lock(&anon_vma->lock);
173 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
174 spin_unlock(&anon_vma->lock);
142} 175}
143 176
144void __anon_vma_link(struct vm_area_struct *vma) 177/*
178 * Attach the anon_vmas from src to dst.
179 * Returns 0 on success, -ENOMEM on failure.
180 */
181int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 182{
146 struct anon_vma *anon_vma = vma->anon_vma; 183 struct anon_vma_chain *avc, *pavc;
184
185 list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
186 avc = anon_vma_chain_alloc();
187 if (!avc)
188 goto enomem_failure;
189 anon_vma_chain_link(dst, avc, pavc->anon_vma);
190 }
191 return 0;
147 192
148 if (anon_vma) 193 enomem_failure:
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 194 unlink_anon_vmas(dst);
195 return -ENOMEM;
150} 196}
151 197
152void anon_vma_link(struct vm_area_struct *vma) 198/*
199 * Attach vma to its own anon_vma, as well as to the anon_vmas that
200 * the corresponding VMA in the parent process is attached to.
201 * Returns 0 on success, non-zero on failure.
202 */
203int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 204{
154 struct anon_vma *anon_vma = vma->anon_vma; 205 struct anon_vma_chain *avc;
206 struct anon_vma *anon_vma;
155 207
156 if (anon_vma) { 208 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 209 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 210 return 0;
159 spin_unlock(&anon_vma->lock); 211
160 } 212 /*
213 * First, attach the new VMA to the parent VMA's anon_vmas,
214 * so rmap can find non-COWed pages in child processes.
215 */
216 if (anon_vma_clone(vma, pvma))
217 return -ENOMEM;
218
219 /* Then add our own anon_vma. */
220 anon_vma = anon_vma_alloc();
221 if (!anon_vma)
222 goto out_error;
223 avc = anon_vma_chain_alloc();
224 if (!avc)
225 goto out_error_free_anon_vma;
226 anon_vma_chain_link(vma, avc, anon_vma);
227 /* Mark this anon_vma as the one where our new (COWed) pages go. */
228 vma->anon_vma = anon_vma;
229
230 return 0;
231
232 out_error_free_anon_vma:
233 anon_vma_free(anon_vma);
234 out_error:
235 return -ENOMEM;
161} 236}
162 237
163void anon_vma_unlink(struct vm_area_struct *vma) 238static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 239{
165 struct anon_vma *anon_vma = vma->anon_vma; 240 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 241 int empty;
167 242
243 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 244 if (!anon_vma)
169 return; 245 return;
170 246
171 spin_lock(&anon_vma->lock); 247 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 248 list_del(&anon_vma_chain->same_anon_vma);
173 249
174 /* We must garbage collect the anon_vma if it's empty */ 250 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 251 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 255 anon_vma_free(anon_vma);
180} 256}
181 257
258void unlink_anon_vmas(struct vm_area_struct *vma)
259{
260 struct anon_vma_chain *avc, *next;
261
262 /* Unlink each anon_vma chained to the VMA. */
263 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
264 anon_vma_unlink(avc);
265 list_del(&avc->same_vma);
266 anon_vma_chain_free(avc);
267 }
268}
269
182static void anon_vma_ctor(void *data) 270static void anon_vma_ctor(void *data)
183{ 271{
184 struct anon_vma *anon_vma = data; 272 struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
192{ 280{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 281 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 282 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
283 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 284}
196 285
197/* 286/*
@@ -240,6 +329,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
240 /* page should be within @vma mapping range */ 329 /* page should be within @vma mapping range */
241 return -EFAULT; 330 return -EFAULT;
242 } 331 }
332 if (unlikely(vma->vm_flags & VM_LOCK_RMAP)) {
333 /*
334 * This VMA is being unlinked or is not yet linked into the
335 * VMA tree. Do not try to follow this rmap. This race
336 * condition can result in page_referenced() ignoring a
337 * reference or in try_to_unmap() failing to unmap a page.
338 * The VMA cannot be freed under us because we hold the
339 * anon_vma->lock, which the munmap code takes while
340 * unlinking the anon_vmas from the VMA.
341 */
342 return -EFAULT;
343 }
243 return address; 344 return address;
244} 345}
245 346
@@ -396,7 +497,7 @@ static int page_referenced_anon(struct page *page,
396{ 497{
397 unsigned int mapcount; 498 unsigned int mapcount;
398 struct anon_vma *anon_vma; 499 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 500 struct anon_vma_chain *avc;
400 int referenced = 0; 501 int referenced = 0;
401 502
402 anon_vma = page_lock_anon_vma(page); 503 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +505,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 505 return referenced;
405 506
406 mapcount = page_mapcount(page); 507 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 508 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
509 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 510 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 511 if (address == -EFAULT)
410 continue; 512 continue;
@@ -1025,14 +1127,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1025static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1127static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1026{ 1128{
1027 struct anon_vma *anon_vma; 1129 struct anon_vma *anon_vma;
1028 struct vm_area_struct *vma; 1130 struct anon_vma_chain *avc;
1029 int ret = SWAP_AGAIN; 1131 int ret = SWAP_AGAIN;
1030 1132
1031 anon_vma = page_lock_anon_vma(page); 1133 anon_vma = page_lock_anon_vma(page);
1032 if (!anon_vma) 1134 if (!anon_vma)
1033 return ret; 1135 return ret;
1034 1136
1035 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1137 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1138 struct vm_area_struct *vma = avc->vma;
1036 unsigned long address = vma_address(page, vma); 1139 unsigned long address = vma_address(page, vma);
1037 if (address == -EFAULT) 1140 if (address == -EFAULT)
1038 continue; 1141 continue;
@@ -1223,7 +1326,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1223 struct vm_area_struct *, unsigned long, void *), void *arg) 1326 struct vm_area_struct *, unsigned long, void *), void *arg)
1224{ 1327{
1225 struct anon_vma *anon_vma; 1328 struct anon_vma *anon_vma;
1226 struct vm_area_struct *vma; 1329 struct anon_vma_chain *avc;
1227 int ret = SWAP_AGAIN; 1330 int ret = SWAP_AGAIN;
1228 1331
1229 /* 1332 /*
@@ -1238,7 +1341,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1238 if (!anon_vma) 1341 if (!anon_vma)
1239 return ret; 1342 return ret;
1240 spin_lock(&anon_vma->lock); 1343 spin_lock(&anon_vma->lock);
1241 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1344 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1345 struct vm_area_struct *vma = avc->vma;
1242 unsigned long address = vma_address(page, vma); 1346 unsigned long address = vma_address(page, vma);
1243 if (address == -EFAULT) 1347 if (address == -EFAULT)
1244 continue; 1348 continue;