aboutsummaryrefslogtreecommitdiffstats
path: root/mm/ksm.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2010-03-05 16:42:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:26 -0500
commit5beb49305251e5669852ed541e8e2f2f7696c53e (patch)
tree46457450a22f23938b24904aeba5d4ada2f53b20 /mm/ksm.c
parent648bcc771145172a14bc35eeb849ed08f6aa4f1e (diff)
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/ksm.c')
-rw-r--r--mm/ksm.c12
1 files changed, 9 insertions, 3 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..a93f1b7f508c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1563again: 1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1566 struct vm_area_struct *vma; 1567 struct vm_area_struct *vma;
1567 1568
1568 spin_lock(&anon_vma->lock); 1569 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1570 if (rmap_item->address < vma->vm_start || 1572 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end) 1573 rmap_item->address >= vma->vm_end)
1572 continue; 1574 continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1614again: 1616again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1617 struct vm_area_struct *vma; 1620 struct vm_area_struct *vma;
1618 1621
1619 spin_lock(&anon_vma->lock); 1622 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1621 if (rmap_item->address < vma->vm_start || 1625 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end) 1626 rmap_item->address >= vma->vm_end)
1623 continue; 1627 continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1664again: 1668again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1667 struct vm_area_struct *vma; 1672 struct vm_area_struct *vma;
1668 1673
1669 spin_lock(&anon_vma->lock); 1674 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1671 if (rmap_item->address < vma->vm_start || 1677 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end) 1678 rmap_item->address >= vma->vm_end)
1673 continue; 1679 continue;