aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:07 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:35 -0400
commitb795854b1fa70f6aee923ae5df74ff7afeaddcaa (patch)
treefd109d9f3778c7bc934fedb3cda2b5bfb1293375 /mm/memory.c
parent073b5beea735c7e1970686c94ff1f3aaac790a2a (diff)
sched/numa: Set preferred NUMA node based on number of private faults
Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c16
1 files changed, 8 insertions, 8 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 3e3b4b8b6c41..cc7f20691c82 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3536 struct page *page = NULL; 3536 struct page *page = NULL;
3537 spinlock_t *ptl; 3537 spinlock_t *ptl;
3538 int page_nid = -1; 3538 int page_nid = -1;
3539 int last_nid; 3539 int last_nidpid;
3540 int target_nid; 3540 int target_nid;
3541 bool migrated = false; 3541 bool migrated = false;
3542 3542
@@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3567 } 3567 }
3568 BUG_ON(is_zero_pfn(page_to_pfn(page))); 3568 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3569 3569
3570 last_nid = page_nid_last(page); 3570 last_nidpid = page_nidpid_last(page);
3571 page_nid = page_to_nid(page); 3571 page_nid = page_to_nid(page);
3572 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3572 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3573 pte_unmap_unlock(ptep, ptl); 3573 pte_unmap_unlock(ptep, ptl);
@@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3583 3583
3584out: 3584out:
3585 if (page_nid != -1) 3585 if (page_nid != -1)
3586 task_numa_fault(last_nid, page_nid, 1, migrated); 3586 task_numa_fault(last_nidpid, page_nid, 1, migrated);
3587 return 0; 3587 return 0;
3588} 3588}
3589 3589
@@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3598 unsigned long offset; 3598 unsigned long offset;
3599 spinlock_t *ptl; 3599 spinlock_t *ptl;
3600 bool numa = false; 3600 bool numa = false;
3601 int last_nid; 3601 int last_nidpid;
3602 3602
3603 spin_lock(&mm->page_table_lock); 3603 spin_lock(&mm->page_table_lock);
3604 pmd = *pmdp; 3604 pmd = *pmdp;
@@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3643 if (unlikely(!page)) 3643 if (unlikely(!page))
3644 continue; 3644 continue;
3645 3645
3646 last_nid = page_nid_last(page); 3646 last_nidpid = page_nidpid_last(page);
3647 page_nid = page_to_nid(page); 3647 page_nid = page_to_nid(page);
3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3649 pte_unmap_unlock(pte, ptl); 3649 pte_unmap_unlock(pte, ptl);
@@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3656 } 3656 }
3657 3657
3658 if (page_nid != -1) 3658 if (page_nid != -1)
3659 task_numa_fault(last_nid, page_nid, 1, migrated); 3659 task_numa_fault(last_nidpid, page_nid, 1, migrated);
3660 3660
3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); 3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3662 } 3662 }