aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2015-03-25 18:55:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-03-25 19:20:31 -0400
commit074c238177a75f5e79af3b2cb6a84e54823ef950 (patch)
tree59fc27cb666e4691083df324e3016749523566c9
parentb191f9b106ea1a24a711dbebb2925d3313da5852 (diff)
mm: numa: slow PTE scan rate if migration failures occur
Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226 Across the board the 4.0-rc1 numbers are much slower, and the degradation is far worse when using the large memory footprint configs. Perf points straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config: - 56.07% 56.07% [kernel] [k] default_send_IPI_mask_sequence_phys - default_send_IPI_mask_sequence_phys - 99.99% physflat_send_IPI_mask - 99.37% native_send_call_func_ipi smp_call_function_many - native_flush_tlb_others - 99.85% flush_tlb_page ptep_clear_flush try_to_unmap_one rmap_walk try_to_unmap migrate_pages migrate_misplaced_page - handle_mm_fault - 99.73% __do_page_fault trace_do_page_fault do_async_page_fault + async_page_fault 0.63% native_send_call_func_single_ipi generic_exec_single smp_call_function_single This is showing excessive migration activity even though excessive migrations are meant to get throttled. Normally, the scan rate is tuned on a per-task basis depending on the locality of faults. However, if migrations fail for any reason then the PTE scanner may scan faster if the faults continue to be remote. This means there is higher system CPU overhead and fault trapping at exactly the time we know that migrations cannot happen. This patch tracks when migration failures occur and slows the PTE scanner. Signed-off-by: Mel Gorman <mgorman@suse.de> Reported-by: Dave Chinner <david@fromorbit.com> Tested-by: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/sched/fair.c8
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/memory.c3
4 files changed, 15 insertions, 8 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d77432e14ff..a419b65770d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1625,11 +1625,11 @@ struct task_struct {
1625 1625
1626 /* 1626 /*
1627 * numa_faults_locality tracks if faults recorded during the last 1627 * numa_faults_locality tracks if faults recorded during the last
1628 * scan window were remote/local. The task scan period is adapted 1628 * scan window were remote/local or failed to migrate. The task scan
1629 * based on the locality of the faults with different weights 1629 * period is adapted based on the locality of the faults with different
1630 * depending on whether they were shared or private faults 1630 * weights depending on whether they were shared or private faults
1631 */ 1631 */
1632 unsigned long numa_faults_locality[2]; 1632 unsigned long numa_faults_locality[3];
1633 1633
1634 unsigned long numa_pages_migrated; 1634 unsigned long numa_pages_migrated;
1635#endif /* CONFIG_NUMA_BALANCING */ 1635#endif /* CONFIG_NUMA_BALANCING */
@@ -1719,6 +1719,7 @@ struct task_struct {
1719#define TNF_NO_GROUP 0x02 1719#define TNF_NO_GROUP 0x02
1720#define TNF_SHARED 0x04 1720#define TNF_SHARED 0x04
1721#define TNF_FAULT_LOCAL 0x08 1721#define TNF_FAULT_LOCAL 0x08
1722#define TNF_MIGRATE_FAIL 0x10
1722 1723
1723#ifdef CONFIG_NUMA_BALANCING 1724#ifdef CONFIG_NUMA_BALANCING
1724extern void task_numa_fault(int last_node, int node, int pages, int flags); 1725extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..bcfe32088b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
1609 /* 1609 /*
1610 * If there were no record hinting faults then either the task is 1610 * If there were no record hinting faults then either the task is
1611 * completely idle or all activity is areas that are not of interest 1611 * completely idle or all activity is areas that are not of interest
1612 * to automatic numa balancing. Scan slower 1612 * to automatic numa balancing. Related to that, if there were failed
1613 * migration then it implies we are migrating too quickly or the local
1614 * node is overloaded. In either case, scan slower
1613 */ 1615 */
1614 if (local + shared == 0) { 1616 if (local + shared == 0 || p->numa_faults_locality[2]) {
1615 p->numa_scan_period = min(p->numa_scan_period_max, 1617 p->numa_scan_period = min(p->numa_scan_period_max,
1616 p->numa_scan_period << 1); 1618 p->numa_scan_period << 1);
1617 1619
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2080 2082
2081 if (migrated) 2083 if (migrated)
2082 p->numa_pages_migrated += pages; 2084 p->numa_pages_migrated += pages;
2085 if (flags & TNF_MIGRATE_FAIL)
2086 p->numa_faults_locality[2] += pages;
2083 2087
2084 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 2088 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2085 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; 2089 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0a42d1521aa4..51b3e7c64622 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1350 if (migrated) { 1350 if (migrated) {
1351 flags |= TNF_MIGRATED; 1351 flags |= TNF_MIGRATED;
1352 page_nid = target_nid; 1352 page_nid = target_nid;
1353 } 1353 } else
1354 flags |= TNF_MIGRATE_FAIL;
1354 1355
1355 goto out; 1356 goto out;
1356clear_pmdnuma: 1357clear_pmdnuma:
diff --git a/mm/memory.c b/mm/memory.c
index d20e12da3a3c..97839f5c8c30 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3103 if (migrated) { 3103 if (migrated) {
3104 page_nid = target_nid; 3104 page_nid = target_nid;
3105 flags |= TNF_MIGRATED; 3105 flags |= TNF_MIGRATED;
3106 } 3106 } else
3107 flags |= TNF_MIGRATE_FAIL;
3107 3108
3108out: 3109out:
3109 if (page_nid != -1) 3110 if (page_nid != -1)