aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:28:53 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:17 -0400
commitb726b7dfb400c937546fa91cf8523dcb1aa2fc6e (patch)
treeb4a6f741b630bb22568536860f3f46e94c4e8904
parent9e645ab6d089f5822479a833c6977c785bcfffe3 (diff)
Revert "mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node"
PTE scanning and NUMA hinting fault handling is expensive so commit 5bca2303 ("mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node") deferred the PTE scan until a task had been scheduled on another node. The problem is that in the purely shared memory case that this may never happen and no NUMA hinting fault information will be captured. We are not ruling out the possibility that something better can be done here but for now, this patch needs to be reverted and depend entirely on the scan_delay to avoid punishing short-lived processes. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-16-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--kernel/sched/features.h4
4 files changed, 1 insertions, 34 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851eeb6e1d..b7adf1d4310c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -428,20 +428,10 @@ struct mm_struct {
428 428
429 /* numa_scan_seq prevents two threads setting pte_numa */ 429 /* numa_scan_seq prevents two threads setting pte_numa */
430 int numa_scan_seq; 430 int numa_scan_seq;
431
432 /*
433 * The first node a task was scheduled on. If a task runs on
434 * a different node than Make PTE Scan Go Now.
435 */
436 int first_nid;
437#endif 431#endif
438 struct uprobes_state uprobes_state; 432 struct uprobes_state uprobes_state;
439}; 433};
440 434
441/* first nid will either be a valid NID or one of these values */
442#define NUMA_PTE_SCAN_INIT -1
443#define NUMA_PTE_SCAN_ACTIVE -2
444
445static inline void mm_init_cpumask(struct mm_struct *mm) 435static inline void mm_init_cpumask(struct mm_struct *mm)
446{ 436{
447#ifdef CONFIG_CPUMASK_OFFSTACK 437#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..7192d91b5415 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
817#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#ifdef CONFIG_TRANSPARENT_HUGEPAGE
818 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
819#endif 819#endif
820#ifdef CONFIG_NUMA_BALANCING
821 mm->first_nid = NUMA_PTE_SCAN_INIT;
822#endif
823 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
824 goto fail_nomem; 821 goto fail_nomem;
825 822
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 464207fc9eef..49b11faa2961 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,24 +901,6 @@ void task_numa_work(struct callback_head *work)
901 return; 901 return;
902 902
903 /* 903 /*
904 * We do not care about task placement until a task runs on a node
905 * other than the first one used by the address space. This is
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that 904 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks 905 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking 906 * can enter different phases this needs to be re-examined. Lacking
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..cba5c616a157 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,8 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false)
72#endif 70#endif