aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-11-22 09:40:03 -0500
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:56 -0500
commit5bca23035391928c4c7301835accca3551b96cc2 (patch)
tree2feb63abf318e6edfded8bb97b43ca29c3c5b312
parent3105b86a9fee7d2c2e76edb53bbbc4027599628f (diff)
mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node
Due to the fact that migrations are driven by the CPU a task is running on there is no point tracking NUMA faults until one task runs on a new node. This patch tracks the first node used by an address space. Until it changes, PTE scanning is disabled and no NUMA hinting faults are trapped. This should help workloads that are short-lived, do not care about NUMA placement or have bound themselves to a single node. This takes advantage of the logic in "mm: sched: numa: Implement slow start for working set sampling" to delay when the checks are made. This will take advantage of processes that set their CPU and node bindings early in their lifetime. It will also potentially allow any initial load balancing to take place. Signed-off-by: Mel Gorman <mgorman@suse.de>
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--kernel/sched/features.h4
4 files changed, 34 insertions, 1 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e850a23dd6ec..197422a1598c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -418,10 +418,20 @@ struct mm_struct {
418 418
419 /* numa_scan_seq prevents two threads setting pte_numa */ 419 /* numa_scan_seq prevents two threads setting pte_numa */
420 int numa_scan_seq; 420 int numa_scan_seq;
421
422 /*
423 * The first node a task was scheduled on. If a task runs on
424 * a different node than Make PTE Scan Go Now.
425 */
426 int first_nid;
421#endif 427#endif
422 struct uprobes_state uprobes_state; 428 struct uprobes_state uprobes_state;
423}; 429};
424 430
431/* first nid will either be a valid NID or one of these values */
432#define NUMA_PTE_SCAN_INIT -1
433#define NUMA_PTE_SCAN_ACTIVE -2
434
425static inline void mm_init_cpumask(struct mm_struct *mm) 435static inline void mm_init_cpumask(struct mm_struct *mm)
426{ 436{
427#ifdef CONFIG_CPUMASK_OFFSTACK 437#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..296ea308096d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -821,6 +821,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
821#ifdef CONFIG_TRANSPARENT_HUGEPAGE 821#ifdef CONFIG_TRANSPARENT_HUGEPAGE
822 mm->pmd_huge_pte = NULL; 822 mm->pmd_huge_pte = NULL;
823#endif 823#endif
824#ifdef CONFIG_NUMA_BALANCING
825 mm->first_nid = NUMA_PTE_SCAN_INIT;
826#endif
824 if (!mm_init(mm, tsk)) 827 if (!mm_init(mm, tsk))
825 goto fail_nomem; 828 goto fail_nomem;
826 829
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a02a2082e95..3e18f611a5aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -861,6 +861,24 @@ void task_numa_work(struct callback_head *work)
861 return; 861 return;
862 862
863 /* 863 /*
864 * We do not care about task placement until a task runs on a node
865 * other than the first one used by the address space. This is
866 * largely because migrations are driven by what CPU the task
867 * is running on. If it's never scheduled on another node, it'll
868 * not migrate so why bother trapping the fault.
869 */
870 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
871 mm->first_nid = numa_node_id();
872 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
873 /* Are we running on a new node yet? */
874 if (numa_node_id() == mm->first_nid &&
875 !sched_feat_numa(NUMA_FORCE))
876 return;
877
878 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
879 }
880
881 /*
864 * Reset the scan period if enough time has gone by. Objective is that 882 * Reset the scan period if enough time has gone by. Objective is that
865 * scanning will be reduced if pages are properly placed. As tasks 883 * scanning will be reduced if pages are properly placed. As tasks
866 * can enter different phases this needs to be re-examined. Lacking 884 * can enter different phases this needs to be re-examined. Lacking
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d2373a3e3252..e7c25fff1e94 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false)
65/* 65/*
66 * Apply the automatic NUMA scheduling policy. Enabled automatically 66 * Apply the automatic NUMA scheduling policy. Enabled automatically
67 * at runtime if running on a NUMA machine. Can be controlled via 67 * at runtime if running on a NUMA machine. Can be controlled via
68 * numa_balancing= 68 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
69 * for debugging the core machinery.
69 */ 70 */
70#ifdef CONFIG_NUMA_BALANCING 71#ifdef CONFIG_NUMA_BALANCING
71SCHED_FEAT(NUMA, false) 72SCHED_FEAT(NUMA, false)
73SCHED_FEAT(NUMA_FORCE, false)
72#endif 74#endif