aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2013-10-07 06:29:08 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:36 -0400
commit6fe6b2d6dabf392aceb3ad3a5e859b46a04465c6 (patch)
treedb4493950d94c418edcce093bd698e79ec1dca1a
parentb795854b1fa70f6aee923ae5df74ff7afeaddcaa (diff)
sched/numa: Do not migrate memory immediately after switching node
The load balancer can move tasks between nodes and does not take NUMA locality into account. With automatic NUMA balancing this may result in the tasks working set being migrated to the new node. However, as the fault buffer will still store faults from the old node the schduler may decide to reset the preferred node and migrate the task back resulting in more migrations. The ideal would be that the scheduler did not migrate tasks with a heavy memory footprint but this may result nodes being overloaded. We could also discard the fault information on task migration but this would still cause all the tasks working set to be migrated. This patch simply avoids migrating the memory for a short time after a task is migrated. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-31-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--mm/mempolicy.c12
3 files changed, 29 insertions, 3 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 66b878e94554..9060a7f4e9ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p)
1631 1631
1632 p->node_stamp = 0ULL; 1632 p->node_stamp = 0ULL;
1633 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1633 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1634 p->numa_migrate_seq = 0; 1634 p->numa_migrate_seq = 1;
1635 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1635 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1636 p->numa_preferred_nid = -1; 1636 p->numa_preferred_nid = -1;
1637 p->numa_work.next = &p->numa_work; 1637 p->numa_work.next = &p->numa_work;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1de7c55e9f7..61ec0d4765b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -884,7 +884,7 @@ static unsigned int task_scan_max(struct task_struct *p)
884 * the preferred node but still allow the scheduler to move the task again if 884 * the preferred node but still allow the scheduler to move the task again if
885 * the nodes CPUs are overloaded. 885 * the nodes CPUs are overloaded.
886 */ 886 */
887unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; 887unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
888 888
889static inline int task_faults_idx(int nid, int priv) 889static inline int task_faults_idx(int nid, int priv)
890{ 890{
@@ -980,7 +980,7 @@ static void task_numa_placement(struct task_struct *p)
980 980
981 /* Update the preferred nid and migrate task if possible */ 981 /* Update the preferred nid and migrate task if possible */
982 p->numa_preferred_nid = max_nid; 982 p->numa_preferred_nid = max_nid;
983 p->numa_migrate_seq = 0; 983 p->numa_migrate_seq = 1;
984 migrate_task_to(p, preferred_cpu); 984 migrate_task_to(p, preferred_cpu);
985 } 985 }
986} 986}
@@ -4121,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env)
4121 set_task_cpu(p, env->dst_cpu); 4121 set_task_cpu(p, env->dst_cpu);
4122 activate_task(env->dst_rq, p, 0); 4122 activate_task(env->dst_rq, p, 0);
4123 check_preempt_curr(env->dst_rq, p, 0); 4123 check_preempt_curr(env->dst_rq, p, 0);
4124#ifdef CONFIG_NUMA_BALANCING
4125 if (p->numa_preferred_nid != -1) {
4126 int src_nid = cpu_to_node(env->src_cpu);
4127 int dst_nid = cpu_to_node(env->dst_cpu);
4128
4129 /*
4130 * If the load balancer has moved the task then limit
4131 * migrations from taking place in the short term in
4132 * case this is a short-lived migration.
4133 */
4134 if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
4135 p->numa_migrate_seq = 0;
4136 }
4137#endif
4124} 4138}
4125 4139
4126/* 4140/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aff1f1ed3dc5..196d8da2b657 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2378,6 +2378,18 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2378 last_nidpid = page_nidpid_xchg_last(page, this_nidpid); 2378 last_nidpid = page_nidpid_xchg_last(page, this_nidpid);
2379 if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) 2379 if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid)
2380 goto out; 2380 goto out;
2381
2382#ifdef CONFIG_NUMA_BALANCING
2383 /*
2384 * If the scheduler has just moved us away from our
2385 * preferred node, do not bother migrating pages yet.
2386 * This way a short and temporary process migration will
2387 * not cause excessive memory migration.
2388 */
2389 if (polnid != current->numa_preferred_nid &&
2390 !current->numa_migrate_seq)
2391 goto out;
2392#endif
2381 } 2393 }
2382 2394
2383 if (curnid != polnid) 2395 if (curnid != polnid)