aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-10-25 08:16:47 -0400
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:47 -0500
commit4b96a29ba891dd59734cb7be80a900fe93aa2d9f (patch)
tree5162223c4ceae37f6ccf0ef1b84993c2556e60cf /kernel/sched
parent9f40604cdab935e80db57b309c48659de349d4e6 (diff)
mm: sched: numa: Implement slow start for working set sampling
Add a 1 second delay before starting to scan the working set of a task and starting to balance it amongst nodes. [ note that before the constant per task WSS sampling rate patch the initial scan would happen much later still, in effect that patch caused this regression. ] The theory is that short-run tasks benefit very little from NUMA placement: they come and go, and they better stick to the node they were started on. As tasks mature and rebalance to other CPUs and nodes, so does their NUMA placement have to change and so does it start to matter more and more. In practice this change fixes an observable kbuild regression: # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ] !NUMA: 45.291088843 seconds time elapsed ( +- 0.40% ) 45.154231752 seconds time elapsed ( +- 0.36% ) +NUMA, no slow start: 46.172308123 seconds time elapsed ( +- 0.30% ) 46.343168745 seconds time elapsed ( +- 0.25% ) +NUMA, 1 sec slow start: 45.224189155 seconds time elapsed ( +- 0.25% ) 45.160866532 seconds time elapsed ( +- 0.17% ) and it also fixes an observable perf bench (hackbench) regression: # perf stat --null --repeat 10 perf bench sched messaging -NUMA: -NUMA: 0.246225691 seconds time elapsed ( +- 1.31% ) +NUMA no slow start: 0.252620063 seconds time elapsed ( +- 1.13% ) +NUMA 1sec delay: 0.248076230 seconds time elapsed ( +- 1.35% ) The implementation is simple and straightforward, most of the patch deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable knob. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> [ Wrote the changelog, ran measurements, tuned the default. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c5
2 files changed, 6 insertions, 1 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cad0d092ce3b..fbfc4843063f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1543,7 +1543,7 @@ static void __sched_fork(struct task_struct *p)
1543 p->node_stamp = 0ULL; 1543 p->node_stamp = 0ULL;
1544 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1544 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1545 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; 1545 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1546 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1546 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1547 p->numa_work.next = &p->numa_work; 1547 p->numa_work.next = &p->numa_work;
1548#endif /* CONFIG_NUMA_BALANCING */ 1548#endif /* CONFIG_NUMA_BALANCING */
1549} 1549}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6e1f25ed2bd..7727b0161579 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -788,6 +788,9 @@ unsigned int sysctl_numa_balancing_scan_period_max = 100*16;
788/* Portion of address space to scan in MB */ 788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256; 789unsigned int sysctl_numa_balancing_scan_size = 256;
790 790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
791static void task_numa_placement(struct task_struct *p) 794static void task_numa_placement(struct task_struct *p)
792{ 795{
793 int seq = ACCESS_ONCE(p->mm->numa_scan_seq); 796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -929,6 +932,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
929 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; 932 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
930 933
931 if (now - curr->node_stamp > period) { 934 if (now - curr->node_stamp > period) {
935 if (!curr->node_stamp)
936 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
932 curr->node_stamp = now; 937 curr->node_stamp = now;
933 938
934 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 939 if (!time_before(jiffies, curr->mm->numa_next_scan)) {