diff options
author | Mel Gorman <mgorman@suse.de> | 2012-11-20 20:18:23 -0500 |
---|---|---|
committer | Mel Gorman <mgorman@suse.de> | 2012-12-11 09:42:55 -0500 |
commit | b8593bfda1652755136333cdd362de125b283a9c (patch) | |
tree | c0395d9cf775fd9225e81b055fc8f5540a14333a /kernel/sched | |
parent | e42c8ff2999de1239a57d434bfbd8e9f2a56e814 (diff) |
mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate
The PTE scanning rate and fault rates are two of the biggest sources of
system CPU overhead with automatic NUMA placement. Ideally a proper policy
would detect if a workload was properly placed, schedule and adjust the
PTE scanning rate accordingly. We do not track the necessary information
to do that but we at least know if we migrated or not.
This patch scans slower if a page was not migrated as the result of a
NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is
now higher than the previous default. Once every minute it will reset
the scanner in case of phase changes.
This is hilariously crude and the numbers are arbitrary. Workloads will
converge quite slowly in comparison to what a proper policy should be able
to do. On the plus side, we will chew up less CPU for workloads that have
no need for automatic balancing.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 29 |
2 files changed, 22 insertions, 8 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbfc4843063f..9d255bc0e278 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p) | |||
1537 | #ifdef CONFIG_NUMA_BALANCING | 1537 | #ifdef CONFIG_NUMA_BALANCING |
1538 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | 1538 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { |
1539 | p->mm->numa_next_scan = jiffies; | 1539 | p->mm->numa_next_scan = jiffies; |
1540 | p->mm->numa_next_reset = jiffies; | ||
1540 | p->mm->numa_scan_seq = 0; | 1541 | p->mm->numa_scan_seq = 0; |
1541 | } | 1542 | } |
1542 | 1543 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd18087fd369..4b577863933f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
784 | * numa task sample period in ms | 784 | * numa task sample period in ms |
785 | */ | 785 | */ |
786 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 786 | unsigned int sysctl_numa_balancing_scan_period_min = 100; |
787 | unsigned int sysctl_numa_balancing_scan_period_max = 100*16; | 787 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; |
788 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
788 | 789 | ||
789 | /* Portion of address space to scan in MB */ | 790 | /* Portion of address space to scan in MB */ |
790 | unsigned int sysctl_numa_balancing_scan_size = 256; | 791 | unsigned int sysctl_numa_balancing_scan_size = 256; |
@@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p) | |||
806 | /* | 807 | /* |
807 | * Got a PROT_NONE fault for a page on @node. | 808 | * Got a PROT_NONE fault for a page on @node. |
808 | */ | 809 | */ |
809 | void task_numa_fault(int node, int pages) | 810 | void task_numa_fault(int node, int pages, bool migrated) |
810 | { | 811 | { |
811 | struct task_struct *p = current; | 812 | struct task_struct *p = current; |
812 | 813 | ||
813 | /* FIXME: Allocate task-specific structure for placement policy here */ | 814 | /* FIXME: Allocate task-specific structure for placement policy here */ |
814 | 815 | ||
815 | /* | 816 | /* |
816 | * Assume that as faults occur that pages are getting properly placed | 817 | * If pages are properly placed (did not migrate) then scan slower. |
817 | * and fewer NUMA hints are required. Note that this is a big | 818 | * This is reset periodically in case of phase changes |
818 | * assumption, it assumes processes reach a steady steady with no | ||
819 | * further phase changes. | ||
820 | */ | 819 | */ |
821 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 820 | if (!migrated) |
822 | p->numa_scan_period + jiffies_to_msecs(2)); | 821 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, |
822 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
823 | 823 | ||
824 | task_numa_placement(p); | 824 | task_numa_placement(p); |
825 | } | 825 | } |
@@ -858,6 +858,19 @@ void task_numa_work(struct callback_head *work) | |||
858 | return; | 858 | return; |
859 | 859 | ||
860 | /* | 860 | /* |
861 | * Reset the scan period if enough time has gone by. Objective is that | ||
862 | * scanning will be reduced if pages are properly placed. As tasks | ||
863 | * can enter different phases this needs to be re-examined. Lacking | ||
864 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
865 | */ | ||
866 | migrate = mm->numa_next_reset; | ||
867 | if (time_after(now, migrate)) { | ||
868 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
869 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
870 | xchg(&mm->numa_next_reset, next_scan); | ||
871 | } | ||
872 | |||
873 | /* | ||
861 | * Enforce maximal scan/migration frequency.. | 874 | * Enforce maximal scan/migration frequency.. |
862 | */ | 875 | */ |
863 | migrate = mm->numa_next_scan; | 876 | migrate = mm->numa_next_scan; |