aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:28:55 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:20 -0400
commit598f0ec0bc996e90a806ee9564af919ea5aad401 (patch)
tree9df97675a01340285b792be1909a41a02dbe905f
parent7e8d16b6cbccb2f5da579f5085479fb82ba851b8 (diff)
sched/numa: Set the scan rate proportional to the memory usage of the task being scanned
The NUMA PTE scan rate is controlled with a combination of the numa_balancing_scan_period_min, numa_balancing_scan_period_max and numa_balancing_scan_size. This scan rate is independent of the size of the task and as an aside it is further complicated by the fact that numa_balancing_scan_size controls how many pages are marked pte_numa and not how much virtual memory is scanned. In combination, it is almost impossible to meaningfully tune the min and max scan periods and reasoning about performance is complex when the time to complete a full scan is is partially a function of the tasks memory size. This patch alters the semantic of the min and max tunables to be about tuning the length time it takes to complete a scan of a tasks occupied virtual address space. Conceptually this is a lot easier to understand. There is a "sanity" check to ensure the scan rate is never extremely fast based on the amount of virtual memory that should be scanned in a second. The default of 2.5G seems arbitrary but it is to have the maximum scan rate after the patch roughly match the maximum scan rate before the patch was applied. On a similar note, numa_scan_period is in milliseconds and not jiffies. Properly placed pages slow the scanning rate but adding 10 jiffies to numa_scan_period means that the rate scanning slows depends on HZ which is confusing. Get rid of the jiffies_to_msec conversion and treat it as ms. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-18-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/sysctl/kernel.txt11
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched/fair.c88
3 files changed, 83 insertions, 17 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1428c6659254..8cd7e5fc79da 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -403,15 +403,16 @@ workload pattern changes and minimises performance impact due to remote
403memory accesses. These sysctls control the thresholds for scan delays and 403memory accesses. These sysctls control the thresholds for scan delays and
404the number of pages scanned. 404the number of pages scanned.
405 405
406numa_balancing_scan_period_min_ms is the minimum delay in milliseconds 406numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
407between scans. It effectively controls the maximum scanning rate for 407scan a tasks virtual memory. It effectively controls the maximum scanning
408each task. 408rate for each task.
409 409
410numa_balancing_scan_delay_ms is the starting "scan delay" used for a task 410numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
411when it initially forks. 411when it initially forks.
412 412
413numa_balancing_scan_period_max_ms is the maximum delay between scans. It 413numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
414effectively controls the minimum scanning rate for each task. 414scan a tasks virtual memory. It effectively controls the minimum scanning
415rate for each task.
415 416
416numa_balancing_scan_size_mb is how many megabytes worth of pages are 417numa_balancing_scan_size_mb is how many megabytes worth of pages are
417scanned for a given scan. 418scanned for a given scan.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2ac5285db434..fdcb4c855072 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1339,6 +1339,7 @@ struct task_struct {
1339 int numa_scan_seq; 1339 int numa_scan_seq;
1340 int numa_migrate_seq; 1340 int numa_migrate_seq;
1341 unsigned int numa_scan_period; 1341 unsigned int numa_scan_period;
1342 unsigned int numa_scan_period_max;
1342 u64 node_stamp; /* migration stamp */ 1343 u64 node_stamp; /* migration stamp */
1343 struct callback_head numa_work; 1344 struct callback_head numa_work;
1344#endif /* CONFIG_NUMA_BALANCING */ 1345#endif /* CONFIG_NUMA_BALANCING */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0966f0c16f1b..e08d757720de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 818
819#ifdef CONFIG_NUMA_BALANCING 819#ifdef CONFIG_NUMA_BALANCING
820/* 820/*
821 * numa task sample period in ms 821 * Approximate time to scan a full NUMA task in ms. The task scan period is
822 * calculated based on the tasks virtual memory size and
823 * numa_balancing_scan_size.
822 */ 824 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 825unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 826unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; 827unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
826 828
827/* Portion of address space to scan in MB */ 829/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 830unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 832/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 833unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 834
835static unsigned int task_nr_scan_windows(struct task_struct *p)
836{
837 unsigned long rss = 0;
838 unsigned long nr_scan_pages;
839
840 /*
841 * Calculations based on RSS as non-present and empty pages are skipped
842 * by the PTE scanner and NUMA hinting faults should be trapped based
843 * on resident pages
844 */
845 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
846 rss = get_mm_rss(p->mm);
847 if (!rss)
848 rss = nr_scan_pages;
849
850 rss = round_up(rss, nr_scan_pages);
851 return rss / nr_scan_pages;
852}
853
854/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
855#define MAX_SCAN_WINDOW 2560
856
857static unsigned int task_scan_min(struct task_struct *p)
858{
859 unsigned int scan, floor;
860 unsigned int windows = 1;
861
862 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
863 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
864 floor = 1000 / windows;
865
866 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
867 return max_t(unsigned int, floor, scan);
868}
869
870static unsigned int task_scan_max(struct task_struct *p)
871{
872 unsigned int smin = task_scan_min(p);
873 unsigned int smax;
874
875 /* Watch for min being lower than max due to floor calculations */
876 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
877 return max(smin, smax);
878}
879
833static void task_numa_placement(struct task_struct *p) 880static void task_numa_placement(struct task_struct *p)
834{ 881{
835 int seq; 882 int seq;
@@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p)
840 if (p->numa_scan_seq == seq) 887 if (p->numa_scan_seq == seq)
841 return; 888 return;
842 p->numa_scan_seq = seq; 889 p->numa_scan_seq = seq;
890 p->numa_scan_period_max = task_scan_max(p);
843 891
844 /* FIXME: Scheduling placement policy hints go here */ 892 /* FIXME: Scheduling placement policy hints go here */
845} 893}
@@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated)
860 * If pages are properly placed (did not migrate) then scan slower. 908 * If pages are properly placed (did not migrate) then scan slower.
861 * This is reset periodically in case of phase changes 909 * This is reset periodically in case of phase changes
862 */ 910 */
863 if (!migrated) 911 if (!migrated) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 912 /* Initialise if necessary */
865 p->numa_scan_period + jiffies_to_msecs(10)); 913 if (!p->numa_scan_period_max)
914 p->numa_scan_period_max = task_scan_max(p);
915
916 p->numa_scan_period = min(p->numa_scan_period_max,
917 p->numa_scan_period + 10);
918 }
866 919
867 task_numa_placement(p); 920 task_numa_placement(p);
868} 921}
@@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 937 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 938 struct vm_area_struct *vma;
886 unsigned long start, end; 939 unsigned long start, end;
940 unsigned long nr_pte_updates = 0;
887 long pages; 941 long pages;
888 942
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 943 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work)
915 */ 969 */
916 migrate = mm->numa_next_reset; 970 migrate = mm->numa_next_reset;
917 if (time_after(now, migrate)) { 971 if (time_after(now, migrate)) {
918 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 972 p->numa_scan_period = task_scan_min(p);
919 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); 973 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
920 xchg(&mm->numa_next_reset, next_scan); 974 xchg(&mm->numa_next_reset, next_scan);
921 } 975 }
@@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work)
927 if (time_before(now, migrate)) 981 if (time_before(now, migrate))
928 return; 982 return;
929 983
930 if (p->numa_scan_period == 0) 984 if (p->numa_scan_period == 0) {
931 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 985 p->numa_scan_period_max = task_scan_max(p);
986 p->numa_scan_period = task_scan_min(p);
987 }
932 988
933 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 989 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
934 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 990 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
@@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work)
965 start = max(start, vma->vm_start); 1021 start = max(start, vma->vm_start);
966 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1022 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
967 end = min(end, vma->vm_end); 1023 end = min(end, vma->vm_end);
968 pages -= change_prot_numa(vma, start, end); 1024 nr_pte_updates += change_prot_numa(vma, start, end);
1025
1026 /*
1027 * Scan sysctl_numa_balancing_scan_size but ensure that
1028 * at least one PTE is updated so that unused virtual
1029 * address space is quickly skipped.
1030 */
1031 if (nr_pte_updates)
1032 pages -= (end - start) >> PAGE_SHIFT;
969 1033
970 start = end; 1034 start = end;
971 if (pages <= 0) 1035 if (pages <= 0)
@@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1012 1076
1013 if (now - curr->node_stamp > period) { 1077 if (now - curr->node_stamp > period) {
1014 if (!curr->node_stamp) 1078 if (!curr->node_stamp)
1015 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1079 curr->numa_scan_period = task_scan_min(curr);
1016 curr->node_stamp += period; 1080 curr->node_stamp += period;
1017 1081
1018 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1082 if (!time_before(jiffies, curr->mm->numa_next_scan)) {