aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c88
1 files changed, 76 insertions, 12 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0966f0c16f1b..e08d757720de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 818
819#ifdef CONFIG_NUMA_BALANCING 819#ifdef CONFIG_NUMA_BALANCING
820/* 820/*
821 * numa task sample period in ms 821 * Approximate time to scan a full NUMA task in ms. The task scan period is
822 * calculated based on the tasks virtual memory size and
823 * numa_balancing_scan_size.
822 */ 824 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 825unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 826unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; 827unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
826 828
827/* Portion of address space to scan in MB */ 829/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 830unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 832/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 833unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 834
835static unsigned int task_nr_scan_windows(struct task_struct *p)
836{
837 unsigned long rss = 0;
838 unsigned long nr_scan_pages;
839
840 /*
841 * Calculations based on RSS as non-present and empty pages are skipped
842 * by the PTE scanner and NUMA hinting faults should be trapped based
843 * on resident pages
844 */
845 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
846 rss = get_mm_rss(p->mm);
847 if (!rss)
848 rss = nr_scan_pages;
849
850 rss = round_up(rss, nr_scan_pages);
851 return rss / nr_scan_pages;
852}
853
854/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
855#define MAX_SCAN_WINDOW 2560
856
857static unsigned int task_scan_min(struct task_struct *p)
858{
859 unsigned int scan, floor;
860 unsigned int windows = 1;
861
862 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
863 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
864 floor = 1000 / windows;
865
866 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
867 return max_t(unsigned int, floor, scan);
868}
869
870static unsigned int task_scan_max(struct task_struct *p)
871{
872 unsigned int smin = task_scan_min(p);
873 unsigned int smax;
874
875 /* Watch for min being lower than max due to floor calculations */
876 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
877 return max(smin, smax);
878}
879
833static void task_numa_placement(struct task_struct *p) 880static void task_numa_placement(struct task_struct *p)
834{ 881{
835 int seq; 882 int seq;
@@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p)
840 if (p->numa_scan_seq == seq) 887 if (p->numa_scan_seq == seq)
841 return; 888 return;
842 p->numa_scan_seq = seq; 889 p->numa_scan_seq = seq;
890 p->numa_scan_period_max = task_scan_max(p);
843 891
844 /* FIXME: Scheduling placement policy hints go here */ 892 /* FIXME: Scheduling placement policy hints go here */
845} 893}
@@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated)
860 * If pages are properly placed (did not migrate) then scan slower. 908 * If pages are properly placed (did not migrate) then scan slower.
861 * This is reset periodically in case of phase changes 909 * This is reset periodically in case of phase changes
862 */ 910 */
863 if (!migrated) 911 if (!migrated) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 912 /* Initialise if necessary */
865 p->numa_scan_period + jiffies_to_msecs(10)); 913 if (!p->numa_scan_period_max)
914 p->numa_scan_period_max = task_scan_max(p);
915
916 p->numa_scan_period = min(p->numa_scan_period_max,
917 p->numa_scan_period + 10);
918 }
866 919
867 task_numa_placement(p); 920 task_numa_placement(p);
868} 921}
@@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 937 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 938 struct vm_area_struct *vma;
886 unsigned long start, end; 939 unsigned long start, end;
940 unsigned long nr_pte_updates = 0;
887 long pages; 941 long pages;
888 942
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 943 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work)
915 */ 969 */
916 migrate = mm->numa_next_reset; 970 migrate = mm->numa_next_reset;
917 if (time_after(now, migrate)) { 971 if (time_after(now, migrate)) {
918 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 972 p->numa_scan_period = task_scan_min(p);
919 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); 973 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
920 xchg(&mm->numa_next_reset, next_scan); 974 xchg(&mm->numa_next_reset, next_scan);
921 } 975 }
@@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work)
927 if (time_before(now, migrate)) 981 if (time_before(now, migrate))
928 return; 982 return;
929 983
930 if (p->numa_scan_period == 0) 984 if (p->numa_scan_period == 0) {
931 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 985 p->numa_scan_period_max = task_scan_max(p);
986 p->numa_scan_period = task_scan_min(p);
987 }
932 988
933 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 989 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
934 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 990 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
@@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work)
965 start = max(start, vma->vm_start); 1021 start = max(start, vma->vm_start);
966 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1022 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
967 end = min(end, vma->vm_end); 1023 end = min(end, vma->vm_end);
968 pages -= change_prot_numa(vma, start, end); 1024 nr_pte_updates += change_prot_numa(vma, start, end);
1025
1026 /*
1027 * Scan sysctl_numa_balancing_scan_size but ensure that
1028 * at least one PTE is updated so that unused virtual
1029 * address space is quickly skipped.
1030 */
1031 if (nr_pte_updates)
1032 pages -= (end - start) >> PAGE_SHIFT;
969 1033
970 start = end; 1034 start = end;
971 if (pages <= 0) 1035 if (pages <= 0)
@@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1012 1076
1013 if (now - curr->node_stamp > period) { 1077 if (now - curr->node_stamp > period) {
1014 if (!curr->node_stamp) 1078 if (!curr->node_stamp)
1015 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1079 curr->numa_scan_period = task_scan_min(curr);
1016 curr->node_stamp += period; 1080 curr->node_stamp += period;
1017 1081
1018 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1082 if (!time_before(jiffies, curr->mm->numa_next_scan)) {