diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 88 |
1 files changed, 76 insertions, 12 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0966f0c16f1b..e08d757720de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
818 | 818 | ||
819 | #ifdef CONFIG_NUMA_BALANCING | 819 | #ifdef CONFIG_NUMA_BALANCING |
820 | /* | 820 | /* |
821 | * numa task sample period in ms | 821 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
822 | * calculated based on the tasks virtual memory size and | ||
823 | * numa_balancing_scan_size. | ||
822 | */ | 824 | */ |
823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 825 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 826 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | 827 | unsigned int sysctl_numa_balancing_scan_period_reset = 60000; |
826 | 828 | ||
827 | /* Portion of address space to scan in MB */ | 829 | /* Portion of address space to scan in MB */ |
828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 830 | unsigned int sysctl_numa_balancing_scan_size = 256; |
@@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 832 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 833 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
832 | 834 | ||
835 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
836 | { | ||
837 | unsigned long rss = 0; | ||
838 | unsigned long nr_scan_pages; | ||
839 | |||
840 | /* | ||
841 | * Calculations based on RSS as non-present and empty pages are skipped | ||
842 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
843 | * on resident pages | ||
844 | */ | ||
845 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
846 | rss = get_mm_rss(p->mm); | ||
847 | if (!rss) | ||
848 | rss = nr_scan_pages; | ||
849 | |||
850 | rss = round_up(rss, nr_scan_pages); | ||
851 | return rss / nr_scan_pages; | ||
852 | } | ||
853 | |||
854 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
855 | #define MAX_SCAN_WINDOW 2560 | ||
856 | |||
857 | static unsigned int task_scan_min(struct task_struct *p) | ||
858 | { | ||
859 | unsigned int scan, floor; | ||
860 | unsigned int windows = 1; | ||
861 | |||
862 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
863 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
864 | floor = 1000 / windows; | ||
865 | |||
866 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
867 | return max_t(unsigned int, floor, scan); | ||
868 | } | ||
869 | |||
870 | static unsigned int task_scan_max(struct task_struct *p) | ||
871 | { | ||
872 | unsigned int smin = task_scan_min(p); | ||
873 | unsigned int smax; | ||
874 | |||
875 | /* Watch for min being lower than max due to floor calculations */ | ||
876 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
877 | return max(smin, smax); | ||
878 | } | ||
879 | |||
833 | static void task_numa_placement(struct task_struct *p) | 880 | static void task_numa_placement(struct task_struct *p) |
834 | { | 881 | { |
835 | int seq; | 882 | int seq; |
@@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p) | |||
840 | if (p->numa_scan_seq == seq) | 887 | if (p->numa_scan_seq == seq) |
841 | return; | 888 | return; |
842 | p->numa_scan_seq = seq; | 889 | p->numa_scan_seq = seq; |
890 | p->numa_scan_period_max = task_scan_max(p); | ||
843 | 891 | ||
844 | /* FIXME: Scheduling placement policy hints go here */ | 892 | /* FIXME: Scheduling placement policy hints go here */ |
845 | } | 893 | } |
@@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated) | |||
860 | * If pages are properly placed (did not migrate) then scan slower. | 908 | * If pages are properly placed (did not migrate) then scan slower. |
861 | * This is reset periodically in case of phase changes | 909 | * This is reset periodically in case of phase changes |
862 | */ | 910 | */ |
863 | if (!migrated) | 911 | if (!migrated) { |
864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 912 | /* Initialise if necessary */ |
865 | p->numa_scan_period + jiffies_to_msecs(10)); | 913 | if (!p->numa_scan_period_max) |
914 | p->numa_scan_period_max = task_scan_max(p); | ||
915 | |||
916 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
917 | p->numa_scan_period + 10); | ||
918 | } | ||
866 | 919 | ||
867 | task_numa_placement(p); | 920 | task_numa_placement(p); |
868 | } | 921 | } |
@@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work) | |||
884 | struct mm_struct *mm = p->mm; | 937 | struct mm_struct *mm = p->mm; |
885 | struct vm_area_struct *vma; | 938 | struct vm_area_struct *vma; |
886 | unsigned long start, end; | 939 | unsigned long start, end; |
940 | unsigned long nr_pte_updates = 0; | ||
887 | long pages; | 941 | long pages; |
888 | 942 | ||
889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 943 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
@@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work) | |||
915 | */ | 969 | */ |
916 | migrate = mm->numa_next_reset; | 970 | migrate = mm->numa_next_reset; |
917 | if (time_after(now, migrate)) { | 971 | if (time_after(now, migrate)) { |
918 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 972 | p->numa_scan_period = task_scan_min(p); |
919 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | 973 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); |
920 | xchg(&mm->numa_next_reset, next_scan); | 974 | xchg(&mm->numa_next_reset, next_scan); |
921 | } | 975 | } |
@@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work) | |||
927 | if (time_before(now, migrate)) | 981 | if (time_before(now, migrate)) |
928 | return; | 982 | return; |
929 | 983 | ||
930 | if (p->numa_scan_period == 0) | 984 | if (p->numa_scan_period == 0) { |
931 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 985 | p->numa_scan_period_max = task_scan_max(p); |
986 | p->numa_scan_period = task_scan_min(p); | ||
987 | } | ||
932 | 988 | ||
933 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 989 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
934 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 990 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
@@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work) | |||
965 | start = max(start, vma->vm_start); | 1021 | start = max(start, vma->vm_start); |
966 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1022 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
967 | end = min(end, vma->vm_end); | 1023 | end = min(end, vma->vm_end); |
968 | pages -= change_prot_numa(vma, start, end); | 1024 | nr_pte_updates += change_prot_numa(vma, start, end); |
1025 | |||
1026 | /* | ||
1027 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
1028 | * at least one PTE is updated so that unused virtual | ||
1029 | * address space is quickly skipped. | ||
1030 | */ | ||
1031 | if (nr_pte_updates) | ||
1032 | pages -= (end - start) >> PAGE_SHIFT; | ||
969 | 1033 | ||
970 | start = end; | 1034 | start = end; |
971 | if (pages <= 0) | 1035 | if (pages <= 0) |
@@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1012 | 1076 | ||
1013 | if (now - curr->node_stamp > period) { | 1077 | if (now - curr->node_stamp > period) { |
1014 | if (!curr->node_stamp) | 1078 | if (!curr->node_stamp) |
1015 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1079 | curr->numa_scan_period = task_scan_min(curr); |
1016 | curr->node_stamp += period; | 1080 | curr->node_stamp += period; |
1017 | 1081 | ||
1018 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1082 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |