aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /kernel/sched
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/sched/fair.c227
-rw-r--r--kernel/sched/features.h11
-rw-r--r--kernel/sched/sched.h12
4 files changed, 306 insertions, 15 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533496b6228..c1fb82104bfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { }; 193static void sched_feat_enable(int i) { };
194#endif /* HAVE_JUMP_LABEL */ 194#endif /* HAVE_JUMP_LABEL */
195 195
196static ssize_t 196static int sched_feat_set(char *cmp)
197sched_feat_write(struct file *filp, const char __user *ubuf,
198 size_t cnt, loff_t *ppos)
199{ 197{
200 char buf[64];
201 char *cmp;
202 int neg = 0;
203 int i; 198 int i;
204 199 int neg = 0;
205 if (cnt > 63)
206 cnt = 63;
207
208 if (copy_from_user(&buf, ubuf, cnt))
209 return -EFAULT;
210
211 buf[cnt] = 0;
212 cmp = strstrip(buf);
213 200
214 if (strncmp(cmp, "NO_", 3) == 0) { 201 if (strncmp(cmp, "NO_", 3) == 0) {
215 neg = 1; 202 neg = 1;
@@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
229 } 216 }
230 } 217 }
231 218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
232 if (i == __SCHED_FEAT_NR) 240 if (i == __SCHED_FEAT_NR)
233 return -EINVAL; 241 return -EINVAL;
234 242
@@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p)
1560#ifdef CONFIG_PREEMPT_NOTIFIERS 1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1561 INIT_HLIST_HEAD(&p->preempt_notifiers); 1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1562#endif 1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1585}
1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1563} 1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1564 1605
1565/* 1606/*
1566 * fork()/clone()-time setup: 1607 * fork()/clone()-time setup:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e8542..9af5af979a13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
774 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
775 */ 778 */
776 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
797
798 if (p->numa_scan_seq == seq)
799 return;
800 p->numa_scan_seq = seq;
801
802 /* FIXME: Scheduling placement policy hints go here */
803}
804
805/*
806 * Got a PROT_NONE fault for a page on @node.
807 */
808void task_numa_fault(int node, int pages, bool migrated)
809{
810 struct task_struct *p = current;
811
812 if (!sched_feat_numa(NUMA))
813 return;
814
815 /* FIXME: Allocate task-specific structure for placement policy here */
816
817 /*
818 * If pages are properly placed (did not migrate) then scan slower.
819 * This is reset periodically in case of phase changes
820 */
821 if (!migrated)
822 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
823 p->numa_scan_period + jiffies_to_msecs(10));
824
825 task_numa_placement(p);
826}
827
828static void reset_ptenuma_scan(struct task_struct *p)
829{
830 ACCESS_ONCE(p->mm->numa_scan_seq)++;
831 p->mm->numa_scan_offset = 0;
832}
833
834/*
835 * The expensive part of numa migration is done from task_work context.
836 * Triggered from task_tick_numa().
837 */
838void task_numa_work(struct callback_head *work)
839{
840 unsigned long migrate, next_scan, now = jiffies;
841 struct task_struct *p = current;
842 struct mm_struct *mm = p->mm;
843 struct vm_area_struct *vma;
844 unsigned long start, end;
845 long pages;
846
847 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
848
849 work->next = work; /* protect against double add */
850 /*
851 * Who cares about NUMA placement when they're dying.
852 *
853 * NOTE: make sure not to dereference p->mm before this check,
854 * exit_task_work() happens _after_ exit_mm() so we could be called
855 * without p->mm even though we still had it when we enqueued this
856 * work.
857 */
858 if (p->flags & PF_EXITING)
859 return;
860
861 /*
862 * We do not care about task placement until a task runs on a node
863 * other than the first one used by the address space. This is
864 * largely because migrations are driven by what CPU the task
865 * is running on. If it's never scheduled on another node, it'll
866 * not migrate so why bother trapping the fault.
867 */
868 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
869 mm->first_nid = numa_node_id();
870 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
871 /* Are we running on a new node yet? */
872 if (numa_node_id() == mm->first_nid &&
873 !sched_feat_numa(NUMA_FORCE))
874 return;
875
876 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
877 }
878
879 /*
880 * Reset the scan period if enough time has gone by. Objective is that
881 * scanning will be reduced if pages are properly placed. As tasks
882 * can enter different phases this needs to be re-examined. Lacking
883 * proper tracking of reference behaviour, this blunt hammer is used.
884 */
885 migrate = mm->numa_next_reset;
886 if (time_after(now, migrate)) {
887 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
888 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
889 xchg(&mm->numa_next_reset, next_scan);
890 }
891
892 /*
893 * Enforce maximal scan/migration frequency..
894 */
895 migrate = mm->numa_next_scan;
896 if (time_before(now, migrate))
897 return;
898
899 if (p->numa_scan_period == 0)
900 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
901
902 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
903 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
904 return;
905
906 /*
907 * Do not set pte_numa if the current running node is rate-limited.
908 * This loses statistics on the fault but if we are unwilling to
909 * migrate to this node, it is less likely we can do useful work
910 */
911 if (migrate_ratelimited(numa_node_id()))
912 return;
913
914 start = mm->numa_scan_offset;
915 pages = sysctl_numa_balancing_scan_size;
916 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
917 if (!pages)
918 return;
919
920 down_read(&mm->mmap_sem);
921 vma = find_vma(mm, start);
922 if (!vma) {
923 reset_ptenuma_scan(p);
924 start = 0;
925 vma = mm->mmap;
926 }
927 for (; vma; vma = vma->vm_next) {
928 if (!vma_migratable(vma))
929 continue;
930
931 /* Skip small VMAs. They are not likely to be of relevance */
932 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
933 continue;
934
935 do {
936 start = max(start, vma->vm_start);
937 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
938 end = min(end, vma->vm_end);
939 pages -= change_prot_numa(vma, start, end);
940
941 start = end;
942 if (pages <= 0)
943 goto out;
944 } while (end != vma->vm_end);
945 }
946
947out:
948 /*
949 * It is possible to reach the end of the VMA list but the last few VMAs are
950 * not guaranteed to the vma_migratable. If they are not, we would find the
951 * !migratable VMA on the next scan but not reset the scanner to the start
952 * so check it now.
953 */
954 if (vma)
955 mm->numa_scan_offset = start;
956 else
957 reset_ptenuma_scan(p);
958 up_read(&mm->mmap_sem);
959}
960
961/*
962 * Drive the periodic memory faults..
963 */
964void task_tick_numa(struct rq *rq, struct task_struct *curr)
965{
966 struct callback_head *work = &curr->numa_work;
967 u64 period, now;
968
969 /*
970 * We don't care about NUMA placement if we don't have memory.
971 */
972 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
973 return;
974
975 /*
976 * Using runtime rather than walltime has the dual advantage that
977 * we (mostly) drive the selection from busy threads and that the
978 * task needs to have done some actual work before we bother with
979 * NUMA placement.
980 */
981 now = curr->se.sum_exec_runtime;
982 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
983
984 if (now - curr->node_stamp > period) {
985 if (!curr->node_stamp)
986 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
987 curr->node_stamp = now;
988
989 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
990 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
991 task_work_add(curr, work, true);
992 }
993 }
994}
995#else
996static void task_tick_numa(struct rq *rq, struct task_struct *curr)
997{
998}
999#endif /* CONFIG_NUMA_BALANCING */
1000
777static void 1001static void
778account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1002account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779{ 1003{
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5501 entity_tick(cfs_rq, se, queued); 5725 entity_tick(cfs_rq, se, queued);
5502 } 5726 }
5503 5727
5728 if (sched_feat_numa(NUMA))
5729 task_tick_numa(rq, curr);
5730
5504 update_rq_runnable_avg(rq, 1); 5731 update_rq_runnable_avg(rq, 1);
5505} 5732}
5506 5733
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e68e69ab917d..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
66SCHED_FEAT(FORCE_SD_OVERLAP, false) 66SCHED_FEAT(FORCE_SD_OVERLAP, false)
67SCHED_FEAT(RT_RUNTIME_SHARE, true) 67SCHED_FEAT(RT_RUNTIME_SHARE, true)
68SCHED_FEAT(LB_MIN, false) 68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5eca173b563f..fc886441436a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
665 665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
666static inline u64 global_rt_period(void) 678static inline u64 global_rt_period(void)
667{ 679{
668 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;