aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-10-25 08:16:43 -0400
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:45 -0500
commitcbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
treed4cfbcfa3e89742216cd792d4aa914356406b532
parenta720094ded8cbb303111035be91858011d2eac71 (diff)
mm: numa: Add fault driven placement and migration
NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com>
-rw-r--r--arch/sh/mm/Kconfig1
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--include/linux/mm_types.h11
-rw-r--r--include/linux/sched.h20
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/fair.c125
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sysctl.c24
-rw-r--r--mm/huge_memory.c5
-rw-r--r--mm/memory.c14
11 files changed, 224 insertions, 4 deletions
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
111config NUMA 111config NUMA
112 bool "Non Uniform Memory Access (NUMA) Support" 112 bool "Non Uniform Memory Access (NUMA) Support"
113 depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL 113 depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
114 select ARCH_WANT_NUMA_VARIABLE_LOCALITY
114 default n 115 default n
115 help 116 help
116 Some SH systems have many various memories scattered around 117 Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff3ced2..1137028fc6d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
22 def_bool y 22 def_bool y
23 select HAVE_AOUT if X86_32 23 select HAVE_AOUT if X86_32
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select ARCH_SUPPORTS_NUMA_BALANCING
26 select ARCH_WANTS_PROT_NUMA_PROT_NONE
25 select HAVE_IDE 27 select HAVE_IDE
26 select HAVE_OPROFILE 28 select HAVE_OPROFILE
27 select HAVE_PCSPKR_PLATFORM 29 select HAVE_PCSPKR_PLATFORM
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..ed8638c29b3e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
398#ifdef CONFIG_CPUMASK_OFFSTACK 398#ifdef CONFIG_CPUMASK_OFFSTACK
399 struct cpumask cpumask_allocation; 399 struct cpumask cpumask_allocation;
400#endif 400#endif
401#ifdef CONFIG_NUMA_BALANCING
402 /*
403 * numa_next_scan is the next time when the PTEs will me marked
404 * pte_numa to gather statistics and migrate pages to new nodes
405 * if necessary
406 */
407 unsigned long numa_next_scan;
408
409 /* numa_scan_seq prevents two threads setting pte_numa */
410 int numa_scan_seq;
411#endif
401 struct uprobes_state uprobes_state; 412 struct uprobes_state uprobes_state;
402}; 413};
403 414
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..844af5b12cb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
1479 short il_next; 1479 short il_next;
1480 short pref_node_fork; 1480 short pref_node_fork;
1481#endif 1481#endif
1482#ifdef CONFIG_NUMA_BALANCING
1483 int numa_scan_seq;
1484 int numa_migrate_seq;
1485 unsigned int numa_scan_period;
1486 u64 node_stamp; /* migration stamp */
1487 struct callback_head numa_work;
1488#endif /* CONFIG_NUMA_BALANCING */
1489
1482 struct rcu_head rcu; 1490 struct rcu_head rcu;
1483 1491
1484 /* 1492 /*
@@ -1553,6 +1561,14 @@ struct task_struct {
1553/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1561/* Future-safe accessor for struct task_struct's cpus_allowed. */
1554#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1562#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1555 1563
1564#ifdef CONFIG_NUMA_BALANCING
1565extern void task_numa_fault(int node, int pages);
1566#else
1567static inline void task_numa_fault(int node, int pages)
1568{
1569}
1570#endif
1571
1556/* 1572/*
1557 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 1573 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1558 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 1574 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
1990}; 2006};
1991extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; 2007extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
1992 2008
2009extern unsigned int sysctl_numa_balancing_scan_period_min;
2010extern unsigned int sysctl_numa_balancing_scan_period_max;
2011extern unsigned int sysctl_numa_balancing_settle_count;
2012
1993#ifdef CONFIG_SCHED_DEBUG 2013#ifdef CONFIG_SCHED_DEBUG
1994extern unsigned int sysctl_sched_migration_cost; 2014extern unsigned int sysctl_sched_migration_cost;
1995extern unsigned int sysctl_sched_nr_migrate; 2015extern unsigned int sysctl_sched_nr_migrate;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1533#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1534 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1535#endif
1536
1537#ifdef CONFIG_NUMA_BALANCING
1538 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1539 p->mm->numa_next_scan = jiffies;
1540 p->mm->numa_scan_seq = 0;
1541 }
1542
1543 p->node_stamp = 0ULL;
1544 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1545 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1546 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1547 p->numa_work.next = &p->numa_work;
1548#endif /* CONFIG_NUMA_BALANCING */
1536} 1549}
1537 1550
1538/* 1551/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/task_work.h>
29 31
30#include <trace/events/sched.h> 32#include <trace/events/sched.h>
31 33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 778 * Scheduling class queueing methods:
777 */ 779 */
778 780
781#ifdef CONFIG_NUMA_BALANCING
782/*
783 * numa task sample period in ms: 5s
784 */
785unsigned int sysctl_numa_balancing_scan_period_min = 5000;
786unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
787
788static void task_numa_placement(struct task_struct *p)
789{
790 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
791
792 if (p->numa_scan_seq == seq)
793 return;
794 p->numa_scan_seq = seq;
795
796 /* FIXME: Scheduling placement policy hints go here */
797}
798
799/*
800 * Got a PROT_NONE fault for a page on @node.
801 */
802void task_numa_fault(int node, int pages)
803{
804 struct task_struct *p = current;
805
806 /* FIXME: Allocate task-specific structure for placement policy here */
807
808 task_numa_placement(p);
809}
810
811/*
812 * The expensive part of numa migration is done from task_work context.
813 * Triggered from task_tick_numa().
814 */
815void task_numa_work(struct callback_head *work)
816{
817 unsigned long migrate, next_scan, now = jiffies;
818 struct task_struct *p = current;
819 struct mm_struct *mm = p->mm;
820
821 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
822
823 work->next = work; /* protect against double add */
824 /*
825 * Who cares about NUMA placement when they're dying.
826 *
827 * NOTE: make sure not to dereference p->mm before this check,
828 * exit_task_work() happens _after_ exit_mm() so we could be called
829 * without p->mm even though we still had it when we enqueued this
830 * work.
831 */
832 if (p->flags & PF_EXITING)
833 return;
834
835 /*
836 * Enforce maximal scan/migration frequency..
837 */
838 migrate = mm->numa_next_scan;
839 if (time_before(now, migrate))
840 return;
841
842 if (p->numa_scan_period == 0)
843 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
844
845 next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
846 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
847 return;
848
849 ACCESS_ONCE(mm->numa_scan_seq)++;
850 {
851 struct vm_area_struct *vma;
852
853 down_read(&mm->mmap_sem);
854 for (vma = mm->mmap; vma; vma = vma->vm_next) {
855 if (!vma_migratable(vma))
856 continue;
857 change_prot_numa(vma, vma->vm_start, vma->vm_end);
858 }
859 up_read(&mm->mmap_sem);
860 }
861}
862
863/*
864 * Drive the periodic memory faults..
865 */
866void task_tick_numa(struct rq *rq, struct task_struct *curr)
867{
868 struct callback_head *work = &curr->numa_work;
869 u64 period, now;
870
871 /*
872 * We don't care about NUMA placement if we don't have memory.
873 */
874 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
875 return;
876
877 /*
878 * Using runtime rather than walltime has the dual advantage that
879 * we (mostly) drive the selection from busy threads and that the
880 * task needs to have done some actual work before we bother with
881 * NUMA placement.
882 */
883 now = curr->se.sum_exec_runtime;
884 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
885
886 if (now - curr->node_stamp > period) {
887 curr->node_stamp = now;
888
889 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
890 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
891 task_work_add(curr, work, true);
892 }
893 }
894}
895#else
896static void task_tick_numa(struct rq *rq, struct task_struct *curr)
897{
898}
899#endif /* CONFIG_NUMA_BALANCING */
900
779static void 901static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 902account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 903{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5076 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5077 entity_tick(cfs_rq, se, queued);
4956 } 5078 }
5079
5080 if (sched_feat_numa(NUMA))
5081 task_tick_numa(rq, curr);
4957} 5082}
4958 5083
4959/* 5084/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 61SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 62SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 63SCHED_FEAT(LB_MIN, false)
64
65/*
66 * Apply the automatic NUMA scheduling policy
67 */
68#ifdef CONFIG_NUMA_BALANCING
69SCHED_FEAT(NUMA, true)
70#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 650
651#ifdef CONFIG_NUMA_BALANCING
652#define sched_feat_numa(x) sched_feat(x)
653#else
654#define sched_feat_numa(x) (0)
655#endif
656
651static inline u64 global_rt_period(void) 657static inline u64 global_rt_period(void)
652{ 658{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 659 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
262 264
263#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
303 }, 305 },
306#ifdef CONFIG_SMP
304 { 307 {
305 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 350 .extra1 = &zero,
348 .extra2 = &one, 351 .extra2 = &one,
349 }, 352 },
350#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_period_min_ms",
357 .data = &sysctl_numa_balancing_scan_period_min,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_max_ms",
364 .data = &sysctl_numa_balancing_scan_period_max,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369#endif /* CONFIG_NUMA_BALANCING */
370#endif /* CONFIG_SCHED_DEBUG */
351 { 371 {
352 .procname = "sched_rt_period_us", 372 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 373 .data = &sysctl_sched_rt_period,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d79f7a55bf6f..ee8133794a56 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1046 */ 1046 */
1047 split_huge_page(page); 1047 split_huge_page(page);
1048 put_page(page); 1048 put_page(page);
1049
1049 return 0; 1050 return 0;
1050 1051
1051clear_pmdnuma: 1052clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
1060 1061
1061out_unlock: 1062out_unlock:
1062 spin_unlock(&mm->page_table_lock); 1063 spin_unlock(&mm->page_table_lock);
1063 if (page) 1064 if (page) {
1064 put_page(page); 1065 put_page(page);
1066 task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
1067 }
1065 return 0; 1068 return 0;
1066} 1069}
1067 1070
diff --git a/mm/memory.c b/mm/memory.c
index d52542680e10..8012c1907895 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3454{ 3454{
3455 struct page *page = NULL; 3455 struct page *page = NULL;
3456 spinlock_t *ptl; 3456 spinlock_t *ptl;
3457 int current_nid, target_nid; 3457 int current_nid = -1;
3458 int target_nid;
3458 3459
3459 /* 3460 /*
3460 * The "pte" at this point cannot be used safely without 3461 * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3501 current_nid = target_nid; 3502 current_nid = target_nid;
3502 3503
3503out: 3504out:
3505 task_numa_fault(current_nid, 1);
3504 return 0; 3506 return 0;
3505} 3507}
3506 3508
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3537 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { 3539 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3538 pte_t pteval = *pte; 3540 pte_t pteval = *pte;
3539 struct page *page; 3541 struct page *page;
3542 int curr_nid;
3540 if (!pte_present(pteval)) 3543 if (!pte_present(pteval))
3541 continue; 3544 continue;
3542 if (!pte_numa(pteval)) 3545 if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3554 page = vm_normal_page(vma, addr, pteval); 3557 page = vm_normal_page(vma, addr, pteval);
3555 if (unlikely(!page)) 3558 if (unlikely(!page))
3556 continue; 3559 continue;
3560 /* only check non-shared pages */
3561 if (unlikely(page_mapcount(page) != 1))
3562 continue;
3563 pte_unmap_unlock(pte, ptl);
3564
3565 curr_nid = page_to_nid(page);
3566 task_numa_fault(curr_nid, 1);
3567
3568 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3557 } 3569 }
3558 pte_unmap_unlock(orig_pte, ptl); 3570 pte_unmap_unlock(orig_pte, ptl);
3559 3571