aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/sh/mm/Kconfig1
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--include/linux/mm_types.h11
-rw-r--r--include/linux/sched.h20
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/fair.c125
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sysctl.c24
-rw-r--r--mm/huge_memory.c5
-rw-r--r--mm/memory.c14
11 files changed, 224 insertions, 4 deletions
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
111config NUMA 111config NUMA
112 bool "Non Uniform Memory Access (NUMA) Support" 112 bool "Non Uniform Memory Access (NUMA) Support"
113 depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL 113 depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
114 select ARCH_WANT_NUMA_VARIABLE_LOCALITY
114 default n 115 default n
115 help 116 help
116 Some SH systems have many various memories scattered around 117 Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff3ced2..1137028fc6d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
22 def_bool y 22 def_bool y
23 select HAVE_AOUT if X86_32 23 select HAVE_AOUT if X86_32
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select ARCH_SUPPORTS_NUMA_BALANCING
26 select ARCH_WANTS_PROT_NUMA_PROT_NONE
25 select HAVE_IDE 27 select HAVE_IDE
26 select HAVE_OPROFILE 28 select HAVE_OPROFILE
27 select HAVE_PCSPKR_PLATFORM 29 select HAVE_PCSPKR_PLATFORM
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..ed8638c29b3e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
398#ifdef CONFIG_CPUMASK_OFFSTACK 398#ifdef CONFIG_CPUMASK_OFFSTACK
399 struct cpumask cpumask_allocation; 399 struct cpumask cpumask_allocation;
400#endif 400#endif
401#ifdef CONFIG_NUMA_BALANCING
402 /*
403 * numa_next_scan is the next time when the PTEs will me marked
404 * pte_numa to gather statistics and migrate pages to new nodes
405 * if necessary
406 */
407 unsigned long numa_next_scan;
408
409 /* numa_scan_seq prevents two threads setting pte_numa */
410 int numa_scan_seq;
411#endif
401 struct uprobes_state uprobes_state; 412 struct uprobes_state uprobes_state;
402}; 413};
403 414
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..844af5b12cb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
1479 short il_next; 1479 short il_next;
1480 short pref_node_fork; 1480 short pref_node_fork;
1481#endif 1481#endif
1482#ifdef CONFIG_NUMA_BALANCING
1483 int numa_scan_seq;
1484 int numa_migrate_seq;
1485 unsigned int numa_scan_period;
1486 u64 node_stamp; /* migration stamp */
1487 struct callback_head numa_work;
1488#endif /* CONFIG_NUMA_BALANCING */
1489
1482 struct rcu_head rcu; 1490 struct rcu_head rcu;
1483 1491
1484 /* 1492 /*
@@ -1553,6 +1561,14 @@ struct task_struct {
1553/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1561/* Future-safe accessor for struct task_struct's cpus_allowed. */
1554#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1562#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1555 1563
1564#ifdef CONFIG_NUMA_BALANCING
1565extern void task_numa_fault(int node, int pages);
1566#else
1567static inline void task_numa_fault(int node, int pages)
1568{
1569}
1570#endif
1571
1556/* 1572/*
1557 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 1573 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1558 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 1574 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
1990}; 2006};
1991extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; 2007extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
1992 2008
2009extern unsigned int sysctl_numa_balancing_scan_period_min;
2010extern unsigned int sysctl_numa_balancing_scan_period_max;
2011extern unsigned int sysctl_numa_balancing_settle_count;
2012
1993#ifdef CONFIG_SCHED_DEBUG 2013#ifdef CONFIG_SCHED_DEBUG
1994extern unsigned int sysctl_sched_migration_cost; 2014extern unsigned int sysctl_sched_migration_cost;
1995extern unsigned int sysctl_sched_nr_migrate; 2015extern unsigned int sysctl_sched_nr_migrate;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1533#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1534 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1535#endif
1536
1537#ifdef CONFIG_NUMA_BALANCING
1538 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1539 p->mm->numa_next_scan = jiffies;
1540 p->mm->numa_scan_seq = 0;
1541 }
1542
1543 p->node_stamp = 0ULL;
1544 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1545 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1546 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1547 p->numa_work.next = &p->numa_work;
1548#endif /* CONFIG_NUMA_BALANCING */
1536} 1549}
1537 1550
1538/* 1551/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/task_work.h>
29 31
30#include <trace/events/sched.h> 32#include <trace/events/sched.h>
31 33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 778 * Scheduling class queueing methods:
777 */ 779 */
778 780
781#ifdef CONFIG_NUMA_BALANCING
782/*
783 * numa task sample period in ms: 5s
784 */
785unsigned int sysctl_numa_balancing_scan_period_min = 5000;
786unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
787
788static void task_numa_placement(struct task_struct *p)
789{
790 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
791
792 if (p->numa_scan_seq == seq)
793 return;
794 p->numa_scan_seq = seq;
795
796 /* FIXME: Scheduling placement policy hints go here */
797}
798
799/*
800 * Got a PROT_NONE fault for a page on @node.
801 */
802void task_numa_fault(int node, int pages)
803{
804 struct task_struct *p = current;
805
806 /* FIXME: Allocate task-specific structure for placement policy here */
807
808 task_numa_placement(p);
809}
810
811/*
812 * The expensive part of numa migration is done from task_work context.
813 * Triggered from task_tick_numa().
814 */
815void task_numa_work(struct callback_head *work)
816{
817 unsigned long migrate, next_scan, now = jiffies;
818 struct task_struct *p = current;
819 struct mm_struct *mm = p->mm;
820
821 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
822
823 work->next = work; /* protect against double add */
824 /*
825 * Who cares about NUMA placement when they're dying.
826 *
827 * NOTE: make sure not to dereference p->mm before this check,
828 * exit_task_work() happens _after_ exit_mm() so we could be called
829 * without p->mm even though we still had it when we enqueued this
830 * work.
831 */
832 if (p->flags & PF_EXITING)
833 return;
834
835 /*
836 * Enforce maximal scan/migration frequency..
837 */
838 migrate = mm->numa_next_scan;
839 if (time_before(now, migrate))
840 return;
841
842 if (p->numa_scan_period == 0)
843 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
844
845 next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
846 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
847 return;
848
849 ACCESS_ONCE(mm->numa_scan_seq)++;
850 {
851 struct vm_area_struct *vma;
852
853 down_read(&mm->mmap_sem);
854 for (vma = mm->mmap; vma; vma = vma->vm_next) {
855 if (!vma_migratable(vma))
856 continue;
857 change_prot_numa(vma, vma->vm_start, vma->vm_end);
858 }
859 up_read(&mm->mmap_sem);
860 }
861}
862
863/*
864 * Drive the periodic memory faults..
865 */
866void task_tick_numa(struct rq *rq, struct task_struct *curr)
867{
868 struct callback_head *work = &curr->numa_work;
869 u64 period, now;
870
871 /*
872 * We don't care about NUMA placement if we don't have memory.
873 */
874 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
875 return;
876
877 /*
878 * Using runtime rather than walltime has the dual advantage that
879 * we (mostly) drive the selection from busy threads and that the
880 * task needs to have done some actual work before we bother with
881 * NUMA placement.
882 */
883 now = curr->se.sum_exec_runtime;
884 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
885
886 if (now - curr->node_stamp > period) {
887 curr->node_stamp = now;
888
889 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
890 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
891 task_work_add(curr, work, true);
892 }
893 }
894}
895#else
896static void task_tick_numa(struct rq *rq, struct task_struct *curr)
897{
898}
899#endif /* CONFIG_NUMA_BALANCING */
900
779static void 901static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 902account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 903{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5076 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5077 entity_tick(cfs_rq, se, queued);
4956 } 5078 }
5079
5080 if (sched_feat_numa(NUMA))
5081 task_tick_numa(rq, curr);
4957} 5082}
4958 5083
4959/* 5084/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 61SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 62SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 63SCHED_FEAT(LB_MIN, false)
64
65/*
66 * Apply the automatic NUMA scheduling policy
67 */
68#ifdef CONFIG_NUMA_BALANCING
69SCHED_FEAT(NUMA, true)
70#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 650
651#ifdef CONFIG_NUMA_BALANCING
652#define sched_feat_numa(x) sched_feat(x)
653#else
654#define sched_feat_numa(x) (0)
655#endif
656
651static inline u64 global_rt_period(void) 657static inline u64 global_rt_period(void)
652{ 658{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 659 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
262 264
263#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
303 }, 305 },
306#ifdef CONFIG_SMP
304 { 307 {
305 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 350 .extra1 = &zero,
348 .extra2 = &one, 351 .extra2 = &one,
349 }, 352 },
350#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_period_min_ms",
357 .data = &sysctl_numa_balancing_scan_period_min,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_max_ms",
364 .data = &sysctl_numa_balancing_scan_period_max,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369#endif /* CONFIG_NUMA_BALANCING */
370#endif /* CONFIG_SCHED_DEBUG */
351 { 371 {
352 .procname = "sched_rt_period_us", 372 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 373 .data = &sysctl_sched_rt_period,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d79f7a55bf6f..ee8133794a56 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1046 */ 1046 */
1047 split_huge_page(page); 1047 split_huge_page(page);
1048 put_page(page); 1048 put_page(page);
1049
1049 return 0; 1050 return 0;
1050 1051
1051clear_pmdnuma: 1052clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
1060 1061
1061out_unlock: 1062out_unlock:
1062 spin_unlock(&mm->page_table_lock); 1063 spin_unlock(&mm->page_table_lock);
1063 if (page) 1064 if (page) {
1064 put_page(page); 1065 put_page(page);
1066 task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
1067 }
1065 return 0; 1068 return 0;
1066} 1069}
1067 1070
diff --git a/mm/memory.c b/mm/memory.c
index d52542680e10..8012c1907895 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3454{ 3454{
3455 struct page *page = NULL; 3455 struct page *page = NULL;
3456 spinlock_t *ptl; 3456 spinlock_t *ptl;
3457 int current_nid, target_nid; 3457 int current_nid = -1;
3458 int target_nid;
3458 3459
3459 /* 3460 /*
3460 * The "pte" at this point cannot be used safely without 3461 * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3501 current_nid = target_nid; 3502 current_nid = target_nid;
3502 3503
3503out: 3504out:
3505 task_numa_fault(current_nid, 1);
3504 return 0; 3506 return 0;
3505} 3507}
3506 3508
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3537 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { 3539 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3538 pte_t pteval = *pte; 3540 pte_t pteval = *pte;
3539 struct page *page; 3541 struct page *page;
3542 int curr_nid;
3540 if (!pte_present(pteval)) 3543 if (!pte_present(pteval))
3541 continue; 3544 continue;
3542 if (!pte_numa(pteval)) 3545 if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3554 page = vm_normal_page(vma, addr, pteval); 3557 page = vm_normal_page(vma, addr, pteval);
3555 if (unlikely(!page)) 3558 if (unlikely(!page))
3556 continue; 3559 continue;
3560 /* only check non-shared pages */
3561 if (unlikely(page_mapcount(page) != 1))
3562 continue;
3563 pte_unmap_unlock(pte, ptl);
3564
3565 curr_nid = page_to_nid(page);
3566 task_numa_fault(curr_nid, 1);
3567
3568 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3557 } 3569 }
3558 pte_unmap_unlock(orig_pte, ptl); 3570 pte_unmap_unlock(orig_pte, ptl);
3559 3571