11 files changed, 224 insertions, 4 deletions
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
 config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+        select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff3ced2..1137028fc6d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+        select ARCH_SUPPORTS_NUMA_BALANCING
+        select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..ed8638c29b3e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
 #ifdef CONFIG_CPUMASK_OFFSTACK
        struct cpumask cpumask_allocation;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        /*
+         * numa_next_scan is the next time when the PTEs will me marked
+         * pte_numa to gather statistics and migrate pages to new nodes
+         * if necessary
+         */
+        unsigned long numa_next_scan;
+        /* numa_scan_seq prevents two threads setting pte_numa */
+        int numa_scan_seq;
+#endif
        struct uprobes_state uprobes_state;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..844af5b12cb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
        short il_next;
        short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        int numa_scan_seq;
+        int numa_migrate_seq;
+        unsigned int numa_scan_period;
+        u64 node_stamp;                 /* migration stamp  */
+        struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
        struct rcu_head rcu;
        /*
@@ -1553,6 +1561,14 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
 /*
 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+        struct task_struct *p = current;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        task_numa_placement(p);
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        ACCESS_ONCE(mm->numa_scan_seq)++;
+        {
+                struct vm_area_struct *vma;
+                down_read(&mm->mmap_sem);
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        if (!vma_migratable(vma))
+                                continue;
+                        change_prot_numa(vma, vma->vm_start, vma->vm_end);
+                }
+                up_read(&mm->mmap_sem);
+        }
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        true)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+        {
+                .procname       = "numa_balancing_scan_period_min_ms",
+                .data           = &sysctl_numa_balancing_scan_period_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_max_ms",
+                .data           = &sysctl_numa_balancing_scan_period_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d79f7a55bf6f..ee8133794a56 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        split_huge_page(page);
        put_page(page);
        return 0;
 clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-        if (page)
+        if (page) {
                put_page(page);
+                task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+        }
        return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d52542680e10..8012c1907895 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *page = NULL;
        spinlock_t *ptl;
-        int current_nid, target_nid;
+        int current_nid = -1;
+        int target_nid;
        /*
        * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                current_nid = target_nid;
 out:
+        task_numa_fault(current_nid, 1);
        return 0;
 }
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                pte_t pteval = *pte;
                struct page *page;
+                int curr_nid;
                if (!pte_present(pteval))
                        continue;
                if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page))
                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                pte_unmap_unlock(pte, ptl);
+                curr_nid = page_to_nid(page);
+                task_numa_fault(curr_nid, 1);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);

diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4dd..0f7c852f355c 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
111	config NUMA	111	config NUMA
112	bool "Non Uniform Memory Access (NUMA) Support"	112	bool "Non Uniform Memory Access (NUMA) Support"
113	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL	113	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
		114	select ARCH_WANT_NUMA_VARIABLE_LOCALITY
114	default n	115	default n
115	help	116	help
116	Some SH systems have many various memories scattered around	117	Some SH systems have many various memories scattered around


diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced2..1137028fc6d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
22	def_bool y	22	def_bool y
23	select HAVE_AOUT if X86_32	23	select HAVE_AOUT if X86_32
24	select HAVE_UNSTABLE_SCHED_CLOCK	24	select HAVE_UNSTABLE_SCHED_CLOCK
		25	select ARCH_SUPPORTS_NUMA_BALANCING
		26	select ARCH_WANTS_PROT_NUMA_PROT_NONE
25	select HAVE_IDE	27	select HAVE_IDE
26	select HAVE_OPROFILE	28	select HAVE_OPROFILE
27	select HAVE_PCSPKR_PLATFORM	29	select HAVE_PCSPKR_PLATFORM


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3af7d94..ed8638c29b3e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
398	#ifdef CONFIG_CPUMASK_OFFSTACK	398	#ifdef CONFIG_CPUMASK_OFFSTACK
399	struct cpumask cpumask_allocation;	399	struct cpumask cpumask_allocation;
400	#endif	400	#endif
		401	#ifdef CONFIG_NUMA_BALANCING
		402	/*
		403	* numa_next_scan is the next time when the PTEs will me marked
		404	* pte_numa to gather statistics and migrate pages to new nodes
		405	* if necessary
		406	*/
		407	unsigned long numa_next_scan;
		408
		409	/* numa_scan_seq prevents two threads setting pte_numa */
		410	int numa_scan_seq;
		411	#endif
401	struct uprobes_state uprobes_state;	412	struct uprobes_state uprobes_state;
402	};	413	};
403		414


diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..844af5b12cb2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
1479	short il_next;	1479	short il_next;
1480	short pref_node_fork;	1480	short pref_node_fork;
1481	#endif	1481	#endif
		1482	#ifdef CONFIG_NUMA_BALANCING
		1483	int numa_scan_seq;
		1484	int numa_migrate_seq;
		1485	unsigned int numa_scan_period;
		1486	u64 node_stamp; /* migration stamp */
		1487	struct callback_head numa_work;
		1488	#endif /* CONFIG_NUMA_BALANCING */
		1489
1482	struct rcu_head rcu;	1490	struct rcu_head rcu;
1483		1491
1484	/*	1492	/*
@@ -1553,6 +1561,14 @@ struct task_struct {
1553	/* Future-safe accessor for struct task_struct's cpus_allowed. */	1561	/* Future-safe accessor for struct task_struct's cpus_allowed. */
1554	#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)	1562	#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1555		1563
		1564	#ifdef CONFIG_NUMA_BALANCING
		1565	extern void task_numa_fault(int node, int pages);
		1566	#else
		1567	static inline void task_numa_fault(int node, int pages)
		1568	{
		1569	}
		1570	#endif
		1571
1556	/*	1572	/*
1557	* Priority of a process goes from 0..MAX_PRIO-1, valid RT	1573	* Priority of a process goes from 0..MAX_PRIO-1, valid RT
1558	* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH	1574	* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
1990	};	2006	};
1991	extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;	2007	extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
1992		2008
		2009	extern unsigned int sysctl_numa_balancing_scan_period_min;
		2010	extern unsigned int sysctl_numa_balancing_scan_period_max;
		2011	extern unsigned int sysctl_numa_balancing_settle_count;
		2012
1993	#ifdef CONFIG_SCHED_DEBUG	2013	#ifdef CONFIG_SCHED_DEBUG
1994	extern unsigned int sysctl_sched_migration_cost;	2014	extern unsigned int sysctl_sched_migration_cost;
1995	extern unsigned int sysctl_sched_nr_migrate;	2015	extern unsigned int sysctl_sched_nr_migrate;


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533	#ifdef CONFIG_PREEMPT_NOTIFIERS	1533	#ifdef CONFIG_PREEMPT_NOTIFIERS
1534	INIT_HLIST_HEAD(&p->preempt_notifiers);	1534	INIT_HLIST_HEAD(&p->preempt_notifiers);
1535	#endif	1535	#endif
		1536
		1537	#ifdef CONFIG_NUMA_BALANCING
		1538	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
		1539	p->mm->numa_next_scan = jiffies;
		1540	p->mm->numa_scan_seq = 0;
		1541	}
		1542
		1543	p->node_stamp = 0ULL;
		1544	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
		1545	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
		1546	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		1547	p->numa_work.next = &p->numa_work;
		1548	#endif /* CONFIG_NUMA_BALANCING */
1536	}	1549	}
1537		1550
1538	/*	1551	/*


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
		29	#include <linux/mempolicy.h>
		30	#include <linux/task_work.h>
29		31
30	#include <trace/events/sched.h>	32	#include <trace/events/sched.h>
31		33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
776	* Scheduling class queueing methods:	778	* Scheduling class queueing methods:
777	*/	779	*/
778		780
		781	#ifdef CONFIG_NUMA_BALANCING
		782	/*
		783	* numa task sample period in ms: 5s
		784	*/
		785	unsigned int sysctl_numa_balancing_scan_period_min = 5000;
		786	unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
		787
		788	static void task_numa_placement(struct task_struct *p)
		789	{
		790	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
		791
		792	if (p->numa_scan_seq == seq)
		793	return;
		794	p->numa_scan_seq = seq;
		795
		796	/* FIXME: Scheduling placement policy hints go here */
		797	}
		798
		799	/*
		800	* Got a PROT_NONE fault for a page on @node.
		801	*/
		802	void task_numa_fault(int node, int pages)
		803	{
		804	struct task_struct *p = current;
		805
		806	/* FIXME: Allocate task-specific structure for placement policy here */
		807
		808	task_numa_placement(p);
		809	}
		810
		811	/*
		812	* The expensive part of numa migration is done from task_work context.
		813	* Triggered from task_tick_numa().
		814	*/
		815	void task_numa_work(struct callback_head *work)
		816	{
		817	unsigned long migrate, next_scan, now = jiffies;
		818	struct task_struct *p = current;
		819	struct mm_struct *mm = p->mm;
		820
		821	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
		822
		823	work->next = work; /* protect against double add */
		824	/*
		825	* Who cares about NUMA placement when they're dying.
		826	*
		827	* NOTE: make sure not to dereference p->mm before this check,
		828	* exit_task_work() happens _after_ exit_mm() so we could be called
		829	* without p->mm even though we still had it when we enqueued this
		830	* work.
		831	*/
		832	if (p->flags & PF_EXITING)
		833	return;
		834
		835	/*
		836	* Enforce maximal scan/migration frequency..
		837	*/
		838	migrate = mm->numa_next_scan;
		839	if (time_before(now, migrate))
		840	return;
		841
		842	if (p->numa_scan_period == 0)
		843	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		844
		845	next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
		846	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
		847	return;
		848
		849	ACCESS_ONCE(mm->numa_scan_seq)++;
		850	{
		851	struct vm_area_struct *vma;
		852
		853	down_read(&mm->mmap_sem);
		854	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		855	if (!vma_migratable(vma))
		856	continue;
		857	change_prot_numa(vma, vma->vm_start, vma->vm_end);
		858	}
		859	up_read(&mm->mmap_sem);
		860	}
		861	}
		862
		863	/*
		864	* Drive the periodic memory faults..
		865	*/
		866	void task_tick_numa(struct rq rq, struct task_struct curr)
		867	{
		868	struct callback_head *work = &curr->numa_work;
		869	u64 period, now;
		870
		871	/*
		872	* We don't care about NUMA placement if we don't have memory.
		873	*/
		874	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
		875	return;
		876
		877	/*
		878	* Using runtime rather than walltime has the dual advantage that
		879	* we (mostly) drive the selection from busy threads and that the
		880	* task needs to have done some actual work before we bother with
		881	* NUMA placement.
		882	*/
		883	now = curr->se.sum_exec_runtime;
		884	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
		885
		886	if (now - curr->node_stamp > period) {
		887	curr->node_stamp = now;
		888
		889	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
		890	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
		891	task_work_add(curr, work, true);
		892	}
		893	}
		894	}
		895	#else
		896	static void task_tick_numa(struct rq rq, struct task_struct curr)
		897	{
		898	}
		899	#endif /* CONFIG_NUMA_BALANCING */
		900
779	static void	901	static void
780	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	902	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
781	{	903	{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
4954	cfs_rq = cfs_rq_of(se);	5076	cfs_rq = cfs_rq_of(se);
4955	entity_tick(cfs_rq, se, queued);	5077	entity_tick(cfs_rq, se, queued);
4956	}	5078	}
		5079
		5080	if (sched_feat_numa(NUMA))
		5081	task_tick_numa(rq, curr);
4957	}	5082	}
4958		5083
4959	/*	5084	/*


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61	SCHED_FEAT(FORCE_SD_OVERLAP, false)	61	SCHED_FEAT(FORCE_SD_OVERLAP, false)
62	SCHED_FEAT(RT_RUNTIME_SHARE, true)	62	SCHED_FEAT(RT_RUNTIME_SHARE, true)
63	SCHED_FEAT(LB_MIN, false)	63	SCHED_FEAT(LB_MIN, false)
		64
		65	/*
		66	* Apply the automatic NUMA scheduling policy
		67	*/
		68	#ifdef CONFIG_NUMA_BALANCING
		69	SCHED_FEAT(NUMA, true)
		70	#endif


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */	649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650		650
		651	#ifdef CONFIG_NUMA_BALANCING
		652	#define sched_feat_numa(x) sched_feat(x)
		653	#else
		654	#define sched_feat_numa(x) (0)
		655	#endif
		656
651	static inline u64 global_rt_period(void)	657	static inline u64 global_rt_period(void)
652	{	658	{
653	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	659	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */	256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257	static int min_wakeup_granularity_ns; /* 0 usecs */	257	static int min_wakeup_granularity_ns; /* 0 usecs */
258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */	258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
		259	#ifdef CONFIG_SMP
259	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;	260	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;	261	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261	#endif	262	#endif /* CONFIG_SMP */
		263	#endif /* CONFIG_SCHED_DEBUG */
262		264
263	#ifdef CONFIG_COMPACTION	265	#ifdef CONFIG_COMPACTION
264	static int min_extfrag_threshold;	266	static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301	.extra1 = &min_wakeup_granularity_ns,	303	.extra1 = &min_wakeup_granularity_ns,
302	.extra2 = &max_wakeup_granularity_ns,	304	.extra2 = &max_wakeup_granularity_ns,
303	},	305	},
		306	#ifdef CONFIG_SMP
304	{	307	{
305	.procname = "sched_tunable_scaling",	308	.procname = "sched_tunable_scaling",
306	.data = &sysctl_sched_tunable_scaling,	309	.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347	.extra1 = &zero,	350	.extra1 = &zero,
348	.extra2 = &one,	351	.extra2 = &one,
349	},	352	},
350	#endif	353	#endif /* CONFIG_SMP */
		354	#ifdef CONFIG_NUMA_BALANCING
		355	{
		356	.procname = "numa_balancing_scan_period_min_ms",
		357	.data = &sysctl_numa_balancing_scan_period_min,
		358	.maxlen = sizeof(unsigned int),
		359	.mode = 0644,
		360	.proc_handler = proc_dointvec,
		361	},
		362	{
		363	.procname = "numa_balancing_scan_period_max_ms",
		364	.data = &sysctl_numa_balancing_scan_period_max,
		365	.maxlen = sizeof(unsigned int),
		366	.mode = 0644,
		367	.proc_handler = proc_dointvec,
		368	},
		369	#endif /* CONFIG_NUMA_BALANCING */
		370	#endif /* CONFIG_SCHED_DEBUG */
351	{	371	{
352	.procname = "sched_rt_period_us",	372	.procname = "sched_rt_period_us",
353	.data = &sysctl_sched_rt_period,	373	.data = &sysctl_sched_rt_period,


diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6f..ee8133794a56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
1046	*/	1046	*/
1047	split_huge_page(page);	1047	split_huge_page(page);
1048	put_page(page);	1048	put_page(page);
		1049
1049	return 0;	1050	return 0;
1050		1051
1051	clear_pmdnuma:	1052	clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
1060		1061
1061	out_unlock:	1062	out_unlock:
1062	spin_unlock(&mm->page_table_lock);	1063	spin_unlock(&mm->page_table_lock);
1063	if (page)	1064	if (page) {
1064	put_page(page);	1065	put_page(page);
		1066	task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
		1067	}
1065	return 0;	1068	return 0;
1066	}	1069	}
1067		1070


diff --git a/mm/memory.c b/mm/memory.c index d52542680e10..8012c1907895 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3454	{	3454	{
3455	struct page *page = NULL;	3455	struct page *page = NULL;
3456	spinlock_t *ptl;	3456	spinlock_t *ptl;
3457	int current_nid, target_nid;	3457	int current_nid = -1;
		3458	int target_nid;
3458		3459
3459	/*	3460	/*
3460	* The "pte" at this point cannot be used safely without	3461	* The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3501	current_nid = target_nid;	3502	current_nid = target_nid;
3502		3503
3503	out:	3504	out:
		3505	task_numa_fault(current_nid, 1);
3504	return 0;	3506	return 0;
3505	}	3507	}
3506		3508
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3537	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {	3539	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3538	pte_t pteval = *pte;	3540	pte_t pteval = *pte;
3539	struct page *page;	3541	struct page *page;
		3542	int curr_nid;
3540	if (!pte_present(pteval))	3543	if (!pte_present(pteval))
3541	continue;	3544	continue;
3542	if (!pte_numa(pteval))	3545	if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3554	page = vm_normal_page(vma, addr, pteval);	3557	page = vm_normal_page(vma, addr, pteval);
3555	if (unlikely(!page))	3558	if (unlikely(!page))
3556	continue;	3559	continue;
		3560	/* only check non-shared pages */
		3561	if (unlikely(page_mapcount(page) != 1))
		3562	continue;
		3563	pte_unmap_unlock(pte, ptl);
		3564
		3565	curr_nid = page_to_nid(page);
		3566	task_numa_fault(curr_nid, 1);
		3567
		3568	pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3557	}	3569	}
3558	pte_unmap_unlock(orig_pte, ptl);	3570	pte_unmap_unlock(orig_pte, ptl);
3559		3571