mm: numa: Add fault driven placement and migration

NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2012-10-25 08:16:43 -0400
committer: Mel Gorman <mgorman@suse.de> 2012-12-11 09:42:45 -0500
commit: cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
tree: d4cfbcfa3e89742216cd792d4aa914356406b532
parent: a720094ded8cbb303111035be91858011d2eac71 (diff)
11 files changed, 224 insertions, 4 deletions
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
 config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+        select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff3ced2..1137028fc6d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+        select ARCH_SUPPORTS_NUMA_BALANCING
+        select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..ed8638c29b3e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
 #ifdef CONFIG_CPUMASK_OFFSTACK
        struct cpumask cpumask_allocation;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        /*
+         * numa_next_scan is the next time when the PTEs will me marked
+         * pte_numa to gather statistics and migrate pages to new nodes
+         * if necessary
+         */
+        unsigned long numa_next_scan;
+        /* numa_scan_seq prevents two threads setting pte_numa */
+        int numa_scan_seq;
+#endif
        struct uprobes_state uprobes_state;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..844af5b12cb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
        short il_next;
        short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        int numa_scan_seq;
+        int numa_migrate_seq;
+        unsigned int numa_scan_period;
+        u64 node_stamp;                 /* migration stamp  */
+        struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
        struct rcu_head rcu;
        /*
@@ -1553,6 +1561,14 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
 /*
 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+        struct task_struct *p = current;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        task_numa_placement(p);
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        ACCESS_ONCE(mm->numa_scan_seq)++;
+        {
+                struct vm_area_struct *vma;
+                down_read(&mm->mmap_sem);
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        if (!vma_migratable(vma))
+                                continue;
+                        change_prot_numa(vma, vma->vm_start, vma->vm_end);
+                }
+                up_read(&mm->mmap_sem);
+        }
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        true)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+        {
+                .procname       = "numa_balancing_scan_period_min_ms",
+                .data           = &sysctl_numa_balancing_scan_period_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_max_ms",
+                .data           = &sysctl_numa_balancing_scan_period_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d79f7a55bf6f..ee8133794a56 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        split_huge_page(page);
        put_page(page);
        return 0;
 clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-        if (page)
+        if (page) {
                put_page(page);
+                task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+        }
        return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d52542680e10..8012c1907895 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *page = NULL;
        spinlock_t *ptl;
-        int current_nid, target_nid;
+        int current_nid = -1;
+        int target_nid;
        /*
        * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                current_nid = target_nid;
 out:
+        task_numa_fault(current_nid, 1);
        return 0;
 }
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                pte_t pteval = *pte;
                struct page *page;
+                int curr_nid;
                if (!pte_present(pteval))
                        continue;
                if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page))
                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                pte_unmap_unlock(pte, ptl);
+                curr_nid = page_to_nid(page);
+                task_numa_fault(curr_nid, 1);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2012-10-25 08:16:43 -0400
committer	Mel Gorman <mgorman@suse.de>	2012-12-11 09:42:45 -0500
commit	cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
tree	d4cfbcfa3e89742216cd792d4aa914356406b532
parent	a720094ded8cbb303111035be91858011d2eac71 (diff)

diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4dd..0f7c852f355c 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
111	config NUMA	111	config NUMA
112	bool "Non Uniform Memory Access (NUMA) Support"	112	bool "Non Uniform Memory Access (NUMA) Support"
113	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL	113	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
		114	select ARCH_WANT_NUMA_VARIABLE_LOCALITY
114	default n	115	default n
115	help	116	help
116	Some SH systems have many various memories scattered around	117	Some SH systems have many various memories scattered around


diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced2..1137028fc6d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
22	def_bool y	22	def_bool y
23	select HAVE_AOUT if X86_32	23	select HAVE_AOUT if X86_32
24	select HAVE_UNSTABLE_SCHED_CLOCK	24	select HAVE_UNSTABLE_SCHED_CLOCK
		25	select ARCH_SUPPORTS_NUMA_BALANCING
		26	select ARCH_WANTS_PROT_NUMA_PROT_NONE
25	select HAVE_IDE	27	select HAVE_IDE
26	select HAVE_OPROFILE	28	select HAVE_OPROFILE
27	select HAVE_PCSPKR_PLATFORM	29	select HAVE_PCSPKR_PLATFORM


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3af7d94..ed8638c29b3e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h
@@ -398,6 +398,17 @@ struct mm_struct {
398	#ifdef CONFIG_CPUMASK_OFFSTACK	398	#ifdef CONFIG_CPUMASK_OFFSTACK
399	struct cpumask cpumask_allocation;	399	struct cpumask cpumask_allocation;
400	#endif	400	#endif
		401	#ifdef CONFIG_NUMA_BALANCING
		402	/*
		403	* numa_next_scan is the next time when the PTEs will me marked
		404	* pte_numa to gather statistics and migrate pages to new nodes
		405	* if necessary
		406	*/
		407	unsigned long numa_next_scan;
		408
		409	/* numa_scan_seq prevents two threads setting pte_numa */
		410	int numa_scan_seq;
		411	#endif
401	struct uprobes_state uprobes_state;	412	struct uprobes_state uprobes_state;
402	};	413	};
403		414


diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..844af5b12cb2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
1479	short il_next;	1479	short il_next;
1480	short pref_node_fork;	1480	short pref_node_fork;
1481	#endif	1481	#endif
		1482	#ifdef CONFIG_NUMA_BALANCING
		1483	int numa_scan_seq;
		1484	int numa_migrate_seq;
		1485	unsigned int numa_scan_period;
		1486	u64 node_stamp; /* migration stamp */
		1487	struct callback_head numa_work;
		1488	#endif /* CONFIG_NUMA_BALANCING */
		1489
1482	struct rcu_head rcu;	1490	struct rcu_head rcu;
1483		1491
1484	/*	1492	/*
@@ -1553,6 +1561,14 @@ struct task_struct {
1553	/* Future-safe accessor for struct task_struct's cpus_allowed. */	1561	/* Future-safe accessor for struct task_struct's cpus_allowed. */
1554	#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)	1562	#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1555		1563
		1564	#ifdef CONFIG_NUMA_BALANCING
		1565	extern void task_numa_fault(int node, int pages);
		1566	#else
		1567	static inline void task_numa_fault(int node, int pages)
		1568	{
		1569	}
		1570	#endif
		1571
1556	/*	1572	/*
1557	* Priority of a process goes from 0..MAX_PRIO-1, valid RT	1573	* Priority of a process goes from 0..MAX_PRIO-1, valid RT
1558	* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH	1574	* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
1990	};	2006	};
1991	extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;	2007	extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
1992		2008
		2009	extern unsigned int sysctl_numa_balancing_scan_period_min;
		2010	extern unsigned int sysctl_numa_balancing_scan_period_max;
		2011	extern unsigned int sysctl_numa_balancing_settle_count;
		2012
1993	#ifdef CONFIG_SCHED_DEBUG	2013	#ifdef CONFIG_SCHED_DEBUG
1994	extern unsigned int sysctl_sched_migration_cost;	2014	extern unsigned int sysctl_sched_migration_cost;
1995	extern unsigned int sysctl_sched_nr_migrate;	2015	extern unsigned int sysctl_sched_nr_migrate;


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533	#ifdef CONFIG_PREEMPT_NOTIFIERS	1533	#ifdef CONFIG_PREEMPT_NOTIFIERS
1534	INIT_HLIST_HEAD(&p->preempt_notifiers);	1534	INIT_HLIST_HEAD(&p->preempt_notifiers);
1535	#endif	1535	#endif
		1536
		1537	#ifdef CONFIG_NUMA_BALANCING
		1538	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
		1539	p->mm->numa_next_scan = jiffies;
		1540	p->mm->numa_scan_seq = 0;
		1541	}
		1542
		1543	p->node_stamp = 0ULL;
		1544	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
		1545	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
		1546	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		1547	p->numa_work.next = &p->numa_work;
		1548	#endif /* CONFIG_NUMA_BALANCING */
1536	}	1549	}
1537		1550
1538	/*	1551	/*


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
		29	#include <linux/mempolicy.h>
		30	#include <linux/task_work.h>
29		31
30	#include <trace/events/sched.h>	32	#include <trace/events/sched.h>
31		33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
776	* Scheduling class queueing methods:	778	* Scheduling class queueing methods:
777	*/	779	*/
778		780
		781	#ifdef CONFIG_NUMA_BALANCING
		782	/*
		783	* numa task sample period in ms: 5s
		784	*/
		785	unsigned int sysctl_numa_balancing_scan_period_min = 5000;
		786	unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
		787
		788	static void task_numa_placement(struct task_struct *p)
		789	{
		790	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
		791
		792	if (p->numa_scan_seq == seq)
		793	return;
		794	p->numa_scan_seq = seq;
		795
		796	/* FIXME: Scheduling placement policy hints go here */
		797	}
		798
		799	/*
		800	* Got a PROT_NONE fault for a page on @node.
		801	*/
		802	void task_numa_fault(int node, int pages)
		803	{
		804	struct task_struct *p = current;
		805
		806	/* FIXME: Allocate task-specific structure for placement policy here */
		807
		808	task_numa_placement(p);
		809	}
		810
		811	/*
		812	* The expensive part of numa migration is done from task_work context.
		813	* Triggered from task_tick_numa().
		814	*/
		815	void task_numa_work(struct callback_head *work)
		816	{
		817	unsigned long migrate, next_scan, now = jiffies;
		818	struct task_struct *p = current;
		819	struct mm_struct *mm = p->mm;
		820
		821	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
		822
		823	work->next = work; /* protect against double add */
		824	/*
		825	* Who cares about NUMA placement when they're dying.
		826	*
		827	* NOTE: make sure not to dereference p->mm before this check,
		828	* exit_task_work() happens _after_ exit_mm() so we could be called
		829	* without p->mm even though we still had it when we enqueued this
		830	* work.
		831	*/
		832	if (p->flags & PF_EXITING)
		833	return;
		834
		835	/*
		836	* Enforce maximal scan/migration frequency..
		837	*/
		838	migrate = mm->numa_next_scan;
		839	if (time_before(now, migrate))
		840	return;
		841
		842	if (p->numa_scan_period == 0)
		843	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		844
		845	next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
		846	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
		847	return;
		848
		849	ACCESS_ONCE(mm->numa_scan_seq)++;
		850	{
		851	struct vm_area_struct *vma;
		852
		853	down_read(&mm->mmap_sem);
		854	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		855	if (!vma_migratable(vma))
		856	continue;
		857	change_prot_numa(vma, vma->vm_start, vma->vm_end);
		858	}
		859	up_read(&mm->mmap_sem);
		860	}
		861	}
		862
		863	/*
		864	* Drive the periodic memory faults..
		865	*/
		866	void task_tick_numa(struct rq rq, struct task_struct curr)
		867	{
		868	struct callback_head *work = &curr->numa_work;
		869	u64 period, now;
		870
		871	/*
		872	* We don't care about NUMA placement if we don't have memory.
		873	*/
		874	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
		875	return;
		876
		877	/*
		878	* Using runtime rather than walltime has the dual advantage that
		879	* we (mostly) drive the selection from busy threads and that the
		880	* task needs to have done some actual work before we bother with
		881	* NUMA placement.
		882	*/
		883	now = curr->se.sum_exec_runtime;
		884	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
		885
		886	if (now - curr->node_stamp > period) {
		887	curr->node_stamp = now;
		888
		889	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
		890	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
		891	task_work_add(curr, work, true);
		892	}
		893	}
		894	}
		895	#else
		896	static void task_tick_numa(struct rq rq, struct task_struct curr)
		897	{
		898	}
		899	#endif /* CONFIG_NUMA_BALANCING */
		900
779	static void	901	static void
780	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	902	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
781	{	903	{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
4954	cfs_rq = cfs_rq_of(se);	5076	cfs_rq = cfs_rq_of(se);
4955	entity_tick(cfs_rq, se, queued);	5077	entity_tick(cfs_rq, se, queued);
4956	}	5078	}
		5079
		5080	if (sched_feat_numa(NUMA))
		5081	task_tick_numa(rq, curr);
4957	}	5082	}
4958		5083
4959	/*	5084	/*


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61	SCHED_FEAT(FORCE_SD_OVERLAP, false)	61	SCHED_FEAT(FORCE_SD_OVERLAP, false)
62	SCHED_FEAT(RT_RUNTIME_SHARE, true)	62	SCHED_FEAT(RT_RUNTIME_SHARE, true)
63	SCHED_FEAT(LB_MIN, false)	63	SCHED_FEAT(LB_MIN, false)
		64
		65	/*
		66	* Apply the automatic NUMA scheduling policy
		67	*/
		68	#ifdef CONFIG_NUMA_BALANCING
		69	SCHED_FEAT(NUMA, true)
		70	#endif


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */	649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650		650
		651	#ifdef CONFIG_NUMA_BALANCING
		652	#define sched_feat_numa(x) sched_feat(x)
		653	#else
		654	#define sched_feat_numa(x) (0)
		655	#endif
		656
651	static inline u64 global_rt_period(void)	657	static inline u64 global_rt_period(void)
652	{	658	{
653	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	659	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */	256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257	static int min_wakeup_granularity_ns; /* 0 usecs */	257	static int min_wakeup_granularity_ns; /* 0 usecs */
258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */	258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
		259	#ifdef CONFIG_SMP
259	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;	260	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;	261	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261	#endif	262	#endif /* CONFIG_SMP */
		263	#endif /* CONFIG_SCHED_DEBUG */
262		264
263	#ifdef CONFIG_COMPACTION	265	#ifdef CONFIG_COMPACTION
264	static int min_extfrag_threshold;	266	static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301	.extra1 = &min_wakeup_granularity_ns,	303	.extra1 = &min_wakeup_granularity_ns,
302	.extra2 = &max_wakeup_granularity_ns,	304	.extra2 = &max_wakeup_granularity_ns,
303	},	305	},
		306	#ifdef CONFIG_SMP
304	{	307	{
305	.procname = "sched_tunable_scaling",	308	.procname = "sched_tunable_scaling",
306	.data = &sysctl_sched_tunable_scaling,	309	.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347	.extra1 = &zero,	350	.extra1 = &zero,
348	.extra2 = &one,	351	.extra2 = &one,
349	},	352	},
350	#endif	353	#endif /* CONFIG_SMP */
		354	#ifdef CONFIG_NUMA_BALANCING
		355	{
		356	.procname = "numa_balancing_scan_period_min_ms",
		357	.data = &sysctl_numa_balancing_scan_period_min,
		358	.maxlen = sizeof(unsigned int),
		359	.mode = 0644,
		360	.proc_handler = proc_dointvec,
		361	},
		362	{
		363	.procname = "numa_balancing_scan_period_max_ms",
		364	.data = &sysctl_numa_balancing_scan_period_max,
		365	.maxlen = sizeof(unsigned int),
		366	.mode = 0644,
		367	.proc_handler = proc_dointvec,
		368	},
		369	#endif /* CONFIG_NUMA_BALANCING */
		370	#endif /* CONFIG_SCHED_DEBUG */
351	{	371	{
352	.procname = "sched_rt_period_us",	372	.procname = "sched_rt_period_us",
353	.data = &sysctl_sched_rt_period,	373	.data = &sysctl_sched_rt_period,


diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6f..ee8133794a56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
1046	*/	1046	*/
1047	split_huge_page(page);	1047	split_huge_page(page);
1048	put_page(page);	1048	put_page(page);
		1049
1049	return 0;	1050	return 0;
1050		1051
1051	clear_pmdnuma:	1052	clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
1060		1061
1061	out_unlock:	1062	out_unlock:
1062	spin_unlock(&mm->page_table_lock);	1063	spin_unlock(&mm->page_table_lock);
1063	if (page)	1064	if (page) {
1064	put_page(page);	1065	put_page(page);
		1066	task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
		1067	}
1065	return 0;	1068	return 0;
1066	}	1069	}
1067		1070


diff --git a/mm/memory.c b/mm/memory.c index d52542680e10..8012c1907895 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3454	{	3454	{
3455	struct page *page = NULL;	3455	struct page *page = NULL;
3456	spinlock_t *ptl;	3456	spinlock_t *ptl;
3457	int current_nid, target_nid;	3457	int current_nid = -1;
		3458	int target_nid;
3458		3459
3459	/*	3460	/*
3460	* The "pte" at this point cannot be used safely without	3461	* The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3501	current_nid = target_nid;	3502	current_nid = target_nid;
3502		3503
3503	out:	3504	out:
		3505	task_numa_fault(current_nid, 1);
3504	return 0;	3506	return 0;
3505	}	3507	}
3506		3508
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3537	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {	3539	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3538	pte_t pteval = *pte;	3540	pte_t pteval = *pte;
3539	struct page *page;	3541	struct page *page;
		3542	int curr_nid;
3540	if (!pte_present(pteval))	3543	if (!pte_present(pteval))
3541	continue;	3544	continue;
3542	if (!pte_numa(pteval))	3545	if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3554	page = vm_normal_page(vma, addr, pteval);	3557	page = vm_normal_page(vma, addr, pteval);
3555	if (unlikely(!page))	3558	if (unlikely(!page))
3556	continue;	3559	continue;
		3560	/* only check non-shared pages */
		3561	if (unlikely(page_mapcount(page) != 1))
		3562	continue;
		3563	pte_unmap_unlock(pte, ptl);
		3564
		3565	curr_nid = page_to_nid(page);
		3566	task_numa_fault(curr_nid, 1);
		3567
		3568	pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3557	}	3569	}
3558	pte_unmap_unlock(orig_pte, ptl);	3570	pte_unmap_unlock(orig_pte, ptl);
3559		3571