5 files changed, 173 insertions, 2 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+        struct task_struct *p = current;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        task_numa_placement(p);
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        ACCESS_ONCE(mm->numa_scan_seq)++;
+        {
+                struct vm_area_struct *vma;
+                down_read(&mm->mmap_sem);
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        if (!vma_migratable(vma))
+                                continue;
+                        change_prot_numa(vma, vma->vm_start, vma->vm_end);
+                }
+                up_read(&mm->mmap_sem);
+        }
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        true)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+        {
+                .procname       = "numa_balancing_scan_period_min_ms",
+                .data           = &sysctl_numa_balancing_scan_period_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_max_ms",
+                .data           = &sysctl_numa_balancing_scan_period_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533	#ifdef CONFIG_PREEMPT_NOTIFIERS	1533	#ifdef CONFIG_PREEMPT_NOTIFIERS
1534	INIT_HLIST_HEAD(&p->preempt_notifiers);	1534	INIT_HLIST_HEAD(&p->preempt_notifiers);
1535	#endif	1535	#endif
		1536
		1537	#ifdef CONFIG_NUMA_BALANCING
		1538	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
		1539	p->mm->numa_next_scan = jiffies;
		1540	p->mm->numa_scan_seq = 0;
		1541	}
		1542
		1543	p->node_stamp = 0ULL;
		1544	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
		1545	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
		1546	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		1547	p->numa_work.next = &p->numa_work;
		1548	#endif /* CONFIG_NUMA_BALANCING */
1536	}	1549	}
1537		1550
1538	/*	1551	/*


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
		29	#include <linux/mempolicy.h>
		30	#include <linux/task_work.h>
29		31
30	#include <trace/events/sched.h>	32	#include <trace/events/sched.h>
31		33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
776	* Scheduling class queueing methods:	778	* Scheduling class queueing methods:
777	*/	779	*/
778		780
		781	#ifdef CONFIG_NUMA_BALANCING
		782	/*
		783	* numa task sample period in ms: 5s
		784	*/
		785	unsigned int sysctl_numa_balancing_scan_period_min = 5000;
		786	unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
		787
		788	static void task_numa_placement(struct task_struct *p)
		789	{
		790	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
		791
		792	if (p->numa_scan_seq == seq)
		793	return;
		794	p->numa_scan_seq = seq;
		795
		796	/* FIXME: Scheduling placement policy hints go here */
		797	}
		798
		799	/*
		800	* Got a PROT_NONE fault for a page on @node.
		801	*/
		802	void task_numa_fault(int node, int pages)
		803	{
		804	struct task_struct *p = current;
		805
		806	/* FIXME: Allocate task-specific structure for placement policy here */
		807
		808	task_numa_placement(p);
		809	}
		810
		811	/*
		812	* The expensive part of numa migration is done from task_work context.
		813	* Triggered from task_tick_numa().
		814	*/
		815	void task_numa_work(struct callback_head *work)
		816	{
		817	unsigned long migrate, next_scan, now = jiffies;
		818	struct task_struct *p = current;
		819	struct mm_struct *mm = p->mm;
		820
		821	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
		822
		823	work->next = work; /* protect against double add */
		824	/*
		825	* Who cares about NUMA placement when they're dying.
		826	*
		827	* NOTE: make sure not to dereference p->mm before this check,
		828	* exit_task_work() happens _after_ exit_mm() so we could be called
		829	* without p->mm even though we still had it when we enqueued this
		830	* work.
		831	*/
		832	if (p->flags & PF_EXITING)
		833	return;
		834
		835	/*
		836	* Enforce maximal scan/migration frequency..
		837	*/
		838	migrate = mm->numa_next_scan;
		839	if (time_before(now, migrate))
		840	return;
		841
		842	if (p->numa_scan_period == 0)
		843	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		844
		845	next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
		846	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
		847	return;
		848
		849	ACCESS_ONCE(mm->numa_scan_seq)++;
		850	{
		851	struct vm_area_struct *vma;
		852
		853	down_read(&mm->mmap_sem);
		854	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		855	if (!vma_migratable(vma))
		856	continue;
		857	change_prot_numa(vma, vma->vm_start, vma->vm_end);
		858	}
		859	up_read(&mm->mmap_sem);
		860	}
		861	}
		862
		863	/*
		864	* Drive the periodic memory faults..
		865	*/
		866	void task_tick_numa(struct rq rq, struct task_struct curr)
		867	{
		868	struct callback_head *work = &curr->numa_work;
		869	u64 period, now;
		870
		871	/*
		872	* We don't care about NUMA placement if we don't have memory.
		873	*/
		874	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
		875	return;
		876
		877	/*
		878	* Using runtime rather than walltime has the dual advantage that
		879	* we (mostly) drive the selection from busy threads and that the
		880	* task needs to have done some actual work before we bother with
		881	* NUMA placement.
		882	*/
		883	now = curr->se.sum_exec_runtime;
		884	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
		885
		886	if (now - curr->node_stamp > period) {
		887	curr->node_stamp = now;
		888
		889	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
		890	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
		891	task_work_add(curr, work, true);
		892	}
		893	}
		894	}
		895	#else
		896	static void task_tick_numa(struct rq rq, struct task_struct curr)
		897	{
		898	}
		899	#endif /* CONFIG_NUMA_BALANCING */
		900
779	static void	901	static void
780	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	902	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
781	{	903	{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
4954	cfs_rq = cfs_rq_of(se);	5076	cfs_rq = cfs_rq_of(se);
4955	entity_tick(cfs_rq, se, queued);	5077	entity_tick(cfs_rq, se, queued);
4956	}	5078	}
		5079
		5080	if (sched_feat_numa(NUMA))
		5081	task_tick_numa(rq, curr);
4957	}	5082	}
4958		5083
4959	/*	5084	/*


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61	SCHED_FEAT(FORCE_SD_OVERLAP, false)	61	SCHED_FEAT(FORCE_SD_OVERLAP, false)
62	SCHED_FEAT(RT_RUNTIME_SHARE, true)	62	SCHED_FEAT(RT_RUNTIME_SHARE, true)
63	SCHED_FEAT(LB_MIN, false)	63	SCHED_FEAT(LB_MIN, false)
		64
		65	/*
		66	* Apply the automatic NUMA scheduling policy
		67	*/
		68	#ifdef CONFIG_NUMA_BALANCING
		69	SCHED_FEAT(NUMA, true)
		70	#endif


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */	649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650		650
		651	#ifdef CONFIG_NUMA_BALANCING
		652	#define sched_feat_numa(x) sched_feat(x)
		653	#else
		654	#define sched_feat_numa(x) (0)
		655	#endif
		656
651	static inline u64 global_rt_period(void)	657	static inline u64 global_rt_period(void)
652	{	658	{
653	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	659	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */	256	static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257	static int min_wakeup_granularity_ns; /* 0 usecs */	257	static int min_wakeup_granularity_ns; /* 0 usecs */
258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */	258	static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
		259	#ifdef CONFIG_SMP
259	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;	260	static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;	261	static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261	#endif	262	#endif /* CONFIG_SMP */
		263	#endif /* CONFIG_SCHED_DEBUG */
262		264
263	#ifdef CONFIG_COMPACTION	265	#ifdef CONFIG_COMPACTION
264	static int min_extfrag_threshold;	266	static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301	.extra1 = &min_wakeup_granularity_ns,	303	.extra1 = &min_wakeup_granularity_ns,
302	.extra2 = &max_wakeup_granularity_ns,	304	.extra2 = &max_wakeup_granularity_ns,
303	},	305	},
		306	#ifdef CONFIG_SMP
304	{	307	{
305	.procname = "sched_tunable_scaling",	308	.procname = "sched_tunable_scaling",
306	.data = &sysctl_sched_tunable_scaling,	309	.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347	.extra1 = &zero,	350	.extra1 = &zero,
348	.extra2 = &one,	351	.extra2 = &one,
349	},	352	},
350	#endif	353	#endif /* CONFIG_SMP */
		354	#ifdef CONFIG_NUMA_BALANCING
		355	{
		356	.procname = "numa_balancing_scan_period_min_ms",
		357	.data = &sysctl_numa_balancing_scan_period_min,
		358	.maxlen = sizeof(unsigned int),
		359	.mode = 0644,
		360	.proc_handler = proc_dointvec,
		361	},
		362	{
		363	.procname = "numa_balancing_scan_period_max_ms",
		364	.data = &sysctl_numa_balancing_scan_period_max,
		365	.maxlen = sizeof(unsigned int),
		366	.mode = 0644,
		367	.proc_handler = proc_dointvec,
		368	},
		369	#endif /* CONFIG_NUMA_BALANCING */
		370	#endif /* CONFIG_SCHED_DEBUG */
351	{	371	{
352	.procname = "sched_rt_period_us",	372	.procname = "sched_rt_period_us",
353	.data = &sysctl_sched_rt_period,	373	.data = &sysctl_sched_rt_period,