mm: numa: Add fault driven placement and migration

NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2012-10-25 08:16:43 -0400
committer: Mel Gorman <mgorman@suse.de> 2012-12-11 09:42:45 -0500
commit: cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
tree: d4cfbcfa3e89742216cd792d4aa914356406b532 /kernel/sched
parent: a720094ded8cbb303111035be91858011d2eac71 (diff)
4 files changed, 151 insertions, 0 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+        struct task_struct *p = current;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        task_numa_placement(p);
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        ACCESS_ONCE(mm->numa_scan_seq)++;
+        {
+                struct vm_area_struct *vma;
+                down_read(&mm->mmap_sem);
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        if (!vma_migratable(vma))
+                                continue;
+                        change_prot_numa(vma, vma->vm_start, vma->vm_end);
+                }
+                up_read(&mm->mmap_sem);
+        }
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        true)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2012-10-25 08:16:43 -0400
committer	Mel Gorman <mgorman@suse.de>	2012-12-11 09:42:45 -0500
commit	cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
tree	d4cfbcfa3e89742216cd792d4aa914356406b532 /kernel/sched
parent	a720094ded8cbb303111035be91858011d2eac71 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533	#ifdef CONFIG_PREEMPT_NOTIFIERS	1533	#ifdef CONFIG_PREEMPT_NOTIFIERS
1534	INIT_HLIST_HEAD(&p->preempt_notifiers);	1534	INIT_HLIST_HEAD(&p->preempt_notifiers);
1535	#endif	1535	#endif
		1536
		1537	#ifdef CONFIG_NUMA_BALANCING
		1538	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
		1539	p->mm->numa_next_scan = jiffies;
		1540	p->mm->numa_scan_seq = 0;
		1541	}
		1542
		1543	p->node_stamp = 0ULL;
		1544	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
		1545	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
		1546	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		1547	p->numa_work.next = &p->numa_work;
		1548	#endif /* CONFIG_NUMA_BALANCING */
1536	}	1549	}
1537		1550
1538	/*	1551	/*


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
		29	#include <linux/mempolicy.h>
		30	#include <linux/task_work.h>
29		31
30	#include <trace/events/sched.h>	32	#include <trace/events/sched.h>
31		33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
776	* Scheduling class queueing methods:	778	* Scheduling class queueing methods:
777	*/	779	*/
778		780
		781	#ifdef CONFIG_NUMA_BALANCING
		782	/*
		783	* numa task sample period in ms: 5s
		784	*/
		785	unsigned int sysctl_numa_balancing_scan_period_min = 5000;
		786	unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
		787
		788	static void task_numa_placement(struct task_struct *p)
		789	{
		790	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
		791
		792	if (p->numa_scan_seq == seq)
		793	return;
		794	p->numa_scan_seq = seq;
		795
		796	/* FIXME: Scheduling placement policy hints go here */
		797	}
		798
		799	/*
		800	* Got a PROT_NONE fault for a page on @node.
		801	*/
		802	void task_numa_fault(int node, int pages)
		803	{
		804	struct task_struct *p = current;
		805
		806	/* FIXME: Allocate task-specific structure for placement policy here */
		807
		808	task_numa_placement(p);
		809	}
		810
		811	/*
		812	* The expensive part of numa migration is done from task_work context.
		813	* Triggered from task_tick_numa().
		814	*/
		815	void task_numa_work(struct callback_head *work)
		816	{
		817	unsigned long migrate, next_scan, now = jiffies;
		818	struct task_struct *p = current;
		819	struct mm_struct *mm = p->mm;
		820
		821	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
		822
		823	work->next = work; /* protect against double add */
		824	/*
		825	* Who cares about NUMA placement when they're dying.
		826	*
		827	* NOTE: make sure not to dereference p->mm before this check,
		828	* exit_task_work() happens _after_ exit_mm() so we could be called
		829	* without p->mm even though we still had it when we enqueued this
		830	* work.
		831	*/
		832	if (p->flags & PF_EXITING)
		833	return;
		834
		835	/*
		836	* Enforce maximal scan/migration frequency..
		837	*/
		838	migrate = mm->numa_next_scan;
		839	if (time_before(now, migrate))
		840	return;
		841
		842	if (p->numa_scan_period == 0)
		843	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		844
		845	next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
		846	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
		847	return;
		848
		849	ACCESS_ONCE(mm->numa_scan_seq)++;
		850	{
		851	struct vm_area_struct *vma;
		852
		853	down_read(&mm->mmap_sem);
		854	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		855	if (!vma_migratable(vma))
		856	continue;
		857	change_prot_numa(vma, vma->vm_start, vma->vm_end);
		858	}
		859	up_read(&mm->mmap_sem);
		860	}
		861	}
		862
		863	/*
		864	* Drive the periodic memory faults..
		865	*/
		866	void task_tick_numa(struct rq rq, struct task_struct curr)
		867	{
		868	struct callback_head *work = &curr->numa_work;
		869	u64 period, now;
		870
		871	/*
		872	* We don't care about NUMA placement if we don't have memory.
		873	*/
		874	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
		875	return;
		876
		877	/*
		878	* Using runtime rather than walltime has the dual advantage that
		879	* we (mostly) drive the selection from busy threads and that the
		880	* task needs to have done some actual work before we bother with
		881	* NUMA placement.
		882	*/
		883	now = curr->se.sum_exec_runtime;
		884	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
		885
		886	if (now - curr->node_stamp > period) {
		887	curr->node_stamp = now;
		888
		889	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
		890	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
		891	task_work_add(curr, work, true);
		892	}
		893	}
		894	}
		895	#else
		896	static void task_tick_numa(struct rq rq, struct task_struct curr)
		897	{
		898	}
		899	#endif /* CONFIG_NUMA_BALANCING */
		900
779	static void	901	static void
780	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	902	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
781	{	903	{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
4954	cfs_rq = cfs_rq_of(se);	5076	cfs_rq = cfs_rq_of(se);
4955	entity_tick(cfs_rq, se, queued);	5077	entity_tick(cfs_rq, se, queued);
4956	}	5078	}
		5079
		5080	if (sched_feat_numa(NUMA))
		5081	task_tick_numa(rq, curr);
4957	}	5082	}
4958		5083
4959	/*	5084	/*


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61	SCHED_FEAT(FORCE_SD_OVERLAP, false)	61	SCHED_FEAT(FORCE_SD_OVERLAP, false)
62	SCHED_FEAT(RT_RUNTIME_SHARE, true)	62	SCHED_FEAT(RT_RUNTIME_SHARE, true)
63	SCHED_FEAT(LB_MIN, false)	63	SCHED_FEAT(LB_MIN, false)
		64
		65	/*
		66	* Apply the automatic NUMA scheduling policy
		67	*/
		68	#ifdef CONFIG_NUMA_BALANCING
		69	SCHED_FEAT(NUMA, true)
		70	#endif


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	648	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */	649	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650		650
		651	#ifdef CONFIG_NUMA_BALANCING
		652	#define sched_feat_numa(x) sched_feat(x)
		653	#else
		654	#define sched_feat_numa(x) (0)
		655	#endif
		656
651	static inline u64 global_rt_period(void)	657	static inline u64 global_rt_period(void)
652	{	658	{
653	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	659	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;