1 files changed, 227 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e8542..9af5af979a13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+        struct task_struct *p = current;
+        if (!sched_feat_numa(NUMA))
+                return;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        /*
+         * If pages are properly placed (did not migrate) then scan slower.
+         * This is reset periodically in case of phase changes
+         */
+        if (!migrated)
+                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                        p->numa_scan_period + jiffies_to_msecs(10));
+        task_numa_placement(p);
+}
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+        ACCESS_ONCE(p->mm->numa_scan_seq)++;
+        p->mm->numa_scan_offset = 0;
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        struct vm_area_struct *vma;
+        unsigned long start, end;
+        long pages;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * We do not care about task placement until a task runs on a node
+         * other than the first one used by the address space. This is
+         * largely because migrations are driven by what CPU the task
+         * is running on. If it's never scheduled on another node, it'll
+         * not migrate so why bother trapping the fault.
+         */
+        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+                mm->first_nid = numa_node_id();
+        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+                /* Are we running on a new node yet? */
+                if (numa_node_id() == mm->first_nid &&
+                    !sched_feat_numa(NUMA_FORCE))
+                        return;
+                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+        }
+        /*
+         * Reset the scan period if enough time has gone by. Objective is that
+         * scanning will be reduced if pages are properly placed. As tasks
+         * can enter different phases this needs to be re-examined. Lacking
+         * proper tracking of reference behaviour, this blunt hammer is used.
+         */
+        migrate = mm->numa_next_reset;
+        if (time_after(now, migrate)) {
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+                xchg(&mm->numa_next_reset, next_scan);
+        }
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        /*
+         * Do not set pte_numa if the current running node is rate-limited.
+         * This loses statistics on the fault but if we are unwilling to
+         * migrate to this node, it is less likely we can do useful work
+         */
+        if (migrate_ratelimited(numa_node_id()))
+                return;
+        start = mm->numa_scan_offset;
+        pages = sysctl_numa_balancing_scan_size;
+        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+        if (!pages)
+                return;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, start);
+        if (!vma) {
+                reset_ptenuma_scan(p);
+                start = 0;
+                vma = mm->mmap;
+        }
+        for (; vma; vma = vma->vm_next) {
+                if (!vma_migratable(vma))
+                        continue;
+                /* Skip small VMAs. They are not likely to be of relevance */
+                if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
+                        continue;
+                do {
+                        start = max(start, vma->vm_start);
+                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+                        end = min(end, vma->vm_end);
+                        pages -= change_prot_numa(vma, start, end);
+                        start = end;
+                        if (pages <= 0)
+                                goto out;
+                } while (end != vma->vm_end);
+        }
+out:
+        /*
+         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * !migratable VMA on the next scan but not reset the scanner to the start
+         * so check it now.
+         */
+        if (vma)
+                mm->numa_scan_offset = start;
+        else
+                reset_ptenuma_scan(p);
+        up_read(&mm->mmap_sem);
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                if (!curr->node_stamp)
+                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
        update_rq_runnable_avg(rq, 1);
 }

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 756f9f9e8542..9af5af979a13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
		29	#include <linux/mempolicy.h>
		30	#include <linux/migrate.h>
		31	#include <linux/task_work.h>
29		32
30	#include <trace/events/sched.h>	33	#include <trace/events/sched.h>
31		34
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
774	* Scheduling class queueing methods:	777	* Scheduling class queueing methods:
775	*/	778	*/
776		779
		780	#ifdef CONFIG_NUMA_BALANCING
		781	/*
		782	* numa task sample period in ms
		783	*/
		784	unsigned int sysctl_numa_balancing_scan_period_min = 100;
		785	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
		786	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
		787
		788	/* Portion of address space to scan in MB */
		789	unsigned int sysctl_numa_balancing_scan_size = 256;
		790
		791	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
		792	unsigned int sysctl_numa_balancing_scan_delay = 1000;
		793
		794	static void task_numa_placement(struct task_struct *p)
		795	{
		796	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
		797
		798	if (p->numa_scan_seq == seq)
		799	return;
		800	p->numa_scan_seq = seq;
		801
		802	/* FIXME: Scheduling placement policy hints go here */
		803	}
		804
		805	/*
		806	* Got a PROT_NONE fault for a page on @node.
		807	*/
		808	void task_numa_fault(int node, int pages, bool migrated)
		809	{
		810	struct task_struct *p = current;
		811
		812	if (!sched_feat_numa(NUMA))
		813	return;
		814
		815	/* FIXME: Allocate task-specific structure for placement policy here */
		816
		817	/*
		818	* If pages are properly placed (did not migrate) then scan slower.
		819	* This is reset periodically in case of phase changes
		820	*/
		821	if (!migrated)
		822	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
		823	p->numa_scan_period + jiffies_to_msecs(10));
		824
		825	task_numa_placement(p);
		826	}
		827
		828	static void reset_ptenuma_scan(struct task_struct *p)
		829	{
		830	ACCESS_ONCE(p->mm->numa_scan_seq)++;
		831	p->mm->numa_scan_offset = 0;
		832	}
		833
		834	/*
		835	* The expensive part of numa migration is done from task_work context.
		836	* Triggered from task_tick_numa().
		837	*/
		838	void task_numa_work(struct callback_head *work)
		839	{
		840	unsigned long migrate, next_scan, now = jiffies;
		841	struct task_struct *p = current;
		842	struct mm_struct *mm = p->mm;
		843	struct vm_area_struct *vma;
		844	unsigned long start, end;
		845	long pages;
		846
		847	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
		848
		849	work->next = work; /* protect against double add */
		850	/*
		851	* Who cares about NUMA placement when they're dying.
		852	*
		853	* NOTE: make sure not to dereference p->mm before this check,
		854	* exit_task_work() happens _after_ exit_mm() so we could be called
		855	* without p->mm even though we still had it when we enqueued this
		856	* work.
		857	*/
		858	if (p->flags & PF_EXITING)
		859	return;
		860
		861	/*
		862	* We do not care about task placement until a task runs on a node
		863	* other than the first one used by the address space. This is
		864	* largely because migrations are driven by what CPU the task
		865	* is running on. If it's never scheduled on another node, it'll
		866	* not migrate so why bother trapping the fault.
		867	*/
		868	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
		869	mm->first_nid = numa_node_id();
		870	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
		871	/* Are we running on a new node yet? */
		872	if (numa_node_id() == mm->first_nid &&
		873	!sched_feat_numa(NUMA_FORCE))
		874	return;
		875
		876	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
		877	}
		878
		879	/*
		880	* Reset the scan period if enough time has gone by. Objective is that
		881	* scanning will be reduced if pages are properly placed. As tasks
		882	* can enter different phases this needs to be re-examined. Lacking
		883	* proper tracking of reference behaviour, this blunt hammer is used.
		884	*/
		885	migrate = mm->numa_next_reset;
		886	if (time_after(now, migrate)) {
		887	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		888	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
		889	xchg(&mm->numa_next_reset, next_scan);
		890	}
		891
		892	/*
		893	* Enforce maximal scan/migration frequency..
		894	*/
		895	migrate = mm->numa_next_scan;
		896	if (time_before(now, migrate))
		897	return;
		898
		899	if (p->numa_scan_period == 0)
		900	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		901
		902	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
		903	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
		904	return;
		905
		906	/*
		907	* Do not set pte_numa if the current running node is rate-limited.
		908	* This loses statistics on the fault but if we are unwilling to
		909	* migrate to this node, it is less likely we can do useful work
		910	*/
		911	if (migrate_ratelimited(numa_node_id()))
		912	return;
		913
		914	start = mm->numa_scan_offset;
		915	pages = sysctl_numa_balancing_scan_size;
		916	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
		917	if (!pages)
		918	return;
		919
		920	down_read(&mm->mmap_sem);
		921	vma = find_vma(mm, start);
		922	if (!vma) {
		923	reset_ptenuma_scan(p);
		924	start = 0;
		925	vma = mm->mmap;
		926	}
		927	for (; vma; vma = vma->vm_next) {
		928	if (!vma_migratable(vma))
		929	continue;
		930
		931	/* Skip small VMAs. They are not likely to be of relevance */
		932	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
		933	continue;
		934
		935	do {
		936	start = max(start, vma->vm_start);
		937	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
		938	end = min(end, vma->vm_end);
		939	pages -= change_prot_numa(vma, start, end);
		940
		941	start = end;
		942	if (pages <= 0)
		943	goto out;
		944	} while (end != vma->vm_end);
		945	}
		946
		947	out:
		948	/*
		949	* It is possible to reach the end of the VMA list but the last few VMAs are
		950	* not guaranteed to the vma_migratable. If they are not, we would find the
		951	* !migratable VMA on the next scan but not reset the scanner to the start
		952	* so check it now.
		953	*/
		954	if (vma)
		955	mm->numa_scan_offset = start;
		956	else
		957	reset_ptenuma_scan(p);
		958	up_read(&mm->mmap_sem);
		959	}
		960
		961	/*
		962	* Drive the periodic memory faults..
		963	*/
		964	void task_tick_numa(struct rq rq, struct task_struct curr)
		965	{
		966	struct callback_head *work = &curr->numa_work;
		967	u64 period, now;
		968
		969	/*
		970	* We don't care about NUMA placement if we don't have memory.
		971	*/
		972	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
		973	return;
		974
		975	/*
		976	* Using runtime rather than walltime has the dual advantage that
		977	* we (mostly) drive the selection from busy threads and that the
		978	* task needs to have done some actual work before we bother with
		979	* NUMA placement.
		980	*/
		981	now = curr->se.sum_exec_runtime;
		982	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
		983
		984	if (now - curr->node_stamp > period) {
		985	if (!curr->node_stamp)
		986	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
		987	curr->node_stamp = now;
		988
		989	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
		990	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
		991	task_work_add(curr, work, true);
		992	}
		993	}
		994	}
		995	#else
		996	static void task_tick_numa(struct rq rq, struct task_struct curr)
		997	{
		998	}
		999	#endif /* CONFIG_NUMA_BALANCING */
		1000
777	static void	1001	static void
778	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	1002	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
779	{	1003	{
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
5501	entity_tick(cfs_rq, se, queued);	5725	entity_tick(cfs_rq, se, queued);
5502	}	5726	}
5503		5727
		5728	if (sched_feat_numa(NUMA))
		5729	task_tick_numa(rq, curr);
		5730
5504	update_rq_runnable_avg(rq, 1);	5731	update_rq_runnable_avg(rq, 1);
5505	}	5732	}
5506		5733