5 files changed, 75 insertions, 3 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 84f17800f8b5..4273b2d71a27 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.
+numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
+numa_balancing_migrate_deferred.
 ==============================================================
@@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This
 gives the scheduler a chance to place the task on an alternative node if the
 preferred node is overloaded.
+numa_balancing_migrate_deferred is how many page migrations get skipped
+unconditionally, after a page migration is skipped because a page is shared
+with other tasks. This reduces page migration overhead, and determines
+how much stronger the "move task near its memory" policy scheduler becomes,
+versus the "move memory near its task" memory management policy, for workloads
+with shared memory.
 ==============================================================
 osrelease, ostype & version:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d24f70ffddee..833eed55cf43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1342,6 +1342,8 @@ struct task_struct {
        int numa_scan_seq;
        unsigned int numa_scan_period;
        unsigned int numa_scan_period_max;
+        int numa_preferred_nid;
+        int numa_migrate_deferred;
        unsigned long numa_migrate_retry;
        u64 node_stamp;                 /* migration stamp  */
        struct callback_head numa_work;
@@ -1372,7 +1374,6 @@ struct task_struct {
         */
        unsigned long numa_faults_locality[2];
-        int numa_preferred_nid;
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
                                   int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8454c38b1b12..e7884dc3416d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
        unsigned long rss = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e509b90a8002..a159e1fd2013 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "numa_balancing_migrate_deferred",
+                .data           = &sysctl_numa_balancing_migrate_deferred,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2929c24c22b7..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+        /* Never defer a private fault */
+        if (cpupid_match_pid(p, last_cpupid))
+                return false;
+        if (p->numa_migrate_deferred) {
+                p->numa_migrate_deferred--;
+                return true;
+        }
+        return false;
+}
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+        p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+        return false;
+}
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * mpol_misplaced - check whether current page node is valid in policy
 *
@@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                 * relation.
                 */
                last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-                if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)
+                if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+                        /* See sysctl_numa_balancing_migrate_deferred comment */
+                        if (!cpupid_match_pid(current, last_cpupid))
+                                defer_numa_migrate(current);
+                        goto out;
+                }
+                /*
+                 * The quadratic filter above reduces extraneous migration
+                 * of shared pages somewhat. This code reduces it even more,
+                 * reducing the overhead of page migrations of shared pages.
+                 * This makes workloads with shared pages rely more on
+                 * "move task near its memory", and less on "move memory
+                 * towards its task", which is exactly what we want.
+                 */
+                if (numa_migrate_deferred(current, last_cpupid))
                        goto out;
        }

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 84f17800f8b5..4273b2d71a27 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt
@@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
375	feature is too high then the rate the kernel samples for NUMA hinting	375	feature is too high then the rate the kernel samples for NUMA hinting
376	faults may be controlled by the numa_balancing_scan_period_min_ms,	376	faults may be controlled by the numa_balancing_scan_period_min_ms,
377	numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,	377	numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
378	numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.	378	numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
		379	numa_balancing_migrate_deferred.
379		380
380	==============================================================	381	==============================================================
381		382
@@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This
421	gives the scheduler a chance to place the task on an alternative node if the	422	gives the scheduler a chance to place the task on an alternative node if the
422	preferred node is overloaded.	423	preferred node is overloaded.
423		424
		425	numa_balancing_migrate_deferred is how many page migrations get skipped
		426	unconditionally, after a page migration is skipped because a page is shared
		427	with other tasks. This reduces page migration overhead, and determines
		428	how much stronger the "move task near its memory" policy scheduler becomes,
		429	versus the "move memory near its task" memory management policy, for workloads
		430	with shared memory.
		431
424	==============================================================	432	==============================================================
425		433
426	osrelease, ostype & version:	434	osrelease, ostype & version:


diff --git a/include/linux/sched.h b/include/linux/sched.h index d24f70ffddee..833eed55cf43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1342,6 +1342,8 @@ struct task_struct {
1342	int numa_scan_seq;	1342	int numa_scan_seq;
1343	unsigned int numa_scan_period;	1343	unsigned int numa_scan_period;
1344	unsigned int numa_scan_period_max;	1344	unsigned int numa_scan_period_max;
		1345	int numa_preferred_nid;
		1346	int numa_migrate_deferred;
1345	unsigned long numa_migrate_retry;	1347	unsigned long numa_migrate_retry;
1346	u64 node_stamp; /* migration stamp */	1348	u64 node_stamp; /* migration stamp */
1347	struct callback_head numa_work;	1349	struct callback_head numa_work;
@@ -1372,7 +1374,6 @@ struct task_struct {
1372	*/	1374	*/
1373	unsigned long numa_faults_locality[2];	1375	unsigned long numa_faults_locality[2];
1374		1376
1375	int numa_preferred_nid;
1376	unsigned long numa_pages_migrated;	1377	unsigned long numa_pages_migrated;
1377	#endif /* CONFIG_NUMA_BALANCING */	1378	#endif /* CONFIG_NUMA_BALANCING */
1378		1379
@@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
1469	extern pid_t task_numa_group_id(struct task_struct *p);	1470	extern pid_t task_numa_group_id(struct task_struct *p);
1470	extern void set_numabalancing_state(bool enabled);	1471	extern void set_numabalancing_state(bool enabled);
1471	extern void task_numa_free(struct task_struct *p);	1472	extern void task_numa_free(struct task_struct *p);
		1473
		1474	extern unsigned int sysctl_numa_balancing_migrate_deferred;
1472	#else	1475	#else
1473	static inline void task_numa_fault(int last_node, int node, int pages,	1476	static inline void task_numa_fault(int last_node, int node, int pages,
1474	int flags)	1477	int flags)


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8454c38b1b12..e7884dc3416d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
833	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */	833	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
834	unsigned int sysctl_numa_balancing_scan_delay = 1000;	834	unsigned int sysctl_numa_balancing_scan_delay = 1000;
835		835
		836	/*
		837	* After skipping a page migration on a shared page, skip N more numa page
		838	* migrations unconditionally. This reduces the number of NUMA migrations
		839	* in shared memory workloads, and has the effect of pulling tasks towards
		840	* where their memory lives, over pulling the memory towards the task.
		841	*/
		842	unsigned int sysctl_numa_balancing_migrate_deferred = 16;
		843
836	static unsigned int task_nr_scan_windows(struct task_struct *p)	844	static unsigned int task_nr_scan_windows(struct task_struct *p)
837	{	845	{
838	unsigned long rss = 0;	846	unsigned long rss = 0;


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e509b90a8002..a159e1fd2013 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
391	.mode = 0644,	391	.mode = 0644,
392	.proc_handler = proc_dointvec,	392	.proc_handler = proc_dointvec,
393	},	393	},
		394	{
		395	.procname = "numa_balancing_migrate_deferred",
		396	.data = &sysctl_numa_balancing_migrate_deferred,
		397	.maxlen = sizeof(unsigned int),
		398	.mode = 0644,
		399	.proc_handler = proc_dointvec,
		400	},
394	#endif /* CONFIG_NUMA_BALANCING */	401	#endif /* CONFIG_NUMA_BALANCING */
395	#endif /* CONFIG_SCHED_DEBUG */	402	#endif /* CONFIG_SCHED_DEBUG */
396	{	403	{


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2929c24c22b7..71cb253368cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n)
2301	kmem_cache_free(sn_cache, n);	2301	kmem_cache_free(sn_cache, n);
2302	}	2302	}
2303		2303
		2304	#ifdef CONFIG_NUMA_BALANCING
		2305	static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
		2306	{
		2307	/* Never defer a private fault */
		2308	if (cpupid_match_pid(p, last_cpupid))
		2309	return false;
		2310
		2311	if (p->numa_migrate_deferred) {
		2312	p->numa_migrate_deferred--;
		2313	return true;
		2314	}
		2315	return false;
		2316	}
		2317
		2318	static inline void defer_numa_migrate(struct task_struct *p)
		2319	{
		2320	p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
		2321	}
		2322	#else
		2323	static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
		2324	{
		2325	return false;
		2326	}
		2327
		2328	static inline void defer_numa_migrate(struct task_struct *p)
		2329	{
		2330	}
		2331	#endif /* CONFIG_NUMA_BALANCING */
		2332
2304	/**	2333	/**
2305	* mpol_misplaced - check whether current page node is valid in policy	2334	* mpol_misplaced - check whether current page node is valid in policy
2306	*	2335	*
@@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page page, struct vm_area_struct vma, unsigned long
2402	* relation.	2431	* relation.
2403	*/	2432	*/
2404	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);	2433	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2405	if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)	2434	if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
		2435
		2436	/* See sysctl_numa_balancing_migrate_deferred comment */
		2437	if (!cpupid_match_pid(current, last_cpupid))
		2438	defer_numa_migrate(current);
		2439
		2440	goto out;
		2441	}
		2442
		2443	/*
		2444	* The quadratic filter above reduces extraneous migration
		2445	* of shared pages somewhat. This code reduces it even more,
		2446	* reducing the overhead of page migrations of shared pages.
		2447	* This makes workloads with shared pages rely more on
		2448	* "move task near its memory", and less on "move memory
		2449	* towards its task", which is exactly what we want.
		2450	*/
		2451	if (numa_migrate_deferred(current, last_cpupid))
2406	goto out;	2452	goto out;
2407	}	2453	}
2408		2454