aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt10
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/sched/fair.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/mempolicy.c48
5 files changed, 75 insertions, 3 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 84f17800f8b5..4273b2d71a27 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
375feature is too high then the rate the kernel samples for NUMA hinting 375feature is too high then the rate the kernel samples for NUMA hinting
376faults may be controlled by the numa_balancing_scan_period_min_ms, 376faults may be controlled by the numa_balancing_scan_period_min_ms,
377numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, 377numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
378numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls. 378numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
379numa_balancing_migrate_deferred.
379 380
380============================================================== 381==============================================================
381 382
@@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This
421gives the scheduler a chance to place the task on an alternative node if the 422gives the scheduler a chance to place the task on an alternative node if the
422preferred node is overloaded. 423preferred node is overloaded.
423 424
425numa_balancing_migrate_deferred is how many page migrations get skipped
426unconditionally, after a page migration is skipped because a page is shared
427with other tasks. This reduces page migration overhead, and determines
428how much stronger the "move task near its memory" policy scheduler becomes,
429versus the "move memory near its task" memory management policy, for workloads
430with shared memory.
431
424============================================================== 432==============================================================
425 433
426osrelease, ostype & version: 434osrelease, ostype & version:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d24f70ffddee..833eed55cf43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1342,6 +1342,8 @@ struct task_struct {
1342 int numa_scan_seq; 1342 int numa_scan_seq;
1343 unsigned int numa_scan_period; 1343 unsigned int numa_scan_period;
1344 unsigned int numa_scan_period_max; 1344 unsigned int numa_scan_period_max;
1345 int numa_preferred_nid;
1346 int numa_migrate_deferred;
1345 unsigned long numa_migrate_retry; 1347 unsigned long numa_migrate_retry;
1346 u64 node_stamp; /* migration stamp */ 1348 u64 node_stamp; /* migration stamp */
1347 struct callback_head numa_work; 1349 struct callback_head numa_work;
@@ -1372,7 +1374,6 @@ struct task_struct {
1372 */ 1374 */
1373 unsigned long numa_faults_locality[2]; 1375 unsigned long numa_faults_locality[2];
1374 1376
1375 int numa_preferred_nid;
1376 unsigned long numa_pages_migrated; 1377 unsigned long numa_pages_migrated;
1377#endif /* CONFIG_NUMA_BALANCING */ 1378#endif /* CONFIG_NUMA_BALANCING */
1378 1379
@@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
1469extern pid_t task_numa_group_id(struct task_struct *p); 1470extern pid_t task_numa_group_id(struct task_struct *p);
1470extern void set_numabalancing_state(bool enabled); 1471extern void set_numabalancing_state(bool enabled);
1471extern void task_numa_free(struct task_struct *p); 1472extern void task_numa_free(struct task_struct *p);
1473
1474extern unsigned int sysctl_numa_balancing_migrate_deferred;
1472#else 1475#else
1473static inline void task_numa_fault(int last_node, int node, int pages, 1476static inline void task_numa_fault(int last_node, int node, int pages,
1474 int flags) 1477 int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8454c38b1b12..e7884dc3416d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
834unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
835 835
836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
836static unsigned int task_nr_scan_windows(struct task_struct *p) 844static unsigned int task_nr_scan_windows(struct task_struct *p)
837{ 845{
838 unsigned long rss = 0; 846 unsigned long rss = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e509b90a8002..a159e1fd2013 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 391 .mode = 0644,
392 .proc_handler = proc_dointvec, 392 .proc_handler = proc_dointvec,
393 }, 393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2929c24c22b7..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n)
2301 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2302} 2302}
2303 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2304/** 2333/**
2305 * mpol_misplaced - check whether current page node is valid in policy 2334 * mpol_misplaced - check whether current page node is valid in policy
2306 * 2335 *
@@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2402 * relation. 2431 * relation.
2403 */ 2432 */
2404 last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2405 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) 2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435
2436 /* See sysctl_numa_balancing_migrate_deferred comment */
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2406 goto out; 2452 goto out;
2407 } 2453 }
2408 2454