diff options
-rw-r--r-- | Documentation/sysctl/kernel.txt | 10 | ||||
-rw-r--r-- | include/linux/sched.h | 5 | ||||
-rw-r--r-- | kernel/sched/fair.c | 8 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | mm/mempolicy.c | 48 |
5 files changed, 75 insertions, 3 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 84f17800f8b5..4273b2d71a27 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the | |||
375 | feature is too high then the rate the kernel samples for NUMA hinting | 375 | feature is too high then the rate the kernel samples for NUMA hinting |
376 | faults may be controlled by the numa_balancing_scan_period_min_ms, | 376 | faults may be controlled by the numa_balancing_scan_period_min_ms, |
377 | numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, | 377 | numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, |
378 | numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls. | 378 | numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and |
379 | numa_balancing_migrate_deferred. | ||
379 | 380 | ||
380 | ============================================================== | 381 | ============================================================== |
381 | 382 | ||
@@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This | |||
421 | gives the scheduler a chance to place the task on an alternative node if the | 422 | gives the scheduler a chance to place the task on an alternative node if the |
422 | preferred node is overloaded. | 423 | preferred node is overloaded. |
423 | 424 | ||
425 | numa_balancing_migrate_deferred is how many page migrations get skipped | ||
426 | unconditionally, after a page migration is skipped because a page is shared | ||
427 | with other tasks. This reduces page migration overhead, and determines | ||
428 | how much stronger the "move task near its memory" policy scheduler becomes, | ||
429 | versus the "move memory near its task" memory management policy, for workloads | ||
430 | with shared memory. | ||
431 | |||
424 | ============================================================== | 432 | ============================================================== |
425 | 433 | ||
426 | osrelease, ostype & version: | 434 | osrelease, ostype & version: |
diff --git a/include/linux/sched.h b/include/linux/sched.h index d24f70ffddee..833eed55cf43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1342,6 +1342,8 @@ struct task_struct { | |||
1342 | int numa_scan_seq; | 1342 | int numa_scan_seq; |
1343 | unsigned int numa_scan_period; | 1343 | unsigned int numa_scan_period; |
1344 | unsigned int numa_scan_period_max; | 1344 | unsigned int numa_scan_period_max; |
1345 | int numa_preferred_nid; | ||
1346 | int numa_migrate_deferred; | ||
1345 | unsigned long numa_migrate_retry; | 1347 | unsigned long numa_migrate_retry; |
1346 | u64 node_stamp; /* migration stamp */ | 1348 | u64 node_stamp; /* migration stamp */ |
1347 | struct callback_head numa_work; | 1349 | struct callback_head numa_work; |
@@ -1372,7 +1374,6 @@ struct task_struct { | |||
1372 | */ | 1374 | */ |
1373 | unsigned long numa_faults_locality[2]; | 1375 | unsigned long numa_faults_locality[2]; |
1374 | 1376 | ||
1375 | int numa_preferred_nid; | ||
1376 | unsigned long numa_pages_migrated; | 1377 | unsigned long numa_pages_migrated; |
1377 | #endif /* CONFIG_NUMA_BALANCING */ | 1378 | #endif /* CONFIG_NUMA_BALANCING */ |
1378 | 1379 | ||
@@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); | |||
1469 | extern pid_t task_numa_group_id(struct task_struct *p); | 1470 | extern pid_t task_numa_group_id(struct task_struct *p); |
1470 | extern void set_numabalancing_state(bool enabled); | 1471 | extern void set_numabalancing_state(bool enabled); |
1471 | extern void task_numa_free(struct task_struct *p); | 1472 | extern void task_numa_free(struct task_struct *p); |
1473 | |||
1474 | extern unsigned int sysctl_numa_balancing_migrate_deferred; | ||
1472 | #else | 1475 | #else |
1473 | static inline void task_numa_fault(int last_node, int node, int pages, | 1476 | static inline void task_numa_fault(int last_node, int node, int pages, |
1474 | int flags) | 1477 | int flags) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8454c38b1b12..e7884dc3416d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
835 | 835 | ||
836 | /* | ||
837 | * After skipping a page migration on a shared page, skip N more numa page | ||
838 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
839 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
840 | * where their memory lives, over pulling the memory towards the task. | ||
841 | */ | ||
842 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
843 | |||
836 | static unsigned int task_nr_scan_windows(struct task_struct *p) | 844 | static unsigned int task_nr_scan_windows(struct task_struct *p) |
837 | { | 845 | { |
838 | unsigned long rss = 0; | 846 | unsigned long rss = 0; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e509b90a8002..a159e1fd2013 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = { | |||
391 | .mode = 0644, | 391 | .mode = 0644, |
392 | .proc_handler = proc_dointvec, | 392 | .proc_handler = proc_dointvec, |
393 | }, | 393 | }, |
394 | { | ||
395 | .procname = "numa_balancing_migrate_deferred", | ||
396 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
397 | .maxlen = sizeof(unsigned int), | ||
398 | .mode = 0644, | ||
399 | .proc_handler = proc_dointvec, | ||
400 | }, | ||
394 | #endif /* CONFIG_NUMA_BALANCING */ | 401 | #endif /* CONFIG_NUMA_BALANCING */ |
395 | #endif /* CONFIG_SCHED_DEBUG */ | 402 | #endif /* CONFIG_SCHED_DEBUG */ |
396 | { | 403 | { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2929c24c22b7..71cb253368cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n) | |||
2301 | kmem_cache_free(sn_cache, n); | 2301 | kmem_cache_free(sn_cache, n); |
2302 | } | 2302 | } |
2303 | 2303 | ||
2304 | #ifdef CONFIG_NUMA_BALANCING | ||
2305 | static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2306 | { | ||
2307 | /* Never defer a private fault */ | ||
2308 | if (cpupid_match_pid(p, last_cpupid)) | ||
2309 | return false; | ||
2310 | |||
2311 | if (p->numa_migrate_deferred) { | ||
2312 | p->numa_migrate_deferred--; | ||
2313 | return true; | ||
2314 | } | ||
2315 | return false; | ||
2316 | } | ||
2317 | |||
2318 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2319 | { | ||
2320 | p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; | ||
2321 | } | ||
2322 | #else | ||
2323 | static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2324 | { | ||
2325 | return false; | ||
2326 | } | ||
2327 | |||
2328 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2329 | { | ||
2330 | } | ||
2331 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2332 | |||
2304 | /** | 2333 | /** |
2305 | * mpol_misplaced - check whether current page node is valid in policy | 2334 | * mpol_misplaced - check whether current page node is valid in policy |
2306 | * | 2335 | * |
@@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2402 | * relation. | 2431 | * relation. |
2403 | */ | 2432 | */ |
2404 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); | 2433 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); |
2405 | if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) | 2434 | if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { |
2435 | |||
2436 | /* See sysctl_numa_balancing_migrate_deferred comment */ | ||
2437 | if (!cpupid_match_pid(current, last_cpupid)) | ||
2438 | defer_numa_migrate(current); | ||
2439 | |||
2440 | goto out; | ||
2441 | } | ||
2442 | |||
2443 | /* | ||
2444 | * The quadratic filter above reduces extraneous migration | ||
2445 | * of shared pages somewhat. This code reduces it even more, | ||
2446 | * reducing the overhead of page migrations of shared pages. | ||
2447 | * This makes workloads with shared pages rely more on | ||
2448 | * "move task near its memory", and less on "move memory | ||
2449 | * towards its task", which is exactly what we want. | ||
2450 | */ | ||
2451 | if (numa_migrate_deferred(current, last_cpupid)) | ||
2406 | goto out; | 2452 | goto out; |
2407 | } | 2453 | } |
2408 | 2454 | ||