diff options
author | Eric B Munson <emunson@akamai.com> | 2015-04-15 19:13:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:35:17 -0400 |
commit | 5bbe3547aa3ba5242366a322a28996872301b703 (patch) | |
tree | fca743e8713bc9b5d807083df50fdc120314fdc4 | |
parent | a4bb3ecdc12a78dc4d0e690d40ec10887b640786 (diff) |
mm: allow compaction of unevictable pages
Currently, pages which are marked as unevictable are protected from
compaction, but not from other types of migration. The POSIX real time
extension explicitly states that mlock() will prevent a major page
fault, but the spirit of this is that mlock() should give a process the
ability to control sources of latency, including minor page faults.
However, the mlock manpage only explicitly says that a locked page will
not be written to swap and this can cause some confusion. The
compaction code today does not give a developer who wants to avoid swap
but wants to have large contiguous areas available any method to achieve
this state. This patch introduces a sysctl for controlling compaction
behavior with respect to the unevictable lru. Users who demand no page
faults after a page is present can set compact_unevictable_allowed to 0
and users who need the large contiguous areas can enable compaction on
locked memory by leaving the default value of 1.
To illustrate this problem I wrote a quick test program that mmaps a
large number of 1MB files filled with random data. These maps are
created locked and read only. Then every other mmap is unmapped and I
attempt to allocate huge pages to the static huge page pool. When the
compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages
after fragmenting memory. When the value is set to 1, allocations
succeed.
Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/sysctl/vm.txt | 11 | ||||
-rw-r--r-- | include/linux/compaction.h | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 9 | ||||
-rw-r--r-- | mm/compaction.c | 7 |
4 files changed, 28 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 902b4574acfb..9832ec52f859 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm: | |||
21 | - admin_reserve_kbytes | 21 | - admin_reserve_kbytes |
22 | - block_dump | 22 | - block_dump |
23 | - compact_memory | 23 | - compact_memory |
24 | - compact_unevictable_allowed | ||
24 | - dirty_background_bytes | 25 | - dirty_background_bytes |
25 | - dirty_background_ratio | 26 | - dirty_background_ratio |
26 | - dirty_bytes | 27 | - dirty_bytes |
@@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required. | |||
106 | 107 | ||
107 | ============================================================== | 108 | ============================================================== |
108 | 109 | ||
110 | compact_unevictable_allowed | ||
111 | |||
112 | Available only when CONFIG_COMPACTION is set. When set to 1, compaction is | ||
113 | allowed to examine the unevictable lru (mlocked pages) for pages to compact. | ||
114 | This should be used on systems where stalls for minor page faults are an | ||
115 | acceptable trade for large contiguous free memory. Set to 0 to prevent | ||
116 | compaction from moving pages that are unevictable. Default value is 1. | ||
117 | |||
118 | ============================================================== | ||
119 | |||
109 | dirty_background_bytes | 120 | dirty_background_bytes |
110 | 121 | ||
111 | Contains the amount of dirty memory at which the background kernel | 122 | Contains the amount of dirty memory at which the background kernel |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a014559e4a49..aa8f61cf3a19 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write, | |||
34 | extern int sysctl_extfrag_threshold; | 34 | extern int sysctl_extfrag_threshold; |
35 | extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | 35 | extern int sysctl_extfrag_handler(struct ctl_table *table, int write, |
36 | void __user *buffer, size_t *length, loff_t *ppos); | 36 | void __user *buffer, size_t *length, loff_t *ppos); |
37 | extern int sysctl_compact_unevictable_allowed; | ||
37 | 38 | ||
38 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 39 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
39 | extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | 40 | extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c0eabd41886..42b7fc2860c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = { | |||
1335 | .extra1 = &min_extfrag_threshold, | 1335 | .extra1 = &min_extfrag_threshold, |
1336 | .extra2 = &max_extfrag_threshold, | 1336 | .extra2 = &max_extfrag_threshold, |
1337 | }, | 1337 | }, |
1338 | { | ||
1339 | .procname = "compact_unevictable_allowed", | ||
1340 | .data = &sysctl_compact_unevictable_allowed, | ||
1341 | .maxlen = sizeof(int), | ||
1342 | .mode = 0644, | ||
1343 | .proc_handler = proc_dointvec, | ||
1344 | .extra1 = &zero, | ||
1345 | .extra2 = &one, | ||
1346 | }, | ||
1338 | 1347 | ||
1339 | #endif /* CONFIG_COMPACTION */ | 1348 | #endif /* CONFIG_COMPACTION */ |
1340 | { | 1349 | { |
diff --git a/mm/compaction.c b/mm/compaction.c index a18201a8124e..570426edcadf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -1047,6 +1047,12 @@ typedef enum { | |||
1047 | } isolate_migrate_t; | 1047 | } isolate_migrate_t; |
1048 | 1048 | ||
1049 | /* | 1049 | /* |
1050 | * Allow userspace to control policy on scanning the unevictable LRU for | ||
1051 | * compactable pages. | ||
1052 | */ | ||
1053 | int sysctl_compact_unevictable_allowed __read_mostly = 1; | ||
1054 | |||
1055 | /* | ||
1050 | * Isolate all pages that can be migrated from the first suitable block, | 1056 | * Isolate all pages that can be migrated from the first suitable block, |
1051 | * starting at the block pointed to by the migrate scanner pfn within | 1057 | * starting at the block pointed to by the migrate scanner pfn within |
1052 | * compact_control. | 1058 | * compact_control. |
@@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1057 | unsigned long low_pfn, end_pfn; | 1063 | unsigned long low_pfn, end_pfn; |
1058 | struct page *page; | 1064 | struct page *page; |
1059 | const isolate_mode_t isolate_mode = | 1065 | const isolate_mode_t isolate_mode = |
1066 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | ||
1060 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1067 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
1061 | 1068 | ||
1062 | /* | 1069 | /* |