aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric B Munson <emunson@akamai.com>2015-04-15 19:13:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 19:35:17 -0400
commit5bbe3547aa3ba5242366a322a28996872301b703 (patch)
treefca743e8713bc9b5d807083df50fdc120314fdc4
parenta4bb3ecdc12a78dc4d0e690d40ec10887b640786 (diff)
mm: allow compaction of unevictable pages
Currently, pages which are marked as unevictable are protected from compaction, but not from other types of migration. The POSIX real time extension explicitly states that mlock() will prevent a major page fault, but the spirit of this is that mlock() should give a process the ability to control sources of latency, including minor page faults. However, the mlock manpage only explicitly says that a locked page will not be written to swap and this can cause some confusion. The compaction code today does not give a developer who wants to avoid swap but wants to have large contiguous areas available any method to achieve this state. This patch introduces a sysctl for controlling compaction behavior with respect to the unevictable lru. Users who demand no page faults after a page is present can set compact_unevictable_allowed to 0 and users who need the large contiguous areas can enable compaction on locked memory by leaving the default value of 1. To illustrate this problem I wrote a quick test program that mmaps a large number of 1MB files filled with random data. These maps are created locked and read only. Then every other mmap is unmapped and I attempt to allocate huge pages to the static huge page pool. When the compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages after fragmenting memory. When the value is set to 1, allocations succeed. Signed-off-by: Eric B Munson <emunson@akamai.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Christoph Lameter <cl@linux.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Christoph Lameter <cl@linux.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/sysctl/vm.txt11
-rw-r--r--include/linux/compaction.h1
-rw-r--r--kernel/sysctl.c9
-rw-r--r--mm/compaction.c7
4 files changed, 28 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 902b4574acfb..9832ec52f859 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm:
21- admin_reserve_kbytes 21- admin_reserve_kbytes
22- block_dump 22- block_dump
23- compact_memory 23- compact_memory
24- compact_unevictable_allowed
24- dirty_background_bytes 25- dirty_background_bytes
25- dirty_background_ratio 26- dirty_background_ratio
26- dirty_bytes 27- dirty_bytes
@@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required.
106 107
107============================================================== 108==============================================================
108 109
110compact_unevictable_allowed
111
112Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
113allowed to examine the unevictable lru (mlocked pages) for pages to compact.
114This should be used on systems where stalls for minor page faults are an
115acceptable trade for large contiguous free memory. Set to 0 to prevent
116compaction from moving pages that are unevictable. Default value is 1.
117
118==============================================================
119
109dirty_background_bytes 120dirty_background_bytes
110 121
111Contains the amount of dirty memory at which the background kernel 122Contains the amount of dirty memory at which the background kernel
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a014559e4a49..aa8f61cf3a19 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write,
34extern int sysctl_extfrag_threshold; 34extern int sysctl_extfrag_threshold;
35extern int sysctl_extfrag_handler(struct ctl_table *table, int write, 35extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
36 void __user *buffer, size_t *length, loff_t *ppos); 36 void __user *buffer, size_t *length, loff_t *ppos);
37extern int sysctl_compact_unevictable_allowed;
37 38
38extern int fragmentation_index(struct zone *zone, unsigned int order); 39extern int fragmentation_index(struct zone *zone, unsigned int order);
39extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 40extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c0eabd41886..42b7fc2860c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = {
1335 .extra1 = &min_extfrag_threshold, 1335 .extra1 = &min_extfrag_threshold,
1336 .extra2 = &max_extfrag_threshold, 1336 .extra2 = &max_extfrag_threshold,
1337 }, 1337 },
1338 {
1339 .procname = "compact_unevictable_allowed",
1340 .data = &sysctl_compact_unevictable_allowed,
1341 .maxlen = sizeof(int),
1342 .mode = 0644,
1343 .proc_handler = proc_dointvec,
1344 .extra1 = &zero,
1345 .extra2 = &one,
1346 },
1338 1347
1339#endif /* CONFIG_COMPACTION */ 1348#endif /* CONFIG_COMPACTION */
1340 { 1349 {
diff --git a/mm/compaction.c b/mm/compaction.c
index a18201a8124e..570426edcadf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1047,6 +1047,12 @@ typedef enum {
1047} isolate_migrate_t; 1047} isolate_migrate_t;
1048 1048
1049/* 1049/*
1050 * Allow userspace to control policy on scanning the unevictable LRU for
1051 * compactable pages.
1052 */
1053int sysctl_compact_unevictable_allowed __read_mostly = 1;
1054
1055/*
1050 * Isolate all pages that can be migrated from the first suitable block, 1056 * Isolate all pages that can be migrated from the first suitable block,
1051 * starting at the block pointed to by the migrate scanner pfn within 1057 * starting at the block pointed to by the migrate scanner pfn within
1052 * compact_control. 1058 * compact_control.
@@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1057 unsigned long low_pfn, end_pfn; 1063 unsigned long low_pfn, end_pfn;
1058 struct page *page; 1064 struct page *page;
1059 const isolate_mode_t isolate_mode = 1065 const isolate_mode_t isolate_mode =
1066 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1060 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1067 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1061 1068
1062 /* 1069 /*