diff options
author | Andrew Shewmaker <agshew@gmail.com> | 2013-04-29 18:08:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-04-29 18:54:36 -0400 |
commit | 4eeab4f5580d11bffedc697684b91b0bca0d5009 (patch) | |
tree | 4e9a0c010d34e786df52225039a17aa38e9adf17 | |
parent | c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd (diff) |
mm: replace hardcoded 3% with admin_reserve_pages knob
Add an admin_reserve_kbytes knob to allow admins to change the hardcoded
memory reserve to something other than 3%, which may be multiple
gigabytes on large memory systems. Only about 8MB is necessary to
enable recovery in the default mode, and only a few hundred MB are
required even when overcommit is disabled.
This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER.
admin_reserve_kbytes is initialized to min(3% free pages, 8MB)
I arrived at 8MB by summing the RSS of sshd or login, bash, and top.
Please see first patch in this series for full background, motivation,
testing, and full changelog.
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_admin_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/sysctl/vm.txt | 30 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | mm/mmap.c | 30 | ||||
-rw-r--r-- | mm/nommu.c | 30 |
5 files changed, 90 insertions, 8 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f69895738357..dcc75a9ed919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -18,6 +18,7 @@ files can be found in mm/swap.c. | |||
18 | 18 | ||
19 | Currently, these files are in /proc/sys/vm: | 19 | Currently, these files are in /proc/sys/vm: |
20 | 20 | ||
21 | - admin_reserve_kbytes | ||
21 | - block_dump | 22 | - block_dump |
22 | - compact_memory | 23 | - compact_memory |
23 | - dirty_background_bytes | 24 | - dirty_background_bytes |
@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm: | |||
59 | 60 | ||
60 | ============================================================== | 61 | ============================================================== |
61 | 62 | ||
63 | admin_reserve_kbytes | ||
64 | |||
65 | The amount of free memory in the system that should be reserved for users | ||
66 | with the capability cap_sys_admin. | ||
67 | |||
68 | admin_reserve_kbytes defaults to min(3% of free pages, 8MB) | ||
69 | |||
70 | That should provide enough for the admin to log in and kill a process, | ||
71 | if necessary, under the default overcommit 'guess' mode. | ||
72 | |||
73 | Systems running under overcommit 'never' should increase this to account | ||
74 | for the full Virtual Memory Size of programs used to recover. Otherwise, | ||
75 | root may not be able to log in to recover the system. | ||
76 | |||
77 | How do you calculate a minimum useful reserve? | ||
78 | |||
79 | sshd or login + bash (or some other shell) + top (or ps, kill, etc.) | ||
80 | |||
81 | For overcommit 'guess', we can sum resident set sizes (RSS). | ||
82 | On x86_64 this is about 8MB. | ||
83 | |||
84 | For overcommit 'never', we can take the max of their virtual sizes (VSZ) | ||
85 | and add the sum of their RSS. | ||
86 | On x86_64 this is about 128MB. | ||
87 | |||
88 | Changing this takes effect whenever an application requests memory. | ||
89 | |||
90 | ============================================================== | ||
91 | |||
62 | block_dump | 92 | block_dump |
63 | 93 | ||
64 | block_dump enables block I/O debugging when set to a nonzero value. More | 94 | block_dump enables block I/O debugging when set to a nonzero value. More |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 43cfaabbde40..c05d7cfbb6b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout; | |||
45 | #include <asm/processor.h> | 45 | #include <asm/processor.h> |
46 | 46 | ||
47 | extern unsigned long sysctl_user_reserve_kbytes; | 47 | extern unsigned long sysctl_user_reserve_kbytes; |
48 | extern unsigned long sysctl_admin_reserve_kbytes; | ||
48 | 49 | ||
49 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) | 50 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) |
50 | 51 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6daabb72bdb5..9edcf456e0fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = { | |||
1436 | .mode = 0644, | 1436 | .mode = 0644, |
1437 | .proc_handler = proc_doulongvec_minmax, | 1437 | .proc_handler = proc_doulongvec_minmax, |
1438 | }, | 1438 | }, |
1439 | { | ||
1440 | .procname = "admin_reserve_kbytes", | ||
1441 | .data = &sysctl_admin_reserve_kbytes, | ||
1442 | .maxlen = sizeof(sysctl_admin_reserve_kbytes), | ||
1443 | .mode = 0644, | ||
1444 | .proc_handler = proc_doulongvec_minmax, | ||
1445 | }, | ||
1439 | { } | 1446 | { } |
1440 | }; | 1447 | }; |
1441 | 1448 | ||
@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove | |||
85 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 85 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
86 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 86 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
87 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 87 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
88 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | ||
88 | /* | 89 | /* |
89 | * Make sure vm_committed_as in one cacheline and not cacheline shared with | 90 | * Make sure vm_committed_as in one cacheline and not cacheline shared with |
90 | * other variables. It can be updated by several CPUs frequently. | 91 | * other variables. It can be updated by several CPUs frequently. |
@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
164 | free -= totalreserve_pages; | 165 | free -= totalreserve_pages; |
165 | 166 | ||
166 | /* | 167 | /* |
167 | * Leave the last 3% for root | 168 | * Reserve some for root |
168 | */ | 169 | */ |
169 | if (!cap_sys_admin) | 170 | if (!cap_sys_admin) |
170 | free -= free / 32; | 171 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
171 | 172 | ||
172 | if (free > pages) | 173 | if (free > pages) |
173 | return 0; | 174 | return 0; |
@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
178 | allowed = (totalram_pages - hugetlb_total_pages()) | 179 | allowed = (totalram_pages - hugetlb_total_pages()) |
179 | * sysctl_overcommit_ratio / 100; | 180 | * sysctl_overcommit_ratio / 100; |
180 | /* | 181 | /* |
181 | * Leave the last 3% for root | 182 | * Reserve some for root |
182 | */ | 183 | */ |
183 | if (!cap_sys_admin) | 184 | if (!cap_sys_admin) |
184 | allowed -= allowed / 32; | 185 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
185 | allowed += total_swap_pages; | 186 | allowed += total_swap_pages; |
186 | 187 | ||
187 | /* | 188 | /* |
@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void) | |||
3119 | return 0; | 3120 | return 0; |
3120 | } | 3121 | } |
3121 | module_init(init_user_reserve) | 3122 | module_init(init_user_reserve) |
3123 | |||
3124 | /* | ||
3125 | * Initialise sysctl_admin_reserve_kbytes. | ||
3126 | * | ||
3127 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | ||
3128 | * to log in and kill a memory hogging process. | ||
3129 | * | ||
3130 | * Systems with more than 256MB will reserve 8MB, enough to recover | ||
3131 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | ||
3132 | * only reserve 3% of free pages by default. | ||
3133 | */ | ||
3134 | static int __meminit init_admin_reserve(void) | ||
3135 | { | ||
3136 | unsigned long free_kbytes; | ||
3137 | |||
3138 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
3139 | |||
3140 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | ||
3141 | return 0; | ||
3142 | } | ||
3143 | module_init(init_admin_reserve) | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 58e4a0a5125f..fbe3e2f317eb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ | |||
64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
67 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | ||
67 | int heap_stack_gap = 0; | 68 | int heap_stack_gap = 0; |
68 | 69 | ||
69 | atomic_long_t mmap_pages_allocated; | 70 | atomic_long_t mmap_pages_allocated; |
@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1939 | free -= totalreserve_pages; | 1940 | free -= totalreserve_pages; |
1940 | 1941 | ||
1941 | /* | 1942 | /* |
1942 | * Leave the last 3% for root | 1943 | * Reserve some for root |
1943 | */ | 1944 | */ |
1944 | if (!cap_sys_admin) | 1945 | if (!cap_sys_admin) |
1945 | free -= free / 32; | 1946 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1946 | 1947 | ||
1947 | if (free > pages) | 1948 | if (free > pages) |
1948 | return 0; | 1949 | return 0; |
@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1952 | 1953 | ||
1953 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; | 1954 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; |
1954 | /* | 1955 | /* |
1955 | * Leave the last 3% for root | 1956 | * Reserve some 3% for root |
1956 | */ | 1957 | */ |
1957 | if (!cap_sys_admin) | 1958 | if (!cap_sys_admin) |
1958 | allowed -= allowed / 32; | 1959 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1959 | allowed += total_swap_pages; | 1960 | allowed += total_swap_pages; |
1960 | 1961 | ||
1961 | /* | 1962 | /* |
@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void) | |||
2147 | return 0; | 2148 | return 0; |
2148 | } | 2149 | } |
2149 | module_init(init_user_reserve) | 2150 | module_init(init_user_reserve) |
2151 | |||
2152 | /* | ||
2153 | * Initialise sysctl_admin_reserve_kbytes. | ||
2154 | * | ||
2155 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | ||
2156 | * to log in and kill a memory hogging process. | ||
2157 | * | ||
2158 | * Systems with more than 256MB will reserve 8MB, enough to recover | ||
2159 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | ||
2160 | * only reserve 3% of free pages by default. | ||
2161 | */ | ||
2162 | static int __meminit init_admin_reserve(void) | ||
2163 | { | ||
2164 | unsigned long free_kbytes; | ||
2165 | |||
2166 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
2167 | |||
2168 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | ||
2169 | return 0; | ||
2170 | } | ||
2171 | module_init(init_admin_reserve) | ||