aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Shewmaker <agshew@gmail.com>2013-04-29 18:08:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 18:54:36 -0400
commit4eeab4f5580d11bffedc697684b91b0bca0d5009 (patch)
tree4e9a0c010d34e786df52225039a17aa38e9adf17
parentc9b1d0981fcce3d9976d7b7a56e4e0503bc610dd (diff)
mm: replace hardcoded 3% with admin_reserve_pages knob
Add an admin_reserve_kbytes knob to allow admins to change the hardcoded memory reserve to something other than 3%, which may be multiple gigabytes on large memory systems. Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER. admin_reserve_kbytes is initialized to min(3% free pages, 8MB) I arrived at 8MB by summing the RSS of sshd or login, bash, and top. Please see first patch in this series for full background, motivation, testing, and full changelog. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_admin_reserve() static] Signed-off-by: Andrew Shewmaker <agshew@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/sysctl/vm.txt30
-rw-r--r--include/linux/mm.h1
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/mmap.c30
-rw-r--r--mm/nommu.c30
5 files changed, 90 insertions, 8 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index f69895738357..dcc75a9ed919 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -18,6 +18,7 @@ files can be found in mm/swap.c.
18 18
19Currently, these files are in /proc/sys/vm: 19Currently, these files are in /proc/sys/vm:
20 20
21- admin_reserve_kbytes
21- block_dump 22- block_dump
22- compact_memory 23- compact_memory
23- dirty_background_bytes 24- dirty_background_bytes
@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm:
59 60
60============================================================== 61==============================================================
61 62
63admin_reserve_kbytes
64
65The amount of free memory in the system that should be reserved for users
66with the capability cap_sys_admin.
67
68admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
69
70That should provide enough for the admin to log in and kill a process,
71if necessary, under the default overcommit 'guess' mode.
72
73Systems running under overcommit 'never' should increase this to account
74for the full Virtual Memory Size of programs used to recover. Otherwise,
75root may not be able to log in to recover the system.
76
77How do you calculate a minimum useful reserve?
78
79sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
80
81For overcommit 'guess', we can sum resident set sizes (RSS).
82On x86_64 this is about 8MB.
83
84For overcommit 'never', we can take the max of their virtual sizes (VSZ)
85and add the sum of their RSS.
86On x86_64 this is about 128MB.
87
88Changing this takes effect whenever an application requests memory.
89
90==============================================================
91
62block_dump 92block_dump
63 93
64block_dump enables block I/O debugging when set to a nonzero value. More 94block_dump enables block I/O debugging when set to a nonzero value. More
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43cfaabbde40..c05d7cfbb6b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout;
45#include <asm/processor.h> 45#include <asm/processor.h>
46 46
47extern unsigned long sysctl_user_reserve_kbytes; 47extern unsigned long sysctl_user_reserve_kbytes;
48extern unsigned long sysctl_admin_reserve_kbytes;
48 49
49#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 50#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
50 51
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6daabb72bdb5..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = {
1436 .mode = 0644, 1436 .mode = 0644,
1437 .proc_handler = proc_doulongvec_minmax, 1437 .proc_handler = proc_doulongvec_minmax,
1438 }, 1438 },
1439 {
1440 .procname = "admin_reserve_kbytes",
1441 .data = &sysctl_admin_reserve_kbytes,
1442 .maxlen = sizeof(sysctl_admin_reserve_kbytes),
1443 .mode = 0644,
1444 .proc_handler = proc_doulongvec_minmax,
1445 },
1439 { } 1446 { }
1440}; 1447};
1441 1448
diff --git a/mm/mmap.c b/mm/mmap.c
index 80a965f35251..5485f18e6631 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove
85int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 85int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
86int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 86int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
87unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 87unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
88unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
88/* 89/*
89 * Make sure vm_committed_as in one cacheline and not cacheline shared with 90 * Make sure vm_committed_as in one cacheline and not cacheline shared with
90 * other variables. It can be updated by several CPUs frequently. 91 * other variables. It can be updated by several CPUs frequently.
@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
164 free -= totalreserve_pages; 165 free -= totalreserve_pages;
165 166
166 /* 167 /*
167 * Leave the last 3% for root 168 * Reserve some for root
168 */ 169 */
169 if (!cap_sys_admin) 170 if (!cap_sys_admin)
170 free -= free / 32; 171 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
171 172
172 if (free > pages) 173 if (free > pages)
173 return 0; 174 return 0;
@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
178 allowed = (totalram_pages - hugetlb_total_pages()) 179 allowed = (totalram_pages - hugetlb_total_pages())
179 * sysctl_overcommit_ratio / 100; 180 * sysctl_overcommit_ratio / 100;
180 /* 181 /*
181 * Leave the last 3% for root 182 * Reserve some for root
182 */ 183 */
183 if (!cap_sys_admin) 184 if (!cap_sys_admin)
184 allowed -= allowed / 32; 185 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
185 allowed += total_swap_pages; 186 allowed += total_swap_pages;
186 187
187 /* 188 /*
@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void)
3119 return 0; 3120 return 0;
3120} 3121}
3121module_init(init_user_reserve) 3122module_init(init_user_reserve)
3123
3124/*
3125 * Initialise sysctl_admin_reserve_kbytes.
3126 *
3127 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3128 * to log in and kill a memory hogging process.
3129 *
3130 * Systems with more than 256MB will reserve 8MB, enough to recover
3131 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3132 * only reserve 3% of free pages by default.
3133 */
3134static int __meminit init_admin_reserve(void)
3135{
3136 unsigned long free_kbytes;
3137
3138 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3139
3140 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3141 return 0;
3142}
3143module_init(init_admin_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c
index 58e4a0a5125f..fbe3e2f317eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
67unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
67int heap_stack_gap = 0; 68int heap_stack_gap = 0;
68 69
69atomic_long_t mmap_pages_allocated; 70atomic_long_t mmap_pages_allocated;
@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1939 free -= totalreserve_pages; 1940 free -= totalreserve_pages;
1940 1941
1941 /* 1942 /*
1942 * Leave the last 3% for root 1943 * Reserve some for root
1943 */ 1944 */
1944 if (!cap_sys_admin) 1945 if (!cap_sys_admin)
1945 free -= free / 32; 1946 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1946 1947
1947 if (free > pages) 1948 if (free > pages)
1948 return 0; 1949 return 0;
@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1952 1953
1953 allowed = totalram_pages * sysctl_overcommit_ratio / 100; 1954 allowed = totalram_pages * sysctl_overcommit_ratio / 100;
1954 /* 1955 /*
1955 * Leave the last 3% for root 1956 * Reserve some 3% for root
1956 */ 1957 */
1957 if (!cap_sys_admin) 1958 if (!cap_sys_admin)
1958 allowed -= allowed / 32; 1959 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1959 allowed += total_swap_pages; 1960 allowed += total_swap_pages;
1960 1961
1961 /* 1962 /*
@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void)
2147 return 0; 2148 return 0;
2148} 2149}
2149module_init(init_user_reserve) 2150module_init(init_user_reserve)
2151
2152/*
2153 * Initialise sysctl_admin_reserve_kbytes.
2154 *
2155 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
2156 * to log in and kill a memory hogging process.
2157 *
2158 * Systems with more than 256MB will reserve 8MB, enough to recover
2159 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
2160 * only reserve 3% of free pages by default.
2161 */
2162static int __meminit init_admin_reserve(void)
2163{
2164 unsigned long free_kbytes;
2165
2166 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
2167
2168 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
2169 return 0;
2170}
2171module_init(init_admin_reserve)