aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2014-06-23 16:22:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-23 19:47:43 -0400
commit7cd2b0a34ab8e4db971920eef8982f985441adfb (patch)
tree0c8732b3ff3712bbad8b58696ad98c6f7fb17c05
parentdf2e1ef68c51ddccfdb6f34f92ee9f93541de802 (diff)
mm, pcp: allow restoring percpu_pagelist_fraction default
Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes <rientjes@google.com> Reported-by: Oleg Drokin <green@linuxhacker.ru> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/sysctl/vm.txt3
-rw-r--r--kernel/sysctl.c3
-rw-r--r--mm/page_alloc.c40
3 files changed, 31 insertions, 15 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd4b34c03738..4415aa915681 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
702set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) 702set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
703 703
704The initial value is zero. Kernel does not use this value at boot time to set 704The initial value is zero. Kernel does not use this value at boot time to set
705the high water marks for each per cpu page list. 705the high water marks for each per cpu page list. If the user writes '0' to this
706sysctl, it will revert to this default behavior.
706 707
707============================================================== 708==============================================================
708 709
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7de6555cfea0..075d1903138f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = {
1317 .maxlen = sizeof(percpu_pagelist_fraction), 1316 .maxlen = sizeof(percpu_pagelist_fraction),
1318 .mode = 0644, 1317 .mode = 0644,
1319 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1318 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1320 .extra1 = &min_percpu_pagelist_fract, 1319 .extra1 = &zero,
1321 }, 1320 },
1322#ifdef CONFIG_MMU 1321#ifdef CONFIG_MMU
1323 { 1322 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa29eda8..20d17f8266fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
69 69
70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71static DEFINE_MUTEX(pcp_batch_high_lock); 71static DEFINE_MUTEX(pcp_batch_high_lock);
72#define MIN_PERCPU_PAGELIST_FRACTION (8)
72 73
73#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
74DEFINE_PER_CPU(int, numa_node); 75DEFINE_PER_CPU(int, numa_node);
@@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
4145 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4146 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4146#endif 4147#endif
4147 4148
4148static int __meminit zone_batchsize(struct zone *zone) 4149static int zone_batchsize(struct zone *zone)
4149{ 4150{
4150#ifdef CONFIG_MMU 4151#ifdef CONFIG_MMU
4151 int batch; 4152 int batch;
@@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
4261 pageset_update(&p->pcp, high, batch); 4262 pageset_update(&p->pcp, high, batch);
4262} 4263}
4263 4264
4264static void __meminit pageset_set_high_and_batch(struct zone *zone, 4265static void pageset_set_high_and_batch(struct zone *zone,
4265 struct per_cpu_pageset *pcp) 4266 struct per_cpu_pageset *pcp)
4266{ 4267{
4267 if (percpu_pagelist_fraction) 4268 if (percpu_pagelist_fraction)
4268 pageset_set_high(pcp, 4269 pageset_set_high(pcp,
@@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
5881 void __user *buffer, size_t *length, loff_t *ppos) 5882 void __user *buffer, size_t *length, loff_t *ppos)
5882{ 5883{
5883 struct zone *zone; 5884 struct zone *zone;
5884 unsigned int cpu; 5885 int old_percpu_pagelist_fraction;
5885 int ret; 5886 int ret;
5886 5887
5888 mutex_lock(&pcp_batch_high_lock);
5889 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5890
5887 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5891 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5888 if (!write || (ret < 0)) 5892 if (!write || ret < 0)
5889 return ret; 5893 goto out;
5894
5895 /* Sanity checking to avoid pcp imbalance */
5896 if (percpu_pagelist_fraction &&
5897 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5898 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5899 ret = -EINVAL;
5900 goto out;
5901 }
5902
5903 /* No change? */
5904 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5905 goto out;
5890 5906
5891 mutex_lock(&pcp_batch_high_lock);
5892 for_each_populated_zone(zone) { 5907 for_each_populated_zone(zone) {
5893 unsigned long high; 5908 unsigned int cpu;
5894 high = zone->managed_pages / percpu_pagelist_fraction; 5909
5895 for_each_possible_cpu(cpu) 5910 for_each_possible_cpu(cpu)
5896 pageset_set_high(per_cpu_ptr(zone->pageset, cpu), 5911 pageset_set_high_and_batch(zone,
5897 high); 5912 per_cpu_ptr(zone->pageset, cpu));
5898 } 5913 }
5914out:
5899 mutex_unlock(&pcp_batch_high_lock); 5915 mutex_unlock(&pcp_batch_high_lock);
5900 return 0; 5916 return ret;
5901} 5917}
5902 5918
5903int hashdist = HASHDIST_DEFAULT; 5919int hashdist = HASHDIST_DEFAULT;