mm: tune vm_committed_as percpu_counter batching size

Currently the per cpu counter's batch size for memory accounting is configured as twice the number of cpus in the system. However, for system with very large memory, it is more appropriate to make it proportional to the memory size per cpu in the system. For example, for a x86_64 system with 64 cpus and 128 GB of memory, the batch size is only 2*64 pages (0.5 MB). So any memory accounting changes of more than 0.5MB will overflow the per cpu counter into the global counter. Instead, for the new scheme, the batch size is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256), which is more inline with the memory size. I've done a repeated brk test of 800KB (from will-it-scale test suite) with 80 concurrent processes on a 4 socket Westmere machine with a total of 40 cores. Without the patch, about 80% of cpu is spent on spin-lock contention within the vm_committed_as counter. With the patch, there's a 73x speedup on the benchmark and the lock contention drops off almost entirely. [akpm@linux-foundation.org: fix section mismatch] Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Cc: Tejun Heo <tj@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Tim Chen <tim.c.chen@linux.intel.com> 2013-07-03 18:02:44 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-03 19:07:32 -0400
commit: 917d9290af749fac9c4d90bacf18699c9d8ba28d (patch)
tree: 506f3a6f90d318a612161da2adf8aae8f81aaab3
parent: 2415cf12e04d415b16d9c2f2a705bcd6cd9a0474 (diff)
2 files changed, 54 insertions, 1 deletions
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863da287f..92dc257251e4 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
+#ifdef CONFIG_SMP
+extern s32 vm_committed_as_batch;
+#else
+#define vm_committed_as_batch 0
+#endif
 unsigned long vm_memory_committed(void);
 static inline void vm_acct_memory(long pages)
 {
-        percpu_counter_add(&vm_committed_as, pages);
+        __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+static void __meminit mm_compute_batch(void)
+{
+        u64 memsized_batch;
+        s32 nr = num_present_cpus();
+        s32 batch = max_t(s32, nr*2, 32);
+        /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+        memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+        vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        switch (action) {
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+                mm_compute_batch();
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block compute_batch_nb __meminitdata = {
+        .notifier_call = mm_compute_batch_notifier,
+        .priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+static int __init mm_compute_batch_init(void)
+{
+        mm_compute_batch();
+        register_hotmemory_notifier(&compute_batch_nb);
+        return 0;
+}
+__initcall(mm_compute_batch_init);
+#endif
 static int __init mm_sysfs_init(void)
 {
        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
author	Tim Chen <tim.c.chen@linux.intel.com>	2013-07-03 18:02:44 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-03 19:07:32 -0400
commit	917d9290af749fac9c4d90bacf18699c9d8ba28d (patch)
tree	506f3a6f90d318a612161da2adf8aae8f81aaab3
parent	2415cf12e04d415b16d9c2f2a705bcd6cd9a0474 (diff)

diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..92dc257251e4 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h
@@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory;
11	extern int sysctl_overcommit_ratio;	11	extern int sysctl_overcommit_ratio;
12	extern struct percpu_counter vm_committed_as;	12	extern struct percpu_counter vm_committed_as;
13		13
		14	#ifdef CONFIG_SMP
		15	extern s32 vm_committed_as_batch;
		16	#else
		17	#define vm_committed_as_batch 0
		18	#endif
		19
14	unsigned long vm_memory_committed(void);	20	unsigned long vm_memory_committed(void);
15		21
16	static inline void vm_acct_memory(long pages)	22	static inline void vm_acct_memory(long pages)
17	{	23	{
18	percpu_counter_add(&vm_committed_as, pages);	24	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
19	}	25	}
20		26
21	static inline void vm_unacct_memory(long pages)	27	static inline void vm_unacct_memory(long pages)


diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02ea11e..633c08863fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
9	#include <linux/init.h>	9	#include <linux/init.h>
10	#include <linux/kobject.h>	10	#include <linux/kobject.h>
11	#include <linux/export.h>	11	#include <linux/export.h>
		12	#include <linux/memory.h>
		13	#include <linux/notifier.h>
12	#include "internal.h"	14	#include "internal.h"
13		15
14	#ifdef CONFIG_DEBUG_MEMORY_INIT	16	#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
147	struct kobject *mm_kobj;	149	struct kobject *mm_kobj;
148	EXPORT_SYMBOL_GPL(mm_kobj);	150	EXPORT_SYMBOL_GPL(mm_kobj);
149		151
		152	#ifdef CONFIG_SMP
		153	s32 vm_committed_as_batch = 32;
		154
		155	static void __meminit mm_compute_batch(void)
		156	{
		157	u64 memsized_batch;
		158	s32 nr = num_present_cpus();
		159	s32 batch = max_t(s32, nr*2, 32);
		160
		161	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
		162	memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
		163
		164	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
		165	}
		166
		167	static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
		168	unsigned long action, void *arg)
		169	{
		170	switch (action) {
		171	case MEM_ONLINE:
		172	case MEM_OFFLINE:
		173	mm_compute_batch();
		174	default:
		175	break;
		176	}
		177	return NOTIFY_OK;
		178	}
		179
		180	static struct notifier_block compute_batch_nb __meminitdata = {
		181	.notifier_call = mm_compute_batch_notifier,
		182	.priority = IPC_CALLBACK_PRI, /* use lowest priority */
		183	};
		184
		185	static int __init mm_compute_batch_init(void)
		186	{
		187	mm_compute_batch();
		188	register_hotmemory_notifier(&compute_batch_nb);
		189
		190	return 0;
		191	}
		192
		193	__initcall(mm_compute_batch_init);
		194
		195	#endif
		196
150	static int __init mm_sysfs_init(void)	197	static int __init mm_sysfs_init(void)
151	{	198	{
152	mm_kobj = kobject_create_and_add("mm", kernel_kobj);	199	mm_kobj = kobject_create_and_add("mm", kernel_kobj);