mm: per-thread vma caching

This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Michel Lespinasse <walken@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Davidlohr Bueso <davidlohr@hp.com> 2014-04-07 18:37:25 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-07 19:35:53 -0400
commit: 615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree: 45b039ccafb606a30e53c1012775efe848e789ed /kernel
parent: d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)
2 files changed, 17 insertions, 4 deletions
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 99982a70ddad..2956c8da1605 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
        if (!CACHE_FLUSH_IS_SAFE)
                return;
-        if (current->mm && current->mm->mmap_cache) {
+        if (current->mm) {
-                flush_cache_range(current->mm->mmap_cache,
+                int i;
-                                  addr, addr + BREAK_INSTR_SIZE);
+                for (i = 0; i < VMACACHE_SIZE; i++) {
+                        if (!current->vmacache[i])
+                                continue;
+                        flush_cache_range(current->vmacache[i],
+                                          addr, addr + BREAK_INSTR_SIZE);
+                }
        }
        /* Force flush instruction cache if it was outside the mm */
        flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index e40c0a01d5a6..bc0e96b78dfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
@@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        mm->locked_vm = 0;
        mm->mmap = NULL;
-        mm->mmap_cache = NULL;
+        mm->vmacache_seqnum = 0;
        mm->map_count = 0;
        cpumask_clear(mm_cpumask(mm));
        mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        if (!oldmm)
                return 0;
+        /* initialize the new vmacache entries */
+        vmacache_flush(tsk);
        if (clone_flags & CLONE_VM) {
                atomic_inc(&oldmm->mm_users);
                mm = oldmm;
author	Davidlohr Bueso <davidlohr@hp.com>	2014-04-07 18:37:25 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-07 19:35:53 -0400
commit	615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree	45b039ccafb606a30e53c1012775efe848e789ed /kernel
parent	d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a70ddad..2956c8da1605 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
49	#include <linux/pid.h>	49	#include <linux/pid.h>
50	#include <linux/smp.h>	50	#include <linux/smp.h>
51	#include <linux/mm.h>	51	#include <linux/mm.h>
		52	#include <linux/vmacache.h>
52	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
53		54
54	#include <asm/cacheflush.h>	55	#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
224	if (!CACHE_FLUSH_IS_SAFE)	225	if (!CACHE_FLUSH_IS_SAFE)
225	return;	226	return;
226		227
227	if (current->mm && current->mm->mmap_cache) {	228	if (current->mm) {
228	flush_cache_range(current->mm->mmap_cache,	229	int i;
229	addr, addr + BREAK_INSTR_SIZE);	230
		231	for (i = 0; i < VMACACHE_SIZE; i++) {
		232	if (!current->vmacache[i])
		233	continue;
		234	flush_cache_range(current->vmacache[i],
		235	addr, addr + BREAK_INSTR_SIZE);
		236	}
230	}	237	}
		238
231	/* Force flush instruction cache if it was outside the mm */	239	/* Force flush instruction cache if it was outside the mm */
232	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);	240	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
233	}	241	}


diff --git a/kernel/fork.c b/kernel/fork.c index e40c0a01d5a6..bc0e96b78dfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -28,6 +28,8 @@
28	#include <linux/mman.h>	28	#include <linux/mman.h>
29	#include <linux/mmu_notifier.h>	29	#include <linux/mmu_notifier.h>
30	#include <linux/fs.h>	30	#include <linux/fs.h>
		31	#include <linux/mm.h>
		32	#include <linux/vmacache.h>
31	#include <linux/nsproxy.h>	33	#include <linux/nsproxy.h>
32	#include <linux/capability.h>	34	#include <linux/capability.h>
33	#include <linux/cpu.h>	35	#include <linux/cpu.h>
@@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct mm, struct mm_struct oldmm)
364		366
365	mm->locked_vm = 0;	367	mm->locked_vm = 0;
366	mm->mmap = NULL;	368	mm->mmap = NULL;
367	mm->mmap_cache = NULL;	369	mm->vmacache_seqnum = 0;
368	mm->map_count = 0;	370	mm->map_count = 0;
369	cpumask_clear(mm_cpumask(mm));	371	cpumask_clear(mm_cpumask(mm));
370	mm->mm_rb = RB_ROOT;	372	mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
882	if (!oldmm)	884	if (!oldmm)
883	return 0;	885	return 0;
884		886
		887	/* initialize the new vmacache entries */
		888	vmacache_flush(tsk);
		889
885	if (clone_flags & CLONE_VM) {	890	if (clone_flags & CLONE_VM) {
886	atomic_inc(&oldmm->mm_users);	891	atomic_inc(&oldmm->mm_users);
887	mm = oldmm;	892	mm = oldmm;