mm: per-thread vma caching

This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Michel Lespinasse <walken@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Davidlohr Bueso <davidlohr@hp.com> 2014-04-07 18:37:25 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-07 19:35:53 -0400
commit: 615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree: 45b039ccafb606a30e53c1012775efe848e789ed /include/linux
parent: d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)
3 files changed, 47 insertions, 2 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 290901a8c1de..2b58d192ea24 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,9 +342,9 @@ struct mm_rss_stat {
 struct kioctx_table;
 struct mm_struct {
-        struct vm_area_struct * mmap;           /* list of VMAs */
+        struct vm_area_struct *mmap;            /* list of VMAs */
        struct rb_root mm_rb;
-        struct vm_area_struct * mmap_cache;     /* last find_vma result */
+        u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
        unsigned long (*get_unmapped_area) (struct file *filp,
                                unsigned long addr, unsigned long len,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7cb07fd26680..642477dd814a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -132,6 +132,10 @@ struct perf_event_context;
 struct blk_plug;
 struct filename;
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
 /*
 * List of flags we want to share for kernel threads,
 * if only because they are not used by them anyway.
@@ -1235,6 +1239,9 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
        unsigned brk_randomized:1;
 #endif
+        /* per-thread vma caching */
+        u32 vmacache_seqnum;
+        struct vm_area_struct *vmacache[VMACACHE_SIZE];
 #if defined(SPLIT_RSS_COUNTING)
        struct task_rss_stat    rss_stat;
 #endif
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
new file mode 100644
index 000000000000..c3fa0fd43949
--- /dev/null
+++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
+#ifndef __LINUX_VMACACHE_H
+#define __LINUX_VMACACHE_H
+#include <linux/sched.h>
+#include <linux/mm.h>
+/*
+ * Hash based on the page number. Provides a good hit rate for
+ * workloads with good locality and those with random accesses as well.
+ */
+#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
+static inline void vmacache_flush(struct task_struct *tsk)
+{
+        memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+}
+extern void vmacache_flush_all(struct mm_struct *mm);
+extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
+extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
+                                                    unsigned long addr);
+#ifndef CONFIG_MMU
+extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+                                                  unsigned long start,
+                                                  unsigned long end);
+#endif
+static inline void vmacache_invalidate(struct mm_struct *mm)
+{
+        mm->vmacache_seqnum++;
+        /* deal with overflows */
+        if (unlikely(mm->vmacache_seqnum == 0))
+                vmacache_flush_all(mm);
+}
+#endif /* __LINUX_VMACACHE_H */
author	Davidlohr Bueso <davidlohr@hp.com>	2014-04-07 18:37:25 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-07 19:35:53 -0400
commit	615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree	45b039ccafb606a30e53c1012775efe848e789ed /include/linux
parent	d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 290901a8c1de..2b58d192ea24 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h
@@ -342,9 +342,9 @@ struct mm_rss_stat {
342		342
343	struct kioctx_table;	343	struct kioctx_table;
344	struct mm_struct {	344	struct mm_struct {
345	struct vm_area_struct * mmap; /* list of VMAs */	345	struct vm_area_struct mmap; / list of VMAs */
346	struct rb_root mm_rb;	346	struct rb_root mm_rb;
347	struct vm_area_struct * mmap_cache; /* last find_vma result */	347	u32 vmacache_seqnum; /* per-thread vmacache */
348	#ifdef CONFIG_MMU	348	#ifdef CONFIG_MMU
349	unsigned long (get_unmapped_area) (struct file filp,	349	unsigned long (get_unmapped_area) (struct file filp,
350	unsigned long addr, unsigned long len,	350	unsigned long addr, unsigned long len,


diff --git a/include/linux/sched.h b/include/linux/sched.h index 7cb07fd26680..642477dd814a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -132,6 +132,10 @@ struct perf_event_context;
132	struct blk_plug;	132	struct blk_plug;
133	struct filename;	133	struct filename;
134		134
		135	#define VMACACHE_BITS 2
		136	#define VMACACHE_SIZE (1U << VMACACHE_BITS)
		137	#define VMACACHE_MASK (VMACACHE_SIZE - 1)
		138
135	/*	139	/*
136	* List of flags we want to share for kernel threads,	140	* List of flags we want to share for kernel threads,
137	* if only because they are not used by them anyway.	141	* if only because they are not used by them anyway.
@@ -1235,6 +1239,9 @@ struct task_struct {
1235	#ifdef CONFIG_COMPAT_BRK	1239	#ifdef CONFIG_COMPAT_BRK
1236	unsigned brk_randomized:1;	1240	unsigned brk_randomized:1;
1237	#endif	1241	#endif
		1242	/* per-thread vma caching */
		1243	u32 vmacache_seqnum;
		1244	struct vm_area_struct *vmacache[VMACACHE_SIZE];
1238	#if defined(SPLIT_RSS_COUNTING)	1245	#if defined(SPLIT_RSS_COUNTING)
1239	struct task_rss_stat rss_stat;	1246	struct task_rss_stat rss_stat;
1240	#endif	1247	#endif


diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h new file mode 100644 index 000000000000..c3fa0fd43949 --- /dev/null +++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
		1	#ifndef __LINUX_VMACACHE_H
		2	#define __LINUX_VMACACHE_H
		3
		4	#include <linux/sched.h>
		5	#include <linux/mm.h>
		6
		7	/*
		8	* Hash based on the page number. Provides a good hit rate for
		9	* workloads with good locality and those with random accesses as well.
		10	*/
		11	#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
		12
		13	static inline void vmacache_flush(struct task_struct *tsk)
		14	{
		15	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
		16	}
		17
		18	extern void vmacache_flush_all(struct mm_struct *mm);
		19	extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
		20	extern struct vm_area_struct vmacache_find(struct mm_struct mm,
		21	unsigned long addr);
		22
		23	#ifndef CONFIG_MMU
		24	extern struct vm_area_struct vmacache_find_exact(struct mm_struct mm,
		25	unsigned long start,
		26	unsigned long end);
		27	#endif
		28
		29	static inline void vmacache_invalidate(struct mm_struct *mm)
		30	{
		31	mm->vmacache_seqnum++;
		32
		33	/* deal with overflows */
		34	if (unlikely(mm->vmacache_seqnum == 0))
		35	vmacache_flush_all(mm);
		36	}
		37
		38	#endif /* __LINUX_VMACACHE_H */