mm: per-thread vma caching

This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Michel Lespinasse <walken@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Davidlohr Bueso <davidlohr@hp.com> 2014-04-07 18:37:25 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-07 19:35:53 -0400
commit: 615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree: 45b039ccafb606a30e53c1012775efe848e789ed
parent: d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)
12 files changed, 231 insertions, 44 deletions
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c658f7a..ef470a7a3d0f 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -14,6 +14,8 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/io.h>
 #include <asm/cacheflush.h>
@@ -73,7 +75,7 @@ do { \
                else \
                        mm->mmap = NULL; \
                rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-                mm->mmap_cache = NULL; \
+                vmacache_invalidate(mm); \
                mm->map_count--; \
                remove_vma(high_vma); \
        } \
diff --git a/fs/exec.c b/fs/exec.c
index 25dfeba6d55f..b60ccf969a8b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
@@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code);
 static int exec_mmap(struct mm_struct *mm)
 {
        struct task_struct *tsk;
-        struct mm_struct * old_mm, *active_mm;
+        struct mm_struct *old_mm, *active_mm;
        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
@@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
+        tsk->mm->vmacache_seqnum = 0;
+        vmacache_flush(tsk);
        task_unlock(tsk);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..442177b1119a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/hugetlb.h>
 #include <linux/huge_mm.h>
 #include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        /*
         * We remember last_addr rather than next_addr to hit with
-         * mmap_cache most of the time. We have zero last_addr at
+         * vmacache most of the time. We have zero last_addr at
         * the beginning and also after lseek. We will have -1 last_addr
         * after the end of the vmas.
         */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 290901a8c1de..2b58d192ea24 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,9 +342,9 @@ struct mm_rss_stat {
 struct kioctx_table;
 struct mm_struct {
-        struct vm_area_struct * mmap;           /* list of VMAs */
+        struct vm_area_struct *mmap;            /* list of VMAs */
        struct rb_root mm_rb;
-        struct vm_area_struct * mmap_cache;     /* last find_vma result */
+        u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
        unsigned long (*get_unmapped_area) (struct file *filp,
                                unsigned long addr, unsigned long len,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7cb07fd26680..642477dd814a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -132,6 +132,10 @@ struct perf_event_context;
 struct blk_plug;
 struct filename;
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
 /*
 * List of flags we want to share for kernel threads,
 * if only because they are not used by them anyway.
@@ -1235,6 +1239,9 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
        unsigned brk_randomized:1;
 #endif
+        /* per-thread vma caching */
+        u32 vmacache_seqnum;
+        struct vm_area_struct *vmacache[VMACACHE_SIZE];
 #if defined(SPLIT_RSS_COUNTING)
        struct task_rss_stat    rss_stat;
 #endif
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
new file mode 100644
index 000000000000..c3fa0fd43949
--- /dev/null
+++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
+#ifndef __LINUX_VMACACHE_H
+#define __LINUX_VMACACHE_H
+#include <linux/sched.h>
+#include <linux/mm.h>
+/*
+ * Hash based on the page number. Provides a good hit rate for
+ * workloads with good locality and those with random accesses as well.
+ */
+#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
+static inline void vmacache_flush(struct task_struct *tsk)
+{
+        memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+}
+extern void vmacache_flush_all(struct mm_struct *mm);
+extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
+extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
+                                                    unsigned long addr);
+#ifndef CONFIG_MMU
+extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+                                                  unsigned long start,
+                                                  unsigned long end);
+#endif
+static inline void vmacache_invalidate(struct mm_struct *mm)
+{
+        mm->vmacache_seqnum++;
+        /* deal with overflows */
+        if (unlikely(mm->vmacache_seqnum == 0))
+                vmacache_flush_all(mm);
+}
+#endif /* __LINUX_VMACACHE_H */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 99982a70ddad..2956c8da1605 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
        if (!CACHE_FLUSH_IS_SAFE)
                return;
-        if (current->mm && current->mm->mmap_cache) {
+        if (current->mm) {
-                flush_cache_range(current->mm->mmap_cache,
+                int i;
-                                  addr, addr + BREAK_INSTR_SIZE);
+                for (i = 0; i < VMACACHE_SIZE; i++) {
+                        if (!current->vmacache[i])
+                                continue;
+                        flush_cache_range(current->vmacache[i],
+                                          addr, addr + BREAK_INSTR_SIZE);
+                }
        }
        /* Force flush instruction cache if it was outside the mm */
        flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index e40c0a01d5a6..bc0e96b78dfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
@@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        mm->locked_vm = 0;
        mm->mmap = NULL;
-        mm->mmap_cache = NULL;
+        mm->vmacache_seqnum = 0;
        mm->map_count = 0;
        cpumask_clear(mm_cpumask(mm));
        mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        if (!oldmm)
                return 0;
+        /* initialize the new vmacache entries */
+        vmacache_flush(tsk);
        if (clone_flags & CLONE_VM) {
                atomic_inc(&oldmm->mm_users);
                mm = oldmm;
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..23a6f7e23019 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o balloon_compaction.o \
+                           compaction.o balloon_compaction.o vmacache.o \
                           interval_tree.o list_lru.o workingset.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
        prev->vm_next = next = vma->vm_next;
        if (next)
                next->vm_prev = prev;
-        if (mm->mmap_cache == vma)
-                mm->mmap_cache = prev;
+        /* Kill the cache */
+        vmacache_invalidate(mm);
 }
 /*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
-        struct vm_area_struct *vma = NULL;
+        struct rb_node *rb_node;
+        struct vm_area_struct *vma;
        /* Check the cache first. */
-        /* (Cache hit rate is typically around 35%.) */
+        vma = vmacache_find(mm, addr);
-        vma = ACCESS_ONCE(mm->mmap_cache);
+        if (likely(vma))
-        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+                return vma;
-                struct rb_node *rb_node;
-                rb_node = mm->mm_rb.rb_node;
+        rb_node = mm->mm_rb.rb_node;
-                vma = NULL;
+        vma = NULL;
-                while (rb_node) {
+        while (rb_node) {
-                        struct vm_area_struct *vma_tmp;
+                struct vm_area_struct *tmp;
-                        vma_tmp = rb_entry(rb_node,
+                tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-                                           struct vm_area_struct, vm_rb);
+                if (tmp->vm_end > addr) {
-                        if (vma_tmp->vm_end > addr) {
+                        vma = tmp;
-                                vma = vma_tmp;
+                        if (tmp->vm_start <= addr)
-                                if (vma_tmp->vm_start <= addr)
+                                break;
-                                        break;
+                        rb_node = rb_node->rb_left;
-                                rb_node = rb_node->rb_left;
+                } else
-                        } else
+                        rb_node = rb_node->rb_right;
-                                rb_node = rb_node->rb_right;
-                }
-                if (vma)
-                        mm->mmap_cache = vma;
        }
+        if (vma)
+                vmacache_update(addr, vma);
        return vma;
 }
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        } else
                mm->highest_vm_end = prev ? prev->vm_end : 0;
        tail_vma->vm_next = NULL;
-        mm->mmap_cache = NULL;          /* Kill the cache. */
+        /* Kill the cache */
+        vmacache_invalidate(mm);
 }
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index e19482533ce3..5d3f3524bbdc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/file.h>
@@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 */
 static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+        int i;
        struct address_space *mapping;
        struct mm_struct *mm = vma->vm_mm;
+        struct task_struct *curr = current;
        kenter("%p", vma);
        protect_vma(vma, 0);
        mm->map_count--;
-        if (mm->mmap_cache == vma)
+        for (i = 0; i < VMACACHE_SIZE; i++) {
-                mm->mmap_cache = NULL;
+                /* if the vma is cached, invalidate the entire cache */
+                if (curr->vmacache[i] == vma) {
+                        vmacache_invalidate(curr->mm);
+                        break;
+                }
+        }
        /* remove the VMA from the mapping */
        if (vma->vm_file) {
@@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
        struct vm_area_struct *vma;
        /* check the cache first */
-        vma = ACCESS_ONCE(mm->mmap_cache);
+        vma = vmacache_find(mm, addr);
-        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+        if (likely(vma))
                return vma;
        /* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end > addr) {
-                        mm->mmap_cache = vma;
+                        vmacache_update(addr, vma);
                        return vma;
                }
        }
@@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
        unsigned long end = addr + len;
        /* check the cache first */
-        vma = mm->mmap_cache;
+        vma = vmacache_find_exact(mm, addr, end);
-        if (vma && vma->vm_start == addr && vma->vm_end == end)
+        if (vma)
                return vma;
        /* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end == end) {
-                        mm->mmap_cache = vma;
+                        vmacache_update(addr, vma);
                        return vma;
                }
        }
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+        struct task_struct *g, *p;
+        rcu_read_lock();
+        for_each_process_thread(g, p) {
+                /*
+                 * Only flush the vmacache pointers as the
+                 * mm seqnum is already set and curr's will
+                 * be set upon invalidation when the next
+                 * lookup is done.
+                 */
+                if (mm == p->mm)
+                        vmacache_flush(p);
+        }
+        rcu_read_unlock();
+}
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma().  The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own).  There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+        return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+        if (vmacache_valid_mm(newvma->vm_mm))
+                current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+static bool vmacache_valid(struct mm_struct *mm)
+{
+        struct task_struct *curr;
+        if (!vmacache_valid_mm(mm))
+                return false;
+        curr = current;
+        if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+                /*
+                 * First attempt will always be invalid, initialize
+                 * the new cache for this task here.
+                 */
+                curr->vmacache_seqnum = mm->vmacache_seqnum;
+                vmacache_flush(curr);
+                return false;
+        }
+        return true;
+}
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+        int i;
+        if (!vmacache_valid(mm))
+                return NULL;
+        for (i = 0; i < VMACACHE_SIZE; i++) {
+                struct vm_area_struct *vma = current->vmacache[i];
+                if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
+                        BUG_ON(vma->vm_mm != mm);
+                        return vma;
+                }
+        }
+        return NULL;
+}
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+                                           unsigned long start,
+                                           unsigned long end)
+{
+        int i;
+        if (!vmacache_valid(mm))
+                return NULL;
+        for (i = 0; i < VMACACHE_SIZE; i++) {
+                struct vm_area_struct *vma = current->vmacache[i];
+                if (vma && vma->vm_start == start && vma->vm_end == end)
+                        return vma;
+        }
+        return NULL;
+}
+#endif
author	Davidlohr Bueso <davidlohr@hp.com>	2014-04-07 18:37:25 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-07 19:35:53 -0400
commit	615d6e8756c87149f2d4c1b93d471bca002bd849 (patch)
tree	45b039ccafb606a30e53c1012775efe848e789ed
parent	d7c1755179b82d954f593ca5285b9360f2f62e9c (diff)

diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index fb5e4c658f7a..ef470a7a3d0f 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h
@@ -14,6 +14,8 @@
14		14
15	#include <linux/compiler.h>	15	#include <linux/compiler.h>
16	#include <linux/sched.h>	16	#include <linux/sched.h>
		17	#include <linux/mm.h>
		18	#include <linux/vmacache.h>
17	#include <linux/io.h>	19	#include <linux/io.h>
18		20
19	#include <asm/cacheflush.h>	21	#include <asm/cacheflush.h>
@@ -73,7 +75,7 @@ do { \
73	else \	75	else \
74	mm->mmap = NULL; \	76	mm->mmap = NULL; \
75	rb_erase(&high_vma->vm_rb, &mm->mm_rb); \	77	rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
76	mm->mmap_cache = NULL; \	78	vmacache_invalidate(mm); \
77	mm->map_count--; \	79	mm->map_count--; \
78	remove_vma(high_vma); \	80	remove_vma(high_vma); \
79	} \	81	} \


diff --git a/fs/exec.c b/fs/exec.c index 25dfeba6d55f..b60ccf969a8b 100644 --- a/fs/exec.c +++ b/fs/exec.c
@@ -26,6 +26,7 @@
26	#include <linux/file.h>	26	#include <linux/file.h>
27	#include <linux/fdtable.h>	27	#include <linux/fdtable.h>
28	#include <linux/mm.h>	28	#include <linux/mm.h>
		29	#include <linux/vmacache.h>
29	#include <linux/stat.h>	30	#include <linux/stat.h>
30	#include <linux/fcntl.h>	31	#include <linux/fcntl.h>
31	#include <linux/swap.h>	32	#include <linux/swap.h>
@@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code);
822	static int exec_mmap(struct mm_struct *mm)	823	static int exec_mmap(struct mm_struct *mm)
823	{	824	{
824	struct task_struct *tsk;	825	struct task_struct *tsk;
825	struct mm_struct * old_mm, *active_mm;	826	struct mm_struct old_mm, active_mm;
826		827
827	/* Notify parent that we're no longer interested in the old VM */	828	/* Notify parent that we're no longer interested in the old VM */
828	tsk = current;	829	tsk = current;
@@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
848	tsk->mm = mm;	849	tsk->mm = mm;
849	tsk->active_mm = mm;	850	tsk->active_mm = mm;
850	activate_mm(active_mm, mm);	851	activate_mm(active_mm, mm);
		852	tsk->mm->vmacache_seqnum = 0;
		853	vmacache_flush(tsk);
851	task_unlock(tsk);	854	task_unlock(tsk);
852	if (old_mm) {	855	if (old_mm) {
853	up_read(&old_mm->mmap_sem);	856	up_read(&old_mm->mmap_sem);


diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fb52b548080d..442177b1119a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
1	#include <linux/mm.h>	1	#include <linux/mm.h>
		2	#include <linux/vmacache.h>
2	#include <linux/hugetlb.h>	3	#include <linux/hugetlb.h>
3	#include <linux/huge_mm.h>	4	#include <linux/huge_mm.h>
4	#include <linux/mount.h>	5	#include <linux/mount.h>
@@ -152,7 +153,7 @@ static void m_start(struct seq_file m, loff_t *pos)
152		153
153	/*	154	/*
154	* We remember last_addr rather than next_addr to hit with	155	* We remember last_addr rather than next_addr to hit with
155	* mmap_cache most of the time. We have zero last_addr at	156	* vmacache most of the time. We have zero last_addr at
156	* the beginning and also after lseek. We will have -1 last_addr	157	* the beginning and also after lseek. We will have -1 last_addr
157	* after the end of the vmas.	158	* after the end of the vmas.
158	*/	159	*/


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 290901a8c1de..2b58d192ea24 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h
@@ -342,9 +342,9 @@ struct mm_rss_stat {
342		342
343	struct kioctx_table;	343	struct kioctx_table;
344	struct mm_struct {	344	struct mm_struct {
345	struct vm_area_struct * mmap; /* list of VMAs */	345	struct vm_area_struct mmap; / list of VMAs */
346	struct rb_root mm_rb;	346	struct rb_root mm_rb;
347	struct vm_area_struct * mmap_cache; /* last find_vma result */	347	u32 vmacache_seqnum; /* per-thread vmacache */
348	#ifdef CONFIG_MMU	348	#ifdef CONFIG_MMU
349	unsigned long (get_unmapped_area) (struct file filp,	349	unsigned long (get_unmapped_area) (struct file filp,
350	unsigned long addr, unsigned long len,	350	unsigned long addr, unsigned long len,


diff --git a/include/linux/sched.h b/include/linux/sched.h index 7cb07fd26680..642477dd814a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -132,6 +132,10 @@ struct perf_event_context;
132	struct blk_plug;	132	struct blk_plug;
133	struct filename;	133	struct filename;
134		134
		135	#define VMACACHE_BITS 2
		136	#define VMACACHE_SIZE (1U << VMACACHE_BITS)
		137	#define VMACACHE_MASK (VMACACHE_SIZE - 1)
		138
135	/*	139	/*
136	* List of flags we want to share for kernel threads,	140	* List of flags we want to share for kernel threads,
137	* if only because they are not used by them anyway.	141	* if only because they are not used by them anyway.
@@ -1235,6 +1239,9 @@ struct task_struct {
1235	#ifdef CONFIG_COMPAT_BRK	1239	#ifdef CONFIG_COMPAT_BRK
1236	unsigned brk_randomized:1;	1240	unsigned brk_randomized:1;
1237	#endif	1241	#endif
		1242	/* per-thread vma caching */
		1243	u32 vmacache_seqnum;
		1244	struct vm_area_struct *vmacache[VMACACHE_SIZE];
1238	#if defined(SPLIT_RSS_COUNTING)	1245	#if defined(SPLIT_RSS_COUNTING)
1239	struct task_rss_stat rss_stat;	1246	struct task_rss_stat rss_stat;
1240	#endif	1247	#endif


diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h new file mode 100644 index 000000000000..c3fa0fd43949 --- /dev/null +++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
		1	#ifndef __LINUX_VMACACHE_H
		2	#define __LINUX_VMACACHE_H
		3
		4	#include <linux/sched.h>
		5	#include <linux/mm.h>
		6
		7	/*
		8	* Hash based on the page number. Provides a good hit rate for
		9	* workloads with good locality and those with random accesses as well.
		10	*/
		11	#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
		12
		13	static inline void vmacache_flush(struct task_struct *tsk)
		14	{
		15	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
		16	}
		17
		18	extern void vmacache_flush_all(struct mm_struct *mm);
		19	extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
		20	extern struct vm_area_struct vmacache_find(struct mm_struct mm,
		21	unsigned long addr);
		22
		23	#ifndef CONFIG_MMU
		24	extern struct vm_area_struct vmacache_find_exact(struct mm_struct mm,
		25	unsigned long start,
		26	unsigned long end);
		27	#endif
		28
		29	static inline void vmacache_invalidate(struct mm_struct *mm)
		30	{
		31	mm->vmacache_seqnum++;
		32
		33	/* deal with overflows */
		34	if (unlikely(mm->vmacache_seqnum == 0))
		35	vmacache_flush_all(mm);
		36	}
		37
		38	#endif /* __LINUX_VMACACHE_H */


diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a70ddad..2956c8da1605 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
49	#include <linux/pid.h>	49	#include <linux/pid.h>
50	#include <linux/smp.h>	50	#include <linux/smp.h>
51	#include <linux/mm.h>	51	#include <linux/mm.h>
		52	#include <linux/vmacache.h>
52	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
53		54
54	#include <asm/cacheflush.h>	55	#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
224	if (!CACHE_FLUSH_IS_SAFE)	225	if (!CACHE_FLUSH_IS_SAFE)
225	return;	226	return;
226		227
227	if (current->mm && current->mm->mmap_cache) {	228	if (current->mm) {
228	flush_cache_range(current->mm->mmap_cache,	229	int i;
229	addr, addr + BREAK_INSTR_SIZE);	230
		231	for (i = 0; i < VMACACHE_SIZE; i++) {
		232	if (!current->vmacache[i])
		233	continue;
		234	flush_cache_range(current->vmacache[i],
		235	addr, addr + BREAK_INSTR_SIZE);
		236	}
230	}	237	}
		238
231	/* Force flush instruction cache if it was outside the mm */	239	/* Force flush instruction cache if it was outside the mm */
232	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);	240	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
233	}	241	}


diff --git a/kernel/fork.c b/kernel/fork.c index e40c0a01d5a6..bc0e96b78dfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -28,6 +28,8 @@
28	#include <linux/mman.h>	28	#include <linux/mman.h>
29	#include <linux/mmu_notifier.h>	29	#include <linux/mmu_notifier.h>
30	#include <linux/fs.h>	30	#include <linux/fs.h>
		31	#include <linux/mm.h>
		32	#include <linux/vmacache.h>
31	#include <linux/nsproxy.h>	33	#include <linux/nsproxy.h>
32	#include <linux/capability.h>	34	#include <linux/capability.h>
33	#include <linux/cpu.h>	35	#include <linux/cpu.h>
@@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct mm, struct mm_struct oldmm)
364		366
365	mm->locked_vm = 0;	367	mm->locked_vm = 0;
366	mm->mmap = NULL;	368	mm->mmap = NULL;
367	mm->mmap_cache = NULL;	369	mm->vmacache_seqnum = 0;
368	mm->map_count = 0;	370	mm->map_count = 0;
369	cpumask_clear(mm_cpumask(mm));	371	cpumask_clear(mm_cpumask(mm));
370	mm->mm_rb = RB_ROOT;	372	mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
882	if (!oldmm)	884	if (!oldmm)
883	return 0;	885	return 0;
884		886
		887	/* initialize the new vmacache entries */
		888	vmacache_flush(tsk);
		889
885	if (clone_flags & CLONE_VM) {	890	if (clone_flags & CLONE_VM) {
886	atomic_inc(&oldmm->mm_users);	891	atomic_inc(&oldmm->mm_users);
887	mm = oldmm;	892	mm = oldmm;


diff --git a/mm/Makefile b/mm/Makefile index cdd741519ee0..23a6f7e23019 100644 --- a/mm/Makefile +++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16	readahead.o swap.o truncate.o vmscan.o shmem.o \	16	readahead.o swap.o truncate.o vmscan.o shmem.o \
17	util.o mmzone.o vmstat.o backing-dev.o \	17	util.o mmzone.o vmstat.o backing-dev.o \
18	mm_init.o mmu_context.o percpu.o slab_common.o \	18	mm_init.o mmu_context.o percpu.o slab_common.o \
19	compaction.o balloon_compaction.o \	19	compaction.o balloon_compaction.o vmacache.o \
20	interval_tree.o list_lru.o workingset.o $(mmu-y)	20	interval_tree.o list_lru.o workingset.o $(mmu-y)
21		21
22	obj-y += init-mm.o	22	obj-y += init-mm.o


diff --git a/mm/mmap.c b/mm/mmap.c index 46433e137abc..b1202cf81f4b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -10,6 +10,7 @@
10	#include <linux/slab.h>	10	#include <linux/slab.h>
11	#include <linux/backing-dev.h>	11	#include <linux/backing-dev.h>
12	#include <linux/mm.h>	12	#include <linux/mm.h>
		13	#include <linux/vmacache.h>
13	#include <linux/shm.h>	14	#include <linux/shm.h>
14	#include <linux/mman.h>	15	#include <linux/mman.h>
15	#include <linux/pagemap.h>	16	#include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct mm, struct vm_area_struct vma,
681	prev->vm_next = next = vma->vm_next;	682	prev->vm_next = next = vma->vm_next;
682	if (next)	683	if (next)
683	next->vm_prev = prev;	684	next->vm_prev = prev;
684	if (mm->mmap_cache == vma)	685
685	mm->mmap_cache = prev;	686	/* Kill the cache */
		687	vmacache_invalidate(mm);
686	}	688	}
687		689
688	/*	690	/*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
1989	/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */	1991	/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1990	struct vm_area_struct find_vma(struct mm_struct mm, unsigned long addr)	1992	struct vm_area_struct find_vma(struct mm_struct mm, unsigned long addr)
1991	{	1993	{
1992	struct vm_area_struct *vma = NULL;	1994	struct rb_node *rb_node;
		1995	struct vm_area_struct *vma;
1993		1996
1994	/* Check the cache first. */	1997	/* Check the cache first. */
1995	/* (Cache hit rate is typically around 35%.) */	1998	vma = vmacache_find(mm, addr);
1996	vma = ACCESS_ONCE(mm->mmap_cache);	1999	if (likely(vma))
1997	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {	2000	return vma;
1998	struct rb_node *rb_node;
1999		2001
2000	rb_node = mm->mm_rb.rb_node;	2002	rb_node = mm->mm_rb.rb_node;
2001	vma = NULL;	2003	vma = NULL;
2002		2004
2003	while (rb_node) {	2005	while (rb_node) {
2004	struct vm_area_struct *vma_tmp;	2006	struct vm_area_struct *tmp;
2005		2007
2006	vma_tmp = rb_entry(rb_node,	2008	tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2007	struct vm_area_struct, vm_rb);	2009
2008		2010	if (tmp->vm_end > addr) {
2009	if (vma_tmp->vm_end > addr) {	2011	vma = tmp;
2010	vma = vma_tmp;	2012	if (tmp->vm_start <= addr)
2011	if (vma_tmp->vm_start <= addr)	2013	break;
2012	break;	2014	rb_node = rb_node->rb_left;
2013	rb_node = rb_node->rb_left;	2015	} else
2014	} else	2016	rb_node = rb_node->rb_right;
2015	rb_node = rb_node->rb_right;
2016	}
2017	if (vma)
2018	mm->mmap_cache = vma;
2019	}	2017	}
		2018
		2019	if (vma)
		2020	vmacache_update(addr, vma);
2020	return vma;	2021	return vma;
2021	}	2022	}
2022		2023
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct mm, struct vm_area_struct vma,
2388	} else	2389	} else
2389	mm->highest_vm_end = prev ? prev->vm_end : 0;	2390	mm->highest_vm_end = prev ? prev->vm_end : 0;
2390	tail_vma->vm_next = NULL;	2391	tail_vma->vm_next = NULL;
2391	mm->mmap_cache = NULL; /* Kill the cache. */	2392
		2393	/* Kill the cache */
		2394	vmacache_invalidate(mm);
2392	}	2395	}
2393		2396
2394	/*	2397	/*


diff --git a/mm/nommu.c b/mm/nommu.c index e19482533ce3..5d3f3524bbdc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c
@@ -15,6 +15,7 @@
15		15
16	#include <linux/export.h>	16	#include <linux/export.h>
17	#include <linux/mm.h>	17	#include <linux/mm.h>
		18	#include <linux/vmacache.h>
18	#include <linux/mman.h>	19	#include <linux/mman.h>
19	#include <linux/swap.h>	20	#include <linux/swap.h>
20	#include <linux/file.h>	21	#include <linux/file.h>
@@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct mm, struct vm_area_struct vma)
768	*/	769	*/
769	static void delete_vma_from_mm(struct vm_area_struct *vma)	770	static void delete_vma_from_mm(struct vm_area_struct *vma)
770	{	771	{
		772	int i;
771	struct address_space *mapping;	773	struct address_space *mapping;
772	struct mm_struct *mm = vma->vm_mm;	774	struct mm_struct *mm = vma->vm_mm;
		775	struct task_struct *curr = current;
773		776
774	kenter("%p", vma);	777	kenter("%p", vma);
775		778
776	protect_vma(vma, 0);	779	protect_vma(vma, 0);
777		780
778	mm->map_count--;	781	mm->map_count--;
779	if (mm->mmap_cache == vma)	782	for (i = 0; i < VMACACHE_SIZE; i++) {
780	mm->mmap_cache = NULL;	783	/* if the vma is cached, invalidate the entire cache */
		784	if (curr->vmacache[i] == vma) {
		785	vmacache_invalidate(curr->mm);
		786	break;
		787	}
		788	}
781		789
782	/* remove the VMA from the mapping */	790	/* remove the VMA from the mapping */
783	if (vma->vm_file) {	791	if (vma->vm_file) {
@@ -825,8 +833,8 @@ struct vm_area_struct find_vma(struct mm_struct mm, unsigned long addr)
825	struct vm_area_struct *vma;	833	struct vm_area_struct *vma;
826		834
827	/* check the cache first */	835	/* check the cache first */
828	vma = ACCESS_ONCE(mm->mmap_cache);	836	vma = vmacache_find(mm, addr);
829	if (vma && vma->vm_start <= addr && vma->vm_end > addr)	837	if (likely(vma))
830	return vma;	838	return vma;
831		839
832	/* trawl the list (there may be multiple mappings in which addr	840	/* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +843,7 @@ struct vm_area_struct find_vma(struct mm_struct mm, unsigned long addr)
835	if (vma->vm_start > addr)	843	if (vma->vm_start > addr)
836	return NULL;	844	return NULL;
837	if (vma->vm_end > addr) {	845	if (vma->vm_end > addr) {
838	mm->mmap_cache = vma;	846	vmacache_update(addr, vma);
839	return vma;	847	return vma;
840	}	848	}
841	}	849	}
@@ -874,8 +882,8 @@ static struct vm_area_struct find_vma_exact(struct mm_struct mm,
874	unsigned long end = addr + len;	882	unsigned long end = addr + len;
875		883
876	/* check the cache first */	884	/* check the cache first */
877	vma = mm->mmap_cache;	885	vma = vmacache_find_exact(mm, addr, end);
878	if (vma && vma->vm_start == addr && vma->vm_end == end)	886	if (vma)
879	return vma;	887	return vma;
880		888
881	/* trawl the list (there may be multiple mappings in which addr	889	/* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +894,7 @@ static struct vm_area_struct find_vma_exact(struct mm_struct mm,
886	if (vma->vm_start > addr)	894	if (vma->vm_start > addr)
887	return NULL;	895	return NULL;
888	if (vma->vm_end == end) {	896	if (vma->vm_end == end) {
889	mm->mmap_cache = vma;	897	vmacache_update(addr, vma);
890	return vma;	898	return vma;
891	}	899	}
892	}	900	}


diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..d4224b397c0e --- /dev/null +++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
		1	/*
		2	* Copyright (C) 2014 Davidlohr Bueso.
		3	*/
		4	#include <linux/sched.h>
		5	#include <linux/mm.h>
		6	#include <linux/vmacache.h>
		7
		8	/*
		9	* Flush vma caches for threads that share a given mm.
		10	*
		11	* The operation is safe because the caller holds the mmap_sem
		12	* exclusively and other threads accessing the vma cache will
		13	* have mmap_sem held at least for read, so no extra locking
		14	* is required to maintain the vma cache.
		15	*/
		16	void vmacache_flush_all(struct mm_struct *mm)
		17	{
		18	struct task_struct g, p;
		19
		20	rcu_read_lock();
		21	for_each_process_thread(g, p) {
		22	/*
		23	* Only flush the vmacache pointers as the
		24	* mm seqnum is already set and curr's will
		25	* be set upon invalidation when the next
		26	* lookup is done.
		27	*/
		28	if (mm == p->mm)
		29	vmacache_flush(p);
		30	}
		31	rcu_read_unlock();
		32	}
		33
		34	/*
		35	* This task may be accessing a foreign mm via (for example)
		36	* get_user_pages()->find_vma(). The vmacache is task-local and this
		37	* task's vmacache pertains to a different mm (ie, its own). There is
		38	* nothing we can do here.
		39	*
		40	* Also handle the case where a kernel thread has adopted this mm via use_mm().
		41	* That kernel thread's vmacache is not applicable to this mm.
		42	*/
		43	static bool vmacache_valid_mm(struct mm_struct *mm)
		44	{
		45	return current->mm == mm && !(current->flags & PF_KTHREAD);
		46	}
		47
		48	void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
		49	{
		50	if (vmacache_valid_mm(newvma->vm_mm))
		51	current->vmacache[VMACACHE_HASH(addr)] = newvma;
		52	}
		53
		54	static bool vmacache_valid(struct mm_struct *mm)
		55	{
		56	struct task_struct *curr;
		57
		58	if (!vmacache_valid_mm(mm))
		59	return false;
		60
		61	curr = current;
		62	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
		63	/*
		64	* First attempt will always be invalid, initialize
		65	* the new cache for this task here.
		66	*/
		67	curr->vmacache_seqnum = mm->vmacache_seqnum;
		68	vmacache_flush(curr);
		69	return false;
		70	}
		71	return true;
		72	}
		73
		74	struct vm_area_struct vmacache_find(struct mm_struct mm, unsigned long addr)
		75	{
		76	int i;
		77
		78	if (!vmacache_valid(mm))
		79	return NULL;
		80
		81	for (i = 0; i < VMACACHE_SIZE; i++) {
		82	struct vm_area_struct *vma = current->vmacache[i];
		83
		84	if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
		85	BUG_ON(vma->vm_mm != mm);
		86	return vma;
		87	}
		88	}
		89
		90	return NULL;
		91	}
		92
		93	#ifndef CONFIG_MMU
		94	struct vm_area_struct vmacache_find_exact(struct mm_struct mm,
		95	unsigned long start,
		96	unsigned long end)
		97	{
		98	int i;
		99
		100	if (!vmacache_valid(mm))
		101	return NULL;
		102
		103	for (i = 0; i < VMACACHE_SIZE; i++) {
		104	struct vm_area_struct *vma = current->vmacache[i];
		105
		106	if (vma && vma->vm_start == start && vma->vm_end == end)
		107	return vma;
		108	}
		109
		110	return NULL;
		111	}
		112	#endif