sh: Flush only the needed range when unmapping a VMA.

This follows the ARM change from Aaro Koskinen: When unmapping N pages (e.g. shared memory) the amount of TLB flushes done can be (N*PAGE_SIZE/ZAP_BLOCK_SIZE)*N although it should be N at maximum. With PREEMPT kernel ZAP_BLOCK_SIZE is 8 pages, so there is a noticeable performance penalty when unmapping a large VMA and the system is spending its time in flush_tlb_range(). The problem is that tlb_end_vma() is always flushing the full VMA range. The subrange that needs to be flushed can be calculated by tlb_remove_tlb_entry(). This approach was suggested by Hugh Dickins, and is also used by other arches. The speed increase is roughly 3x for 8M mappings and for larger mappings even more. Bits and peices are taken from the ARM patch as well as the existing arch/um implementation that is quite similar. The end result is a significant reduction in both partial and full TLB flushes initiated through flush_tlb_range(). At the same time, the nommu implementation was broken, had a superfluous cache flush, and subsequently would have triggered a BUG_ON() if a code-path had triggered it. Tidy this up for correctness and provide a nopped-out implementation there. More background on the initial discussion can be found at: http://marc.info/?t=123609820900002&r=1&w=2 http://marc.info/?t=123660375800003&r=1&w=2 Signed-off-by: Paul Mundt <lethal@linux-sh.org>
author: Paul Mundt <lethal@linux-sh.org> 2009-03-17 08:19:49 -0400
committer: Paul Mundt <lethal@linux-sh.org> 2009-03-17 08:19:49 -0400
commit: c20351846efcb755ba849d9fb701fbd9a1ffb7c2 (patch)
tree: 8e8c0cdce372160fb2081d4e491550d1e8a8c99c /arch
parent: 3a3b311ca375a37b29bb78b030f96bf97dee97f5 (diff)
1 files changed, 92 insertions, 8 deletions
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 88ff1ae8a6b8..9c16f737074a 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -6,22 +6,106 @@
 #endif
 #ifndef __ASSEMBLY__
+#include <linux/pagemap.h>
+#ifdef CONFIG_MMU
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+/*
+ * TLB handling.  This allows us to remove pages from the page
+ * tables, and efficiently handle the TLB issues.
+ */
+struct mmu_gather {
+        struct mm_struct        *mm;
+        unsigned int            fullmm;
+        unsigned long           start, end;
+};
-#define tlb_start_vma(tlb, vma) \
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-        flush_cache_range(vma, vma->vm_start, vma->vm_end)
-#define tlb_end_vma(tlb, vma)   \
+static inline void init_tlb_gather(struct mmu_gather *tlb)
-        flush_tlb_range(vma, vma->vm_start, vma->vm_end)
+{
+        tlb->start = TASK_SIZE;
+        tlb->end = 0;
-#define __tlb_remove_tlb_entry(tlb, pte, address)       do { } while (0)
+        if (tlb->fullmm) {
+                tlb->start = 0;
+                tlb->end = TASK_SIZE;
+        }
+}
+static inline struct mmu_gather *
+tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+{
+        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+        tlb->mm = mm;
+        tlb->fullmm = full_mm_flush;
+        init_tlb_gather(tlb);
+        return tlb;
+}
+static inline void
+tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+        if (tlb->fullmm)
+                flush_tlb_mm(tlb->mm);
+        /* keep the page table cache within bounds */
+        check_pgt_cache();
+        put_cpu_var(mmu_gathers);
+}
+static inline void
+tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
+{
+        if (tlb->start > address)
+                tlb->start = address;
+        if (tlb->end < address + PAGE_SIZE)
+                tlb->end = address + PAGE_SIZE;
+}
 /*
- * Flush whole TLBs for MM
+ * In the case of tlb vma handling, we can optimise these away in the
+ * case where we're doing a full MM flush.  When we're doing a munmap,
+ * the vmas are adjusted to only cover the region to be torn down.
 */
-#define tlb_flush(tlb)                          flush_tlb_mm((tlb)->mm)
+static inline void
+tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+        if (!tlb->fullmm)
+                flush_cache_range(vma, vma->vm_start, vma->vm_end);
+}
+static inline void
+tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+        if (!tlb->fullmm && tlb->end) {
+                flush_tlb_range(vma, tlb->start, tlb->end);
+                init_tlb_gather(tlb);
+        }
+}
+#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
+#define pte_free_tlb(tlb, ptep)         pte_free((tlb)->mm, ptep)
+#define pmd_free_tlb(tlb, pmdp)         pmd_free((tlb)->mm, pmdp)
+#define pud_free_tlb(tlb, pudp)         pud_free((tlb)->mm, pudp)
+#define tlb_migrate_finish(mm)          do { } while (0)
+#else /* CONFIG_MMU */
+#define tlb_start_vma(tlb, vma)                         do { } while (0)
+#define tlb_end_vma(tlb, vma)                           do { } while (0)
+#define __tlb_remove_tlb_entry(tlb, pte, address)       do { } while (0)
+#define tlb_flush(tlb)                                  do { } while (0)
-#include <linux/pagemap.h>
 #include <asm-generic/tlb.h>
+#endif /* CONFIG_MMU */
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_SH_TLB_H */
author	Paul Mundt <lethal@linux-sh.org>	2009-03-17 08:19:49 -0400
committer	Paul Mundt <lethal@linux-sh.org>	2009-03-17 08:19:49 -0400
commit	c20351846efcb755ba849d9fb701fbd9a1ffb7c2 (patch)
tree	8e8c0cdce372160fb2081d4e491550d1e8a8c99c /arch
parent	3a3b311ca375a37b29bb78b030f96bf97dee97f5 (diff)

diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 88ff1ae8a6b8..9c16f737074a 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h
@@ -6,22 +6,106 @@
6	#endif	6	#endif
7		7
8	#ifndef __ASSEMBLY__	8	#ifndef __ASSEMBLY__
		9	#include <linux/pagemap.h>
		10
		11	#ifdef CONFIG_MMU
		12	#include <asm/pgalloc.h>
		13	#include <asm/tlbflush.h>
		14
		15	/*
		16	* TLB handling. This allows us to remove pages from the page
		17	* tables, and efficiently handle the TLB issues.
		18	*/
		19	struct mmu_gather {
		20	struct mm_struct *mm;
		21	unsigned int fullmm;
		22	unsigned long start, end;
		23	};
9		24
10	#define tlb_start_vma(tlb, vma) \	25	DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
11	flush_cache_range(vma, vma->vm_start, vma->vm_end)
12		26
13	#define tlb_end_vma(tlb, vma) \	27	static inline void init_tlb_gather(struct mmu_gather *tlb)
14	flush_tlb_range(vma, vma->vm_start, vma->vm_end)	28	{
		29	tlb->start = TASK_SIZE;
		30	tlb->end = 0;
15		31
16	#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0)	32	if (tlb->fullmm) {
		33	tlb->start = 0;
		34	tlb->end = TASK_SIZE;
		35	}
		36	}
		37
		38	static inline struct mmu_gather *
		39	tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
		40	{
		41	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
		42
		43	tlb->mm = mm;
		44	tlb->fullmm = full_mm_flush;
		45
		46	init_tlb_gather(tlb);
		47
		48	return tlb;
		49	}
		50
		51	static inline void
		52	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
		53	{
		54	if (tlb->fullmm)
		55	flush_tlb_mm(tlb->mm);
		56
		57	/* keep the page table cache within bounds */
		58	check_pgt_cache();
		59
		60	put_cpu_var(mmu_gathers);
		61	}
		62
		63	static inline void
		64	tlb_remove_tlb_entry(struct mmu_gather tlb, pte_t ptep, unsigned long address)
		65	{
		66	if (tlb->start > address)
		67	tlb->start = address;
		68	if (tlb->end < address + PAGE_SIZE)
		69	tlb->end = address + PAGE_SIZE;
		70	}
17		71
18	/*	72	/*
19	* Flush whole TLBs for MM	73	* In the case of tlb vma handling, we can optimise these away in the
		74	* case where we're doing a full MM flush. When we're doing a munmap,
		75	* the vmas are adjusted to only cover the region to be torn down.
20	*/	76	*/
21	#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)	77	static inline void
		78	tlb_start_vma(struct mmu_gather tlb, struct vm_area_struct vma)
		79	{
		80	if (!tlb->fullmm)
		81	flush_cache_range(vma, vma->vm_start, vma->vm_end);
		82	}
		83
		84	static inline void
		85	tlb_end_vma(struct mmu_gather tlb, struct vm_area_struct vma)
		86	{
		87	if (!tlb->fullmm && tlb->end) {
		88	flush_tlb_range(vma, tlb->start, tlb->end);
		89	init_tlb_gather(tlb);
		90	}
		91	}
		92
		93	#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page)
		94	#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep)
		95	#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp)
		96	#define pud_free_tlb(tlb, pudp) pud_free((tlb)->mm, pudp)
		97
		98	#define tlb_migrate_finish(mm) do { } while (0)
		99
		100	#else /* CONFIG_MMU */
		101
		102	#define tlb_start_vma(tlb, vma) do { } while (0)
		103	#define tlb_end_vma(tlb, vma) do { } while (0)
		104	#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0)
		105	#define tlb_flush(tlb) do { } while (0)
22		106
23	#include <linux/pagemap.h>
24	#include <asm-generic/tlb.h>	107	#include <asm-generic/tlb.h>
25		108
		109	#endif /* CONFIG_MMU */
26	#endif /* __ASSEMBLY__ */	110	#endif /* __ASSEMBLY__ */
27	#endif /* __ASM_SH_TLB_H */	111	#endif /* __ASM_SH_TLB_H */