diff options
author | Aaro Koskinen <aaro.koskinen@nokia.com> | 2009-04-14 08:07:35 -0400 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2009-04-15 05:01:02 -0400 |
commit | 7fccfc00c003c855936970facdbb667bae9dbe9a (patch) | |
tree | 73c486c5db1c042dba8d507655c5176742bbb9ca /arch/arm/include | |
parent | 41609ff43005de11dadfb0ccadb344f0e2966829 (diff) |
[ARM] 5450/1: Flush only the needed range when unmapping a VMA
When unmapping N pages (e.g. shared memory) the amount of TLB flushes
done can be (N*PAGE_SIZE/ZAP_BLOCK_SIZE)*N although it should be N at
maximum. With PREEMPT kernel ZAP_BLOCK_SIZE is 8 pages, so there is a
noticeable performance penalty when unmapping a large VMA and the system
is spending its time in flush_tlb_range().
The problem is that tlb_end_vma() is always flushing the full VMA
range. The subrange that needs to be flushed can be calculated by
tlb_remove_tlb_entry(). This approach was suggested by Hugh Dickins,
and is also used by other arches.
The speed increase is roughly 3x for 8M mappings and for larger mappings
even more.
Signed-off-by: Aaro Koskinen <Aaro.Koskinen@nokia.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/include')
-rw-r--r-- | arch/arm/include/asm/tlb.h | 25 |
1 files changed, 21 insertions, 4 deletions
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 857f1dfac794..321c83e43a1e 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h | |||
@@ -36,6 +36,8 @@ | |||
36 | struct mmu_gather { | 36 | struct mmu_gather { |
37 | struct mm_struct *mm; | 37 | struct mm_struct *mm; |
38 | unsigned int fullmm; | 38 | unsigned int fullmm; |
39 | unsigned long range_start; | ||
40 | unsigned long range_end; | ||
39 | }; | 41 | }; |
40 | 42 | ||
41 | DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); | 43 | DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); |
@@ -63,7 +65,19 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | |||
63 | put_cpu_var(mmu_gathers); | 65 | put_cpu_var(mmu_gathers); |
64 | } | 66 | } |
65 | 67 | ||
66 | #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) | 68 | /* |
69 | * Memorize the range for the TLB flush. | ||
70 | */ | ||
71 | static inline void | ||
72 | tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) | ||
73 | { | ||
74 | if (!tlb->fullmm) { | ||
75 | if (addr < tlb->range_start) | ||
76 | tlb->range_start = addr; | ||
77 | if (addr + PAGE_SIZE > tlb->range_end) | ||
78 | tlb->range_end = addr + PAGE_SIZE; | ||
79 | } | ||
80 | } | ||
67 | 81 | ||
68 | /* | 82 | /* |
69 | * In the case of tlb vma handling, we can optimise these away in the | 83 | * In the case of tlb vma handling, we can optimise these away in the |
@@ -73,15 +87,18 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | |||
73 | static inline void | 87 | static inline void |
74 | tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) | 88 | tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) |
75 | { | 89 | { |
76 | if (!tlb->fullmm) | 90 | if (!tlb->fullmm) { |
77 | flush_cache_range(vma, vma->vm_start, vma->vm_end); | 91 | flush_cache_range(vma, vma->vm_start, vma->vm_end); |
92 | tlb->range_start = TASK_SIZE; | ||
93 | tlb->range_end = 0; | ||
94 | } | ||
78 | } | 95 | } |
79 | 96 | ||
80 | static inline void | 97 | static inline void |
81 | tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) | 98 | tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) |
82 | { | 99 | { |
83 | if (!tlb->fullmm) | 100 | if (!tlb->fullmm && tlb->range_end > 0) |
84 | flush_tlb_range(vma, vma->vm_start, vma->vm_end); | 101 | flush_tlb_range(vma, tlb->range_start, tlb->range_end); |
85 | } | 102 | } |
86 | 103 | ||
87 | #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) | 104 | #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) |