aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZachary Amsden <zach@vmware.com>2006-10-01 02:29:33 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-01 03:39:33 -0400
commit6606c3e0da5360799e07ae24b05080cc85c68e72 (patch)
tree5072acfc3b36e48ec84fe28805d160cbc9b28900
parent9888a1cae3f859db38b9604e3df1c02177161bb0 (diff)
[PATCH] paravirt: lazy mmu mode hooks.patch
Implement lazy MMU update hooks which are SMP safe for both direct and shadow page tables. The idea is that PTE updates and page invalidations while in lazy mode can be batched into a single hypercall. We use this in VMI for shadow page table synchronization, and it is a win. It also can be used by PPC and for direct page tables on Xen. For SMP, the enter / leave must happen under protection of the page table locks for page tables which are being modified. This is because otherwise, you end up with stale state in the batched hypercall, which other CPUs can race ahead of. Doing this under the protection of the locks guarantees the synchronization is correct, and also means that spurious faults which are generated during this window by remote CPUs are properly handled, as the page fault handler must re-check the PTE under protection of the same lock. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/asm-generic/pgtable.h20
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
4 files changed, 32 insertions, 0 deletions
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 78740716c9e7..56627fa453a6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -171,6 +171,26 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
171#endif 171#endif
172 172
173/* 173/*
174 * A facility to provide lazy MMU batching. This allows PTE updates and
175 * page invalidations to be delayed until a call to leave lazy MMU mode
176 * is issued. Some architectures may benefit from doing this, and it is
177 * beneficial for both shadow and direct mode hypervisors, which may batch
178 * the PTE updates which happen during this window. Note that using this
179 * interface requires that read hazards be removed from the code. A read
180 * hazard could result in the direct mode hypervisor case, since the actual
181 * write to the page tables may not yet have taken place, so reads though
182 * a raw PTE pointer after it has been modified are not guaranteed to be
183 * up to date. This mode can only be entered and left under the protection of
184 * the page table locks for all page tables which may be modified. In the UP
185 * case, this is required so that preemption is disabled, and in the SMP case,
186 * it must synchronize the delayed page table writes properly on other CPUs.
187 */
188#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
189#define arch_enter_lazy_mmu_mode() do {} while (0)
190#define arch_leave_lazy_mmu_mode() do {} while (0)
191#endif
192
193/*
174 * When walking page tables, get the address of the next boundary, 194 * When walking page tables, get the address of the next boundary,
175 * or the end address of the range if that comes earlier. Although no 195 * or the end address of the range if that comes earlier. Although no
176 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. 196 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
diff --git a/mm/memory.c b/mm/memory.c
index 2e754621d333..9cf3f341a28a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ again:
506 src_pte = pte_offset_map_nested(src_pmd, addr); 506 src_pte = pte_offset_map_nested(src_pmd, addr);
507 src_ptl = pte_lockptr(src_mm, src_pmd); 507 src_ptl = pte_lockptr(src_mm, src_pmd);
508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
509 arch_enter_lazy_mmu_mode();
509 510
510 do { 511 do {
511 /* 512 /*
@@ -527,6 +528,7 @@ again:
527 progress += 8; 528 progress += 8;
528 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 529 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
529 530
531 arch_leave_lazy_mmu_mode();
530 spin_unlock(src_ptl); 532 spin_unlock(src_ptl);
531 pte_unmap_nested(src_pte - 1); 533 pte_unmap_nested(src_pte - 1);
532 add_mm_rss(dst_mm, rss[0], rss[1]); 534 add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
628 int anon_rss = 0; 630 int anon_rss = 0;
629 631
630 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 632 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
633 arch_enter_lazy_mmu_mode();
631 do { 634 do {
632 pte_t ptent = *pte; 635 pte_t ptent = *pte;
633 if (pte_none(ptent)) { 636 if (pte_none(ptent)) {
@@ -694,6 +697,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
694 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 697 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
695 698
696 add_mm_rss(mm, file_rss, anon_rss); 699 add_mm_rss(mm, file_rss, anon_rss);
700 arch_leave_lazy_mmu_mode();
697 pte_unmap_unlock(pte - 1, ptl); 701 pte_unmap_unlock(pte - 1, ptl);
698 702
699 return addr; 703 return addr;
@@ -1109,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1109 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1113 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1110 if (!pte) 1114 if (!pte)
1111 return -ENOMEM; 1115 return -ENOMEM;
1116 arch_enter_lazy_mmu_mode();
1112 do { 1117 do {
1113 struct page *page = ZERO_PAGE(addr); 1118 struct page *page = ZERO_PAGE(addr);
1114 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1119 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1118 BUG_ON(!pte_none(*pte)); 1123 BUG_ON(!pte_none(*pte));
1119 set_pte_at(mm, addr, pte, zero_pte); 1124 set_pte_at(mm, addr, pte, zero_pte);
1120 } while (pte++, addr += PAGE_SIZE, addr != end); 1125 } while (pte++, addr += PAGE_SIZE, addr != end);
1126 arch_leave_lazy_mmu_mode();
1121 pte_unmap_unlock(pte - 1, ptl); 1127 pte_unmap_unlock(pte - 1, ptl);
1122 return 0; 1128 return 0;
1123} 1129}
@@ -1275,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1275 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1281 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1276 if (!pte) 1282 if (!pte)
1277 return -ENOMEM; 1283 return -ENOMEM;
1284 arch_enter_lazy_mmu_mode();
1278 do { 1285 do {
1279 BUG_ON(!pte_none(*pte)); 1286 BUG_ON(!pte_none(*pte));
1280 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1287 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1281 pfn++; 1288 pfn++;
1282 } while (pte++, addr += PAGE_SIZE, addr != end); 1289 } while (pte++, addr += PAGE_SIZE, addr != end);
1290 arch_leave_lazy_mmu_mode();
1283 pte_unmap_unlock(pte - 1, ptl); 1291 pte_unmap_unlock(pte - 1, ptl);
1284 return 0; 1292 return 0;
1285} 1293}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38aa..3b8f3c0c63f3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -34,6 +34,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
34 spinlock_t *ptl; 34 spinlock_t *ptl;
35 35
36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
37 arch_enter_lazy_mmu_mode();
37 do { 38 do {
38 oldpte = *pte; 39 oldpte = *pte;
39 if (pte_present(oldpte)) { 40 if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
70 } 71 }
71 72
72 } while (pte++, addr += PAGE_SIZE, addr != end); 73 } while (pte++, addr += PAGE_SIZE, addr != end);
74 arch_leave_lazy_mmu_mode();
73 pte_unmap_unlock(pte - 1, ptl); 75 pte_unmap_unlock(pte - 1, ptl);
74} 76}
75 77
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373ad..9c769fa29f32 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
98 new_ptl = pte_lockptr(mm, new_pmd); 98 new_ptl = pte_lockptr(mm, new_pmd);
99 if (new_ptl != old_ptl) 99 if (new_ptl != old_ptl)
100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
101 arch_enter_lazy_mmu_mode();
101 102
102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 103 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 new_pte++, new_addr += PAGE_SIZE) { 104 new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
109 set_pte_at(mm, new_addr, new_pte, pte); 110 set_pte_at(mm, new_addr, new_pte, pte);
110 } 111 }
111 112
113 arch_leave_lazy_mmu_mode();
112 if (new_ptl != old_ptl) 114 if (new_ptl != old_ptl)
113 spin_unlock(new_ptl); 115 spin_unlock(new_ptl);
114 pte_unmap_nested(new_pte - 1); 116 pte_unmap_nested(new_pte - 1);