aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZachary Amsden <zach@vmware.com>2007-02-13 07:26:21 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:21 -0500
commitc119ecce894120790903ef535dac3e105f3d6cde (patch)
treeb9a60fe46b03d396ba396912c237e6ee2e9ef873
parent90611fe923aa3ac7ffb9e5df45c83860b0f00227 (diff)
[PATCH] MM: page allocation hooks for VMI backend
The VMI backend uses explicit page type notification to track shadow page tables. The allocation of page table roots is especially tricky. We need to clone the root for non-PAE mode while it is protected under the pgd lock to correctly copy the shadow. We don't need to allocate pgds in PAE mode, (PDPs in Intel terminology) as they only have 4 entries, and are cached entirely by the processor, which makes shadowing them rather simple. For base page table level allocation, pmd_populate provides the exact hook point we need. Also, we need to allocate pages when splitting a large page, and we must release pages before returning the page to any free pool. Despite being required with these slightly odd semantics for VMI, Xen also uses these hooks to determine the exact moment when page tables are created or released. AK: All nops for other architectures Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Cc: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andrew Morton <akpm@osdl.org>
-rw-r--r--arch/i386/kernel/paravirt.c6
-rw-r--r--arch/i386/mm/init.c4
-rw-r--r--arch/i386/mm/pageattr.c2
-rw-r--r--arch/i386/mm/pgtable.c24
-rw-r--r--include/asm-i386/paravirt.h14
-rw-r--r--include/asm-i386/pgalloc.h30
6 files changed, 72 insertions, 8 deletions
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f5..7329ec9fcc99 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -550,6 +550,12 @@ struct paravirt_ops paravirt_ops = {
550 .flush_tlb_kernel = native_flush_tlb_global, 550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single, 551 .flush_tlb_single = native_flush_tlb_single,
552 552
553 .alloc_pt = (void *)native_nop,
554 .alloc_pd = (void *)native_nop,
555 .alloc_pd_clone = (void *)native_nop,
556 .release_pt = (void *)native_nop,
557 .release_pd = (void *)native_nop,
558
553 .set_pte = native_set_pte, 559 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at, 560 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd, 561 .set_pmd = native_set_pmd,
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index c5c5ea700cc7..ae436882af7a 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
62 62
63#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
65 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
65 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 66 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
66 pud = pud_offset(pgd, 0); 67 pud = pud_offset(pgd, 0);
67 if (pmd_table != pmd_offset(pud, 0)) 68 if (pmd_table != pmd_offset(pud, 0))
@@ -82,6 +83,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
82{ 83{
83 if (pmd_none(*pmd)) { 84 if (pmd_none(*pmd)) {
84 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 85 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
86 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
85 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 87 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
86 if (page_table != pte_offset_kernel(pmd, 0)) 88 if (page_table != pte_offset_kernel(pmd, 0))
87 BUG(); 89 BUG();
@@ -345,6 +347,8 @@ static void __init pagetable_init (void)
345 /* Init entries of the first-level page table to the zero page */ 347 /* Init entries of the first-level page table to the zero page */
346 for (i = 0; i < PTRS_PER_PGD; i++) 348 for (i = 0; i < PTRS_PER_PGD; i++)
347 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 349 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
350#else
351 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
348#endif 352#endif
349 353
350 /* Enable PSE if available */ 354 /* Enable PSE if available */
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index e223b1d4981c..412ebbd8adb0 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,6 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
60 address = __pa(address); 60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK; 61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base); 62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(page_to_pfn(base));
63 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
64 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, 65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
65 addr == address ? prot : ref_prot)); 66 addr == address ? prot : ref_prot));
@@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
172 if (!PageReserved(kpte_page)) { 173 if (!PageReserved(kpte_page)) {
173 if (cpu_has_pse && (page_private(kpte_page) == 0)) { 174 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
174 ClearPagePrivate(kpte_page); 175 ClearPagePrivate(kpte_page);
176 paravirt_release_pt(page_to_pfn(kpte_page));
175 list_add(&kpte_page->lru, &df_list); 177 list_add(&kpte_page->lru, &df_list);
176 revert_page(kpte_page, address); 178 revert_page(kpte_page, address);
177 } 179 }
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index f349eaf450b0..b5f538f52272 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -248,9 +248,15 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
248 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 248 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
249 swapper_pg_dir + USER_PTRS_PER_PGD, 249 swapper_pg_dir + USER_PTRS_PER_PGD,
250 KERNEL_PGD_PTRS); 250 KERNEL_PGD_PTRS);
251
251 if (PTRS_PER_PMD > 1) 252 if (PTRS_PER_PMD > 1)
252 return; 253 return;
253 254
255 /* must happen under lock */
256 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
257 __pa(swapper_pg_dir) >> PAGE_SHIFT,
258 USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
259
254 pgd_list_add(pgd); 260 pgd_list_add(pgd);
255 spin_unlock_irqrestore(&pgd_lock, flags); 261 spin_unlock_irqrestore(&pgd_lock, flags);
256} 262}
@@ -260,6 +266,7 @@ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
260{ 266{
261 unsigned long flags; /* can be called from interrupt context */ 267 unsigned long flags; /* can be called from interrupt context */
262 268
269 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
263 spin_lock_irqsave(&pgd_lock, flags); 270 spin_lock_irqsave(&pgd_lock, flags);
264 pgd_list_del(pgd); 271 pgd_list_del(pgd);
265 spin_unlock_irqrestore(&pgd_lock, flags); 272 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -277,13 +284,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
277 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 284 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
278 if (!pmd) 285 if (!pmd)
279 goto out_oom; 286 goto out_oom;
287 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
280 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 288 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
281 } 289 }
282 return pgd; 290 return pgd;
283 291
284out_oom: 292out_oom:
285 for (i--; i >= 0; i--) 293 for (i--; i >= 0; i--) {
286 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 294 pgd_t pgdent = pgd[i];
295 void* pmd = (void *)__va(pgd_val(pgdent)-1);
296 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
297 kmem_cache_free(pmd_cache, pmd);
298 }
287 kmem_cache_free(pgd_cache, pgd); 299 kmem_cache_free(pgd_cache, pgd);
288 return NULL; 300 return NULL;
289} 301}
@@ -294,8 +306,12 @@ void pgd_free(pgd_t *pgd)
294 306
295 /* in the PAE case user pgd entries are overwritten before usage */ 307 /* in the PAE case user pgd entries are overwritten before usage */
296 if (PTRS_PER_PMD > 1) 308 if (PTRS_PER_PMD > 1)
297 for (i = 0; i < USER_PTRS_PER_PGD; ++i) 309 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
298 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 310 pgd_t pgdent = pgd[i];
311 void* pmd = (void *)__va(pgd_val(pgdent)-1);
312 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
313 kmem_cache_free(pmd_cache, pmd);
314 }
299 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 315 /* in the non-PAE case, free_pgtables() clears user pgd entries */
300 kmem_cache_free(pgd_cache, pgd); 316 kmem_cache_free(pgd_cache, pgd);
301} 317}
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 9f06265065f4..53da276a2ec2 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -127,6 +127,12 @@ struct paravirt_ops
127 void (fastcall *flush_tlb_kernel)(void); 127 void (fastcall *flush_tlb_kernel)(void);
128 void (fastcall *flush_tlb_single)(u32 addr); 128 void (fastcall *flush_tlb_single)(u32 addr);
129 129
130 void (fastcall *alloc_pt)(u32 pfn);
131 void (fastcall *alloc_pd)(u32 pfn);
132 void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
133 void (fastcall *release_pt)(u32 pfn);
134 void (fastcall *release_pd)(u32 pfn);
135
130 void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); 136 void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
131 void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); 137 void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
132 void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); 138 void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
@@ -320,6 +326,14 @@ static inline unsigned long apic_read(unsigned long reg)
320#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() 326#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
321#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) 327#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
322 328
329#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn)
330#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn)
331
332#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn)
333#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \
334 paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count)
335#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn)
336
323static inline void set_pte(pte_t *ptep, pte_t pteval) 337static inline void set_pte(pte_t *ptep, pte_t pteval)
324{ 338{
325 paravirt_ops.set_pte(ptep, pteval); 339 paravirt_ops.set_pte(ptep, pteval);
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index 4b1e61359f89..c8dc2d0141a7 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -5,13 +5,31 @@
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/mm.h> /* for struct page */ 6#include <linux/mm.h> /* for struct page */
7 7
8#define pmd_populate_kernel(mm, pmd, pte) \ 8#ifdef CONFIG_PARAVIRT
9 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) 9#include <asm/paravirt.h>
10#else
11#define paravirt_alloc_pt(pfn) do { } while (0)
12#define paravirt_alloc_pd(pfn) do { } while (0)
13#define paravirt_alloc_pd(pfn) do { } while (0)
14#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
15#define paravirt_release_pt(pfn) do { } while (0)
16#define paravirt_release_pd(pfn) do { } while (0)
17#endif
18
19#define pmd_populate_kernel(mm, pmd, pte) \
20do { \
21 paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \
22 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
23} while (0)
10 24
11#define pmd_populate(mm, pmd, pte) \ 25#define pmd_populate(mm, pmd, pte) \
26do { \
27 paravirt_alloc_pt(page_to_pfn(pte)); \
12 set_pmd(pmd, __pmd(_PAGE_TABLE + \ 28 set_pmd(pmd, __pmd(_PAGE_TABLE + \
13 ((unsigned long long)page_to_pfn(pte) << \ 29 ((unsigned long long)page_to_pfn(pte) << \
14 (unsigned long long) PAGE_SHIFT))) 30 (unsigned long long) PAGE_SHIFT))); \
31} while (0)
32
15/* 33/*
16 * Allocate and free page tables. 34 * Allocate and free page tables.
17 */ 35 */
@@ -32,7 +50,11 @@ static inline void pte_free(struct page *pte)
32} 50}
33 51
34 52
35#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 53#define __pte_free_tlb(tlb,pte) \
54do { \
55 paravirt_release_pt(page_to_pfn(pte)); \
56 tlb_remove_page((tlb),(pte)); \
57} while (0)
36 58
37#ifdef CONFIG_X86_PAE 59#ifdef CONFIG_X86_PAE
38/* 60/*