aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@goop.org>2007-05-02 13:27:13 -0400
committerAndi Kleen <andi@basil.nowhere.org>2007-05-02 13:27:13 -0400
commit5311ab62cdc7788784971ed816ce85e926f3e994 (patch)
tree08ceda3c1bbdc6c403107f5329d775c772b752ce
parent90caccb9758e88db68a69553689baee38254287b (diff)
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space, which can be shared among all processes (since they all need the same kernel mappings). Xen, however, does not allow guests to have the kernel pmd shared between page tables, so parameterize pgtable.c to allow both modes of operation. There are several side-effects of this. One is that vmalloc will update the kernel address space mappings, and those updates need to be propagated into all processes if the kernel mappings are not intrinsically shared. In the non-PAE case, this is done by maintaining a pgd_list of all processes; this list is used when all process pagetables must be updated. pgd_list is threaded via otherwise unused entries in the page structure for the pgd, which means that the pgd must be page-sized for this to work. Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE pgd to page aligned anyway, so this patch forces the pgd to be page aligned+sized when the kernel pmd is unshared, to accomodate both these requirements. Also, since there may be several distinct kernel pmds (if the user/kernel split is below 3G), there's no point in allocating them from a slab cache; they're just allocated with get_free_page and initialized appropriately. (Of course the could be cached if there is just a single kernel pmd - which is the default with a 3G user/kernel split - but it doesn't seem worthwhile to add yet another case into this code). [ Many thanks to wli for review comments. ] Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Zachary Amsden <zach@vmware.com> Cc: Christoph Lameter <clameter@sgi.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--arch/i386/kernel/paravirt.c1
-rw-r--r--arch/i386/mm/fault.c5
-rw-r--r--arch/i386/mm/init.c18
-rw-r--r--arch/i386/mm/pageattr.c2
-rw-r--r--arch/i386/mm/pgtable.c88
-rw-r--r--include/asm-i386/paravirt.h1
-rw-r--r--include/asm-i386/pgtable-2level-defs.h2
-rw-r--r--include/asm-i386/pgtable-2level.h2
-rw-r--r--include/asm-i386/pgtable-3level-defs.h6
-rw-r--r--include/asm-i386/pgtable-3level.h2
-rw-r--r--include/asm-i386/pgtable.h2
11 files changed, 101 insertions, 28 deletions
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 47d075bdfb95..2040a831d5b3 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -132,6 +132,7 @@ struct paravirt_ops paravirt_ops = {
132 .name = "bare hardware", 132 .name = "bare hardware",
133 .paravirt_enabled = 0, 133 .paravirt_enabled = 0,
134 .kernel_rpl = 0, 134 .kernel_rpl = 0,
135 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
135 136
136 .patch = native_patch, 137 .patch = native_patch,
137 .banner = default_banner, 138 .banner = default_banner,
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index c6a0a06258e6..f534c29e80b2 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -603,7 +603,6 @@ do_sigbus:
603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
604} 604}
605 605
606#ifndef CONFIG_X86_PAE
607void vmalloc_sync_all(void) 606void vmalloc_sync_all(void)
608{ 607{
609 /* 608 /*
@@ -616,6 +615,9 @@ void vmalloc_sync_all(void)
616 static unsigned long start = TASK_SIZE; 615 static unsigned long start = TASK_SIZE;
617 unsigned long address; 616 unsigned long address;
618 617
618 if (SHARED_KERNEL_PMD)
619 return;
620
619 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 621 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
620 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 622 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
621 if (!test_bit(pgd_index(address), insync)) { 623 if (!test_bit(pgd_index(address), insync)) {
@@ -638,4 +640,3 @@ void vmalloc_sync_all(void)
638 start = address + PGDIR_SIZE; 640 start = address + PGDIR_SIZE;
639 } 641 }
640} 642}
641#endif
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index e8545dcf06c5..dbe16f63a566 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -745,6 +745,8 @@ struct kmem_cache *pmd_cache;
745 745
746void __init pgtable_cache_init(void) 746void __init pgtable_cache_init(void)
747{ 747{
748 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
749
748 if (PTRS_PER_PMD > 1) { 750 if (PTRS_PER_PMD > 1) {
749 pmd_cache = kmem_cache_create("pmd", 751 pmd_cache = kmem_cache_create("pmd",
750 PTRS_PER_PMD*sizeof(pmd_t), 752 PTRS_PER_PMD*sizeof(pmd_t),
@@ -754,13 +756,23 @@ void __init pgtable_cache_init(void)
754 NULL); 756 NULL);
755 if (!pmd_cache) 757 if (!pmd_cache)
756 panic("pgtable_cache_init(): cannot create pmd cache"); 758 panic("pgtable_cache_init(): cannot create pmd cache");
759
760 if (!SHARED_KERNEL_PMD) {
761 /* If we're in PAE mode and have a non-shared
762 kernel pmd, then the pgd size must be a
763 page size. This is because the pgd_list
764 links through the page structure, so there
765 can only be one pgd per page for this to
766 work. */
767 pgd_size = PAGE_SIZE;
768 }
757 } 769 }
758 pgd_cache = kmem_cache_create("pgd", 770 pgd_cache = kmem_cache_create("pgd",
759 PTRS_PER_PGD*sizeof(pgd_t), 771 pgd_size,
760 PTRS_PER_PGD*sizeof(pgd_t), 772 pgd_size,
761 0, 773 0,
762 pgd_ctor, 774 pgd_ctor,
763 PTRS_PER_PMD == 1 ? pgd_dtor : NULL); 775 (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL);
764 if (!pgd_cache) 776 if (!pgd_cache)
765 panic("pgtable_cache_init(): Cannot create pgd cache"); 777 panic("pgtable_cache_init(): Cannot create pgd cache");
766} 778}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index ea6b6d4a0a2a..47bd477c8ecc 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
91 unsigned long flags; 91 unsigned long flags;
92 92
93 set_pte_atomic(kpte, pte); /* change init_mm */ 93 set_pte_atomic(kpte, pte); /* change init_mm */
94 if (PTRS_PER_PMD > 1) 94 if (SHARED_KERNEL_PMD)
95 return; 95 return;
96 96
97 spin_lock_irqsave(&pgd_lock, flags); 97 spin_lock_irqsave(&pgd_lock, flags);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 99c09edc3dbb..9a96c1647428 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -232,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd)
232 set_page_private(next, (unsigned long)pprev); 232 set_page_private(next, (unsigned long)pprev);
233} 233}
234 234
235#if (PTRS_PER_PMD == 1)
236/* Non-PAE pgd constructor */
235void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) 237void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
236{ 238{
237 unsigned long flags; 239 unsigned long flags;
238 240
239 if (PTRS_PER_PMD == 1) { 241 /* !PAE, no pagetable sharing */
240 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 242 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
241 spin_lock_irqsave(&pgd_lock, flags); 243
242 } 244 spin_lock_irqsave(&pgd_lock, flags);
243 245
246 /* must happen under lock */
244 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 247 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
245 swapper_pg_dir + USER_PTRS_PER_PGD, 248 swapper_pg_dir + USER_PTRS_PER_PGD,
246 KERNEL_PGD_PTRS); 249 KERNEL_PGD_PTRS);
247
248 if (PTRS_PER_PMD > 1)
249 return;
250
251 /* must happen under lock */
252 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, 250 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
253 __pa(swapper_pg_dir) >> PAGE_SHIFT, 251 __pa(swapper_pg_dir) >> PAGE_SHIFT,
254 USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); 252 USER_PTRS_PER_PGD,
255 253 KERNEL_PGD_PTRS);
256 pgd_list_add(pgd); 254 pgd_list_add(pgd);
257 spin_unlock_irqrestore(&pgd_lock, flags); 255 spin_unlock_irqrestore(&pgd_lock, flags);
258} 256}
257#else /* PTRS_PER_PMD > 1 */
258/* PAE pgd constructor */
259void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
260{
261 /* PAE, kernel PMD may be shared */
262
263 if (SHARED_KERNEL_PMD) {
264 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
265 swapper_pg_dir + USER_PTRS_PER_PGD,
266 KERNEL_PGD_PTRS);
267 } else {
268 unsigned long flags;
269
270 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
271 spin_lock_irqsave(&pgd_lock, flags);
272 pgd_list_add(pgd);
273 spin_unlock_irqrestore(&pgd_lock, flags);
274 }
275}
276#endif /* PTRS_PER_PMD */
259 277
260/* never called when PTRS_PER_PMD > 1 */
261void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) 278void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
262{ 279{
263 unsigned long flags; /* can be called from interrupt context */ 280 unsigned long flags; /* can be called from interrupt context */
264 281
282 BUG_ON(SHARED_KERNEL_PMD);
283
265 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); 284 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
266 spin_lock_irqsave(&pgd_lock, flags); 285 spin_lock_irqsave(&pgd_lock, flags);
267 pgd_list_del(pgd); 286 pgd_list_del(pgd);
268 spin_unlock_irqrestore(&pgd_lock, flags); 287 spin_unlock_irqrestore(&pgd_lock, flags);
269} 288}
270 289
290#define UNSHARED_PTRS_PER_PGD \
291 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
292
293/* If we allocate a pmd for part of the kernel address space, then
294 make sure its initialized with the appropriate kernel mappings.
295 Otherwise use a cached zeroed pmd. */
296static pmd_t *pmd_cache_alloc(int idx)
297{
298 pmd_t *pmd;
299
300 if (idx >= USER_PTRS_PER_PGD) {
301 pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
302
303 if (pmd)
304 memcpy(pmd,
305 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
306 sizeof(pmd_t) * PTRS_PER_PMD);
307 } else
308 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
309
310 return pmd;
311}
312
313static void pmd_cache_free(pmd_t *pmd, int idx)
314{
315 if (idx >= USER_PTRS_PER_PGD)
316 free_page((unsigned long)pmd);
317 else
318 kmem_cache_free(pmd_cache, pmd);
319}
320
271pgd_t *pgd_alloc(struct mm_struct *mm) 321pgd_t *pgd_alloc(struct mm_struct *mm)
272{ 322{
273 int i; 323 int i;
@@ -276,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
276 if (PTRS_PER_PMD == 1 || !pgd) 326 if (PTRS_PER_PMD == 1 || !pgd)
277 return pgd; 327 return pgd;
278 328
279 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 329 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
280 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 330 pmd_t *pmd = pmd_cache_alloc(i);
331
281 if (!pmd) 332 if (!pmd)
282 goto out_oom; 333 goto out_oom;
334
283 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); 335 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
284 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 336 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
285 } 337 }
@@ -290,7 +342,7 @@ out_oom:
290 pgd_t pgdent = pgd[i]; 342 pgd_t pgdent = pgd[i];
291 void* pmd = (void *)__va(pgd_val(pgdent)-1); 343 void* pmd = (void *)__va(pgd_val(pgdent)-1);
292 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 344 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
293 kmem_cache_free(pmd_cache, pmd); 345 pmd_cache_free(pmd, i);
294 } 346 }
295 kmem_cache_free(pgd_cache, pgd); 347 kmem_cache_free(pgd_cache, pgd);
296 return NULL; 348 return NULL;
@@ -302,11 +354,11 @@ void pgd_free(pgd_t *pgd)
302 354
303 /* in the PAE case user pgd entries are overwritten before usage */ 355 /* in the PAE case user pgd entries are overwritten before usage */
304 if (PTRS_PER_PMD > 1) 356 if (PTRS_PER_PMD > 1)
305 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 357 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
306 pgd_t pgdent = pgd[i]; 358 pgd_t pgdent = pgd[i];
307 void* pmd = (void *)__va(pgd_val(pgdent)-1); 359 void* pmd = (void *)__va(pgd_val(pgdent)-1);
308 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 360 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
309 kmem_cache_free(pmd_cache, pmd); 361 pmd_cache_free(pmd, i);
310 } 362 }
311 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 363 /* in the non-PAE case, free_pgtables() clears user pgd entries */
312 kmem_cache_free(pgd_cache, pgd); 364 kmem_cache_free(pgd_cache, pgd);
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index c49b44cdd8ee..f93599dc7756 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -35,6 +35,7 @@ struct desc_struct;
35struct paravirt_ops 35struct paravirt_ops
36{ 36{
37 unsigned int kernel_rpl; 37 unsigned int kernel_rpl;
38 int shared_kernel_pmd;
38 int paravirt_enabled; 39 int paravirt_enabled;
39 const char *name; 40 const char *name;
40 41
diff --git a/include/asm-i386/pgtable-2level-defs.h b/include/asm-i386/pgtable-2level-defs.h
index 02518079f816..0f71c9f13da4 100644
--- a/include/asm-i386/pgtable-2level-defs.h
+++ b/include/asm-i386/pgtable-2level-defs.h
@@ -1,6 +1,8 @@
1#ifndef _I386_PGTABLE_2LEVEL_DEFS_H 1#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
2#define _I386_PGTABLE_2LEVEL_DEFS_H 2#define _I386_PGTABLE_2LEVEL_DEFS_H
3 3
4#define SHARED_KERNEL_PMD 0
5
4/* 6/*
5 * traditional i386 two-level paging structure: 7 * traditional i386 two-level paging structure:
6 */ 8 */
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 043a2bcfa86a..781fe4bcc962 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -82,6 +82,4 @@ static inline int pte_exec_kernel(pte_t pte)
82#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) 82#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
83#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 83#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
84 84
85void vmalloc_sync_all(void);
86
87#endif /* _I386_PGTABLE_2LEVEL_H */ 85#endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/include/asm-i386/pgtable-3level-defs.h b/include/asm-i386/pgtable-3level-defs.h
index eb3a1ea88671..c0df89f66e8b 100644
--- a/include/asm-i386/pgtable-3level-defs.h
+++ b/include/asm-i386/pgtable-3level-defs.h
@@ -1,6 +1,12 @@
1#ifndef _I386_PGTABLE_3LEVEL_DEFS_H 1#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
2#define _I386_PGTABLE_3LEVEL_DEFS_H 2#define _I386_PGTABLE_3LEVEL_DEFS_H
3 3
4#ifdef CONFIG_PARAVIRT
5#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd)
6#else
7#define SHARED_KERNEL_PMD 1
8#endif
9
4/* 10/*
5 * PGDIR_SHIFT determines what a top-level page table entry can map 11 * PGDIR_SHIFT determines what a top-level page table entry can map
6 */ 12 */
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index be6017f37a91..664bfee5a2f2 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -200,6 +200,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
200 200
201#define __pmd_free_tlb(tlb, x) do { } while (0) 201#define __pmd_free_tlb(tlb, x) do { } while (0)
202 202
203#define vmalloc_sync_all() ((void)0)
204
205#endif /* _I386_PGTABLE_3LEVEL_H */ 203#endif /* _I386_PGTABLE_3LEVEL_H */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 0790ad6ed440..5b88a6a1278e 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -243,6 +243,8 @@ static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; re
243static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } 243static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
244static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } 244static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
245 245
246extern void vmalloc_sync_all(void);
247
246#ifdef CONFIG_X86_PAE 248#ifdef CONFIG_X86_PAE
247# include <asm/pgtable-3level.h> 249# include <asm/pgtable-3level.h>
248#else 250#else