diff options
author | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-05-02 13:27:13 -0400 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2007-05-02 13:27:13 -0400 |
commit | 5311ab62cdc7788784971ed816ce85e926f3e994 (patch) | |
tree | 08ceda3c1bbdc6c403107f5329d775c772b752ce /arch | |
parent | 90caccb9758e88db68a69553689baee38254287b (diff) |
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/i386/kernel/paravirt.c | 1 | ||||
-rw-r--r-- | arch/i386/mm/fault.c | 5 | ||||
-rw-r--r-- | arch/i386/mm/init.c | 18 | ||||
-rw-r--r-- | arch/i386/mm/pageattr.c | 2 | ||||
-rw-r--r-- | arch/i386/mm/pgtable.c | 88 |
5 files changed, 90 insertions, 24 deletions
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 47d075bdfb95..2040a831d5b3 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c | |||
@@ -132,6 +132,7 @@ struct paravirt_ops paravirt_ops = { | |||
132 | .name = "bare hardware", | 132 | .name = "bare hardware", |
133 | .paravirt_enabled = 0, | 133 | .paravirt_enabled = 0, |
134 | .kernel_rpl = 0, | 134 | .kernel_rpl = 0, |
135 | .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ | ||
135 | 136 | ||
136 | .patch = native_patch, | 137 | .patch = native_patch, |
137 | .banner = default_banner, | 138 | .banner = default_banner, |
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c6a0a06258e6..f534c29e80b2 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c | |||
@@ -603,7 +603,6 @@ do_sigbus: | |||
603 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 603 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); |
604 | } | 604 | } |
605 | 605 | ||
606 | #ifndef CONFIG_X86_PAE | ||
607 | void vmalloc_sync_all(void) | 606 | void vmalloc_sync_all(void) |
608 | { | 607 | { |
609 | /* | 608 | /* |
@@ -616,6 +615,9 @@ void vmalloc_sync_all(void) | |||
616 | static unsigned long start = TASK_SIZE; | 615 | static unsigned long start = TASK_SIZE; |
617 | unsigned long address; | 616 | unsigned long address; |
618 | 617 | ||
618 | if (SHARED_KERNEL_PMD) | ||
619 | return; | ||
620 | |||
619 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | 621 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); |
620 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { | 622 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { |
621 | if (!test_bit(pgd_index(address), insync)) { | 623 | if (!test_bit(pgd_index(address), insync)) { |
@@ -638,4 +640,3 @@ void vmalloc_sync_all(void) | |||
638 | start = address + PGDIR_SIZE; | 640 | start = address + PGDIR_SIZE; |
639 | } | 641 | } |
640 | } | 642 | } |
641 | #endif | ||
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index e8545dcf06c5..dbe16f63a566 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c | |||
@@ -745,6 +745,8 @@ struct kmem_cache *pmd_cache; | |||
745 | 745 | ||
746 | void __init pgtable_cache_init(void) | 746 | void __init pgtable_cache_init(void) |
747 | { | 747 | { |
748 | size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); | ||
749 | |||
748 | if (PTRS_PER_PMD > 1) { | 750 | if (PTRS_PER_PMD > 1) { |
749 | pmd_cache = kmem_cache_create("pmd", | 751 | pmd_cache = kmem_cache_create("pmd", |
750 | PTRS_PER_PMD*sizeof(pmd_t), | 752 | PTRS_PER_PMD*sizeof(pmd_t), |
@@ -754,13 +756,23 @@ void __init pgtable_cache_init(void) | |||
754 | NULL); | 756 | NULL); |
755 | if (!pmd_cache) | 757 | if (!pmd_cache) |
756 | panic("pgtable_cache_init(): cannot create pmd cache"); | 758 | panic("pgtable_cache_init(): cannot create pmd cache"); |
759 | |||
760 | if (!SHARED_KERNEL_PMD) { | ||
761 | /* If we're in PAE mode and have a non-shared | ||
762 | kernel pmd, then the pgd size must be a | ||
763 | page size. This is because the pgd_list | ||
764 | links through the page structure, so there | ||
765 | can only be one pgd per page for this to | ||
766 | work. */ | ||
767 | pgd_size = PAGE_SIZE; | ||
768 | } | ||
757 | } | 769 | } |
758 | pgd_cache = kmem_cache_create("pgd", | 770 | pgd_cache = kmem_cache_create("pgd", |
759 | PTRS_PER_PGD*sizeof(pgd_t), | 771 | pgd_size, |
760 | PTRS_PER_PGD*sizeof(pgd_t), | 772 | pgd_size, |
761 | 0, | 773 | 0, |
762 | pgd_ctor, | 774 | pgd_ctor, |
763 | PTRS_PER_PMD == 1 ? pgd_dtor : NULL); | 775 | (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); |
764 | if (!pgd_cache) | 776 | if (!pgd_cache) |
765 | panic("pgtable_cache_init(): Cannot create pgd cache"); | 777 | panic("pgtable_cache_init(): Cannot create pgd cache"); |
766 | } | 778 | } |
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index ea6b6d4a0a2a..47bd477c8ecc 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c | |||
@@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | |||
91 | unsigned long flags; | 91 | unsigned long flags; |
92 | 92 | ||
93 | set_pte_atomic(kpte, pte); /* change init_mm */ | 93 | set_pte_atomic(kpte, pte); /* change init_mm */ |
94 | if (PTRS_PER_PMD > 1) | 94 | if (SHARED_KERNEL_PMD) |
95 | return; | 95 | return; |
96 | 96 | ||
97 | spin_lock_irqsave(&pgd_lock, flags); | 97 | spin_lock_irqsave(&pgd_lock, flags); |
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 99c09edc3dbb..9a96c1647428 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c | |||
@@ -232,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
232 | set_page_private(next, (unsigned long)pprev); | 232 | set_page_private(next, (unsigned long)pprev); |
233 | } | 233 | } |
234 | 234 | ||
235 | #if (PTRS_PER_PMD == 1) | ||
236 | /* Non-PAE pgd constructor */ | ||
235 | void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) | 237 | void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) |
236 | { | 238 | { |
237 | unsigned long flags; | 239 | unsigned long flags; |
238 | 240 | ||
239 | if (PTRS_PER_PMD == 1) { | 241 | /* !PAE, no pagetable sharing */ |
240 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | 242 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); |
241 | spin_lock_irqsave(&pgd_lock, flags); | 243 | |
242 | } | 244 | spin_lock_irqsave(&pgd_lock, flags); |
243 | 245 | ||
246 | /* must happen under lock */ | ||
244 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | 247 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, |
245 | swapper_pg_dir + USER_PTRS_PER_PGD, | 248 | swapper_pg_dir + USER_PTRS_PER_PGD, |
246 | KERNEL_PGD_PTRS); | 249 | KERNEL_PGD_PTRS); |
247 | |||
248 | if (PTRS_PER_PMD > 1) | ||
249 | return; | ||
250 | |||
251 | /* must happen under lock */ | ||
252 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | 250 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, |
253 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | 251 | __pa(swapper_pg_dir) >> PAGE_SHIFT, |
254 | USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); | 252 | USER_PTRS_PER_PGD, |
255 | 253 | KERNEL_PGD_PTRS); | |
256 | pgd_list_add(pgd); | 254 | pgd_list_add(pgd); |
257 | spin_unlock_irqrestore(&pgd_lock, flags); | 255 | spin_unlock_irqrestore(&pgd_lock, flags); |
258 | } | 256 | } |
257 | #else /* PTRS_PER_PMD > 1 */ | ||
258 | /* PAE pgd constructor */ | ||
259 | void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) | ||
260 | { | ||
261 | /* PAE, kernel PMD may be shared */ | ||
262 | |||
263 | if (SHARED_KERNEL_PMD) { | ||
264 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | ||
265 | swapper_pg_dir + USER_PTRS_PER_PGD, | ||
266 | KERNEL_PGD_PTRS); | ||
267 | } else { | ||
268 | unsigned long flags; | ||
269 | |||
270 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | ||
271 | spin_lock_irqsave(&pgd_lock, flags); | ||
272 | pgd_list_add(pgd); | ||
273 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
274 | } | ||
275 | } | ||
276 | #endif /* PTRS_PER_PMD */ | ||
259 | 277 | ||
260 | /* never called when PTRS_PER_PMD > 1 */ | ||
261 | void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) | 278 | void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) |
262 | { | 279 | { |
263 | unsigned long flags; /* can be called from interrupt context */ | 280 | unsigned long flags; /* can be called from interrupt context */ |
264 | 281 | ||
282 | BUG_ON(SHARED_KERNEL_PMD); | ||
283 | |||
265 | paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); | 284 | paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); |
266 | spin_lock_irqsave(&pgd_lock, flags); | 285 | spin_lock_irqsave(&pgd_lock, flags); |
267 | pgd_list_del(pgd); | 286 | pgd_list_del(pgd); |
268 | spin_unlock_irqrestore(&pgd_lock, flags); | 287 | spin_unlock_irqrestore(&pgd_lock, flags); |
269 | } | 288 | } |
270 | 289 | ||
290 | #define UNSHARED_PTRS_PER_PGD \ | ||
291 | (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | ||
292 | |||
293 | /* If we allocate a pmd for part of the kernel address space, then | ||
294 | make sure its initialized with the appropriate kernel mappings. | ||
295 | Otherwise use a cached zeroed pmd. */ | ||
296 | static pmd_t *pmd_cache_alloc(int idx) | ||
297 | { | ||
298 | pmd_t *pmd; | ||
299 | |||
300 | if (idx >= USER_PTRS_PER_PGD) { | ||
301 | pmd = (pmd_t *)__get_free_page(GFP_KERNEL); | ||
302 | |||
303 | if (pmd) | ||
304 | memcpy(pmd, | ||
305 | (void *)pgd_page_vaddr(swapper_pg_dir[idx]), | ||
306 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
307 | } else | ||
308 | pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | ||
309 | |||
310 | return pmd; | ||
311 | } | ||
312 | |||
313 | static void pmd_cache_free(pmd_t *pmd, int idx) | ||
314 | { | ||
315 | if (idx >= USER_PTRS_PER_PGD) | ||
316 | free_page((unsigned long)pmd); | ||
317 | else | ||
318 | kmem_cache_free(pmd_cache, pmd); | ||
319 | } | ||
320 | |||
271 | pgd_t *pgd_alloc(struct mm_struct *mm) | 321 | pgd_t *pgd_alloc(struct mm_struct *mm) |
272 | { | 322 | { |
273 | int i; | 323 | int i; |
@@ -276,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
276 | if (PTRS_PER_PMD == 1 || !pgd) | 326 | if (PTRS_PER_PMD == 1 || !pgd) |
277 | return pgd; | 327 | return pgd; |
278 | 328 | ||
279 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | 329 | for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { |
280 | pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | 330 | pmd_t *pmd = pmd_cache_alloc(i); |
331 | |||
281 | if (!pmd) | 332 | if (!pmd) |
282 | goto out_oom; | 333 | goto out_oom; |
334 | |||
283 | paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); | 335 | paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); |
284 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | 336 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); |
285 | } | 337 | } |
@@ -290,7 +342,7 @@ out_oom: | |||
290 | pgd_t pgdent = pgd[i]; | 342 | pgd_t pgdent = pgd[i]; |
291 | void* pmd = (void *)__va(pgd_val(pgdent)-1); | 343 | void* pmd = (void *)__va(pgd_val(pgdent)-1); |
292 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | 344 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); |
293 | kmem_cache_free(pmd_cache, pmd); | 345 | pmd_cache_free(pmd, i); |
294 | } | 346 | } |
295 | kmem_cache_free(pgd_cache, pgd); | 347 | kmem_cache_free(pgd_cache, pgd); |
296 | return NULL; | 348 | return NULL; |
@@ -302,11 +354,11 @@ void pgd_free(pgd_t *pgd) | |||
302 | 354 | ||
303 | /* in the PAE case user pgd entries are overwritten before usage */ | 355 | /* in the PAE case user pgd entries are overwritten before usage */ |
304 | if (PTRS_PER_PMD > 1) | 356 | if (PTRS_PER_PMD > 1) |
305 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | 357 | for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { |
306 | pgd_t pgdent = pgd[i]; | 358 | pgd_t pgdent = pgd[i]; |
307 | void* pmd = (void *)__va(pgd_val(pgdent)-1); | 359 | void* pmd = (void *)__va(pgd_val(pgdent)-1); |
308 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | 360 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); |
309 | kmem_cache_free(pmd_cache, pmd); | 361 | pmd_cache_free(pmd, i); |
310 | } | 362 | } |
311 | /* in the non-PAE case, free_pgtables() clears user pgd entries */ | 363 | /* in the non-PAE case, free_pgtables() clears user pgd entries */ |
312 | kmem_cache_free(pgd_cache, pgd); | 364 | kmem_cache_free(pgd_cache, pgd); |