aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-05 17:55:20 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-05 17:55:20 -0400
commitea62ccd00fd0b6720b033adfc9984f31130ce195 (patch)
tree9837b797b2466fffcb0af96c388b06eae9c3df18 /arch/i386/mm
parent886a0768affe9a32f18c45f8e1393bca9ece5392 (diff)
parent35060b6a9a4e1c89bc6fbea61090e302dbc61847 (diff)
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (231 commits) [PATCH] i386: Don't delete cpu_devs data to identify different x86 types in late_initcall [PATCH] i386: type may be unused [PATCH] i386: Some additional chipset register values validation. [PATCH] i386: Add missing !X86_PAE dependincy to the 2G/2G split. [PATCH] x86-64: Don't exclude asm-offsets.c in Documentation/dontdiff [PATCH] i386: avoid redundant preempt_disable in __unlazy_fpu [PATCH] i386: white space fixes in i387.h [PATCH] i386: Drop noisy e820 debugging printks [PATCH] x86-64: Fix allnoconfig error in genapic_flat.c [PATCH] x86-64: Shut up warnings for vfat compat ioctls on other file systems [PATCH] x86-64: Share identical video.S between i386 and x86-64 [PATCH] x86-64: Remove CONFIG_REORDER [PATCH] x86-64: Print type and size correctly for unknown compat ioctls [PATCH] i386: Remove copy_*_user BUG_ONs for (size < 0) [PATCH] i386: Little cleanups in smpboot.c [PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA scanning [PATCH] x86: Use RDTSCP for synchronous get_cycles if possible [PATCH] i386: Add X86_FEATURE_RDTSCP [PATCH] i386: Implement X86_FEATURE_SYNC_RDTSC on i386 [PATCH] i386: Implement alternative_io for i386 ... Fix up trivial conflict in include/linux/highmem.h manually. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/i386/mm')
-rw-r--r--arch/i386/mm/fault.c60
-rw-r--r--arch/i386/mm/highmem.c10
-rw-r--r--arch/i386/mm/init.c196
-rw-r--r--arch/i386/mm/pageattr.c6
-rw-r--r--arch/i386/mm/pgtable.c94
5 files changed, 252 insertions, 114 deletions
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index b8c4e259fc8b..f534c29e80b2 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -20,6 +20,7 @@
20#include <linux/tty.h> 20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */ 21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kprobes.h> 25#include <linux/kprobes.h>
25#include <linux/uaccess.h> 26#include <linux/uaccess.h>
@@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
301 struct mm_struct *mm; 302 struct mm_struct *mm;
302 struct vm_area_struct * vma; 303 struct vm_area_struct * vma;
303 unsigned long address; 304 unsigned long address;
304 unsigned long page;
305 int write, si_code; 305 int write, si_code;
306 306
307 /* get the address */ 307 /* get the address */
@@ -510,7 +510,9 @@ no_context:
510 bust_spinlocks(1); 510 bust_spinlocks(1);
511 511
512 if (oops_may_print()) { 512 if (oops_may_print()) {
513 #ifdef CONFIG_X86_PAE 513 __typeof__(pte_val(__pte(0))) page;
514
515#ifdef CONFIG_X86_PAE
514 if (error_code & 16) { 516 if (error_code & 16) {
515 pte_t *pte = lookup_address(address); 517 pte_t *pte = lookup_address(address);
516 518
@@ -519,7 +521,7 @@ no_context:
519 "NX-protected page - exploit attempt? " 521 "NX-protected page - exploit attempt? "
520 "(uid: %d)\n", current->uid); 522 "(uid: %d)\n", current->uid);
521 } 523 }
522 #endif 524#endif
523 if (address < PAGE_SIZE) 525 if (address < PAGE_SIZE)
524 printk(KERN_ALERT "BUG: unable to handle kernel NULL " 526 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
525 "pointer dereference"); 527 "pointer dereference");
@@ -529,25 +531,38 @@ no_context:
529 printk(" at virtual address %08lx\n",address); 531 printk(" at virtual address %08lx\n",address);
530 printk(KERN_ALERT " printing eip:\n"); 532 printk(KERN_ALERT " printing eip:\n");
531 printk("%08lx\n", regs->eip); 533 printk("%08lx\n", regs->eip);
532 } 534
533 page = read_cr3(); 535 page = read_cr3();
534 page = ((unsigned long *) __va(page))[address >> 22]; 536 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
535 if (oops_may_print()) 537#ifdef CONFIG_X86_PAE
538 printk(KERN_ALERT "*pdpt = %016Lx\n", page);
539 if ((page >> PAGE_SHIFT) < max_low_pfn
540 && page & _PAGE_PRESENT) {
541 page &= PAGE_MASK;
542 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
543 & (PTRS_PER_PMD - 1)];
544 printk(KERN_ALERT "*pde = %016Lx\n", page);
545 page &= ~_PAGE_NX;
546 }
547#else
536 printk(KERN_ALERT "*pde = %08lx\n", page); 548 printk(KERN_ALERT "*pde = %08lx\n", page);
537 /*
538 * We must not directly access the pte in the highpte
539 * case, the page table might be allocated in highmem.
540 * And lets rather not kmap-atomic the pte, just in case
541 * it's allocated already.
542 */
543#ifndef CONFIG_HIGHPTE
544 if ((page & 1) && oops_may_print()) {
545 page &= PAGE_MASK;
546 address &= 0x003ff000;
547 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
548 printk(KERN_ALERT "*pte = %08lx\n", page);
549 }
550#endif 549#endif
550
551 /*
552 * We must not directly access the pte in the highpte
553 * case if the page table is located in highmem.
554 * And let's rather not kmap-atomic the pte, just in case
555 * it's allocated already.
556 */
557 if ((page >> PAGE_SHIFT) < max_low_pfn
558 && (page & _PAGE_PRESENT)) {
559 page &= PAGE_MASK;
560 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
561 & (PTRS_PER_PTE - 1)];
562 printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
563 }
564 }
565
551 tsk->thread.cr2 = address; 566 tsk->thread.cr2 = address;
552 tsk->thread.trap_no = 14; 567 tsk->thread.trap_no = 14;
553 tsk->thread.error_code = error_code; 568 tsk->thread.error_code = error_code;
@@ -588,7 +603,6 @@ do_sigbus:
588 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
589} 604}
590 605
591#ifndef CONFIG_X86_PAE
592void vmalloc_sync_all(void) 606void vmalloc_sync_all(void)
593{ 607{
594 /* 608 /*
@@ -601,6 +615,9 @@ void vmalloc_sync_all(void)
601 static unsigned long start = TASK_SIZE; 615 static unsigned long start = TASK_SIZE;
602 unsigned long address; 616 unsigned long address;
603 617
618 if (SHARED_KERNEL_PMD)
619 return;
620
604 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 621 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
605 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 622 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
606 if (!test_bit(pgd_index(address), insync)) { 623 if (!test_bit(pgd_index(address), insync)) {
@@ -623,4 +640,3 @@ void vmalloc_sync_all(void)
623 start = address + PGDIR_SIZE; 640 start = address + PGDIR_SIZE;
624 } 641 }
625} 642}
626#endif
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index ac70d09df7ee..ad8d86cc683e 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -26,7 +26,7 @@ void kunmap(struct page *page)
26 * However when holding an atomic kmap is is not legal to sleep, so atomic 26 * However when holding an atomic kmap is is not legal to sleep, so atomic
27 * kmaps are appropriate for short, tight code paths only. 27 * kmaps are appropriate for short, tight code paths only.
28 */ 28 */
29void *kmap_atomic(struct page *page, enum km_type type) 29void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{ 30{
31 enum fixed_addresses idx; 31 enum fixed_addresses idx;
32 unsigned long vaddr; 32 unsigned long vaddr;
@@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type)
41 return page_address(page); 41 return page_address(page);
42 42
43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
44 set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); 44 set_pte(kmap_pte-idx, mk_pte(page, prot));
45 arch_flush_lazy_mmu_mode(); 45 arch_flush_lazy_mmu_mode();
46 46
47 return (void*) vaddr; 47 return (void*) vaddr;
48} 48}
49 49
50void *kmap_atomic(struct page *page, enum km_type type)
51{
52 return kmap_atomic_prot(page, type, kmap_prot);
53}
54
50void kunmap_atomic(void *kvaddr, enum km_type type) 55void kunmap_atomic(void *kvaddr, enum km_type type)
51{ 56{
52 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 57 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -67,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
67#endif 72#endif
68 } 73 }
69 74
75 arch_flush_lazy_mmu_mode();
70 pagefault_enable(); 76 pagefault_enable();
71} 77}
72 78
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index ae436882af7a..dbe16f63a566 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
@@ -42,6 +43,7 @@
42#include <asm/tlb.h> 43#include <asm/tlb.h>
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/sections.h> 45#include <asm/sections.h>
46#include <asm/paravirt.h>
45 47
46unsigned int __VMALLOC_RESERVE = 128 << 20; 48unsigned int __VMALLOC_RESERVE = 128 << 20;
47 49
@@ -61,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
61 pmd_t *pmd_table; 63 pmd_t *pmd_table;
62 64
63#ifdef CONFIG_X86_PAE 65#ifdef CONFIG_X86_PAE
64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
65 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); 67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
66 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 68
67 pud = pud_offset(pgd, 0); 69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
68 if (pmd_table != pmd_offset(pud, 0)) 70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
69 BUG(); 71 pud = pud_offset(pgd, 0);
70#else 72 if (pmd_table != pmd_offset(pud, 0))
73 BUG();
74 }
75#endif
71 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
72 pmd_table = pmd_offset(pud, 0); 77 pmd_table = pmd_offset(pud, 0);
73#endif
74
75 return pmd_table; 78 return pmd_table;
76} 79}
77 80
@@ -81,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
81 */ 84 */
82static pte_t * __init one_page_table_init(pmd_t *pmd) 85static pte_t * __init one_page_table_init(pmd_t *pmd)
83{ 86{
84 if (pmd_none(*pmd)) { 87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
85 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89
86 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); 90 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
87 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
88 if (page_table != pte_offset_kernel(pmd, 0)) 92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
89 BUG();
90
91 return page_table;
92 } 93 }
93 94
94 return pte_offset_kernel(pmd, 0); 95 return pte_offset_kernel(pmd, 0);
@@ -108,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
108static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) 109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
109{ 110{
110 pgd_t *pgd; 111 pgd_t *pgd;
111 pud_t *pud;
112 pmd_t *pmd; 112 pmd_t *pmd;
113 int pgd_idx, pmd_idx; 113 int pgd_idx, pmd_idx;
114 unsigned long vaddr; 114 unsigned long vaddr;
@@ -119,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
119 pgd = pgd_base + pgd_idx; 119 pgd = pgd_base + pgd_idx;
120 120
121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
122 if (pgd_none(*pgd)) 122 pmd = one_md_table_init(pgd);
123 one_md_table_init(pgd); 123 pmd = pmd + pmd_index(vaddr);
124 pud = pud_offset(pgd, vaddr);
125 pmd = pmd_offset(pud, vaddr);
126 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { 124 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
127 if (pmd_none(*pmd)) 125 one_page_table_init(pmd);
128 one_page_table_init(pmd);
129 126
130 vaddr += PMD_SIZE; 127 vaddr += PMD_SIZE;
131 } 128 }
@@ -167,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
167 /* Map with big pages if possible, otherwise create normal page tables. */ 164 /* Map with big pages if possible, otherwise create normal page tables. */
168 if (cpu_has_pse) { 165 if (cpu_has_pse) {
169 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; 166 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
170
171 if (is_kernel_text(address) || is_kernel_text(address2)) 167 if (is_kernel_text(address) || is_kernel_text(address2))
172 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); 168 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
173 else 169 else
174 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); 170 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
171
175 pfn += PTRS_PER_PTE; 172 pfn += PTRS_PER_PTE;
176 } else { 173 } else {
177 pte = one_page_table_init(pmd); 174 pte = one_page_table_init(pmd);
178 175
179 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { 176 for (pte_ofs = 0;
180 if (is_kernel_text(address)) 177 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
181 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); 178 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
182 else 179 if (is_kernel_text(address))
183 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); 180 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
181 else
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
184 } 183 }
185 } 184 }
186 } 185 }
@@ -337,24 +336,78 @@ extern void __init remap_numa_kva(void);
337#define remap_numa_kva() do {} while (0) 336#define remap_numa_kva() do {} while (0)
338#endif 337#endif
339 338
340static void __init pagetable_init (void) 339void __init native_pagetable_setup_start(pgd_t *base)
341{ 340{
342 unsigned long vaddr;
343 pgd_t *pgd_base = swapper_pg_dir;
344
345#ifdef CONFIG_X86_PAE 341#ifdef CONFIG_X86_PAE
346 int i; 342 int i;
347 /* Init entries of the first-level page table to the zero page */ 343
348 for (i = 0; i < PTRS_PER_PGD; i++) 344 /*
349 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 345 * Init entries of the first-level page table to the
346 * zero page, if they haven't already been set up.
347 *
348 * In a normal native boot, we'll be running on a
349 * pagetable rooted in swapper_pg_dir, but not in PAE
350 * mode, so this will end up clobbering the mappings
351 * for the lower 24Mbytes of the address space,
352 * without affecting the kernel address space.
353 */
354 for (i = 0; i < USER_PTRS_PER_PGD; i++)
355 set_pgd(&base[i],
356 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
357
358 /* Make sure kernel address space is empty so that a pagetable
359 will be allocated for it. */
360 memset(&base[USER_PTRS_PER_PGD], 0,
361 KERNEL_PGD_PTRS * sizeof(pgd_t));
350#else 362#else
351 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); 363 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
352#endif 364#endif
365}
366
367void __init native_pagetable_setup_done(pgd_t *base)
368{
369#ifdef CONFIG_X86_PAE
370 /*
371 * Add low memory identity-mappings - SMP needs it when
372 * starting up on an AP from real-mode. In the non-PAE
373 * case we already have these mappings through head.S.
374 * All user-space mappings are explicitly cleared after
375 * SMP startup.
376 */
377 set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
378#endif
379}
380
381/*
382 * Build a proper pagetable for the kernel mappings. Up until this
383 * point, we've been running on some set of pagetables constructed by
384 * the boot process.
385 *
386 * If we're booting on native hardware, this will be a pagetable
387 * constructed in arch/i386/kernel/head.S, and not running in PAE mode
388 * (even if we'll end up running in PAE). The root of the pagetable
389 * will be swapper_pg_dir.
390 *
391 * If we're booting paravirtualized under a hypervisor, then there are
392 * more options: we may already be running PAE, and the pagetable may
393 * or may not be based in swapper_pg_dir. In any case,
394 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
395 * appropriately for the rest of the initialization to work.
396 *
397 * In general, pagetable_init() assumes that the pagetable may already
398 * be partially populated, and so it avoids stomping on any existing
399 * mappings.
400 */
401static void __init pagetable_init (void)
402{
403 unsigned long vaddr, end;
404 pgd_t *pgd_base = swapper_pg_dir;
405
406 paravirt_pagetable_setup_start(pgd_base);
353 407
354 /* Enable PSE if available */ 408 /* Enable PSE if available */
355 if (cpu_has_pse) { 409 if (cpu_has_pse)
356 set_in_cr4(X86_CR4_PSE); 410 set_in_cr4(X86_CR4_PSE);
357 }
358 411
359 /* Enable PGE if available */ 412 /* Enable PGE if available */
360 if (cpu_has_pge) { 413 if (cpu_has_pge) {
@@ -371,20 +424,12 @@ static void __init pagetable_init (void)
371 * created - mappings will be set by set_fixmap(): 424 * created - mappings will be set by set_fixmap():
372 */ 425 */
373 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 426 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
374 page_table_range_init(vaddr, 0, pgd_base); 427 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
428 page_table_range_init(vaddr, end, pgd_base);
375 429
376 permanent_kmaps_init(pgd_base); 430 permanent_kmaps_init(pgd_base);
377 431
378#ifdef CONFIG_X86_PAE 432 paravirt_pagetable_setup_done(pgd_base);
379 /*
380 * Add low memory identity-mappings - SMP needs it when
381 * starting up on an AP from real-mode. In the non-PAE
382 * case we already have these mappings through head.S.
383 * All user-space mappings are explicitly cleared after
384 * SMP startup.
385 */
386 set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
387#endif
388} 433}
389 434
390#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) 435#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
@@ -700,6 +745,8 @@ struct kmem_cache *pmd_cache;
700 745
701void __init pgtable_cache_init(void) 746void __init pgtable_cache_init(void)
702{ 747{
748 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
749
703 if (PTRS_PER_PMD > 1) { 750 if (PTRS_PER_PMD > 1) {
704 pmd_cache = kmem_cache_create("pmd", 751 pmd_cache = kmem_cache_create("pmd",
705 PTRS_PER_PMD*sizeof(pmd_t), 752 PTRS_PER_PMD*sizeof(pmd_t),
@@ -709,13 +756,23 @@ void __init pgtable_cache_init(void)
709 NULL); 756 NULL);
710 if (!pmd_cache) 757 if (!pmd_cache)
711 panic("pgtable_cache_init(): cannot create pmd cache"); 758 panic("pgtable_cache_init(): cannot create pmd cache");
759
760 if (!SHARED_KERNEL_PMD) {
761 /* If we're in PAE mode and have a non-shared
762 kernel pmd, then the pgd size must be a
763 page size. This is because the pgd_list
764 links through the page structure, so there
765 can only be one pgd per page for this to
766 work. */
767 pgd_size = PAGE_SIZE;
768 }
712 } 769 }
713 pgd_cache = kmem_cache_create("pgd", 770 pgd_cache = kmem_cache_create("pgd",
714 PTRS_PER_PGD*sizeof(pgd_t), 771 pgd_size,
715 PTRS_PER_PGD*sizeof(pgd_t), 772 pgd_size,
716 0, 773 0,
717 pgd_ctor, 774 pgd_ctor,
718 PTRS_PER_PMD == 1 ? pgd_dtor : NULL); 775 (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL);
719 if (!pgd_cache) 776 if (!pgd_cache)
720 panic("pgtable_cache_init(): Cannot create pgd cache"); 777 panic("pgtable_cache_init(): Cannot create pgd cache");
721} 778}
@@ -751,13 +808,25 @@ static int noinline do_test_wp_bit(void)
751 808
752void mark_rodata_ro(void) 809void mark_rodata_ro(void)
753{ 810{
754 unsigned long addr = (unsigned long)__start_rodata; 811 unsigned long start = PFN_ALIGN(_text);
812 unsigned long size = PFN_ALIGN(_etext) - start;
755 813
756 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) 814#ifdef CONFIG_HOTPLUG_CPU
757 change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); 815 /* It must still be possible to apply SMP alternatives. */
816 if (num_possible_cpus() <= 1)
817#endif
818 {
819 change_page_attr(virt_to_page(start),
820 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
821 printk("Write protecting the kernel text: %luk\n", size >> 10);
822 }
758 823
759 printk("Write protecting the kernel read-only data: %uk\n", 824 start += size;
760 (__end_rodata - __start_rodata) >> 10); 825 size = (unsigned long)__end_rodata - start;
826 change_page_attr(virt_to_page(start),
827 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
828 printk("Write protecting the kernel read-only data: %luk\n",
829 size >> 10);
761 830
762 /* 831 /*
763 * change_page_attr() requires a global_flush_tlb() call after it. 832 * change_page_attr() requires a global_flush_tlb() call after it.
@@ -774,26 +843,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
774 unsigned long addr; 843 unsigned long addr;
775 844
776 for (addr = begin; addr < end; addr += PAGE_SIZE) { 845 for (addr = begin; addr < end; addr += PAGE_SIZE) {
777 ClearPageReserved(virt_to_page(addr)); 846 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
778 init_page_count(virt_to_page(addr)); 847 ClearPageReserved(page);
779 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 848 init_page_count(page);
780 free_page(addr); 849 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
850 __free_page(page);
781 totalram_pages++; 851 totalram_pages++;
782 } 852 }
783 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 853 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
784} 854}
785 855
786void free_initmem(void) 856void free_initmem(void)
787{ 857{
788 free_init_pages("unused kernel memory", 858 free_init_pages("unused kernel memory",
789 (unsigned long)(&__init_begin), 859 __pa_symbol(&__init_begin),
790 (unsigned long)(&__init_end)); 860 __pa_symbol(&__init_end));
791} 861}
792 862
793#ifdef CONFIG_BLK_DEV_INITRD 863#ifdef CONFIG_BLK_DEV_INITRD
794void free_initrd_mem(unsigned long start, unsigned long end) 864void free_initrd_mem(unsigned long start, unsigned long end)
795{ 865{
796 free_init_pages("initrd memory", start, end); 866 free_init_pages("initrd memory", __pa(start), __pa(end));
797} 867}
798#endif 868#endif
799 869
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 412ebbd8adb0..47bd477c8ecc 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
91 unsigned long flags; 91 unsigned long flags;
92 92
93 set_pte_atomic(kpte, pte); /* change init_mm */ 93 set_pte_atomic(kpte, pte); /* change init_mm */
94 if (PTRS_PER_PMD > 1) 94 if (SHARED_KERNEL_PMD)
95 return; 95 return;
96 96
97 spin_lock_irqsave(&pgd_lock, flags); 97 spin_lock_irqsave(&pgd_lock, flags);
@@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
142 return -EINVAL; 142 return -EINVAL;
143 kpte_page = virt_to_page(kpte); 143 kpte_page = virt_to_page(kpte);
144 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 144 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
145 if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 145 if (!pte_huge(*kpte)) {
146 set_pte_atomic(kpte, mk_pte(page, prot)); 146 set_pte_atomic(kpte, mk_pte(page, prot));
147 } else { 147 } else {
148 pgprot_t ref_prot; 148 pgprot_t ref_prot;
@@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
158 kpte_page = split; 158 kpte_page = split;
159 } 159 }
160 page_private(kpte_page)++; 160 page_private(kpte_page)++;
161 } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 161 } else if (!pte_huge(*kpte)) {
162 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); 162 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
163 BUG_ON(page_private(kpte_page) == 0); 163 BUG_ON(page_private(kpte_page) == 0);
164 page_private(kpte_page)--; 164 page_private(kpte_page)--;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index fa0cfbd551e1..9a96c1647428 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
144} 144}
145 145
146static int fixmaps; 146static int fixmaps;
147#ifndef CONFIG_COMPAT_VDSO
148unsigned long __FIXADDR_TOP = 0xfffff000; 147unsigned long __FIXADDR_TOP = 0xfffff000;
149EXPORT_SYMBOL(__FIXADDR_TOP); 148EXPORT_SYMBOL(__FIXADDR_TOP);
150#endif
151 149
152void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) 150void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
153{ 151{
@@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve)
173 BUG_ON(fixmaps > 0); 171 BUG_ON(fixmaps > 0);
174 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 172 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
175 (int)-reserve); 173 (int)-reserve);
176#ifdef CONFIG_COMPAT_VDSO
177 BUG_ON(reserve != 0);
178#else
179 __FIXADDR_TOP = -reserve - PAGE_SIZE; 174 __FIXADDR_TOP = -reserve - PAGE_SIZE;
180 __VMALLOC_RESERVE += reserve; 175 __VMALLOC_RESERVE += reserve;
181#endif
182} 176}
183 177
184pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 178pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -238,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd)
238 set_page_private(next, (unsigned long)pprev); 232 set_page_private(next, (unsigned long)pprev);
239} 233}
240 234
235#if (PTRS_PER_PMD == 1)
236/* Non-PAE pgd constructor */
241void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) 237void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
242{ 238{
243 unsigned long flags; 239 unsigned long flags;
244 240
245 if (PTRS_PER_PMD == 1) { 241 /* !PAE, no pagetable sharing */
246 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 242 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
247 spin_lock_irqsave(&pgd_lock, flags); 243
248 } 244 spin_lock_irqsave(&pgd_lock, flags);
249 245
246 /* must happen under lock */
250 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 247 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
251 swapper_pg_dir + USER_PTRS_PER_PGD, 248 swapper_pg_dir + USER_PTRS_PER_PGD,
252 KERNEL_PGD_PTRS); 249 KERNEL_PGD_PTRS);
253
254 if (PTRS_PER_PMD > 1)
255 return;
256
257 /* must happen under lock */
258 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, 250 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
259 __pa(swapper_pg_dir) >> PAGE_SHIFT, 251 __pa(swapper_pg_dir) >> PAGE_SHIFT,
260 USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); 252 USER_PTRS_PER_PGD,
261 253 KERNEL_PGD_PTRS);
262 pgd_list_add(pgd); 254 pgd_list_add(pgd);
263 spin_unlock_irqrestore(&pgd_lock, flags); 255 spin_unlock_irqrestore(&pgd_lock, flags);
264} 256}
257#else /* PTRS_PER_PMD > 1 */
258/* PAE pgd constructor */
259void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
260{
261 /* PAE, kernel PMD may be shared */
262
263 if (SHARED_KERNEL_PMD) {
264 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
265 swapper_pg_dir + USER_PTRS_PER_PGD,
266 KERNEL_PGD_PTRS);
267 } else {
268 unsigned long flags;
269
270 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
271 spin_lock_irqsave(&pgd_lock, flags);
272 pgd_list_add(pgd);
273 spin_unlock_irqrestore(&pgd_lock, flags);
274 }
275}
276#endif /* PTRS_PER_PMD */
265 277
266/* never called when PTRS_PER_PMD > 1 */
267void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) 278void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
268{ 279{
269 unsigned long flags; /* can be called from interrupt context */ 280 unsigned long flags; /* can be called from interrupt context */
270 281
282 BUG_ON(SHARED_KERNEL_PMD);
283
271 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); 284 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
272 spin_lock_irqsave(&pgd_lock, flags); 285 spin_lock_irqsave(&pgd_lock, flags);
273 pgd_list_del(pgd); 286 pgd_list_del(pgd);
274 spin_unlock_irqrestore(&pgd_lock, flags); 287 spin_unlock_irqrestore(&pgd_lock, flags);
275} 288}
276 289
290#define UNSHARED_PTRS_PER_PGD \
291 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
292
293/* If we allocate a pmd for part of the kernel address space, then
294 make sure its initialized with the appropriate kernel mappings.
295 Otherwise use a cached zeroed pmd. */
296static pmd_t *pmd_cache_alloc(int idx)
297{
298 pmd_t *pmd;
299
300 if (idx >= USER_PTRS_PER_PGD) {
301 pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
302
303 if (pmd)
304 memcpy(pmd,
305 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
306 sizeof(pmd_t) * PTRS_PER_PMD);
307 } else
308 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
309
310 return pmd;
311}
312
313static void pmd_cache_free(pmd_t *pmd, int idx)
314{
315 if (idx >= USER_PTRS_PER_PGD)
316 free_page((unsigned long)pmd);
317 else
318 kmem_cache_free(pmd_cache, pmd);
319}
320
277pgd_t *pgd_alloc(struct mm_struct *mm) 321pgd_t *pgd_alloc(struct mm_struct *mm)
278{ 322{
279 int i; 323 int i;
@@ -282,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
282 if (PTRS_PER_PMD == 1 || !pgd) 326 if (PTRS_PER_PMD == 1 || !pgd)
283 return pgd; 327 return pgd;
284 328
285 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 329 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
286 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 330 pmd_t *pmd = pmd_cache_alloc(i);
331
287 if (!pmd) 332 if (!pmd)
288 goto out_oom; 333 goto out_oom;
334
289 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); 335 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
290 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 336 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
291 } 337 }
@@ -296,7 +342,7 @@ out_oom:
296 pgd_t pgdent = pgd[i]; 342 pgd_t pgdent = pgd[i];
297 void* pmd = (void *)__va(pgd_val(pgdent)-1); 343 void* pmd = (void *)__va(pgd_val(pgdent)-1);
298 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 344 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
299 kmem_cache_free(pmd_cache, pmd); 345 pmd_cache_free(pmd, i);
300 } 346 }
301 kmem_cache_free(pgd_cache, pgd); 347 kmem_cache_free(pgd_cache, pgd);
302 return NULL; 348 return NULL;
@@ -308,11 +354,11 @@ void pgd_free(pgd_t *pgd)
308 354
309 /* in the PAE case user pgd entries are overwritten before usage */ 355 /* in the PAE case user pgd entries are overwritten before usage */
310 if (PTRS_PER_PMD > 1) 356 if (PTRS_PER_PMD > 1)
311 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 357 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
312 pgd_t pgdent = pgd[i]; 358 pgd_t pgdent = pgd[i];
313 void* pmd = (void *)__va(pgd_val(pgdent)-1); 359 void* pmd = (void *)__va(pgd_val(pgdent)-1);
314 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 360 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
315 kmem_cache_free(pmd_cache, pmd); 361 pmd_cache_free(pmd, i);
316 } 362 }
317 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 363 /* in the non-PAE case, free_pgtables() clears user pgd entries */
318 kmem_cache_free(pgd_cache, pgd); 364 kmem_cache_free(pgd_cache, pgd);