aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/pgtable.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/pgtable.c')
-rw-r--r--arch/x86/mm/pgtable.c199
1 files changed, 128 insertions, 71 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
62#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
63 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
64 65
65static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
66{ 67{
67 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74
75 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
77 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 83 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 84 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 85 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 86}
96 87
97static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
98{ 89{
99 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
100 91
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd)
119 110
120#ifdef CONFIG_X86_PAE 111#ifdef CONFIG_X86_PAE
121/* 112/*
113 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
114 * updating the top-level pagetable entries to guarantee the
115 * processor notices the update. Since this is expensive, and
116 * all 4 top-level entries are used almost immediately in a
117 * new process's life, we just pre-populate them here.
118 *
119 * Also, if we're in a paravirt environment where the kernel pmd is
120 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
121 * and initialize the kernel pmds here.
122 */
123#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
124
125void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
126{
127 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
128
129 /* Note: almost everything apart from _PAGE_PRESENT is
130 reserved at the pmd (PDPT) level. */
131 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
132
133 /*
134 * According to Intel App note "TLBs, Paging-Structure Caches,
135 * and Their Invalidation", April 2007, document 317080-001,
136 * section 8.1: in PAE mode we explicitly have to flush the
137 * TLB via cr3 if the top-level pgd is changed...
138 */
139 if (mm == current->active_mm)
140 write_cr3(read_cr3());
141}
142#else /* !CONFIG_X86_PAE */
143
144/* No need to prepopulate any pagetable entries in non-PAE modes. */
145#define PREALLOCATED_PMDS 0
146
147#endif /* CONFIG_X86_PAE */
148
149static void free_pmds(pmd_t *pmds[])
150{
151 int i;
152
153 for(i = 0; i < PREALLOCATED_PMDS; i++)
154 if (pmds[i])
155 free_page((unsigned long)pmds[i]);
156}
157
158static int preallocate_pmds(pmd_t *pmds[])
159{
160 int i;
161 bool failed = false;
162
163 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
165 if (pmd == NULL)
166 failed = true;
167 pmds[i] = pmd;
168 }
169
170 if (failed) {
171 free_pmds(pmds);
172 return -ENOMEM;
173 }
174
175 return 0;
176}
177
178/*
122 * Mop up any pmd pages which may still be attached to the pgd. 179 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 180 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 181 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 185{
129 int i; 186 int i;
130 187
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 188 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 189 pgd_t pgd = pgdp[i];
133 190
134 if (pgd_val(pgd) != 0) { 191 if (pgd_val(pgd) != 0) {
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 199 }
143} 200}
144 201
145/* 202static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 203{
158 pud_t *pud; 204 pud_t *pud;
159 unsigned long addr; 205 unsigned long addr;
160 int i; 206 int i;
161 207
208 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
209 return;
210
162 pud = pud_offset(pgd, 0); 211 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 212
167 if (!pmd) { 213 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 214 i++, pud++, addr += PUD_SIZE) {
169 return 0; 215 pmd_t *pmd = pmds[i];
170 }
171 216
172 if (i >= KERNEL_PGD_BOUNDARY) 217 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 218 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 220
176 pud_populate(mm, pud, pmd); 221 pud_populate(mm, pud, pmd);
177 } 222 }
178
179 return 1;
180} 223}
181 224
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 225pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 226{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 227 pgd_t *pgd;
228 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags;
185 230
186 /* Note: almost everything apart from _PAGE_PRESENT is 231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 232
190 /* 233 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 234 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 235
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 236 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 237
211pgd_t *pgd_alloc(struct mm_struct *mm) 238 if (preallocate_pmds(pmds) != 0)
212{ 239 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 240
215 /* so that alloc_pmd can use it */ 241 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 242 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 243
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 244 /*
221 pgd_dtor(pgd); 245 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 246 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 247 * never see a partially populated pgd.
224 } 248 */
249 spin_lock_irqsave(&pgd_lock, flags);
250
251 pgd_ctor(pgd);
252 pgd_prepopulate_pmd(mm, pgd, pmds);
253
254 spin_unlock_irqrestore(&pgd_lock, flags);
225 255
226 return pgd; 256 return pgd;
257
258out_free_pmds:
259 free_pmds(pmds);
260out_free_pgd:
261 free_page((unsigned long)pgd);
262out:
263 return NULL;
227} 264}
228 265
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 266void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 267{
231 pgd_mop_up_pmds(mm, pgd); 268 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 269 pgd_dtor(pgd);
270 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 271 free_page((unsigned long)pgd);
234} 272}
235 273
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 293
256 if (pte_young(*ptep)) 294 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 295 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 296 (unsigned long *) &ptep->pte);
259 297
260 if (ret) 298 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 299 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 312
275 return young; 313 return young;
276} 314}
315
316int fixmaps_set;
317
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
319{
320 unsigned long address = __fix_to_virt(idx);
321
322 if (idx >= __end_of_fixed_addresses) {
323 BUG();
324 return;
325 }
326 set_pte_vaddr(address, pte);
327 fixmaps_set++;
328}
329
330void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
331{
332 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
333}