diff options
Diffstat (limited to 'arch/x86/mm/pgtable.c')
-rw-r--r-- | arch/x86/mm/pgtable.c | 190 |
1 files changed, 123 insertions, 67 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50159764f694..557b2abceef8 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <asm/pgalloc.h> | 2 | #include <asm/pgalloc.h> |
3 | #include <asm/pgtable.h> | 3 | #include <asm/pgtable.h> |
4 | #include <asm/tlb.h> | 4 | #include <asm/tlb.h> |
5 | #include <asm/fixmap.h> | ||
5 | 6 | ||
6 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 7 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
7 | { | 8 | { |
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
65 | static void pgd_ctor(void *p) | 66 | static void pgd_ctor(void *p) |
66 | { | 67 | { |
67 | pgd_t *pgd = p; | 68 | pgd_t *pgd = p; |
68 | unsigned long flags; | ||
69 | |||
70 | /* Clear usermode parts of PGD */ | ||
71 | memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); | ||
72 | |||
73 | spin_lock_irqsave(&pgd_lock, flags); | ||
74 | 69 | ||
75 | /* If the pgd points to a shared pagetable level (either the | 70 | /* If the pgd points to a shared pagetable level (either the |
76 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 71 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p) | |||
90 | /* list required to sync kernel mapping updates */ | 85 | /* list required to sync kernel mapping updates */ |
91 | if (!SHARED_KERNEL_PMD) | 86 | if (!SHARED_KERNEL_PMD) |
92 | pgd_list_add(pgd); | 87 | pgd_list_add(pgd); |
93 | |||
94 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
95 | } | 88 | } |
96 | 89 | ||
97 | static void pgd_dtor(void *pgd) | 90 | static void pgd_dtor(void *pgd) |
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd) | |||
119 | 112 | ||
120 | #ifdef CONFIG_X86_PAE | 113 | #ifdef CONFIG_X86_PAE |
121 | /* | 114 | /* |
115 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
116 | * updating the top-level pagetable entries to guarantee the | ||
117 | * processor notices the update. Since this is expensive, and | ||
118 | * all 4 top-level entries are used almost immediately in a | ||
119 | * new process's life, we just pre-populate them here. | ||
120 | * | ||
121 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
122 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
123 | * and initialize the kernel pmds here. | ||
124 | */ | ||
125 | #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD | ||
126 | |||
127 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | ||
128 | { | ||
129 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | ||
130 | |||
131 | /* Note: almost everything apart from _PAGE_PRESENT is | ||
132 | reserved at the pmd (PDPT) level. */ | ||
133 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
134 | |||
135 | /* | ||
136 | * According to Intel App note "TLBs, Paging-Structure Caches, | ||
137 | * and Their Invalidation", April 2007, document 317080-001, | ||
138 | * section 8.1: in PAE mode we explicitly have to flush the | ||
139 | * TLB via cr3 if the top-level pgd is changed... | ||
140 | */ | ||
141 | if (mm == current->active_mm) | ||
142 | write_cr3(read_cr3()); | ||
143 | } | ||
144 | #else /* !CONFIG_X86_PAE */ | ||
145 | |||
146 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
147 | #define PREALLOCATED_PMDS 0 | ||
148 | |||
149 | #endif /* CONFIG_X86_PAE */ | ||
150 | |||
151 | static void free_pmds(pmd_t *pmds[]) | ||
152 | { | ||
153 | int i; | ||
154 | |||
155 | for(i = 0; i < PREALLOCATED_PMDS; i++) | ||
156 | if (pmds[i]) | ||
157 | free_page((unsigned long)pmds[i]); | ||
158 | } | ||
159 | |||
160 | static int preallocate_pmds(pmd_t *pmds[]) | ||
161 | { | ||
162 | int i; | ||
163 | bool failed = false; | ||
164 | |||
165 | for(i = 0; i < PREALLOCATED_PMDS; i++) { | ||
166 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); | ||
167 | if (pmd == NULL) | ||
168 | failed = true; | ||
169 | pmds[i] = pmd; | ||
170 | } | ||
171 | |||
172 | if (failed) { | ||
173 | free_pmds(pmds); | ||
174 | return -ENOMEM; | ||
175 | } | ||
176 | |||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | /* | ||
122 | * Mop up any pmd pages which may still be attached to the pgd. | 181 | * Mop up any pmd pages which may still be attached to the pgd. |
123 | * Normally they will be freed by munmap/exit_mmap, but any pmd we | 182 | * Normally they will be freed by munmap/exit_mmap, but any pmd we |
124 | * preallocate which never got a corresponding vma will need to be | 183 | * preallocate which never got a corresponding vma will need to be |
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
128 | { | 187 | { |
129 | int i; | 188 | int i; |
130 | 189 | ||
131 | for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | 190 | for(i = 0; i < PREALLOCATED_PMDS; i++) { |
132 | pgd_t pgd = pgdp[i]; | 191 | pgd_t pgd = pgdp[i]; |
133 | 192 | ||
134 | if (pgd_val(pgd) != 0) { | 193 | if (pgd_val(pgd) != 0) { |
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
142 | } | 201 | } |
143 | } | 202 | } |
144 | 203 | ||
145 | /* | 204 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
146 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
147 | * updating the top-level pagetable entries to guarantee the | ||
148 | * processor notices the update. Since this is expensive, and | ||
149 | * all 4 top-level entries are used almost immediately in a | ||
150 | * new process's life, we just pre-populate them here. | ||
151 | * | ||
152 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
153 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
154 | * and initialize the kernel pmds here. | ||
155 | */ | ||
156 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
157 | { | 205 | { |
158 | pud_t *pud; | 206 | pud_t *pud; |
159 | unsigned long addr; | 207 | unsigned long addr; |
160 | int i; | 208 | int i; |
161 | 209 | ||
162 | pud = pud_offset(pgd, 0); | 210 | pud = pud_offset(pgd, 0); |
163 | for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | ||
164 | i++, pud++, addr += PUD_SIZE) { | ||
165 | pmd_t *pmd = pmd_alloc_one(mm, addr); | ||
166 | 211 | ||
167 | if (!pmd) { | 212 | for (addr = i = 0; i < PREALLOCATED_PMDS; |
168 | pgd_mop_up_pmds(mm, pgd); | 213 | i++, pud++, addr += PUD_SIZE) { |
169 | return 0; | 214 | pmd_t *pmd = pmds[i]; |
170 | } | ||
171 | 215 | ||
172 | if (i >= KERNEL_PGD_BOUNDARY) | 216 | if (i >= KERNEL_PGD_BOUNDARY) |
173 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | 217 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), |
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | |||
175 | 219 | ||
176 | pud_populate(mm, pud, pmd); | 220 | pud_populate(mm, pud, pmd); |
177 | } | 221 | } |
178 | |||
179 | return 1; | ||
180 | } | 222 | } |
181 | 223 | ||
182 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | 224 | pgd_t *pgd_alloc(struct mm_struct *mm) |
183 | { | 225 | { |
184 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | 226 | pgd_t *pgd; |
227 | pmd_t *pmds[PREALLOCATED_PMDS]; | ||
228 | unsigned long flags; | ||
185 | 229 | ||
186 | /* Note: almost everything apart from _PAGE_PRESENT is | 230 | pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
187 | reserved at the pmd (PDPT) level. */ | ||
188 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
189 | 231 | ||
190 | /* | 232 | if (pgd == NULL) |
191 | * According to Intel App note "TLBs, Paging-Structure Caches, | 233 | goto out; |
192 | * and Their Invalidation", April 2007, document 317080-001, | ||
193 | * section 8.1: in PAE mode we explicitly have to flush the | ||
194 | * TLB via cr3 if the top-level pgd is changed... | ||
195 | */ | ||
196 | if (mm == current->active_mm) | ||
197 | write_cr3(read_cr3()); | ||
198 | } | ||
199 | #else /* !CONFIG_X86_PAE */ | ||
200 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
201 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
202 | { | ||
203 | return 1; | ||
204 | } | ||
205 | 234 | ||
206 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) | 235 | mm->pgd = pgd; |
207 | { | ||
208 | } | ||
209 | #endif /* CONFIG_X86_PAE */ | ||
210 | 236 | ||
211 | pgd_t *pgd_alloc(struct mm_struct *mm) | 237 | if (preallocate_pmds(pmds) != 0) |
212 | { | 238 | goto out_free_pgd; |
213 | pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
214 | 239 | ||
215 | /* so that alloc_pmd can use it */ | 240 | if (paravirt_pgd_alloc(mm) != 0) |
216 | mm->pgd = pgd; | 241 | goto out_free_pmds; |
217 | if (pgd) | ||
218 | pgd_ctor(pgd); | ||
219 | 242 | ||
220 | if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | 243 | /* |
221 | pgd_dtor(pgd); | 244 | * Make sure that pre-populating the pmds is atomic with |
222 | free_page((unsigned long)pgd); | 245 | * respect to anything walking the pgd_list, so that they |
223 | pgd = NULL; | 246 | * never see a partially populated pgd. |
224 | } | 247 | */ |
248 | spin_lock_irqsave(&pgd_lock, flags); | ||
249 | |||
250 | pgd_ctor(pgd); | ||
251 | pgd_prepopulate_pmd(mm, pgd, pmds); | ||
252 | |||
253 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
225 | 254 | ||
226 | return pgd; | 255 | return pgd; |
256 | |||
257 | out_free_pmds: | ||
258 | free_pmds(pmds); | ||
259 | out_free_pgd: | ||
260 | free_page((unsigned long)pgd); | ||
261 | out: | ||
262 | return NULL; | ||
227 | } | 263 | } |
228 | 264 | ||
229 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | 265 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
230 | { | 266 | { |
231 | pgd_mop_up_pmds(mm, pgd); | 267 | pgd_mop_up_pmds(mm, pgd); |
232 | pgd_dtor(pgd); | 268 | pgd_dtor(pgd); |
269 | paravirt_pgd_free(mm, pgd); | ||
233 | free_page((unsigned long)pgd); | 270 | free_page((unsigned long)pgd); |
234 | } | 271 | } |
235 | 272 | ||
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, | |||
255 | 292 | ||
256 | if (pte_young(*ptep)) | 293 | if (pte_young(*ptep)) |
257 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | 294 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
258 | &ptep->pte); | 295 | (unsigned long *) &ptep->pte); |
259 | 296 | ||
260 | if (ret) | 297 | if (ret) |
261 | pte_update(vma->vm_mm, addr, ptep); | 298 | pte_update(vma->vm_mm, addr, ptep); |
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, | |||
274 | 311 | ||
275 | return young; | 312 | return young; |
276 | } | 313 | } |
314 | |||
315 | int fixmaps_set; | ||
316 | |||
317 | void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) | ||
318 | { | ||
319 | unsigned long address = __fix_to_virt(idx); | ||
320 | |||
321 | if (idx >= __end_of_fixed_addresses) { | ||
322 | BUG(); | ||
323 | return; | ||
324 | } | ||
325 | set_pte_vaddr(address, pte); | ||
326 | fixmaps_set++; | ||
327 | } | ||
328 | |||
329 | void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
330 | { | ||
331 | __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); | ||
332 | } | ||