diff options
Diffstat (limited to 'arch/x86/mm/pgtable.c')
-rw-r--r-- | arch/x86/mm/pgtable.c | 199 |
1 files changed, 128 insertions, 71 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50159764f694..86f2ffc43c3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <asm/pgalloc.h> | 2 | #include <asm/pgalloc.h> |
3 | #include <asm/pgtable.h> | 3 | #include <asm/pgtable.h> |
4 | #include <asm/tlb.h> | 4 | #include <asm/tlb.h> |
5 | #include <asm/fixmap.h> | ||
5 | 6 | ||
6 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 7 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
7 | { | 8 | { |
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
62 | #define UNSHARED_PTRS_PER_PGD \ | 63 | #define UNSHARED_PTRS_PER_PGD \ |
63 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 64 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
64 | 65 | ||
65 | static void pgd_ctor(void *p) | 66 | static void pgd_ctor(pgd_t *pgd) |
66 | { | 67 | { |
67 | pgd_t *pgd = p; | ||
68 | unsigned long flags; | ||
69 | |||
70 | /* Clear usermode parts of PGD */ | ||
71 | memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); | ||
72 | |||
73 | spin_lock_irqsave(&pgd_lock, flags); | ||
74 | |||
75 | /* If the pgd points to a shared pagetable level (either the | 68 | /* If the pgd points to a shared pagetable level (either the |
76 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 69 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
77 | references from swapper_pg_dir. */ | 70 | references from swapper_pg_dir. */ |
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p) | |||
90 | /* list required to sync kernel mapping updates */ | 83 | /* list required to sync kernel mapping updates */ |
91 | if (!SHARED_KERNEL_PMD) | 84 | if (!SHARED_KERNEL_PMD) |
92 | pgd_list_add(pgd); | 85 | pgd_list_add(pgd); |
93 | |||
94 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
95 | } | 86 | } |
96 | 87 | ||
97 | static void pgd_dtor(void *pgd) | 88 | static void pgd_dtor(pgd_t *pgd) |
98 | { | 89 | { |
99 | unsigned long flags; /* can be called from interrupt context */ | 90 | unsigned long flags; /* can be called from interrupt context */ |
100 | 91 | ||
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd) | |||
119 | 110 | ||
120 | #ifdef CONFIG_X86_PAE | 111 | #ifdef CONFIG_X86_PAE |
121 | /* | 112 | /* |
113 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
114 | * updating the top-level pagetable entries to guarantee the | ||
115 | * processor notices the update. Since this is expensive, and | ||
116 | * all 4 top-level entries are used almost immediately in a | ||
117 | * new process's life, we just pre-populate them here. | ||
118 | * | ||
119 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
120 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
121 | * and initialize the kernel pmds here. | ||
122 | */ | ||
123 | #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD | ||
124 | |||
125 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | ||
126 | { | ||
127 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | ||
128 | |||
129 | /* Note: almost everything apart from _PAGE_PRESENT is | ||
130 | reserved at the pmd (PDPT) level. */ | ||
131 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
132 | |||
133 | /* | ||
134 | * According to Intel App note "TLBs, Paging-Structure Caches, | ||
135 | * and Their Invalidation", April 2007, document 317080-001, | ||
136 | * section 8.1: in PAE mode we explicitly have to flush the | ||
137 | * TLB via cr3 if the top-level pgd is changed... | ||
138 | */ | ||
139 | if (mm == current->active_mm) | ||
140 | write_cr3(read_cr3()); | ||
141 | } | ||
142 | #else /* !CONFIG_X86_PAE */ | ||
143 | |||
144 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
145 | #define PREALLOCATED_PMDS 0 | ||
146 | |||
147 | #endif /* CONFIG_X86_PAE */ | ||
148 | |||
149 | static void free_pmds(pmd_t *pmds[]) | ||
150 | { | ||
151 | int i; | ||
152 | |||
153 | for(i = 0; i < PREALLOCATED_PMDS; i++) | ||
154 | if (pmds[i]) | ||
155 | free_page((unsigned long)pmds[i]); | ||
156 | } | ||
157 | |||
158 | static int preallocate_pmds(pmd_t *pmds[]) | ||
159 | { | ||
160 | int i; | ||
161 | bool failed = false; | ||
162 | |||
163 | for(i = 0; i < PREALLOCATED_PMDS; i++) { | ||
164 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); | ||
165 | if (pmd == NULL) | ||
166 | failed = true; | ||
167 | pmds[i] = pmd; | ||
168 | } | ||
169 | |||
170 | if (failed) { | ||
171 | free_pmds(pmds); | ||
172 | return -ENOMEM; | ||
173 | } | ||
174 | |||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | /* | ||
122 | * Mop up any pmd pages which may still be attached to the pgd. | 179 | * Mop up any pmd pages which may still be attached to the pgd. |
123 | * Normally they will be freed by munmap/exit_mmap, but any pmd we | 180 | * Normally they will be freed by munmap/exit_mmap, but any pmd we |
124 | * preallocate which never got a corresponding vma will need to be | 181 | * preallocate which never got a corresponding vma will need to be |
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
128 | { | 185 | { |
129 | int i; | 186 | int i; |
130 | 187 | ||
131 | for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | 188 | for(i = 0; i < PREALLOCATED_PMDS; i++) { |
132 | pgd_t pgd = pgdp[i]; | 189 | pgd_t pgd = pgdp[i]; |
133 | 190 | ||
134 | if (pgd_val(pgd) != 0) { | 191 | if (pgd_val(pgd) != 0) { |
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
142 | } | 199 | } |
143 | } | 200 | } |
144 | 201 | ||
145 | /* | 202 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
146 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
147 | * updating the top-level pagetable entries to guarantee the | ||
148 | * processor notices the update. Since this is expensive, and | ||
149 | * all 4 top-level entries are used almost immediately in a | ||
150 | * new process's life, we just pre-populate them here. | ||
151 | * | ||
152 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
153 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
154 | * and initialize the kernel pmds here. | ||
155 | */ | ||
156 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
157 | { | 203 | { |
158 | pud_t *pud; | 204 | pud_t *pud; |
159 | unsigned long addr; | 205 | unsigned long addr; |
160 | int i; | 206 | int i; |
161 | 207 | ||
208 | if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ | ||
209 | return; | ||
210 | |||
162 | pud = pud_offset(pgd, 0); | 211 | pud = pud_offset(pgd, 0); |
163 | for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | ||
164 | i++, pud++, addr += PUD_SIZE) { | ||
165 | pmd_t *pmd = pmd_alloc_one(mm, addr); | ||
166 | 212 | ||
167 | if (!pmd) { | 213 | for (addr = i = 0; i < PREALLOCATED_PMDS; |
168 | pgd_mop_up_pmds(mm, pgd); | 214 | i++, pud++, addr += PUD_SIZE) { |
169 | return 0; | 215 | pmd_t *pmd = pmds[i]; |
170 | } | ||
171 | 216 | ||
172 | if (i >= KERNEL_PGD_BOUNDARY) | 217 | if (i >= KERNEL_PGD_BOUNDARY) |
173 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | 218 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), |
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | |||
175 | 220 | ||
176 | pud_populate(mm, pud, pmd); | 221 | pud_populate(mm, pud, pmd); |
177 | } | 222 | } |
178 | |||
179 | return 1; | ||
180 | } | 223 | } |
181 | 224 | ||
182 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | 225 | pgd_t *pgd_alloc(struct mm_struct *mm) |
183 | { | 226 | { |
184 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | 227 | pgd_t *pgd; |
228 | pmd_t *pmds[PREALLOCATED_PMDS]; | ||
229 | unsigned long flags; | ||
185 | 230 | ||
186 | /* Note: almost everything apart from _PAGE_PRESENT is | 231 | pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
187 | reserved at the pmd (PDPT) level. */ | ||
188 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
189 | 232 | ||
190 | /* | 233 | if (pgd == NULL) |
191 | * According to Intel App note "TLBs, Paging-Structure Caches, | 234 | goto out; |
192 | * and Their Invalidation", April 2007, document 317080-001, | ||
193 | * section 8.1: in PAE mode we explicitly have to flush the | ||
194 | * TLB via cr3 if the top-level pgd is changed... | ||
195 | */ | ||
196 | if (mm == current->active_mm) | ||
197 | write_cr3(read_cr3()); | ||
198 | } | ||
199 | #else /* !CONFIG_X86_PAE */ | ||
200 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
201 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
202 | { | ||
203 | return 1; | ||
204 | } | ||
205 | 235 | ||
206 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) | 236 | mm->pgd = pgd; |
207 | { | ||
208 | } | ||
209 | #endif /* CONFIG_X86_PAE */ | ||
210 | 237 | ||
211 | pgd_t *pgd_alloc(struct mm_struct *mm) | 238 | if (preallocate_pmds(pmds) != 0) |
212 | { | 239 | goto out_free_pgd; |
213 | pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
214 | 240 | ||
215 | /* so that alloc_pmd can use it */ | 241 | if (paravirt_pgd_alloc(mm) != 0) |
216 | mm->pgd = pgd; | 242 | goto out_free_pmds; |
217 | if (pgd) | ||
218 | pgd_ctor(pgd); | ||
219 | 243 | ||
220 | if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | 244 | /* |
221 | pgd_dtor(pgd); | 245 | * Make sure that pre-populating the pmds is atomic with |
222 | free_page((unsigned long)pgd); | 246 | * respect to anything walking the pgd_list, so that they |
223 | pgd = NULL; | 247 | * never see a partially populated pgd. |
224 | } | 248 | */ |
249 | spin_lock_irqsave(&pgd_lock, flags); | ||
250 | |||
251 | pgd_ctor(pgd); | ||
252 | pgd_prepopulate_pmd(mm, pgd, pmds); | ||
253 | |||
254 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
225 | 255 | ||
226 | return pgd; | 256 | return pgd; |
257 | |||
258 | out_free_pmds: | ||
259 | free_pmds(pmds); | ||
260 | out_free_pgd: | ||
261 | free_page((unsigned long)pgd); | ||
262 | out: | ||
263 | return NULL; | ||
227 | } | 264 | } |
228 | 265 | ||
229 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | 266 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
230 | { | 267 | { |
231 | pgd_mop_up_pmds(mm, pgd); | 268 | pgd_mop_up_pmds(mm, pgd); |
232 | pgd_dtor(pgd); | 269 | pgd_dtor(pgd); |
270 | paravirt_pgd_free(mm, pgd); | ||
233 | free_page((unsigned long)pgd); | 271 | free_page((unsigned long)pgd); |
234 | } | 272 | } |
235 | 273 | ||
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, | |||
255 | 293 | ||
256 | if (pte_young(*ptep)) | 294 | if (pte_young(*ptep)) |
257 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | 295 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
258 | &ptep->pte); | 296 | (unsigned long *) &ptep->pte); |
259 | 297 | ||
260 | if (ret) | 298 | if (ret) |
261 | pte_update(vma->vm_mm, addr, ptep); | 299 | pte_update(vma->vm_mm, addr, ptep); |
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, | |||
274 | 312 | ||
275 | return young; | 313 | return young; |
276 | } | 314 | } |
315 | |||
316 | int fixmaps_set; | ||
317 | |||
318 | void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) | ||
319 | { | ||
320 | unsigned long address = __fix_to_virt(idx); | ||
321 | |||
322 | if (idx >= __end_of_fixed_addresses) { | ||
323 | BUG(); | ||
324 | return; | ||
325 | } | ||
326 | set_pte_vaddr(address, pte); | ||
327 | fixmaps_set++; | ||
328 | } | ||
329 | |||
330 | void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
331 | { | ||
332 | __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); | ||
333 | } | ||