aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/pgtable.c
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@goop.org>2008-06-25 00:19:13 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-08 07:11:02 -0400
commitd8d5900ef8afc562088f8470feeaf17c4747790f (patch)
tree3aa3090cd4d9fd41ced2cae3dc5c86c7ac8020a1 /arch/x86/mm/pgtable.c
parenteba0045ff87bab465d3c80c289f3bf709c1800f5 (diff)
x86: preallocate and prepopulate separately
Jan Beulich points out that vmalloc_sync_all() assumes that the kernel's pmd is always expected to be present in the pgd. The current pgd construction code will add the pgd to the pgd_list before its pmds have been pre-populated, thereby making it visible to vmalloc_sync_all(). However, because pgd_prepopulate_pmd also does the allocation, it may block and cannot be done under spinlock. The solution is to preallocate the pmds out of the spinlock, then populate them while holding the pgd_list lock. This patch also pulls the pmd preallocation and mop-up functions out to be common, assuming that the compiler will generate no code for them when PREALLOCTED_PMDS is 0. Also, there's no need for pgd_ctor to clear the pgd again, since it's allocated as a zeroed page. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm/pgtable.c')
-rw-r--r--arch/x86/mm/pgtable.c169
1 files changed, 101 insertions, 68 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 418c4432fb39..557b2abceef8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -66,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
66static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
67{ 67{
68 pgd_t *pgd = p; 68 pgd_t *pgd = p;
69 unsigned long flags;
70
71 /* Clear usermode parts of PGD */
72 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
73
74 spin_lock_irqsave(&pgd_lock, flags);
75 69
76 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
77 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -91,8 +85,6 @@ static void pgd_ctor(void *p)
91 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
92 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
93 pgd_list_add(pgd); 87 pgd_list_add(pgd);
94
95 spin_unlock_irqrestore(&pgd_lock, flags);
96} 88}
97 89
98static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -120,6 +112,72 @@ static void pgd_dtor(void *pgd)
120 112
121#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
122/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
123 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
124 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
125 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -129,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
129{ 187{
130 int i; 188 int i;
131 189
132 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
133 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
134 192
135 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -143,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
143 } 201 }
144} 202}
145 203
146/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
147 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
148 * updating the top-level pagetable entries to guarantee the
149 * processor notices the update. Since this is expensive, and
150 * all 4 top-level entries are used almost immediately in a
151 * new process's life, we just pre-populate them here.
152 *
153 * Also, if we're in a paravirt environment where the kernel pmd is
154 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
155 * and initialize the kernel pmds here.
156 */
157static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
158{ 205{
159 pud_t *pud; 206 pud_t *pud;
160 unsigned long addr; 207 unsigned long addr;
161 int i; 208 int i;
162 209
163 pud = pud_offset(pgd, 0); 210 pud = pud_offset(pgd, 0);
164 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
165 i++, pud++, addr += PUD_SIZE) {
166 pmd_t *pmd = pmd_alloc_one(mm, addr);
167 211
168 if (!pmd) { 212 for (addr = i = 0; i < PREALLOCATED_PMDS;
169 pgd_mop_up_pmds(mm, pgd); 213 i++, pud++, addr += PUD_SIZE) {
170 return 0; 214 pmd_t *pmd = pmds[i];
171 }
172 215
173 if (i >= KERNEL_PGD_BOUNDARY) 216 if (i >= KERNEL_PGD_BOUNDARY)
174 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 217 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -176,57 +219,47 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
176 219
177 pud_populate(mm, pud, pmd); 220 pud_populate(mm, pud, pmd);
178 } 221 }
179
180 return 1;
181} 222}
182 223
183void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 224pgd_t *pgd_alloc(struct mm_struct *mm)
184{ 225{
185 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 226 pgd_t *pgd;
227 pmd_t *pmds[PREALLOCATED_PMDS];
228 unsigned long flags;
186 229
187 /* Note: almost everything apart from _PAGE_PRESENT is 230 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
188 reserved at the pmd (PDPT) level. */ 231
189 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 232 if (pgd == NULL)
233 goto out;
234
235 mm->pgd = pgd;
236
237 if (preallocate_pmds(pmds) != 0)
238 goto out_free_pgd;
239
240 if (paravirt_pgd_alloc(mm) != 0)
241 goto out_free_pmds;
190 242
191 /* 243 /*
192 * According to Intel App note "TLBs, Paging-Structure Caches, 244 * Make sure that pre-populating the pmds is atomic with
193 * and Their Invalidation", April 2007, document 317080-001, 245 * respect to anything walking the pgd_list, so that they
194 * section 8.1: in PAE mode we explicitly have to flush the 246 * never see a partially populated pgd.
195 * TLB via cr3 if the top-level pgd is changed...
196 */ 247 */
197 if (mm == current->active_mm) 248 spin_lock_irqsave(&pgd_lock, flags);
198 write_cr3(read_cr3());
199}
200#else /* !CONFIG_X86_PAE */
201/* No need to prepopulate any pagetable entries in non-PAE modes. */
202static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
203{
204 return 1;
205}
206
207static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
208{
209}
210#endif /* CONFIG_X86_PAE */
211 249
212pgd_t *pgd_alloc(struct mm_struct *mm) 250 pgd_ctor(pgd);
213{ 251 pgd_prepopulate_pmd(mm, pgd, pmds);
214 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
215 252
216 /* so that alloc_pmd can use it */ 253 spin_unlock_irqrestore(&pgd_lock, flags);
217 mm->pgd = pgd;
218 if (pgd) {
219 pgd_ctor(pgd);
220
221 if (paravirt_pgd_alloc(mm) != 0 ||
222 !pgd_prepopulate_pmd(mm, pgd)) {
223 pgd_dtor(pgd);
224 free_page((unsigned long)pgd);
225 pgd = NULL;
226 }
227 }
228 254
229 return pgd; 255 return pgd;
256
257out_free_pmds:
258 free_pmds(pmds);
259out_free_pgd:
260 free_page((unsigned long)pgd);
261out:
262 return NULL;
230} 263}
231 264
232void pgd_free(struct mm_struct *mm, pgd_t *pgd) 265void pgd_free(struct mm_struct *mm, pgd_t *pgd)