aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-08-27 16:28:48 -0400
committerJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-10-22 15:57:24 -0400
commit58e05027b530ff081ecea68e38de8d59db8f87e0 (patch)
tree0e9a6649898ea44ee168b6c111c92c8668661e15 /arch/x86/xen
parentbbbf61eff92c7c236f57ee1953ad84055443717e (diff)
xen: convert p2m to a 3 level tree
Make the p2m structure a 3 level tree which covers the full possible physical space. The p2m structure contains mappings from the domain's pfns to system-wide mfns. The structure has 3 levels and two roots. The first root is for the domain's own use, and is linked with virtual addresses. The second is all mfn references, and is used by Xen on save/restore to allow it to update the p2m mapping for the domain. At boot, the domain builder provides a simple flat p2m array for all the initially present pages. We construct the two levels above that using the early_brk allocator. After early boot time, set_phys_to_machine() will allocate any missing levels using the normal kernel allocator (at GFP_KERNEL, so it must be called in a normal blocking context). Because the early_brk() API requires us to pre-reserve the maximum amount of memory we could allocate, there is still a CONFIG_XEN_MAX_DOMAIN_MEMORY config option, but its only negative side-effect is to increase the kernel's apparent bss size. However, since all unused brk memory is returned to the heap, there's no real downside to making it large. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/mmu.c318
2 files changed, 246 insertions, 83 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401a..90a7f5ad6916 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,15 +19,12 @@ config XEN_PVHVM
19 depends on X86_LOCAL_APIC 19 depends on X86_LOCAL_APIC
20 20
21config XEN_MAX_DOMAIN_MEMORY 21config XEN_MAX_DOMAIN_MEMORY
22 int "Maximum allowed size of a domain in gigabytes" 22 int
23 default 8 if X86_32 23 default 128
24 default 32 if X86_64
25 depends on XEN 24 depends on XEN
26 help 25 help
27 The pseudo-physical to machine address array is sized 26 This only affects the sizing of some bss arrays, the unused
28 according to the maximum possible memory size of a Xen 27 portions of which are freed.
29 domain. This array uses 1 page per gigabyte, so there's no
30 need to be too stingy here.
31 28
32config XEN_SAVE_RESTORE 29config XEN_SAVE_RESTORE
33 bool 30 bool
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 00969099b057..d4c7265cf0a0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
170 */ 170 */
171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
172 172
173static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; 173/*
174 * Xen leaves the responsibility for maintaining p2m mappings to the
175 * guests themselves, but it must also access and update the p2m array
176 * during suspend/resume when all the pages are reallocated.
177 *
178 * The p2m table is logically a flat array, but we implement it as a
179 * three-level tree to allow the address space to be sparse.
180 *
181 * Xen
182 * |
183 * p2m_top p2m_top_mfn
184 * / \ / \
185 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
186 * / \ / \ / /
187 * p2m p2m p2m p2m p2m p2m p2m ...
188 *
189 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
190 * maximum representable pseudo-physical address space is:
191 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
192 *
193 * P2M_PER_PAGE depends on the architecture, as a mfn is always
194 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
195 * 512 and 1024 entries respectively.
196 */
174 197
175#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 198static unsigned long max_p2m_pfn __read_mostly;
176#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE)
177#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES)
178 199
179/* Placeholder for holes in the address space */ 200#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
180static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); 201#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
202#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
181 203
182 /* Array of pointers to pages containing p2m entries */ 204#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
183static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
184 205
185/* Arrays of p2m arrays expressed in mfns used for save/restore */ 206/* Placeholders for holes in the address space */
186static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); 207static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
208static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
209static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
187 210
188static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, 211static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
189 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); 212static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
213
214RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
215RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
190 216
191static inline unsigned p2m_top_index(unsigned long pfn) 217static inline unsigned p2m_top_index(unsigned long pfn)
192{ 218{
193 BUG_ON(pfn >= max_p2m_pfn); 219 BUG_ON(pfn >= MAX_P2M_PFN);
194 return pfn / P2M_ENTRIES_PER_PAGE; 220 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
221}
222
223static inline unsigned p2m_mid_index(unsigned long pfn)
224{
225 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
195} 226}
196 227
197static inline unsigned p2m_index(unsigned long pfn) 228static inline unsigned p2m_index(unsigned long pfn)
198{ 229{
199 return pfn % P2M_ENTRIES_PER_PAGE; 230 return pfn % P2M_PER_PAGE;
200} 231}
201 232
202/* Build the parallel p2m_top_mfn structures */ 233static void p2m_top_init(unsigned long ***top)
234{
235 unsigned i;
236
237 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
238 top[i] = p2m_mid_missing;
239}
240
241static void p2m_top_mfn_init(unsigned long *top)
242{
243 unsigned i;
244
245 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
246 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
247}
248
249static void p2m_mid_init(unsigned long **mid)
250{
251 unsigned i;
252
253 for (i = 0; i < P2M_MID_PER_PAGE; i++)
254 mid[i] = p2m_missing;
255}
256
257static void p2m_mid_mfn_init(unsigned long *mid)
258{
259 unsigned i;
260
261 for (i = 0; i < P2M_MID_PER_PAGE; i++)
262 mid[i] = virt_to_mfn(p2m_missing);
263}
264
265static void p2m_init(unsigned long *p2m)
266{
267 unsigned i;
268
269 for (i = 0; i < P2M_MID_PER_PAGE; i++)
270 p2m[i] = INVALID_P2M_ENTRY;
271}
272
273/*
274 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
275 *
276 * This is called both at boot time, and after resuming from suspend:
277 * - At boot time we're called very early, and must use extend_brk()
278 * to allocate memory.
279 *
280 * - After resume we're called from within stop_machine, but the mfn
281 * tree should alreay be completely allocated.
282 */
203void xen_build_mfn_list_list(void) 283void xen_build_mfn_list_list(void)
204{ 284{
205 unsigned pfn, idx; 285 unsigned pfn, i;
206 286
207 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 287 /* Pre-initialize p2m_top_mfn to be completely missing */
208 unsigned topidx = p2m_top_index(pfn); 288 if (p2m_top_mfn == NULL) {
289 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
290 p2m_mid_mfn_init(p2m_mid_missing_mfn);
209 291
210 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 292 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
293 p2m_top_mfn_init(p2m_top_mfn);
211 } 294 }
212 295
213 for (idx = 0; 296 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
214 idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; 297 unsigned topidx = p2m_top_index(pfn);
215 idx++) { 298 unsigned mididx = p2m_mid_index(pfn);
216 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 299 unsigned long **mid;
217 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 300 unsigned long mid_mfn;
301 unsigned long *mid_mfn_p;
302
303 mid = p2m_top[topidx];
304
305 /* Don't bother allocating any mfn mid levels if
306 they're just missing */
307 if (mid[mididx] == p2m_missing)
308 continue;
309
310 mid_mfn = p2m_top_mfn[topidx];
311 mid_mfn_p = mfn_to_virt(mid_mfn);
312
313 if (mid_mfn_p == p2m_mid_missing_mfn) {
314 /*
315 * XXX boot-time only! We should never find
316 * missing parts of the mfn tree after
317 * runtime. extend_brk() will BUG if we call
318 * it too late.
319 */
320 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
321 p2m_mid_mfn_init(mid_mfn_p);
322
323 mid_mfn = virt_to_mfn(mid_mfn_p);
324
325 p2m_top_mfn[topidx] = mid_mfn;
326 }
327
328 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
218 } 329 }
219} 330}
220 331
@@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void)
223 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 334 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
224 335
225 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 336 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
226 virt_to_mfn(p2m_top_mfn_list); 337 virt_to_mfn(p2m_top_mfn);
227 HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; 338 HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
228} 339}
229 340
@@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void)
233 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 344 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
234 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 345 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
235 unsigned pfn; 346 unsigned pfn;
236 unsigned i;
237 347
238 max_p2m_pfn = max_pfn; 348 max_p2m_pfn = max_pfn;
239 349
240 p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, 350 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
241 PAGE_SIZE); 351 p2m_init(p2m_missing);
242 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
243 p2m_missing[i] = ~0UL;
244 352
245 p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), 353 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
246 PAGE_SIZE); 354 p2m_mid_init(p2m_mid_missing);
247 for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
248 p2m_top[i] = p2m_missing;
249 355
250 p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), 356 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
251 PAGE_SIZE); 357 p2m_top_init(p2m_top);
252 p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
253 (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
254 PAGE_SIZE);
255 358
256 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 359 /*
360 * The domain builder gives us a pre-constructed p2m array in
361 * mfn_list for all the pages initially given to us, so we just
362 * need to graft that into our tree structure.
363 */
364 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
257 unsigned topidx = p2m_top_index(pfn); 365 unsigned topidx = p2m_top_index(pfn);
366 unsigned mididx = p2m_mid_index(pfn);
367
368 if (p2m_top[topidx] == p2m_mid_missing) {
369 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
370 p2m_mid_init(mid);
258 371
259 p2m_top[topidx] = &mfn_list[pfn]; 372 p2m_top[topidx] = mid;
373 }
374
375 p2m_top[topidx][mididx] = &mfn_list[pfn];
260 } 376 }
261 377
378 /* Allocate and initialize top and mid mfn levels */
262 xen_build_mfn_list_list(); 379 xen_build_mfn_list_list();
263} 380}
264 381
265unsigned long get_phys_to_machine(unsigned long pfn) 382unsigned long get_phys_to_machine(unsigned long pfn)
266{ 383{
267 unsigned topidx, idx; 384 unsigned topidx, mididx, idx;
268 385
269 if (unlikely(pfn >= max_p2m_pfn)) 386 if (unlikely(pfn >= MAX_P2M_PFN))
270 return INVALID_P2M_ENTRY; 387 return INVALID_P2M_ENTRY;
271 388
272 topidx = p2m_top_index(pfn); 389 topidx = p2m_top_index(pfn);
390 mididx = p2m_mid_index(pfn);
273 idx = p2m_index(pfn); 391 idx = p2m_index(pfn);
274 return p2m_top[topidx][idx]; 392
393 return p2m_top[topidx][mididx][idx];
275} 394}
276EXPORT_SYMBOL_GPL(get_phys_to_machine); 395EXPORT_SYMBOL_GPL(get_phys_to_machine);
277 396
278/* install a new p2m_top page */ 397static void *alloc_p2m_page(void)
279static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
280{ 398{
281 unsigned topidx = p2m_top_index(pfn); 399 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
282 unsigned long **pfnp, *mfnp; 400}
283 unsigned i;
284 401
285 pfnp = &p2m_top[topidx]; 402static void free_p2m_page(void *p)
286 mfnp = &p2m_top_mfn[topidx]; 403{
404 free_page((unsigned long)p);
405}
287 406
288 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 407/*
289 p[i] = INVALID_P2M_ENTRY; 408 * Fully allocate the p2m structure for a given pfn. We need to check
409 * that both the top and mid levels are allocated, and make sure the
410 * parallel mfn tree is kept in sync. We may race with other cpus, so
411 * the new pages are installed with cmpxchg; if we lose the race then
412 * simply free the page we allocated and use the one that's there.
413 */
414static bool alloc_p2m(unsigned long pfn)
415{
416 unsigned topidx, mididx;
417 unsigned long ***top_p, **mid;
418 unsigned long *top_mfn_p, *mid_mfn;
290 419
291 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { 420 topidx = p2m_top_index(pfn);
292 *mfnp = virt_to_mfn(p); 421 mididx = p2m_mid_index(pfn);
293 return true; 422
423 top_p = &p2m_top[topidx];
424 mid = *top_p;
425
426 if (mid == p2m_mid_missing) {
427 /* Mid level is missing, allocate a new one */
428 mid = alloc_p2m_page();
429 if (!mid)
430 return false;
431
432 p2m_mid_init(mid);
433
434 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
435 free_p2m_page(mid);
294 } 436 }
295 437
296 return false; 438 top_mfn_p = &p2m_top_mfn[topidx];
297} 439 mid_mfn = mfn_to_virt(*top_mfn_p);
298 440
299static void alloc_p2m(unsigned long pfn) 441 if (mid_mfn == p2m_mid_missing_mfn) {
300{ 442 /* Separately check the mid mfn level */
301 unsigned long *p; 443 unsigned long missing_mfn;
444 unsigned long mid_mfn_mfn;
445
446 mid_mfn = alloc_p2m_page();
447 if (!mid_mfn)
448 return false;
449
450 p2m_mid_mfn_init(mid_mfn);
451
452 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
453 mid_mfn_mfn = virt_to_mfn(mid_mfn);
454 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
455 free_p2m_page(mid_mfn);
456 }
302 457
303 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 458 if (p2m_top[topidx][mididx] == p2m_missing) {
304 BUG_ON(p == NULL); 459 /* p2m leaf page is missing */
460 unsigned long *p2m;
305 461
306 if (!install_p2mtop_page(pfn, p)) 462 p2m = alloc_p2m_page();
307 free_page((unsigned long)p); 463 if (!p2m)
464 return false;
465
466 p2m_init(p2m);
467
468 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
469 free_p2m_page(p2m);
470 else
471 mid_mfn[mididx] = virt_to_mfn(p2m);
472 }
473
474 return true;
308} 475}
309 476
310/* Try to install p2m mapping; fail if intermediate bits missing */ 477/* Try to install p2m mapping; fail if intermediate bits missing */
311bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 478bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{ 479{
313 unsigned topidx, idx; 480 unsigned topidx, mididx, idx;
314 481
315 if (unlikely(pfn >= max_p2m_pfn)) { 482 if (unlikely(pfn >= MAX_P2M_PFN)) {
316 BUG_ON(mfn != INVALID_P2M_ENTRY); 483 BUG_ON(mfn != INVALID_P2M_ENTRY);
317 return true; 484 return true;
318 } 485 }
319 486
320 topidx = p2m_top_index(pfn); 487 topidx = p2m_top_index(pfn);
321 if (p2m_top[topidx] == p2m_missing) { 488 mididx = p2m_mid_index(pfn);
322 if (mfn == INVALID_P2M_ENTRY)
323 return true;
324 return false;
325 }
326
327 idx = p2m_index(pfn); 489 idx = p2m_index(pfn);
328 p2m_top[topidx][idx] = mfn; 490
491 if (p2m_top[topidx][mididx] == p2m_missing)
492 return mfn == INVALID_P2M_ENTRY;
493
494 p2m_top[topidx][mididx][idx] = mfn;
329 495
330 return true; 496 return true;
331} 497}
@@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
338 } 504 }
339 505
340 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 506 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
341 alloc_p2m(pfn); 507 WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
342 508
343 if (!__set_phys_to_machine(pfn, mfn)) 509 if (!__set_phys_to_machine(pfn, mfn))
344 BUG(); 510 BUG();