aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/mmu.c318
2 files changed, 246 insertions, 83 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401a..90a7f5ad6916 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,15 +19,12 @@ config XEN_PVHVM
19 depends on X86_LOCAL_APIC 19 depends on X86_LOCAL_APIC
20 20
21config XEN_MAX_DOMAIN_MEMORY 21config XEN_MAX_DOMAIN_MEMORY
22 int "Maximum allowed size of a domain in gigabytes" 22 int
23 default 8 if X86_32 23 default 128
24 default 32 if X86_64
25 depends on XEN 24 depends on XEN
26 help 25 help
27 The pseudo-physical to machine address array is sized 26 This only affects the sizing of some bss arrays, the unused
28 according to the maximum possible memory size of a Xen 27 portions of which are freed.
29 domain. This array uses 1 page per gigabyte, so there's no
30 need to be too stingy here.
31 28
32config XEN_SAVE_RESTORE 29config XEN_SAVE_RESTORE
33 bool 30 bool
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 00969099b057..d4c7265cf0a0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
170 */ 170 */
171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
172 172
173static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; 173/*
174 * Xen leaves the responsibility for maintaining p2m mappings to the
175 * guests themselves, but it must also access and update the p2m array
176 * during suspend/resume when all the pages are reallocated.
177 *
178 * The p2m table is logically a flat array, but we implement it as a
179 * three-level tree to allow the address space to be sparse.
180 *
181 * Xen
182 * |
183 * p2m_top p2m_top_mfn
184 * / \ / \
185 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
186 * / \ / \ / /
187 * p2m p2m p2m p2m p2m p2m p2m ...
188 *
189 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
190 * maximum representable pseudo-physical address space is:
191 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
192 *
193 * P2M_PER_PAGE depends on the architecture, as a mfn is always
194 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
195 * 512 and 1024 entries respectively.
196 */
174 197
175#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 198static unsigned long max_p2m_pfn __read_mostly;
176#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE)
177#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES)
178 199
179/* Placeholder for holes in the address space */ 200#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
180static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); 201#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
202#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
181 203
182 /* Array of pointers to pages containing p2m entries */ 204#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
183static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
184 205
185/* Arrays of p2m arrays expressed in mfns used for save/restore */ 206/* Placeholders for holes in the address space */
186static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); 207static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
208static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
209static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
187 210
188static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, 211static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
189 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); 212static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
213
214RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
215RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
190 216
191static inline unsigned p2m_top_index(unsigned long pfn) 217static inline unsigned p2m_top_index(unsigned long pfn)
192{ 218{
193 BUG_ON(pfn >= max_p2m_pfn); 219 BUG_ON(pfn >= MAX_P2M_PFN);
194 return pfn / P2M_ENTRIES_PER_PAGE; 220 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
221}
222
223static inline unsigned p2m_mid_index(unsigned long pfn)
224{
225 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
195} 226}
196 227
197static inline unsigned p2m_index(unsigned long pfn) 228static inline unsigned p2m_index(unsigned long pfn)
198{ 229{
199 return pfn % P2M_ENTRIES_PER_PAGE; 230 return pfn % P2M_PER_PAGE;
200} 231}
201 232
202/* Build the parallel p2m_top_mfn structures */ 233static void p2m_top_init(unsigned long ***top)
234{
235 unsigned i;
236
237 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
238 top[i] = p2m_mid_missing;
239}
240
241static void p2m_top_mfn_init(unsigned long *top)
242{
243 unsigned i;
244
245 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
246 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
247}
248
249static void p2m_mid_init(unsigned long **mid)
250{
251 unsigned i;
252
253 for (i = 0; i < P2M_MID_PER_PAGE; i++)
254 mid[i] = p2m_missing;
255}
256
257static void p2m_mid_mfn_init(unsigned long *mid)
258{
259 unsigned i;
260
261 for (i = 0; i < P2M_MID_PER_PAGE; i++)
262 mid[i] = virt_to_mfn(p2m_missing);
263}
264
265static void p2m_init(unsigned long *p2m)
266{
267 unsigned i;
268
269 for (i = 0; i < P2M_MID_PER_PAGE; i++)
270 p2m[i] = INVALID_P2M_ENTRY;
271}
272
273/*
274 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
275 *
276 * This is called both at boot time, and after resuming from suspend:
277 * - At boot time we're called very early, and must use extend_brk()
278 * to allocate memory.
279 *
280 * - After resume we're called from within stop_machine, but the mfn
281 * tree should alreay be completely allocated.
282 */
203void xen_build_mfn_list_list(void) 283void xen_build_mfn_list_list(void)
204{ 284{
205 unsigned pfn, idx; 285 unsigned pfn, i;
206 286
207 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 287 /* Pre-initialize p2m_top_mfn to be completely missing */
208 unsigned topidx = p2m_top_index(pfn); 288 if (p2m_top_mfn == NULL) {
289 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
290 p2m_mid_mfn_init(p2m_mid_missing_mfn);
209 291
210 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 292 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
293 p2m_top_mfn_init(p2m_top_mfn);
211 } 294 }
212 295
213 for (idx = 0; 296 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
214 idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; 297 unsigned topidx = p2m_top_index(pfn);
215 idx++) { 298 unsigned mididx = p2m_mid_index(pfn);
216 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 299 unsigned long **mid;
217 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 300 unsigned long mid_mfn;
301 unsigned long *mid_mfn_p;
302
303 mid = p2m_top[topidx];
304
305 /* Don't bother allocating any mfn mid levels if
306 they're just missing */
307 if (mid[mididx] == p2m_missing)
308 continue;
309
310 mid_mfn = p2m_top_mfn[topidx];
311 mid_mfn_p = mfn_to_virt(mid_mfn);
312
313 if (mid_mfn_p == p2m_mid_missing_mfn) {
314 /*
315 * XXX boot-time only! We should never find
316 * missing parts of the mfn tree after
317 * runtime. extend_brk() will BUG if we call
318 * it too late.
319 */
320 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
321 p2m_mid_mfn_init(mid_mfn_p);
322
323 mid_mfn = virt_to_mfn(mid_mfn_p);
324
325 p2m_top_mfn[topidx] = mid_mfn;
326 }
327
328 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
218 } 329 }
219} 330}
220 331
@@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void)
223 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 334 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
224 335
225 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 336 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
226 virt_to_mfn(p2m_top_mfn_list); 337 virt_to_mfn(p2m_top_mfn);
227 HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; 338 HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
228} 339}
229 340
@@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void)
233 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 344 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
234 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 345 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
235 unsigned pfn; 346 unsigned pfn;
236 unsigned i;
237 347
238 max_p2m_pfn = max_pfn; 348 max_p2m_pfn = max_pfn;
239 349
240 p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, 350 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
241 PAGE_SIZE); 351 p2m_init(p2m_missing);
242 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
243 p2m_missing[i] = ~0UL;
244 352
245 p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), 353 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
246 PAGE_SIZE); 354 p2m_mid_init(p2m_mid_missing);
247 for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
248 p2m_top[i] = p2m_missing;
249 355
250 p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), 356 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
251 PAGE_SIZE); 357 p2m_top_init(p2m_top);
252 p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
253 (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
254 PAGE_SIZE);
255 358
256 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 359 /*
360 * The domain builder gives us a pre-constructed p2m array in
361 * mfn_list for all the pages initially given to us, so we just
362 * need to graft that into our tree structure.
363 */
364 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
257 unsigned topidx = p2m_top_index(pfn); 365 unsigned topidx = p2m_top_index(pfn);
366 unsigned mididx = p2m_mid_index(pfn);
367
368 if (p2m_top[topidx] == p2m_mid_missing) {
369 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
370 p2m_mid_init(mid);
258 371
259 p2m_top[topidx] = &mfn_list[pfn]; 372 p2m_top[topidx] = mid;
373 }
374
375 p2m_top[topidx][mididx] = &mfn_list[pfn];
260 } 376 }
261 377
378 /* Allocate and initialize top and mid mfn levels */
262 xen_build_mfn_list_list(); 379 xen_build_mfn_list_list();
263} 380}
264 381
265unsigned long get_phys_to_machine(unsigned long pfn) 382unsigned long get_phys_to_machine(unsigned long pfn)
266{ 383{
267 unsigned topidx, idx; 384 unsigned topidx, mididx, idx;
268 385
269 if (unlikely(pfn >= max_p2m_pfn)) 386 if (unlikely(pfn >= MAX_P2M_PFN))
270 return INVALID_P2M_ENTRY; 387 return INVALID_P2M_ENTRY;
271 388
272 topidx = p2m_top_index(pfn); 389 topidx = p2m_top_index(pfn);
390 mididx = p2m_mid_index(pfn);
273 idx = p2m_index(pfn); 391 idx = p2m_index(pfn);
274 return p2m_top[topidx][idx]; 392
393 return p2m_top[topidx][mididx][idx];
275} 394}
276EXPORT_SYMBOL_GPL(get_phys_to_machine); 395EXPORT_SYMBOL_GPL(get_phys_to_machine);
277 396
278/* install a new p2m_top page */ 397static void *alloc_p2m_page(void)
279static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
280{ 398{
281 unsigned topidx = p2m_top_index(pfn); 399 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
282 unsigned long **pfnp, *mfnp; 400}
283 unsigned i;
284 401
285 pfnp = &p2m_top[topidx]; 402static void free_p2m_page(void *p)
286 mfnp = &p2m_top_mfn[topidx]; 403{
404 free_page((unsigned long)p);
405}
287 406
288 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 407/*
289 p[i] = INVALID_P2M_ENTRY; 408 * Fully allocate the p2m structure for a given pfn. We need to check
409 * that both the top and mid levels are allocated, and make sure the
410 * parallel mfn tree is kept in sync. We may race with other cpus, so
411 * the new pages are installed with cmpxchg; if we lose the race then
412 * simply free the page we allocated and use the one that's there.
413 */
414static bool alloc_p2m(unsigned long pfn)
415{
416 unsigned topidx, mididx;
417 unsigned long ***top_p, **mid;
418 unsigned long *top_mfn_p, *mid_mfn;
290 419
291 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { 420 topidx = p2m_top_index(pfn);
292 *mfnp = virt_to_mfn(p); 421 mididx = p2m_mid_index(pfn);
293 return true; 422
423 top_p = &p2m_top[topidx];
424 mid = *top_p;
425
426 if (mid == p2m_mid_missing) {
427 /* Mid level is missing, allocate a new one */
428 mid = alloc_p2m_page();
429 if (!mid)
430 return false;
431
432 p2m_mid_init(mid);
433
434 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
435 free_p2m_page(mid);
294 } 436 }
295 437
296 return false; 438 top_mfn_p = &p2m_top_mfn[topidx];
297} 439 mid_mfn = mfn_to_virt(*top_mfn_p);
298 440
299static void alloc_p2m(unsigned long pfn) 441 if (mid_mfn == p2m_mid_missing_mfn) {
300{ 442 /* Separately check the mid mfn level */
301 unsigned long *p; 443 unsigned long missing_mfn;
444 unsigned long mid_mfn_mfn;
445
446 mid_mfn = alloc_p2m_page();
447 if (!mid_mfn)
448 return false;
449
450 p2m_mid_mfn_init(mid_mfn);
451
452 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
453 mid_mfn_mfn = virt_to_mfn(mid_mfn);
454 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
455 free_p2m_page(mid_mfn);
456 }
302 457
303 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 458 if (p2m_top[topidx][mididx] == p2m_missing) {
304 BUG_ON(p == NULL); 459 /* p2m leaf page is missing */
460 unsigned long *p2m;
305 461
306 if (!install_p2mtop_page(pfn, p)) 462 p2m = alloc_p2m_page();
307 free_page((unsigned long)p); 463 if (!p2m)
464 return false;
465
466 p2m_init(p2m);
467
468 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
469 free_p2m_page(p2m);
470 else
471 mid_mfn[mididx] = virt_to_mfn(p2m);
472 }
473
474 return true;
308} 475}
309 476
310/* Try to install p2m mapping; fail if intermediate bits missing */ 477/* Try to install p2m mapping; fail if intermediate bits missing */
311bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 478bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{ 479{
313 unsigned topidx, idx; 480 unsigned topidx, mididx, idx;
314 481
315 if (unlikely(pfn >= max_p2m_pfn)) { 482 if (unlikely(pfn >= MAX_P2M_PFN)) {
316 BUG_ON(mfn != INVALID_P2M_ENTRY); 483 BUG_ON(mfn != INVALID_P2M_ENTRY);
317 return true; 484 return true;
318 } 485 }
319 486
320 topidx = p2m_top_index(pfn); 487 topidx = p2m_top_index(pfn);
321 if (p2m_top[topidx] == p2m_missing) { 488 mididx = p2m_mid_index(pfn);
322 if (mfn == INVALID_P2M_ENTRY)
323 return true;
324 return false;
325 }
326
327 idx = p2m_index(pfn); 489 idx = p2m_index(pfn);
328 p2m_top[topidx][idx] = mfn; 490
491 if (p2m_top[topidx][mididx] == p2m_missing)
492 return mfn == INVALID_P2M_ENTRY;
493
494 p2m_top[topidx][mididx][idx] = mfn;
329 495
330 return true; 496 return true;
331} 497}
@@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
338 } 504 }
339 505
340 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 506 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
341 alloc_p2m(pfn); 507 WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
342 508
343 if (!__set_phys_to_machine(pfn, mfn)) 509 if (!__set_phys_to_machine(pfn, mfn))
344 BUG(); 510 BUG();