aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r--arch/x86/xen/mmu.c750
1 files changed, 744 insertions, 6 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..319bd40a57c2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 492{
459 return pte_mfn_to_pfn(pte.pte); 493 return pte_mfn_to_pfn(pte.pte);
460} 494}
495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 496
462pgdval_t xen_pgd_val(pgd_t pgd) 497pgdval_t xen_pgd_val(pgd_t pgd)
463{ 498{
464 return pte_mfn_to_pfn(pgd.pgd); 499 return pte_mfn_to_pfn(pgd.pgd);
465} 500}
501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 502
467pte_t xen_make_pte(pteval_t pte) 503pte_t xen_make_pte(pteval_t pte)
468{ 504{
469 pte = pte_pfn_to_mfn(pte); 505 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 506 return native_make_pte(pte);
471} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 509
473pgd_t xen_make_pgd(pgdval_t pgd) 510pgd_t xen_make_pgd(pgdval_t pgd)
474{ 511{
475 pgd = pte_pfn_to_mfn(pgd); 512 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 513 return native_make_pgd(pgd);
477} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 516
479pmdval_t xen_pmd_val(pmd_t pmd) 517pmdval_t xen_pmd_val(pmd_t pmd)
480{ 518{
481 return pte_mfn_to_pfn(pmd.pmd); 519 return pte_mfn_to_pfn(pmd.pmd);
482} 520}
521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 522
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 524{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 595 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 596 return native_make_pmd(pmd);
558} 597}
598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 599
560#if PAGETABLE_LEVELS == 4 600#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 601pudval_t xen_pud_val(pud_t pud)
562{ 602{
563 return pte_mfn_to_pfn(pud.pud); 603 return pte_mfn_to_pfn(pud.pud);
564} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 606
566pud_t xen_make_pud(pudval_t pud) 607pud_t xen_make_pud(pudval_t pud)
567{ 608{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
569 610
570 return native_make_pud(pud); 611 return native_make_pud(pud);
571} 612}
613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 614
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 616{
@@ -1063,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info; 1105 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm; 1106 struct mm_struct *active_mm;
1065 1107
1066#ifdef CONFIG_X86_64 1108 active_mm = percpu_read(cpu_tlbstate.active_mm);
1067 active_mm = read_pda(active_mm);
1068#else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070#endif
1071 1109
1072 if (active_mm == mm) 1110 if (active_mm == mm)
1073 leave_mm(smp_processor_id()); 1111 leave_mm(smp_processor_id());
1074 1112
1075 /* If this cpu still has a stale cr3 reference, then make sure 1113 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */ 1114 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { 1115 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir); 1116 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode(); 1117 arch_flush_lazy_cpu_mode();
1080 } 1118 }
@@ -1156,6 +1194,706 @@ void xen_exit_mmap(struct mm_struct *mm)
1156 spin_unlock(&mm->page_table_lock); 1194 spin_unlock(&mm->page_table_lock);
1157} 1195}
1158 1196
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276
1277 if (va == TLB_FLUSH_ALL) {
1278 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1279 } else {
1280 args->op.cmd = MMUEXT_INVLPG_MULTI;
1281 args->op.arg1.linear_addr = va;
1282 }
1283
1284 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1285
1286 xen_mc_issue(PARAVIRT_LAZY_MMU);
1287}
1288
1289static unsigned long xen_read_cr3(void)
1290{
1291 return percpu_read(xen_cr3);
1292}
1293
1294static void set_current_cr3(void *v)
1295{
1296 percpu_write(xen_current_cr3, (unsigned long)v);
1297}
1298
1299static void __xen_write_cr3(bool kernel, unsigned long cr3)
1300{
1301 struct mmuext_op *op;
1302 struct multicall_space mcs;
1303 unsigned long mfn;
1304
1305 if (cr3)
1306 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1307 else
1308 mfn = 0;
1309
1310 WARN_ON(mfn == 0 && kernel);
1311
1312 mcs = __xen_mc_entry(sizeof(*op));
1313
1314 op = mcs.args;
1315 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1316 op->arg1.mfn = mfn;
1317
1318 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1319
1320 if (kernel) {
1321 percpu_write(xen_cr3, cr3);
1322
1323 /* Update xen_current_cr3 once the batch has actually
1324 been submitted. */
1325 xen_mc_callback(set_current_cr3, (void *)cr3);
1326 }
1327}
1328
1329static void xen_write_cr3(unsigned long cr3)
1330{
1331 BUG_ON(preemptible());
1332
1333 xen_mc_batch(); /* disables interrupts */
1334
1335 /* Update while interrupts are disabled, so its atomic with
1336 respect to ipis */
1337 percpu_write(xen_cr3, cr3);
1338
1339 __xen_write_cr3(true, cr3);
1340
1341#ifdef CONFIG_X86_64
1342 {
1343 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1344 if (user_pgd)
1345 __xen_write_cr3(false, __pa(user_pgd));
1346 else
1347 __xen_write_cr3(false, 0);
1348 }
1349#endif
1350
1351 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1352}
1353
1354static int xen_pgd_alloc(struct mm_struct *mm)
1355{
1356 pgd_t *pgd = mm->pgd;
1357 int ret = 0;
1358
1359 BUG_ON(PagePinned(virt_to_page(pgd)));
1360
1361#ifdef CONFIG_X86_64
1362 {
1363 struct page *page = virt_to_page(pgd);
1364 pgd_t *user_pgd;
1365
1366 BUG_ON(page->private != 0);
1367
1368 ret = -ENOMEM;
1369
1370 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1371 page->private = (unsigned long)user_pgd;
1372
1373 if (user_pgd != NULL) {
1374 user_pgd[pgd_index(VSYSCALL_START)] =
1375 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1376 ret = 0;
1377 }
1378
1379 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1380 }
1381#endif
1382
1383 return ret;
1384}
1385
1386static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1387{
1388#ifdef CONFIG_X86_64
1389 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1390
1391 if (user_pgd)
1392 free_page((unsigned long)user_pgd);
1393#endif
1394}
1395
1396#ifdef CONFIG_HIGHPTE
1397static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1398{
1399 pgprot_t prot = PAGE_KERNEL;
1400
1401 if (PagePinned(page))
1402 prot = PAGE_KERNEL_RO;
1403
1404 if (0 && PageHighMem(page))
1405 printk("mapping highpte %lx type %d prot %s\n",
1406 page_to_pfn(page), type,
1407 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1408
1409 return kmap_atomic_prot(page, type, prot);
1410}
1411#endif
1412
1413#ifdef CONFIG_X86_32
1414static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1415{
1416 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1417 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1418 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1419 pte_val_ma(pte));
1420
1421 return pte;
1422}
1423
1424/* Init-time set_pte while constructing initial pagetables, which
1425 doesn't allow RO pagetable pages to be remapped RW */
1426static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1427{
1428 pte = mask_rw_pte(ptep, pte);
1429
1430 xen_set_pte(ptep, pte);
1431}
1432#endif
1433
1434/* Early in boot, while setting up the initial pagetable, assume
1435 everything is pinned. */
1436static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1437{
1438#ifdef CONFIG_FLATMEM
1439 BUG_ON(mem_map); /* should only be used early */
1440#endif
1441 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1442}
1443
1444/* Early release_pte assumes that all pts are pinned, since there's
1445 only init_mm and anything attached to that is pinned. */
1446static void xen_release_pte_init(unsigned long pfn)
1447{
1448 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1449}
1450
1451static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1452{
1453 struct mmuext_op op;
1454 op.cmd = cmd;
1455 op.arg1.mfn = pfn_to_mfn(pfn);
1456 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1457 BUG();
1458}
1459
1460/* This needs to make sure the new pte page is pinned iff its being
1461 attached to a pinned pagetable. */
1462static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1463{
1464 struct page *page = pfn_to_page(pfn);
1465
1466 if (PagePinned(virt_to_page(mm->pgd))) {
1467 SetPagePinned(page);
1468
1469 vm_unmap_aliases();
1470 if (!PageHighMem(page)) {
1471 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1472 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1473 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1474 } else {
1475 /* make sure there are no stray mappings of
1476 this page */
1477 kmap_flush_unused();
1478 }
1479 }
1480}
1481
1482static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1483{
1484 xen_alloc_ptpage(mm, pfn, PT_PTE);
1485}
1486
1487static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1488{
1489 xen_alloc_ptpage(mm, pfn, PT_PMD);
1490}
1491
1492/* This should never happen until we're OK to use struct page */
1493static void xen_release_ptpage(unsigned long pfn, unsigned level)
1494{
1495 struct page *page = pfn_to_page(pfn);
1496
1497 if (PagePinned(page)) {
1498 if (!PageHighMem(page)) {
1499 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1500 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1501 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1502 }
1503 ClearPagePinned(page);
1504 }
1505}
1506
1507static void xen_release_pte(unsigned long pfn)
1508{
1509 xen_release_ptpage(pfn, PT_PTE);
1510}
1511
1512static void xen_release_pmd(unsigned long pfn)
1513{
1514 xen_release_ptpage(pfn, PT_PMD);
1515}
1516
1517#if PAGETABLE_LEVELS == 4
1518static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1519{
1520 xen_alloc_ptpage(mm, pfn, PT_PUD);
1521}
1522
1523static void xen_release_pud(unsigned long pfn)
1524{
1525 xen_release_ptpage(pfn, PT_PUD);
1526}
1527#endif
1528
1529void __init xen_reserve_top(void)
1530{
1531#ifdef CONFIG_X86_32
1532 unsigned long top = HYPERVISOR_VIRT_START;
1533 struct xen_platform_parameters pp;
1534
1535 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1536 top = pp.virt_start;
1537
1538 reserve_top_address(-top);
1539#endif /* CONFIG_X86_32 */
1540}
1541
1542/*
1543 * Like __va(), but returns address in the kernel mapping (which is
1544 * all we have until the physical memory mapping has been set up.
1545 */
1546static void *__ka(phys_addr_t paddr)
1547{
1548#ifdef CONFIG_X86_64
1549 return (void *)(paddr + __START_KERNEL_map);
1550#else
1551 return __va(paddr);
1552#endif
1553}
1554
1555/* Convert a machine address to physical address */
1556static unsigned long m2p(phys_addr_t maddr)
1557{
1558 phys_addr_t paddr;
1559
1560 maddr &= PTE_PFN_MASK;
1561 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1562
1563 return paddr;
1564}
1565
1566/* Convert a machine address to kernel virtual */
1567static void *m2v(phys_addr_t maddr)
1568{
1569 return __ka(m2p(maddr));
1570}
1571
1572static void set_page_prot(void *addr, pgprot_t prot)
1573{
1574 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1575 pte_t pte = pfn_pte(pfn, prot);
1576
1577 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1578 BUG();
1579}
1580
1581static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1582{
1583 unsigned pmdidx, pteidx;
1584 unsigned ident_pte;
1585 unsigned long pfn;
1586
1587 ident_pte = 0;
1588 pfn = 0;
1589 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1590 pte_t *pte_page;
1591
1592 /* Reuse or allocate a page of ptes */
1593 if (pmd_present(pmd[pmdidx]))
1594 pte_page = m2v(pmd[pmdidx].pmd);
1595 else {
1596 /* Check for free pte pages */
1597 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1598 break;
1599
1600 pte_page = &level1_ident_pgt[ident_pte];
1601 ident_pte += PTRS_PER_PTE;
1602
1603 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1604 }
1605
1606 /* Install mappings */
1607 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1608 pte_t pte;
1609
1610 if (pfn > max_pfn_mapped)
1611 max_pfn_mapped = pfn;
1612
1613 if (!pte_none(pte_page[pteidx]))
1614 continue;
1615
1616 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1617 pte_page[pteidx] = pte;
1618 }
1619 }
1620
1621 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1622 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1623
1624 set_page_prot(pmd, PAGE_KERNEL_RO);
1625}
1626
1627#ifdef CONFIG_X86_64
1628static void convert_pfn_mfn(void *v)
1629{
1630 pte_t *pte = v;
1631 int i;
1632
1633 /* All levels are converted the same way, so just treat them
1634 as ptes. */
1635 for (i = 0; i < PTRS_PER_PTE; i++)
1636 pte[i] = xen_make_pte(pte[i].pte);
1637}
1638
1639/*
1640 * Set up the inital kernel pagetable.
1641 *
1642 * We can construct this by grafting the Xen provided pagetable into
1643 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1644 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1645 * means that only the kernel has a physical mapping to start with -
1646 * but that's enough to get __va working. We need to fill in the rest
1647 * of the physical mapping once some sort of allocator has been set
1648 * up.
1649 */
1650__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1651 unsigned long max_pfn)
1652{
1653 pud_t *l3;
1654 pmd_t *l2;
1655
1656 /* Zap identity mapping */
1657 init_level4_pgt[0] = __pgd(0);
1658
1659 /* Pre-constructed entries are in pfn, so convert to mfn */
1660 convert_pfn_mfn(init_level4_pgt);
1661 convert_pfn_mfn(level3_ident_pgt);
1662 convert_pfn_mfn(level3_kernel_pgt);
1663
1664 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1665 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1666
1667 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1668 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1669
1670 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1671 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1672 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1673
1674 /* Set up identity map */
1675 xen_map_identity_early(level2_ident_pgt, max_pfn);
1676
1677 /* Make pagetable pieces RO */
1678 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1679 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1680 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1681 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1682 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1684
1685 /* Pin down new L4 */
1686 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1687 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1688
1689 /* Unpin Xen-provided one */
1690 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1691
1692 /* Switch over */
1693 pgd = init_level4_pgt;
1694
1695 /*
1696 * At this stage there can be no user pgd, and no page
1697 * structure to attach it to, so make sure we just set kernel
1698 * pgd.
1699 */
1700 xen_mc_batch();
1701 __xen_write_cr3(true, __pa(pgd));
1702 xen_mc_issue(PARAVIRT_LAZY_CPU);
1703
1704 reserve_early(__pa(xen_start_info->pt_base),
1705 __pa(xen_start_info->pt_base +
1706 xen_start_info->nr_pt_frames * PAGE_SIZE),
1707 "XEN PAGETABLES");
1708
1709 return pgd;
1710}
1711#else /* !CONFIG_X86_64 */
1712static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1713
1714__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1715 unsigned long max_pfn)
1716{
1717 pmd_t *kernel_pmd;
1718
1719 init_pg_tables_start = __pa(pgd);
1720 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1721 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1722
1723 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1724 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1725
1726 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1727
1728 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1729 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1730 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1731
1732 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1733 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1734 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1735
1736 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1737
1738 xen_write_cr3(__pa(swapper_pg_dir));
1739
1740 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1741
1742 return swapper_pg_dir;
1743}
1744#endif /* CONFIG_X86_64 */
1745
1746static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1747{
1748 pte_t pte;
1749
1750 phys >>= PAGE_SHIFT;
1751
1752 switch (idx) {
1753 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1754#ifdef CONFIG_X86_F00F_BUG
1755 case FIX_F00F_IDT:
1756#endif
1757#ifdef CONFIG_X86_32
1758 case FIX_WP_TEST:
1759 case FIX_VDSO:
1760# ifdef CONFIG_HIGHMEM
1761 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1762# endif
1763#else
1764 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1765#endif
1766#ifdef CONFIG_X86_LOCAL_APIC
1767 case FIX_APIC_BASE: /* maps dummy local APIC */
1768#endif
1769 pte = pfn_pte(phys, prot);
1770 break;
1771
1772 default:
1773 pte = mfn_pte(phys, prot);
1774 break;
1775 }
1776
1777 __native_set_fixmap(idx, pte);
1778
1779#ifdef CONFIG_X86_64
1780 /* Replicate changes to map the vsyscall page into the user
1781 pagetable vsyscall mapping. */
1782 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1783 unsigned long vaddr = __fix_to_virt(idx);
1784 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1785 }
1786#endif
1787}
1788
1789__init void xen_post_allocator_init(void)
1790{
1791 pv_mmu_ops.set_pte = xen_set_pte;
1792 pv_mmu_ops.set_pmd = xen_set_pmd;
1793 pv_mmu_ops.set_pud = xen_set_pud;
1794#if PAGETABLE_LEVELS == 4
1795 pv_mmu_ops.set_pgd = xen_set_pgd;
1796#endif
1797
1798 /* This will work as long as patching hasn't happened yet
1799 (which it hasn't) */
1800 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1801 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1802 pv_mmu_ops.release_pte = xen_release_pte;
1803 pv_mmu_ops.release_pmd = xen_release_pmd;
1804#if PAGETABLE_LEVELS == 4
1805 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1806 pv_mmu_ops.release_pud = xen_release_pud;
1807#endif
1808
1809#ifdef CONFIG_X86_64
1810 SetPagePinned(virt_to_page(level3_user_vsyscall));
1811#endif
1812 xen_mark_init_mm_pinned();
1813}
1814
1815
1816const struct pv_mmu_ops xen_mmu_ops __initdata = {
1817 .pagetable_setup_start = xen_pagetable_setup_start,
1818 .pagetable_setup_done = xen_pagetable_setup_done,
1819
1820 .read_cr2 = xen_read_cr2,
1821 .write_cr2 = xen_write_cr2,
1822
1823 .read_cr3 = xen_read_cr3,
1824 .write_cr3 = xen_write_cr3,
1825
1826 .flush_tlb_user = xen_flush_tlb,
1827 .flush_tlb_kernel = xen_flush_tlb,
1828 .flush_tlb_single = xen_flush_tlb_single,
1829 .flush_tlb_others = xen_flush_tlb_others,
1830
1831 .pte_update = paravirt_nop,
1832 .pte_update_defer = paravirt_nop,
1833
1834 .pgd_alloc = xen_pgd_alloc,
1835 .pgd_free = xen_pgd_free,
1836
1837 .alloc_pte = xen_alloc_pte_init,
1838 .release_pte = xen_release_pte_init,
1839 .alloc_pmd = xen_alloc_pte_init,
1840 .alloc_pmd_clone = paravirt_nop,
1841 .release_pmd = xen_release_pte_init,
1842
1843#ifdef CONFIG_HIGHPTE
1844 .kmap_atomic_pte = xen_kmap_atomic_pte,
1845#endif
1846
1847#ifdef CONFIG_X86_64
1848 .set_pte = xen_set_pte,
1849#else
1850 .set_pte = xen_set_pte_init,
1851#endif
1852 .set_pte_at = xen_set_pte_at,
1853 .set_pmd = xen_set_pmd_hyper,
1854
1855 .ptep_modify_prot_start = __ptep_modify_prot_start,
1856 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1857
1858 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1859 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1860
1861 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1862 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1863
1864#ifdef CONFIG_X86_PAE
1865 .set_pte_atomic = xen_set_pte_atomic,
1866 .set_pte_present = xen_set_pte_at,
1867 .pte_clear = xen_pte_clear,
1868 .pmd_clear = xen_pmd_clear,
1869#endif /* CONFIG_X86_PAE */
1870 .set_pud = xen_set_pud_hyper,
1871
1872 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1873 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1874
1875#if PAGETABLE_LEVELS == 4
1876 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1877 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1878 .set_pgd = xen_set_pgd_hyper,
1879
1880 .alloc_pud = xen_alloc_pte_init,
1881 .release_pud = xen_release_pte_init,
1882#endif /* PAGETABLE_LEVELS == 4 */
1883
1884 .activate_mm = xen_activate_mm,
1885 .dup_mmap = xen_dup_mmap,
1886 .exit_mmap = xen_exit_mmap,
1887
1888 .lazy_mode = {
1889 .enter = paravirt_enter_lazy_mmu,
1890 .leave = xen_leave_lazy,
1891 },
1892
1893 .set_fixmap = xen_set_fixmap,
1894};
1895
1896
1159#ifdef CONFIG_XEN_DEBUG_FS 1897#ifdef CONFIG_XEN_DEBUG_FS
1160 1898
1161static struct dentry *d_mmu_debug; 1899static struct dentry *d_mmu_debug;