diff options
Diffstat (limited to 'arch/x86/kvm/paging_tmpl.h')
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 249 |
1 files changed, 168 insertions, 81 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f2..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -25,11 +25,11 @@ | |||
| 25 | #if PTTYPE == 64 | 25 | #if PTTYPE == 64 |
| 26 | #define pt_element_t u64 | 26 | #define pt_element_t u64 |
| 27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
| 28 | #define shadow_walker shadow_walker64 | ||
| 28 | #define FNAME(name) paging##64_##name | 29 | #define FNAME(name) paging##64_##name |
| 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 30 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
| 30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 31 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK |
| 31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
| 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
| 35 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
| @@ -42,11 +42,11 @@ | |||
| 42 | #elif PTTYPE == 32 | 42 | #elif PTTYPE == 32 |
| 43 | #define pt_element_t u32 | 43 | #define pt_element_t u32 |
| 44 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
| 45 | #define shadow_walker shadow_walker32 | ||
| 45 | #define FNAME(name) paging##32_##name | 46 | #define FNAME(name) paging##32_##name |
| 46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 47 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
| 47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 48 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK |
| 48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
| 49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
| 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
| 52 | #define PT_MAX_FULL_LEVELS 2 | 52 | #define PT_MAX_FULL_LEVELS 2 |
| @@ -73,6 +73,17 @@ struct guest_walker { | |||
| 73 | u32 error_code; | 73 | u32 error_code; |
| 74 | }; | 74 | }; |
| 75 | 75 | ||
| 76 | struct shadow_walker { | ||
| 77 | struct kvm_shadow_walk walker; | ||
| 78 | struct guest_walker *guest_walker; | ||
| 79 | int user_fault; | ||
| 80 | int write_fault; | ||
| 81 | int largepage; | ||
| 82 | int *ptwrite; | ||
| 83 | pfn_t pfn; | ||
| 84 | u64 *sptep; | ||
| 85 | }; | ||
| 86 | |||
| 76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 87 | static gfn_t gpte_to_gfn(pt_element_t gpte) |
| 77 | { | 88 | { |
| 78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 89 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
| @@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | |||
| 91 | pt_element_t *table; | 102 | pt_element_t *table; |
| 92 | struct page *page; | 103 | struct page *page; |
| 93 | 104 | ||
| 94 | down_read(¤t->mm->mmap_sem); | ||
| 95 | page = gfn_to_page(kvm, table_gfn); | 105 | page = gfn_to_page(kvm, table_gfn); |
| 96 | up_read(¤t->mm->mmap_sem); | ||
| 97 | 106 | ||
| 98 | table = kmap_atomic(page, KM_USER0); | 107 | table = kmap_atomic(page, KM_USER0); |
| 99 | |||
| 100 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | 108 | ret = CMPXCHG(&table[index], orig_pte, new_pte); |
| 101 | |||
| 102 | kunmap_atomic(table, KM_USER0); | 109 | kunmap_atomic(table, KM_USER0); |
| 103 | 110 | ||
| 104 | kvm_release_page_dirty(page); | 111 | kvm_release_page_dirty(page); |
| @@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
| 274 | /* | 281 | /* |
| 275 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 282 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
| 276 | */ | 283 | */ |
| 277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, |
| 278 | struct guest_walker *walker, | 285 | struct kvm_vcpu *vcpu, u64 addr, |
| 279 | int user_fault, int write_fault, int largepage, | 286 | u64 *sptep, int level) |
| 280 | int *ptwrite, pfn_t pfn) | ||
| 281 | { | 287 | { |
| 282 | hpa_t shadow_addr; | 288 | struct shadow_walker *sw = |
| 283 | int level; | 289 | container_of(_sw, struct shadow_walker, walker); |
| 284 | u64 *shadow_ent; | 290 | struct guest_walker *gw = sw->guest_walker; |
| 285 | unsigned access = walker->pt_access; | 291 | unsigned access = gw->pt_access; |
| 286 | 292 | struct kvm_mmu_page *shadow_page; | |
| 287 | if (!is_present_pte(walker->ptes[walker->level - 1])) | 293 | u64 spte; |
| 288 | return NULL; | 294 | int metaphysical; |
| 289 | 295 | gfn_t table_gfn; | |
| 290 | shadow_addr = vcpu->arch.mmu.root_hpa; | 296 | int r; |
| 291 | level = vcpu->arch.mmu.shadow_root_level; | 297 | pt_element_t curr_pte; |
| 292 | if (level == PT32E_ROOT_LEVEL) { | 298 | |
| 293 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 299 | if (level == PT_PAGE_TABLE_LEVEL |
| 294 | shadow_addr &= PT64_BASE_ADDR_MASK; | 300 | || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { |
| 295 | --level; | 301 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, |
| 302 | sw->user_fault, sw->write_fault, | ||
| 303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | ||
| 304 | sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, | ||
| 305 | false); | ||
| 306 | sw->sptep = sptep; | ||
| 307 | return 1; | ||
| 296 | } | 308 | } |
| 297 | 309 | ||
| 298 | for (; ; level--) { | 310 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) |
| 299 | u32 index = SHADOW_PT_INDEX(addr, level); | 311 | return 0; |
| 300 | struct kvm_mmu_page *shadow_page; | ||
| 301 | u64 shadow_pte; | ||
| 302 | int metaphysical; | ||
| 303 | gfn_t table_gfn; | ||
| 304 | |||
| 305 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
| 306 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 307 | break; | ||
| 308 | |||
| 309 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
| 310 | break; | ||
| 311 | 312 | ||
| 312 | if (is_shadow_present_pte(*shadow_ent) | 313 | if (is_large_pte(*sptep)) { |
| 313 | && !is_large_pte(*shadow_ent)) { | 314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); |
| 314 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 315 | kvm_flush_remote_tlbs(vcpu->kvm); |
| 315 | continue; | 316 | rmap_remove(vcpu->kvm, sptep); |
| 316 | } | 317 | } |
| 317 | 318 | ||
| 318 | if (is_large_pte(*shadow_ent)) | 319 | if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { |
| 319 | rmap_remove(vcpu->kvm, shadow_ent); | 320 | metaphysical = 1; |
| 320 | 321 | if (!is_dirty_pte(gw->ptes[level - 1])) | |
| 321 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 322 | access &= ~ACC_WRITE_MASK; |
| 322 | && walker->level == PT_DIRECTORY_LEVEL) { | 323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); |
| 323 | metaphysical = 1; | 324 | } else { |
| 324 | if (!is_dirty_pte(walker->ptes[level - 1])) | 325 | metaphysical = 0; |
| 325 | access &= ~ACC_WRITE_MASK; | 326 | table_gfn = gw->table_gfn[level - 2]; |
| 326 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | 327 | } |
| 327 | } else { | 328 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, |
| 328 | metaphysical = 0; | 329 | metaphysical, access, sptep); |
| 329 | table_gfn = walker->table_gfn[level - 2]; | 330 | if (!metaphysical) { |
| 330 | } | 331 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], |
| 331 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | 332 | &curr_pte, sizeof(curr_pte)); |
| 332 | metaphysical, access, | 333 | if (r || curr_pte != gw->ptes[level - 2]) { |
| 333 | shadow_ent); | 334 | kvm_release_pfn_clean(sw->pfn); |
| 334 | if (!metaphysical) { | 335 | sw->sptep = NULL; |
| 335 | int r; | 336 | return 1; |
| 336 | pt_element_t curr_pte; | ||
| 337 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
| 338 | walker->pte_gpa[level - 2], | ||
| 339 | &curr_pte, sizeof(curr_pte)); | ||
| 340 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
| 341 | kvm_release_pfn_clean(pfn); | ||
| 342 | return NULL; | ||
| 343 | } | ||
| 344 | } | 337 | } |
| 345 | shadow_addr = __pa(shadow_page->spt); | ||
| 346 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
| 347 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 348 | set_shadow_pte(shadow_ent, shadow_pte); | ||
| 349 | } | 338 | } |
| 350 | 339 | ||
| 351 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 340 | spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK |
| 352 | user_fault, write_fault, | 341 | | PT_WRITABLE_MASK | PT_USER_MASK; |
| 353 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 342 | *sptep = spte; |
| 354 | ptwrite, largepage, walker->gfn, pfn, false); | 343 | return 0; |
| 344 | } | ||
| 345 | |||
| 346 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
| 347 | struct guest_walker *guest_walker, | ||
| 348 | int user_fault, int write_fault, int largepage, | ||
| 349 | int *ptwrite, pfn_t pfn) | ||
| 350 | { | ||
| 351 | struct shadow_walker walker = { | ||
| 352 | .walker = { .entry = FNAME(shadow_walk_entry), }, | ||
| 353 | .guest_walker = guest_walker, | ||
| 354 | .user_fault = user_fault, | ||
| 355 | .write_fault = write_fault, | ||
| 356 | .largepage = largepage, | ||
| 357 | .ptwrite = ptwrite, | ||
| 358 | .pfn = pfn, | ||
| 359 | }; | ||
| 360 | |||
| 361 | if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) | ||
| 362 | return NULL; | ||
| 363 | |||
| 364 | walk_shadow(&walker.walker, vcpu, addr); | ||
| 355 | 365 | ||
| 356 | return shadow_ent; | 366 | return walker.sptep; |
| 357 | } | 367 | } |
| 358 | 368 | ||
| 359 | /* | 369 | /* |
| @@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 407 | return 0; | 417 | return 0; |
| 408 | } | 418 | } |
| 409 | 419 | ||
| 410 | down_read(¤t->mm->mmap_sem); | ||
| 411 | if (walker.level == PT_DIRECTORY_LEVEL) { | 420 | if (walker.level == PT_DIRECTORY_LEVEL) { |
| 412 | gfn_t large_gfn; | 421 | gfn_t large_gfn; |
| 413 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 422 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); |
| @@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 417 | } | 426 | } |
| 418 | } | 427 | } |
| 419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 428 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 420 | /* implicit mb(), we'll read before PT lock is unlocked */ | 429 | smp_rmb(); |
| 421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 430 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
| 422 | up_read(¤t->mm->mmap_sem); | ||
| 423 | 431 | ||
| 424 | /* mmio */ | 432 | /* mmio */ |
| 425 | if (is_error_pfn(pfn)) { | 433 | if (is_error_pfn(pfn)) { |
| @@ -453,6 +461,31 @@ out_unlock: | |||
| 453 | return 0; | 461 | return 0; |
| 454 | } | 462 | } |
| 455 | 463 | ||
| 464 | static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | ||
| 465 | struct kvm_vcpu *vcpu, u64 addr, | ||
| 466 | u64 *sptep, int level) | ||
| 467 | { | ||
| 468 | |||
| 469 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
| 470 | if (is_shadow_present_pte(*sptep)) | ||
| 471 | rmap_remove(vcpu->kvm, sptep); | ||
| 472 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | ||
| 473 | return 1; | ||
| 474 | } | ||
| 475 | if (!is_shadow_present_pte(*sptep)) | ||
| 476 | return 1; | ||
| 477 | return 0; | ||
| 478 | } | ||
| 479 | |||
| 480 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 481 | { | ||
| 482 | struct shadow_walker walker = { | ||
| 483 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, | ||
| 484 | }; | ||
| 485 | |||
| 486 | walk_shadow(&walker.walker, vcpu, gva); | ||
| 487 | } | ||
| 488 | |||
| 456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 489 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
| 457 | { | 490 | { |
| 458 | struct guest_walker walker; | 491 | struct guest_walker walker; |
| @@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
| 499 | } | 532 | } |
| 500 | } | 533 | } |
| 501 | 534 | ||
| 535 | /* | ||
| 536 | * Using the cached information from sp->gfns is safe because: | ||
| 537 | * - The spte has a reference to the struct page, so the pfn for a given gfn | ||
| 538 | * can't change unless all sptes pointing to it are nuked first. | ||
| 539 | * - Alias changes zap the entire shadow cache. | ||
| 540 | */ | ||
| 541 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
| 542 | { | ||
| 543 | int i, offset, nr_present; | ||
| 544 | |||
| 545 | offset = nr_present = 0; | ||
| 546 | |||
| 547 | if (PTTYPE == 32) | ||
| 548 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
| 549 | |||
| 550 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | ||
| 551 | unsigned pte_access; | ||
| 552 | pt_element_t gpte; | ||
| 553 | gpa_t pte_gpa; | ||
| 554 | gfn_t gfn = sp->gfns[i]; | ||
| 555 | |||
| 556 | if (!is_shadow_present_pte(sp->spt[i])) | ||
| 557 | continue; | ||
| 558 | |||
| 559 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
| 560 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
| 561 | |||
| 562 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | ||
| 563 | sizeof(pt_element_t))) | ||
| 564 | return -EINVAL; | ||
| 565 | |||
| 566 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | ||
| 567 | !(gpte & PT_ACCESSED_MASK)) { | ||
| 568 | u64 nonpresent; | ||
| 569 | |||
| 570 | rmap_remove(vcpu->kvm, &sp->spt[i]); | ||
| 571 | if (is_present_pte(gpte)) | ||
| 572 | nonpresent = shadow_trap_nonpresent_pte; | ||
| 573 | else | ||
| 574 | nonpresent = shadow_notrap_nonpresent_pte; | ||
| 575 | set_shadow_pte(&sp->spt[i], nonpresent); | ||
| 576 | continue; | ||
| 577 | } | ||
| 578 | |||
| 579 | nr_present++; | ||
| 580 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
| 581 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | ||
| 582 | is_dirty_pte(gpte), 0, gfn, | ||
| 583 | spte_to_pfn(sp->spt[i]), true, false); | ||
| 584 | } | ||
| 585 | |||
| 586 | return !nr_present; | ||
| 587 | } | ||
| 588 | |||
| 502 | #undef pt_element_t | 589 | #undef pt_element_t |
| 503 | #undef guest_walker | 590 | #undef guest_walker |
| 591 | #undef shadow_walker | ||
| 504 | #undef FNAME | 592 | #undef FNAME |
| 505 | #undef PT_BASE_ADDR_MASK | 593 | #undef PT_BASE_ADDR_MASK |
| 506 | #undef PT_INDEX | 594 | #undef PT_INDEX |
| 507 | #undef SHADOW_PT_INDEX | ||
| 508 | #undef PT_LEVEL_MASK | 595 | #undef PT_LEVEL_MASK |
| 509 | #undef PT_DIR_BASE_ADDR_MASK | 596 | #undef PT_DIR_BASE_ADDR_MASK |
| 510 | #undef PT_LEVEL_BITS | 597 | #undef PT_LEVEL_BITS |
