aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/kvm/mmu.c')
-rw-r--r--arch/arm/kvm/mmu.c408
1 files changed, 221 insertions, 187 deletions
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index be302128c5d7..45c43aecb8f2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -43,11 +43,9 @@ static unsigned long hyp_idmap_start;
43static unsigned long hyp_idmap_end; 43static unsigned long hyp_idmap_end;
44static phys_addr_t hyp_idmap_vector; 44static phys_addr_t hyp_idmap_vector;
45 45
46#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
46#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 47#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
47 48
48#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
49#define kvm_pud_huge(_x) pud_huge(_x)
50
51#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 49#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
52#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 50#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
53 51
@@ -69,14 +67,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
69 67
70static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 68static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
71{ 69{
72 /* 70 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
73 * This function also gets called when dealing with HYP page
74 * tables. As HYP doesn't have an associated struct kvm (and
75 * the HYP page tables are fairly static), we don't do
76 * anything there.
77 */
78 if (kvm)
79 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
80} 71}
81 72
82/* 73/*
@@ -115,7 +106,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
115 */ 106 */
116static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 107static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
117{ 108{
118 if (!kvm_pmd_huge(*pmd)) 109 if (!pmd_thp_or_huge(*pmd))
119 return; 110 return;
120 111
121 pmd_clear(pmd); 112 pmd_clear(pmd);
@@ -155,29 +146,29 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
155 return p; 146 return p;
156} 147}
157 148
158static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 149static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
159{ 150{
160 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); 151 pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
161 pgd_clear(pgd); 152 stage2_pgd_clear(pgd);
162 kvm_tlb_flush_vmid_ipa(kvm, addr); 153 kvm_tlb_flush_vmid_ipa(kvm, addr);
163 pud_free(NULL, pud_table); 154 stage2_pud_free(pud_table);
164 put_page(virt_to_page(pgd)); 155 put_page(virt_to_page(pgd));
165} 156}
166 157
167static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 158static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
168{ 159{
169 pmd_t *pmd_table = pmd_offset(pud, 0); 160 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
170 VM_BUG_ON(pud_huge(*pud)); 161 VM_BUG_ON(stage2_pud_huge(*pud));
171 pud_clear(pud); 162 stage2_pud_clear(pud);
172 kvm_tlb_flush_vmid_ipa(kvm, addr); 163 kvm_tlb_flush_vmid_ipa(kvm, addr);
173 pmd_free(NULL, pmd_table); 164 stage2_pmd_free(pmd_table);
174 put_page(virt_to_page(pud)); 165 put_page(virt_to_page(pud));
175} 166}
176 167
177static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 168static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
178{ 169{
179 pte_t *pte_table = pte_offset_kernel(pmd, 0); 170 pte_t *pte_table = pte_offset_kernel(pmd, 0);
180 VM_BUG_ON(kvm_pmd_huge(*pmd)); 171 VM_BUG_ON(pmd_thp_or_huge(*pmd));
181 pmd_clear(pmd); 172 pmd_clear(pmd);
182 kvm_tlb_flush_vmid_ipa(kvm, addr); 173 kvm_tlb_flush_vmid_ipa(kvm, addr);
183 pte_free_kernel(NULL, pte_table); 174 pte_free_kernel(NULL, pte_table);
@@ -204,7 +195,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
204 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 195 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
205 * the IO subsystem will never hit in the cache. 196 * the IO subsystem will never hit in the cache.
206 */ 197 */
207static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, 198static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
208 phys_addr_t addr, phys_addr_t end) 199 phys_addr_t addr, phys_addr_t end)
209{ 200{
210 phys_addr_t start_addr = addr; 201 phys_addr_t start_addr = addr;
@@ -226,21 +217,21 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
226 } 217 }
227 } while (pte++, addr += PAGE_SIZE, addr != end); 218 } while (pte++, addr += PAGE_SIZE, addr != end);
228 219
229 if (kvm_pte_table_empty(kvm, start_pte)) 220 if (stage2_pte_table_empty(start_pte))
230 clear_pmd_entry(kvm, pmd, start_addr); 221 clear_stage2_pmd_entry(kvm, pmd, start_addr);
231} 222}
232 223
233static void unmap_pmds(struct kvm *kvm, pud_t *pud, 224static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
234 phys_addr_t addr, phys_addr_t end) 225 phys_addr_t addr, phys_addr_t end)
235{ 226{
236 phys_addr_t next, start_addr = addr; 227 phys_addr_t next, start_addr = addr;
237 pmd_t *pmd, *start_pmd; 228 pmd_t *pmd, *start_pmd;
238 229
239 start_pmd = pmd = pmd_offset(pud, addr); 230 start_pmd = pmd = stage2_pmd_offset(pud, addr);
240 do { 231 do {
241 next = kvm_pmd_addr_end(addr, end); 232 next = stage2_pmd_addr_end(addr, end);
242 if (!pmd_none(*pmd)) { 233 if (!pmd_none(*pmd)) {
243 if (kvm_pmd_huge(*pmd)) { 234 if (pmd_thp_or_huge(*pmd)) {
244 pmd_t old_pmd = *pmd; 235 pmd_t old_pmd = *pmd;
245 236
246 pmd_clear(pmd); 237 pmd_clear(pmd);
@@ -250,57 +241,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud,
250 241
251 put_page(virt_to_page(pmd)); 242 put_page(virt_to_page(pmd));
252 } else { 243 } else {
253 unmap_ptes(kvm, pmd, addr, next); 244 unmap_stage2_ptes(kvm, pmd, addr, next);
254 } 245 }
255 } 246 }
256 } while (pmd++, addr = next, addr != end); 247 } while (pmd++, addr = next, addr != end);
257 248
258 if (kvm_pmd_table_empty(kvm, start_pmd)) 249 if (stage2_pmd_table_empty(start_pmd))
259 clear_pud_entry(kvm, pud, start_addr); 250 clear_stage2_pud_entry(kvm, pud, start_addr);
260} 251}
261 252
262static void unmap_puds(struct kvm *kvm, pgd_t *pgd, 253static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
263 phys_addr_t addr, phys_addr_t end) 254 phys_addr_t addr, phys_addr_t end)
264{ 255{
265 phys_addr_t next, start_addr = addr; 256 phys_addr_t next, start_addr = addr;
266 pud_t *pud, *start_pud; 257 pud_t *pud, *start_pud;
267 258
268 start_pud = pud = pud_offset(pgd, addr); 259 start_pud = pud = stage2_pud_offset(pgd, addr);
269 do { 260 do {
270 next = kvm_pud_addr_end(addr, end); 261 next = stage2_pud_addr_end(addr, end);
271 if (!pud_none(*pud)) { 262 if (!stage2_pud_none(*pud)) {
272 if (pud_huge(*pud)) { 263 if (stage2_pud_huge(*pud)) {
273 pud_t old_pud = *pud; 264 pud_t old_pud = *pud;
274 265
275 pud_clear(pud); 266 stage2_pud_clear(pud);
276 kvm_tlb_flush_vmid_ipa(kvm, addr); 267 kvm_tlb_flush_vmid_ipa(kvm, addr);
277
278 kvm_flush_dcache_pud(old_pud); 268 kvm_flush_dcache_pud(old_pud);
279
280 put_page(virt_to_page(pud)); 269 put_page(virt_to_page(pud));
281 } else { 270 } else {
282 unmap_pmds(kvm, pud, addr, next); 271 unmap_stage2_pmds(kvm, pud, addr, next);
283 } 272 }
284 } 273 }
285 } while (pud++, addr = next, addr != end); 274 } while (pud++, addr = next, addr != end);
286 275
287 if (kvm_pud_table_empty(kvm, start_pud)) 276 if (stage2_pud_table_empty(start_pud))
288 clear_pgd_entry(kvm, pgd, start_addr); 277 clear_stage2_pgd_entry(kvm, pgd, start_addr);
289} 278}
290 279
291 280/**
292static void unmap_range(struct kvm *kvm, pgd_t *pgdp, 281 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
293 phys_addr_t start, u64 size) 282 * @kvm: The VM pointer
283 * @start: The intermediate physical base address of the range to unmap
284 * @size: The size of the area to unmap
285 *
286 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
287 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
288 * destroying the VM), otherwise another faulting VCPU may come in and mess
289 * with things behind our backs.
290 */
291static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
294{ 292{
295 pgd_t *pgd; 293 pgd_t *pgd;
296 phys_addr_t addr = start, end = start + size; 294 phys_addr_t addr = start, end = start + size;
297 phys_addr_t next; 295 phys_addr_t next;
298 296
299 pgd = pgdp + kvm_pgd_index(addr); 297 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
300 do { 298 do {
301 next = kvm_pgd_addr_end(addr, end); 299 next = stage2_pgd_addr_end(addr, end);
302 if (!pgd_none(*pgd)) 300 if (!stage2_pgd_none(*pgd))
303 unmap_puds(kvm, pgd, addr, next); 301 unmap_stage2_puds(kvm, pgd, addr, next);
304 } while (pgd++, addr = next, addr != end); 302 } while (pgd++, addr = next, addr != end);
305} 303}
306 304
@@ -322,11 +320,11 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
322 pmd_t *pmd; 320 pmd_t *pmd;
323 phys_addr_t next; 321 phys_addr_t next;
324 322
325 pmd = pmd_offset(pud, addr); 323 pmd = stage2_pmd_offset(pud, addr);
326 do { 324 do {
327 next = kvm_pmd_addr_end(addr, end); 325 next = stage2_pmd_addr_end(addr, end);
328 if (!pmd_none(*pmd)) { 326 if (!pmd_none(*pmd)) {
329 if (kvm_pmd_huge(*pmd)) 327 if (pmd_thp_or_huge(*pmd))
330 kvm_flush_dcache_pmd(*pmd); 328 kvm_flush_dcache_pmd(*pmd);
331 else 329 else
332 stage2_flush_ptes(kvm, pmd, addr, next); 330 stage2_flush_ptes(kvm, pmd, addr, next);
@@ -340,11 +338,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
340 pud_t *pud; 338 pud_t *pud;
341 phys_addr_t next; 339 phys_addr_t next;
342 340
343 pud = pud_offset(pgd, addr); 341 pud = stage2_pud_offset(pgd, addr);
344 do { 342 do {
345 next = kvm_pud_addr_end(addr, end); 343 next = stage2_pud_addr_end(addr, end);
346 if (!pud_none(*pud)) { 344 if (!stage2_pud_none(*pud)) {
347 if (pud_huge(*pud)) 345 if (stage2_pud_huge(*pud))
348 kvm_flush_dcache_pud(*pud); 346 kvm_flush_dcache_pud(*pud);
349 else 347 else
350 stage2_flush_pmds(kvm, pud, addr, next); 348 stage2_flush_pmds(kvm, pud, addr, next);
@@ -360,9 +358,9 @@ static void stage2_flush_memslot(struct kvm *kvm,
360 phys_addr_t next; 358 phys_addr_t next;
361 pgd_t *pgd; 359 pgd_t *pgd;
362 360
363 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 361 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
364 do { 362 do {
365 next = kvm_pgd_addr_end(addr, end); 363 next = stage2_pgd_addr_end(addr, end);
366 stage2_flush_puds(kvm, pgd, addr, next); 364 stage2_flush_puds(kvm, pgd, addr, next);
367 } while (pgd++, addr = next, addr != end); 365 } while (pgd++, addr = next, addr != end);
368} 366}
@@ -391,6 +389,100 @@ static void stage2_flush_vm(struct kvm *kvm)
391 srcu_read_unlock(&kvm->srcu, idx); 389 srcu_read_unlock(&kvm->srcu, idx);
392} 390}
393 391
392static void clear_hyp_pgd_entry(pgd_t *pgd)
393{
394 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
395 pgd_clear(pgd);
396 pud_free(NULL, pud_table);
397 put_page(virt_to_page(pgd));
398}
399
400static void clear_hyp_pud_entry(pud_t *pud)
401{
402 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
403 VM_BUG_ON(pud_huge(*pud));
404 pud_clear(pud);
405 pmd_free(NULL, pmd_table);
406 put_page(virt_to_page(pud));
407}
408
409static void clear_hyp_pmd_entry(pmd_t *pmd)
410{
411 pte_t *pte_table = pte_offset_kernel(pmd, 0);
412 VM_BUG_ON(pmd_thp_or_huge(*pmd));
413 pmd_clear(pmd);
414 pte_free_kernel(NULL, pte_table);
415 put_page(virt_to_page(pmd));
416}
417
418static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
419{
420 pte_t *pte, *start_pte;
421
422 start_pte = pte = pte_offset_kernel(pmd, addr);
423 do {
424 if (!pte_none(*pte)) {
425 kvm_set_pte(pte, __pte(0));
426 put_page(virt_to_page(pte));
427 }
428 } while (pte++, addr += PAGE_SIZE, addr != end);
429
430 if (hyp_pte_table_empty(start_pte))
431 clear_hyp_pmd_entry(pmd);
432}
433
434static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
435{
436 phys_addr_t next;
437 pmd_t *pmd, *start_pmd;
438
439 start_pmd = pmd = pmd_offset(pud, addr);
440 do {
441 next = pmd_addr_end(addr, end);
442 /* Hyp doesn't use huge pmds */
443 if (!pmd_none(*pmd))
444 unmap_hyp_ptes(pmd, addr, next);
445 } while (pmd++, addr = next, addr != end);
446
447 if (hyp_pmd_table_empty(start_pmd))
448 clear_hyp_pud_entry(pud);
449}
450
451static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
452{
453 phys_addr_t next;
454 pud_t *pud, *start_pud;
455
456 start_pud = pud = pud_offset(pgd, addr);
457 do {
458 next = pud_addr_end(addr, end);
459 /* Hyp doesn't use huge puds */
460 if (!pud_none(*pud))
461 unmap_hyp_pmds(pud, addr, next);
462 } while (pud++, addr = next, addr != end);
463
464 if (hyp_pud_table_empty(start_pud))
465 clear_hyp_pgd_entry(pgd);
466}
467
468static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
469{
470 pgd_t *pgd;
471 phys_addr_t addr = start, end = start + size;
472 phys_addr_t next;
473
474 /*
475 * We don't unmap anything from HYP, except at the hyp tear down.
476 * Hence, we don't have to invalidate the TLBs here.
477 */
478 pgd = pgdp + pgd_index(addr);
479 do {
480 next = pgd_addr_end(addr, end);
481 if (!pgd_none(*pgd))
482 unmap_hyp_puds(pgd, addr, next);
483 } while (pgd++, addr = next, addr != end);
484}
485
394/** 486/**
395 * free_boot_hyp_pgd - free HYP boot page tables 487 * free_boot_hyp_pgd - free HYP boot page tables
396 * 488 *
@@ -401,14 +493,14 @@ void free_boot_hyp_pgd(void)
401 mutex_lock(&kvm_hyp_pgd_mutex); 493 mutex_lock(&kvm_hyp_pgd_mutex);
402 494
403 if (boot_hyp_pgd) { 495 if (boot_hyp_pgd) {
404 unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); 496 unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
405 unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 497 unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
406 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 498 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
407 boot_hyp_pgd = NULL; 499 boot_hyp_pgd = NULL;
408 } 500 }
409 501
410 if (hyp_pgd) 502 if (hyp_pgd)
411 unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 503 unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
412 504
413 mutex_unlock(&kvm_hyp_pgd_mutex); 505 mutex_unlock(&kvm_hyp_pgd_mutex);
414} 506}
@@ -433,9 +525,9 @@ void free_hyp_pgds(void)
433 525
434 if (hyp_pgd) { 526 if (hyp_pgd) {
435 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 527 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
436 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 528 unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
437 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 529 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
438 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 530 unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
439 531
440 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 532 free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
441 hyp_pgd = NULL; 533 hyp_pgd = NULL;
@@ -645,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
645 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 737 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
646} 738}
647 739
648/* Free the HW pgd, one page at a time */
649static void kvm_free_hwpgd(void *hwpgd)
650{
651 free_pages_exact(hwpgd, kvm_get_hwpgd_size());
652}
653
654/* Allocate the HW PGD, making sure that each page gets its own refcount */
655static void *kvm_alloc_hwpgd(void)
656{
657 unsigned int size = kvm_get_hwpgd_size();
658
659 return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
660}
661
662/** 740/**
663 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 741 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
664 * @kvm: The KVM struct pointer for the VM. 742 * @kvm: The KVM struct pointer for the VM.
@@ -673,81 +751,22 @@ static void *kvm_alloc_hwpgd(void)
673int kvm_alloc_stage2_pgd(struct kvm *kvm) 751int kvm_alloc_stage2_pgd(struct kvm *kvm)
674{ 752{
675 pgd_t *pgd; 753 pgd_t *pgd;
676 void *hwpgd;
677 754
678 if (kvm->arch.pgd != NULL) { 755 if (kvm->arch.pgd != NULL) {
679 kvm_err("kvm_arch already initialized?\n"); 756 kvm_err("kvm_arch already initialized?\n");
680 return -EINVAL; 757 return -EINVAL;
681 } 758 }
682 759
683 hwpgd = kvm_alloc_hwpgd(); 760 /* Allocate the HW PGD, making sure that each page gets its own refcount */
684 if (!hwpgd) 761 pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
762 if (!pgd)
685 return -ENOMEM; 763 return -ENOMEM;
686 764
687 /* When the kernel uses more levels of page tables than the
688 * guest, we allocate a fake PGD and pre-populate it to point
689 * to the next-level page table, which will be the real
690 * initial page table pointed to by the VTTBR.
691 *
692 * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
693 * the PMD and the kernel will use folded pud.
694 * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
695 * pages.
696 */
697 if (KVM_PREALLOC_LEVEL > 0) {
698 int i;
699
700 /*
701 * Allocate fake pgd for the page table manipulation macros to
702 * work. This is not used by the hardware and we have no
703 * alignment requirement for this allocation.
704 */
705 pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
706 GFP_KERNEL | __GFP_ZERO);
707
708 if (!pgd) {
709 kvm_free_hwpgd(hwpgd);
710 return -ENOMEM;
711 }
712
713 /* Plug the HW PGD into the fake one. */
714 for (i = 0; i < PTRS_PER_S2_PGD; i++) {
715 if (KVM_PREALLOC_LEVEL == 1)
716 pgd_populate(NULL, pgd + i,
717 (pud_t *)hwpgd + i * PTRS_PER_PUD);
718 else if (KVM_PREALLOC_LEVEL == 2)
719 pud_populate(NULL, pud_offset(pgd, 0) + i,
720 (pmd_t *)hwpgd + i * PTRS_PER_PMD);
721 }
722 } else {
723 /*
724 * Allocate actual first-level Stage-2 page table used by the
725 * hardware for Stage-2 page table walks.
726 */
727 pgd = (pgd_t *)hwpgd;
728 }
729
730 kvm_clean_pgd(pgd); 765 kvm_clean_pgd(pgd);
731 kvm->arch.pgd = pgd; 766 kvm->arch.pgd = pgd;
732 return 0; 767 return 0;
733} 768}
734 769
735/**
736 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
737 * @kvm: The VM pointer
738 * @start: The intermediate physical base address of the range to unmap
739 * @size: The size of the area to unmap
740 *
741 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
742 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
743 * destroying the VM), otherwise another faulting VCPU may come in and mess
744 * with things behind our backs.
745 */
746static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
747{
748 unmap_range(kvm, kvm->arch.pgd, start, size);
749}
750
751static void stage2_unmap_memslot(struct kvm *kvm, 770static void stage2_unmap_memslot(struct kvm *kvm,
752 struct kvm_memory_slot *memslot) 771 struct kvm_memory_slot *memslot)
753{ 772{
@@ -830,10 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
830 return; 849 return;
831 850
832 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 851 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
833 kvm_free_hwpgd(kvm_get_hwpgd(kvm)); 852 /* Free the HW pgd, one page at a time */
834 if (KVM_PREALLOC_LEVEL > 0) 853 free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
835 kfree(kvm->arch.pgd);
836
837 kvm->arch.pgd = NULL; 854 kvm->arch.pgd = NULL;
838} 855}
839 856
@@ -843,16 +860,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
843 pgd_t *pgd; 860 pgd_t *pgd;
844 pud_t *pud; 861 pud_t *pud;
845 862
846 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 863 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
847 if (WARN_ON(pgd_none(*pgd))) { 864 if (WARN_ON(stage2_pgd_none(*pgd))) {
848 if (!cache) 865 if (!cache)
849 return NULL; 866 return NULL;
850 pud = mmu_memory_cache_alloc(cache); 867 pud = mmu_memory_cache_alloc(cache);
851 pgd_populate(NULL, pgd, pud); 868 stage2_pgd_populate(pgd, pud);
852 get_page(virt_to_page(pgd)); 869 get_page(virt_to_page(pgd));
853 } 870 }
854 871
855 return pud_offset(pgd, addr); 872 return stage2_pud_offset(pgd, addr);
856} 873}
857 874
858static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 875static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -862,15 +879,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
862 pmd_t *pmd; 879 pmd_t *pmd;
863 880
864 pud = stage2_get_pud(kvm, cache, addr); 881 pud = stage2_get_pud(kvm, cache, addr);
865 if (pud_none(*pud)) { 882 if (stage2_pud_none(*pud)) {
866 if (!cache) 883 if (!cache)
867 return NULL; 884 return NULL;
868 pmd = mmu_memory_cache_alloc(cache); 885 pmd = mmu_memory_cache_alloc(cache);
869 pud_populate(NULL, pud, pmd); 886 stage2_pud_populate(pud, pmd);
870 get_page(virt_to_page(pud)); 887 get_page(virt_to_page(pud));
871 } 888 }
872 889
873 return pmd_offset(pud, addr); 890 return stage2_pmd_offset(pud, addr);
874} 891}
875 892
876static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 893static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
@@ -893,11 +910,14 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
893 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 910 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
894 911
895 old_pmd = *pmd; 912 old_pmd = *pmd;
896 kvm_set_pmd(pmd, *new_pmd); 913 if (pmd_present(old_pmd)) {
897 if (pmd_present(old_pmd)) 914 pmd_clear(pmd);
898 kvm_tlb_flush_vmid_ipa(kvm, addr); 915 kvm_tlb_flush_vmid_ipa(kvm, addr);
899 else 916 } else {
900 get_page(virt_to_page(pmd)); 917 get_page(virt_to_page(pmd));
918 }
919
920 kvm_set_pmd(pmd, *new_pmd);
901 return 0; 921 return 0;
902} 922}
903 923
@@ -946,15 +966,38 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
946 966
947 /* Create 2nd stage page table mapping - Level 3 */ 967 /* Create 2nd stage page table mapping - Level 3 */
948 old_pte = *pte; 968 old_pte = *pte;
949 kvm_set_pte(pte, *new_pte); 969 if (pte_present(old_pte)) {
950 if (pte_present(old_pte)) 970 kvm_set_pte(pte, __pte(0));
951 kvm_tlb_flush_vmid_ipa(kvm, addr); 971 kvm_tlb_flush_vmid_ipa(kvm, addr);
952 else 972 } else {
953 get_page(virt_to_page(pte)); 973 get_page(virt_to_page(pte));
974 }
954 975
976 kvm_set_pte(pte, *new_pte);
955 return 0; 977 return 0;
956} 978}
957 979
980#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
981static int stage2_ptep_test_and_clear_young(pte_t *pte)
982{
983 if (pte_young(*pte)) {
984 *pte = pte_mkold(*pte);
985 return 1;
986 }
987 return 0;
988}
989#else
990static int stage2_ptep_test_and_clear_young(pte_t *pte)
991{
992 return __ptep_test_and_clear_young(pte);
993}
994#endif
995
996static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
997{
998 return stage2_ptep_test_and_clear_young((pte_t *)pmd);
999}
1000
958/** 1001/**
959 * kvm_phys_addr_ioremap - map a device range to guest IPA 1002 * kvm_phys_addr_ioremap - map a device range to guest IPA
960 * 1003 *
@@ -978,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
978 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 1021 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
979 1022
980 if (writable) 1023 if (writable)
981 kvm_set_s2pte_writable(&pte); 1024 pte = kvm_s2pte_mkwrite(pte);
982 1025
983 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, 1026 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
984 KVM_NR_MEM_OBJS); 1027 KVM_NR_MEM_OBJS);
@@ -1078,12 +1121,12 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1078 pmd_t *pmd; 1121 pmd_t *pmd;
1079 phys_addr_t next; 1122 phys_addr_t next;
1080 1123
1081 pmd = pmd_offset(pud, addr); 1124 pmd = stage2_pmd_offset(pud, addr);
1082 1125
1083 do { 1126 do {
1084 next = kvm_pmd_addr_end(addr, end); 1127 next = stage2_pmd_addr_end(addr, end);
1085 if (!pmd_none(*pmd)) { 1128 if (!pmd_none(*pmd)) {
1086 if (kvm_pmd_huge(*pmd)) { 1129 if (pmd_thp_or_huge(*pmd)) {
1087 if (!kvm_s2pmd_readonly(pmd)) 1130 if (!kvm_s2pmd_readonly(pmd))
1088 kvm_set_s2pmd_readonly(pmd); 1131 kvm_set_s2pmd_readonly(pmd);
1089 } else { 1132 } else {
@@ -1106,12 +1149,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1106 pud_t *pud; 1149 pud_t *pud;
1107 phys_addr_t next; 1150 phys_addr_t next;
1108 1151
1109 pud = pud_offset(pgd, addr); 1152 pud = stage2_pud_offset(pgd, addr);
1110 do { 1153 do {
1111 next = kvm_pud_addr_end(addr, end); 1154 next = stage2_pud_addr_end(addr, end);
1112 if (!pud_none(*pud)) { 1155 if (!stage2_pud_none(*pud)) {
1113 /* TODO:PUD not supported, revisit later if supported */ 1156 /* TODO:PUD not supported, revisit later if supported */
1114 BUG_ON(kvm_pud_huge(*pud)); 1157 BUG_ON(stage2_pud_huge(*pud));
1115 stage2_wp_pmds(pud, addr, next); 1158 stage2_wp_pmds(pud, addr, next);
1116 } 1159 }
1117 } while (pud++, addr = next, addr != end); 1160 } while (pud++, addr = next, addr != end);
@@ -1128,7 +1171,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1128 pgd_t *pgd; 1171 pgd_t *pgd;
1129 phys_addr_t next; 1172 phys_addr_t next;
1130 1173
1131 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 1174 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
1132 do { 1175 do {
1133 /* 1176 /*
1134 * Release kvm_mmu_lock periodically if the memory region is 1177 * Release kvm_mmu_lock periodically if the memory region is
@@ -1140,8 +1183,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1140 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 1183 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
1141 cond_resched_lock(&kvm->mmu_lock); 1184 cond_resched_lock(&kvm->mmu_lock);
1142 1185
1143 next = kvm_pgd_addr_end(addr, end); 1186 next = stage2_pgd_addr_end(addr, end);
1144 if (pgd_present(*pgd)) 1187 if (stage2_pgd_present(*pgd))
1145 stage2_wp_puds(pgd, addr, next); 1188 stage2_wp_puds(pgd, addr, next);
1146 } while (pgd++, addr = next, addr != end); 1189 } while (pgd++, addr = next, addr != end);
1147} 1190}
@@ -1320,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1320 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1363 pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1321 new_pmd = pmd_mkhuge(new_pmd); 1364 new_pmd = pmd_mkhuge(new_pmd);
1322 if (writable) { 1365 if (writable) {
1323 kvm_set_s2pmd_writable(&new_pmd); 1366 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1324 kvm_set_pfn_dirty(pfn); 1367 kvm_set_pfn_dirty(pfn);
1325 } 1368 }
1326 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); 1369 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
@@ -1329,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1329 pte_t new_pte = pfn_pte(pfn, mem_type); 1372 pte_t new_pte = pfn_pte(pfn, mem_type);
1330 1373
1331 if (writable) { 1374 if (writable) {
1332 kvm_set_s2pte_writable(&new_pte); 1375 new_pte = kvm_s2pte_mkwrite(new_pte);
1333 kvm_set_pfn_dirty(pfn); 1376 kvm_set_pfn_dirty(pfn);
1334 mark_page_dirty(kvm, gfn); 1377 mark_page_dirty(kvm, gfn);
1335 } 1378 }
@@ -1348,6 +1391,8 @@ out_unlock:
1348 * Resolve the access fault by making the page young again. 1391 * Resolve the access fault by making the page young again.
1349 * Note that because the faulting entry is guaranteed not to be 1392 * Note that because the faulting entry is guaranteed not to be
1350 * cached in the TLB, we don't need to invalidate anything. 1393 * cached in the TLB, we don't need to invalidate anything.
1394 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1395 * so there is no need for atomic (pte|pmd)_mkyoung operations.
1351 */ 1396 */
1352static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1397static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1353{ 1398{
@@ -1364,7 +1409,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1364 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1409 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1365 goto out; 1410 goto out;
1366 1411
1367 if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ 1412 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
1368 *pmd = pmd_mkyoung(*pmd); 1413 *pmd = pmd_mkyoung(*pmd);
1369 pfn = pmd_pfn(*pmd); 1414 pfn = pmd_pfn(*pmd);
1370 pfn_valid = true; 1415 pfn_valid = true;
@@ -1588,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1588 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1633 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1589 return 0; 1634 return 0;
1590 1635
1591 if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ 1636 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
1592 if (pmd_young(*pmd)) { 1637 return stage2_pmdp_test_and_clear_young(pmd);
1593 *pmd = pmd_mkold(*pmd);
1594 return 1;
1595 }
1596
1597 return 0;
1598 }
1599 1638
1600 pte = pte_offset_kernel(pmd, gpa); 1639 pte = pte_offset_kernel(pmd, gpa);
1601 if (pte_none(*pte)) 1640 if (pte_none(*pte))
1602 return 0; 1641 return 0;
1603 1642
1604 if (pte_young(*pte)) { 1643 return stage2_ptep_test_and_clear_young(pte);
1605 *pte = pte_mkold(*pte); /* Just a page... */
1606 return 1;
1607 }
1608
1609 return 0;
1610} 1644}
1611 1645
1612static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) 1646static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
@@ -1618,7 +1652,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1618 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1652 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1619 return 0; 1653 return 0;
1620 1654
1621 if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ 1655 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
1622 return pmd_young(*pmd); 1656 return pmd_young(*pmd);
1623 1657
1624 pte = pte_offset_kernel(pmd, gpa); 1658 pte = pte_offset_kernel(pmd, gpa);