diff options
Diffstat (limited to 'arch/x86/kvm/paging_tmpl.h')
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 199 |
1 files changed, 83 insertions, 116 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bb7cf01cae76..714e2c01a6fe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -63,10 +63,12 @@ | |||
63 | */ | 63 | */ |
64 | struct guest_walker { | 64 | struct guest_walker { |
65 | int level; | 65 | int level; |
66 | unsigned max_level; | ||
66 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | 67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
67 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | 68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
68 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; | 69 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; |
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | 70 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
71 | pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | 72 | unsigned pt_access; |
71 | unsigned pte_access; | 73 | unsigned pte_access; |
72 | gfn_t gfn; | 74 | gfn_t gfn; |
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
101 | return (ret != orig_pte); | 103 | return (ret != orig_pte); |
102 | } | 104 | } |
103 | 105 | ||
104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, | 106 | static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, |
105 | bool last) | 107 | struct kvm_mmu *mmu, |
108 | struct guest_walker *walker, | ||
109 | int write_fault) | ||
106 | { | 110 | { |
107 | unsigned access; | 111 | unsigned level, index; |
108 | 112 | pt_element_t pte, orig_pte; | |
109 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 113 | pt_element_t __user *ptep_user; |
110 | if (last && !is_dirty_gpte(gpte)) | 114 | gfn_t table_gfn; |
111 | access &= ~ACC_WRITE_MASK; | 115 | int ret; |
112 | 116 | ||
113 | #if PTTYPE == 64 | 117 | for (level = walker->max_level; level >= walker->level; --level) { |
114 | if (vcpu->arch.mmu.nx) | 118 | pte = orig_pte = walker->ptes[level - 1]; |
115 | access &= ~(gpte >> PT64_NX_SHIFT); | 119 | table_gfn = walker->table_gfn[level - 1]; |
116 | #endif | 120 | ptep_user = walker->ptep_user[level - 1]; |
117 | return access; | 121 | index = offset_in_page(ptep_user) / sizeof(pt_element_t); |
118 | } | 122 | if (!(pte & PT_ACCESSED_MASK)) { |
119 | 123 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); | |
120 | static bool FNAME(is_last_gpte)(struct guest_walker *walker, | 124 | pte |= PT_ACCESSED_MASK; |
121 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 125 | } |
122 | pt_element_t gpte) | 126 | if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { |
123 | { | 127 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
124 | if (walker->level == PT_PAGE_TABLE_LEVEL) | 128 | pte |= PT_DIRTY_MASK; |
125 | return true; | 129 | } |
126 | 130 | if (pte == orig_pte) | |
127 | if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && | 131 | continue; |
128 | (PTTYPE == 64 || is_pse(vcpu))) | ||
129 | return true; | ||
130 | 132 | ||
131 | if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && | 133 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); |
132 | (mmu->root_level == PT64_ROOT_LEVEL)) | 134 | if (ret) |
133 | return true; | 135 | return ret; |
134 | 136 | ||
135 | return false; | 137 | mark_page_dirty(vcpu->kvm, table_gfn); |
138 | walker->ptes[level] = pte; | ||
139 | } | ||
140 | return 0; | ||
136 | } | 141 | } |
137 | 142 | ||
138 | /* | 143 | /* |
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
142 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 147 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
143 | gva_t addr, u32 access) | 148 | gva_t addr, u32 access) |
144 | { | 149 | { |
150 | int ret; | ||
145 | pt_element_t pte; | 151 | pt_element_t pte; |
146 | pt_element_t __user *uninitialized_var(ptep_user); | 152 | pt_element_t __user *uninitialized_var(ptep_user); |
147 | gfn_t table_gfn; | 153 | gfn_t table_gfn; |
148 | unsigned index, pt_access, uninitialized_var(pte_access); | 154 | unsigned index, pt_access, pte_access, accessed_dirty, shift; |
149 | gpa_t pte_gpa; | 155 | gpa_t pte_gpa; |
150 | bool eperm, last_gpte; | ||
151 | int offset; | 156 | int offset; |
152 | const int write_fault = access & PFERR_WRITE_MASK; | 157 | const int write_fault = access & PFERR_WRITE_MASK; |
153 | const int user_fault = access & PFERR_USER_MASK; | 158 | const int user_fault = access & PFERR_USER_MASK; |
154 | const int fetch_fault = access & PFERR_FETCH_MASK; | 159 | const int fetch_fault = access & PFERR_FETCH_MASK; |
155 | u16 errcode = 0; | 160 | u16 errcode = 0; |
161 | gpa_t real_gpa; | ||
162 | gfn_t gfn; | ||
156 | 163 | ||
157 | trace_kvm_mmu_pagetable_walk(addr, access); | 164 | trace_kvm_mmu_pagetable_walk(addr, access); |
158 | retry_walk: | 165 | retry_walk: |
159 | eperm = false; | ||
160 | walker->level = mmu->root_level; | 166 | walker->level = mmu->root_level; |
161 | pte = mmu->get_cr3(vcpu); | 167 | pte = mmu->get_cr3(vcpu); |
162 | 168 | ||
@@ -169,15 +175,21 @@ retry_walk: | |||
169 | --walker->level; | 175 | --walker->level; |
170 | } | 176 | } |
171 | #endif | 177 | #endif |
178 | walker->max_level = walker->level; | ||
172 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 179 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
173 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); | 180 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); |
174 | 181 | ||
175 | pt_access = ACC_ALL; | 182 | accessed_dirty = PT_ACCESSED_MASK; |
183 | pt_access = pte_access = ACC_ALL; | ||
184 | ++walker->level; | ||
176 | 185 | ||
177 | for (;;) { | 186 | do { |
178 | gfn_t real_gfn; | 187 | gfn_t real_gfn; |
179 | unsigned long host_addr; | 188 | unsigned long host_addr; |
180 | 189 | ||
190 | pt_access &= pte_access; | ||
191 | --walker->level; | ||
192 | |||
181 | index = PT_INDEX(addr, walker->level); | 193 | index = PT_INDEX(addr, walker->level); |
182 | 194 | ||
183 | table_gfn = gpte_to_gfn(pte); | 195 | table_gfn = gpte_to_gfn(pte); |
@@ -199,6 +211,7 @@ retry_walk: | |||
199 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | 211 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
200 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) | 212 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) |
201 | goto error; | 213 | goto error; |
214 | walker->ptep_user[walker->level - 1] = ptep_user; | ||
202 | 215 | ||
203 | trace_kvm_mmu_paging_element(pte, walker->level); | 216 | trace_kvm_mmu_paging_element(pte, walker->level); |
204 | 217 | ||
@@ -211,92 +224,48 @@ retry_walk: | |||
211 | goto error; | 224 | goto error; |
212 | } | 225 | } |
213 | 226 | ||
214 | if (!check_write_user_access(vcpu, write_fault, user_fault, | 227 | accessed_dirty &= pte; |
215 | pte)) | 228 | pte_access = pt_access & gpte_access(vcpu, pte); |
216 | eperm = true; | ||
217 | |||
218 | #if PTTYPE == 64 | ||
219 | if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) | ||
220 | eperm = true; | ||
221 | #endif | ||
222 | |||
223 | last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); | ||
224 | if (last_gpte) { | ||
225 | pte_access = pt_access & | ||
226 | FNAME(gpte_access)(vcpu, pte, true); | ||
227 | /* check if the kernel is fetching from user page */ | ||
228 | if (unlikely(pte_access & PT_USER_MASK) && | ||
229 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
230 | if (fetch_fault && !user_fault) | ||
231 | eperm = true; | ||
232 | } | ||
233 | |||
234 | if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
235 | int ret; | ||
236 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | ||
237 | sizeof(pte)); | ||
238 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | ||
239 | pte, pte|PT_ACCESSED_MASK); | ||
240 | if (unlikely(ret < 0)) | ||
241 | goto error; | ||
242 | else if (ret) | ||
243 | goto retry_walk; | ||
244 | |||
245 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
246 | pte |= PT_ACCESSED_MASK; | ||
247 | } | ||
248 | 229 | ||
249 | walker->ptes[walker->level - 1] = pte; | 230 | walker->ptes[walker->level - 1] = pte; |
231 | } while (!is_last_gpte(mmu, walker->level, pte)); | ||
250 | 232 | ||
251 | if (last_gpte) { | 233 | if (unlikely(permission_fault(mmu, pte_access, access))) { |
252 | int lvl = walker->level; | 234 | errcode |= PFERR_PRESENT_MASK; |
253 | gpa_t real_gpa; | 235 | goto error; |
254 | gfn_t gfn; | 236 | } |
255 | u32 ac; | ||
256 | |||
257 | gfn = gpte_to_gfn_lvl(pte, lvl); | ||
258 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; | ||
259 | |||
260 | if (PTTYPE == 32 && | ||
261 | walker->level == PT_DIRECTORY_LEVEL && | ||
262 | is_cpuid_PSE36()) | ||
263 | gfn += pse36_gfn_delta(pte); | ||
264 | |||
265 | ac = write_fault | fetch_fault | user_fault; | ||
266 | 237 | ||
267 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), | 238 | gfn = gpte_to_gfn_lvl(pte, walker->level); |
268 | ac); | 239 | gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; |
269 | if (real_gpa == UNMAPPED_GVA) | ||
270 | return 0; | ||
271 | 240 | ||
272 | walker->gfn = real_gpa >> PAGE_SHIFT; | 241 | if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) |
242 | gfn += pse36_gfn_delta(pte); | ||
273 | 243 | ||
274 | break; | 244 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); |
275 | } | 245 | if (real_gpa == UNMAPPED_GVA) |
246 | return 0; | ||
276 | 247 | ||
277 | pt_access &= FNAME(gpte_access)(vcpu, pte, false); | 248 | walker->gfn = real_gpa >> PAGE_SHIFT; |
278 | --walker->level; | ||
279 | } | ||
280 | 249 | ||
281 | if (unlikely(eperm)) { | 250 | if (!write_fault) |
282 | errcode |= PFERR_PRESENT_MASK; | 251 | protect_clean_gpte(&pte_access, pte); |
283 | goto error; | ||
284 | } | ||
285 | 252 | ||
286 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { | 253 | /* |
287 | int ret; | 254 | * On a write fault, fold the dirty bit into accessed_dirty by shifting it one |
255 | * place right. | ||
256 | * | ||
257 | * On a read fault, do nothing. | ||
258 | */ | ||
259 | shift = write_fault >> ilog2(PFERR_WRITE_MASK); | ||
260 | shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; | ||
261 | accessed_dirty &= pte >> shift; | ||
288 | 262 | ||
289 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 263 | if (unlikely(!accessed_dirty)) { |
290 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 264 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); |
291 | pte, pte|PT_DIRTY_MASK); | ||
292 | if (unlikely(ret < 0)) | 265 | if (unlikely(ret < 0)) |
293 | goto error; | 266 | goto error; |
294 | else if (ret) | 267 | else if (ret) |
295 | goto retry_walk; | 268 | goto retry_walk; |
296 | |||
297 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
298 | pte |= PT_DIRTY_MASK; | ||
299 | walker->ptes[walker->level - 1] = pte; | ||
300 | } | 269 | } |
301 | 270 | ||
302 | walker->pt_access = pt_access; | 271 | walker->pt_access = pt_access; |
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
368 | return; | 337 | return; |
369 | 338 | ||
370 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 339 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
371 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); | 340 | pte_access = sp->role.access & gpte_access(vcpu, gpte); |
341 | protect_clean_gpte(&pte_access, gpte); | ||
372 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); | 342 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); |
373 | if (mmu_invalid_pfn(pfn)) { | 343 | if (mmu_invalid_pfn(pfn)) |
374 | kvm_release_pfn_clean(pfn); | ||
375 | return; | 344 | return; |
376 | } | ||
377 | 345 | ||
378 | /* | 346 | /* |
379 | * we call mmu_set_spte() with host_writable = true because that | 347 | * we call mmu_set_spte() with host_writable = true because that |
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
443 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 411 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
444 | continue; | 412 | continue; |
445 | 413 | ||
446 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, | 414 | pte_access = sp->role.access & gpte_access(vcpu, gpte); |
447 | true); | 415 | protect_clean_gpte(&pte_access, gpte); |
448 | gfn = gpte_to_gfn(gpte); | 416 | gfn = gpte_to_gfn(gpte); |
449 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | 417 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, |
450 | pte_access & ACC_WRITE_MASK); | 418 | pte_access & ACC_WRITE_MASK); |
451 | if (mmu_invalid_pfn(pfn)) { | 419 | if (mmu_invalid_pfn(pfn)) |
452 | kvm_release_pfn_clean(pfn); | ||
453 | break; | 420 | break; |
454 | } | ||
455 | 421 | ||
456 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 422 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
457 | NULL, PT_PAGE_TABLE_LEVEL, gfn, | 423 | NULL, PT_PAGE_TABLE_LEVEL, gfn, |
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
798 | 764 | ||
799 | gfn = gpte_to_gfn(gpte); | 765 | gfn = gpte_to_gfn(gpte); |
800 | pte_access = sp->role.access; | 766 | pte_access = sp->role.access; |
801 | pte_access &= FNAME(gpte_access)(vcpu, gpte, true); | 767 | pte_access &= gpte_access(vcpu, gpte); |
768 | protect_clean_gpte(&pte_access, gpte); | ||
802 | 769 | ||
803 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) | 770 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) |
804 | continue; | 771 | continue; |