diff options
author | Avi Kivity <avi@redhat.com> | 2012-09-16 07:18:51 -0400 |
---|---|---|
committer | Avi Kivity <avi@redhat.com> | 2012-09-20 06:00:08 -0400 |
commit | 8cbc70696f149e44753b0fe60162b4ff96c2dd2b (patch) | |
tree | 79729287462257080b071f584bdba7a7ef9a25ea /arch/x86/kvm | |
parent | 3d34adec7081621ff51c195be045b87d75c0c49d (diff) |
KVM: MMU: Update accessed and dirty bits after guest pagetable walk
While unspecified, the behaviour of Intel processors is to first
perform the page table walk, then, if the walk was successful, to
atomically update the accessed and dirty bits of walked paging elements.
While we are not required to follow this exactly, doing so will allow us
to perform the access permissions check after the walk is complete, rather
than after each walk step.
(the tricky case is SMEP: a zero in any pte's U bit makes the referenced
page a supervisor page, so we can't fault on a one bit during the walk
itself).
Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 76 |
1 files changed, 47 insertions, 29 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1cbf576852ca..35a05dd2f69c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -63,10 +63,12 @@ | |||
63 | */ | 63 | */ |
64 | struct guest_walker { | 64 | struct guest_walker { |
65 | int level; | 65 | int level; |
66 | unsigned max_level; | ||
66 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | 67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
67 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | 68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
68 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; | 69 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; |
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | 70 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
71 | pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | 72 | unsigned pt_access; |
71 | unsigned pte_access; | 73 | unsigned pte_access; |
72 | gfn_t gfn; | 74 | gfn_t gfn; |
@@ -119,6 +121,43 @@ static bool FNAME(is_last_gpte)(struct guest_walker *walker, | |||
119 | return false; | 121 | return false; |
120 | } | 122 | } |
121 | 123 | ||
124 | static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, | ||
125 | struct kvm_mmu *mmu, | ||
126 | struct guest_walker *walker, | ||
127 | int write_fault) | ||
128 | { | ||
129 | unsigned level, index; | ||
130 | pt_element_t pte, orig_pte; | ||
131 | pt_element_t __user *ptep_user; | ||
132 | gfn_t table_gfn; | ||
133 | int ret; | ||
134 | |||
135 | for (level = walker->max_level; level >= walker->level; --level) { | ||
136 | pte = orig_pte = walker->ptes[level - 1]; | ||
137 | table_gfn = walker->table_gfn[level - 1]; | ||
138 | ptep_user = walker->ptep_user[level - 1]; | ||
139 | index = offset_in_page(ptep_user) / sizeof(pt_element_t); | ||
140 | if (!(pte & PT_ACCESSED_MASK)) { | ||
141 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); | ||
142 | pte |= PT_ACCESSED_MASK; | ||
143 | } | ||
144 | if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { | ||
145 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | ||
146 | pte |= PT_DIRTY_MASK; | ||
147 | } | ||
148 | if (pte == orig_pte) | ||
149 | continue; | ||
150 | |||
151 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); | ||
152 | if (ret) | ||
153 | return ret; | ||
154 | |||
155 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
156 | walker->ptes[level] = pte; | ||
157 | } | ||
158 | return 0; | ||
159 | } | ||
160 | |||
122 | /* | 161 | /* |
123 | * Fetch a guest pte for a guest virtual address | 162 | * Fetch a guest pte for a guest virtual address |
124 | */ | 163 | */ |
@@ -126,6 +165,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
126 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 165 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
127 | gva_t addr, u32 access) | 166 | gva_t addr, u32 access) |
128 | { | 167 | { |
168 | int ret; | ||
129 | pt_element_t pte; | 169 | pt_element_t pte; |
130 | pt_element_t __user *uninitialized_var(ptep_user); | 170 | pt_element_t __user *uninitialized_var(ptep_user); |
131 | gfn_t table_gfn; | 171 | gfn_t table_gfn; |
@@ -153,6 +193,7 @@ retry_walk: | |||
153 | --walker->level; | 193 | --walker->level; |
154 | } | 194 | } |
155 | #endif | 195 | #endif |
196 | walker->max_level = walker->level; | ||
156 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 197 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
157 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); | 198 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); |
158 | 199 | ||
@@ -183,6 +224,7 @@ retry_walk: | |||
183 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | 224 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
184 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) | 225 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) |
185 | goto error; | 226 | goto error; |
227 | walker->ptep_user[walker->level - 1] = ptep_user; | ||
186 | 228 | ||
187 | trace_kvm_mmu_paging_element(pte, walker->level); | 229 | trace_kvm_mmu_paging_element(pte, walker->level); |
188 | 230 | ||
@@ -214,21 +256,6 @@ retry_walk: | |||
214 | eperm = true; | 256 | eperm = true; |
215 | } | 257 | } |
216 | 258 | ||
217 | if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
218 | int ret; | ||
219 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | ||
220 | sizeof(pte)); | ||
221 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | ||
222 | pte, pte|PT_ACCESSED_MASK); | ||
223 | if (unlikely(ret < 0)) | ||
224 | goto error; | ||
225 | else if (ret) | ||
226 | goto retry_walk; | ||
227 | |||
228 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
229 | pte |= PT_ACCESSED_MASK; | ||
230 | } | ||
231 | |||
232 | walker->ptes[walker->level - 1] = pte; | 259 | walker->ptes[walker->level - 1] = pte; |
233 | 260 | ||
234 | if (last_gpte) { | 261 | if (last_gpte) { |
@@ -268,21 +295,12 @@ retry_walk: | |||
268 | 295 | ||
269 | if (!write_fault) | 296 | if (!write_fault) |
270 | protect_clean_gpte(&pte_access, pte); | 297 | protect_clean_gpte(&pte_access, pte); |
271 | else if (unlikely(!is_dirty_gpte(pte))) { | ||
272 | int ret; | ||
273 | 298 | ||
274 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 299 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); |
275 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 300 | if (unlikely(ret < 0)) |
276 | pte, pte|PT_DIRTY_MASK); | 301 | goto error; |
277 | if (unlikely(ret < 0)) | 302 | else if (ret) |
278 | goto error; | 303 | goto retry_walk; |
279 | else if (ret) | ||
280 | goto retry_walk; | ||
281 | |||
282 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
283 | pte |= PT_DIRTY_MASK; | ||
284 | walker->ptes[walker->level - 1] = pte; | ||
285 | } | ||
286 | 304 | ||
287 | walker->pt_access = pt_access; | 305 | walker->pt_access = pt_access; |
288 | walker->pte_access = pte_access; | 306 | walker->pte_access = pte_access; |