aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave@linux.vnet.ibm.com>2013-01-22 16:24:35 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-01-25 19:34:55 -0500
commit5dfd486c4750c9278c63fa96e6e85bdd2fb58e9d (patch)
tree47dfd03b9778ed0e357e7b3b8a20b9ec9462b56a
parentd765653445129b7c476758040e3079480775f80a (diff)
x86, kvm: Fix kvm's use of __pa() on percpu areas
In short, it is illegal to call __pa() on an address holding a percpu variable. This replaces those __pa() calls with slow_virt_to_phys(). All of the cases in this patch are in boot time (or CPU hotplug time at worst) code, so the slow pagetable walking in slow_virt_to_phys() is not expected to have a performance impact. The times when this actually matters are pretty obscure (certain 32-bit NUMA systems), but it _does_ happen. It is important to keep KVM guests working on these systems because the real hardware is getting harder and harder to find. This bug manifested first by me seeing a plain hang at boot after this message: CPU 0 irqstacks, hard=f3018000 soft=f301a000 or, sometimes, it would actually make it out to the console: [ 0.000000] BUG: unable to handle kernel paging request at ffffffff I eventually traced it down to the KVM async pagefault code. This can be worked around by disabling that code either at compile-time, or on the kernel command-line. The kvm async pagefault code was injecting page faults in to the guest which the guest misinterpreted because its "reason" was not being properly sent from the host. The guest passes a physical address of an per-cpu async page fault structure via an MSR to the host. Since __pa() is broken on percpu data, the physical address it sent was bascially bogus and the host went scribbling on random data. The guest never saw the real reason for the page fault (it was injected by the host), assumed that the kernel had taken a _real_ page fault, and panic()'d. The behavior varied, though, depending on what got corrupted by the bad write. Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com> Link: http://lkml.kernel.org/r/20130122212435.4905663F@kernel.stglabs.ibm.com Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c4
2 files changed, 7 insertions, 6 deletions
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9c2bd8bd4b4c..aa7e58b82b39 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -297,9 +297,9 @@ static void kvm_register_steal_time(void)
297 297
298 memset(st, 0, sizeof(*st)); 298 memset(st, 0, sizeof(*st));
299 299
300 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
301 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 301 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
302 cpu, __pa(st)); 302 cpu, slow_virt_to_phys(st));
303} 303}
304 304
305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
324 return; 324 return;
325 325
326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
327 u64 pa = __pa(&__get_cpu_var(apf_reason)); 327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
328 328
329#ifdef CONFIG_PREEMPT 329#ifdef CONFIG_PREEMPT
330 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 330 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
340 /* Size alignment is implied but just to make it explicit. */ 340 /* Size alignment is implied but just to make it explicit. */
341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
342 __get_cpu_var(kvm_apic_eoi) = 0; 342 __get_cpu_var(kvm_apic_eoi) = 0;
343 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
344 | KVM_MSR_ENABLED;
344 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 345 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
345 } 346 }
346 347
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f8..9f966dc0b9e4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -162,8 +162,8 @@ int kvm_register_clock(char *txt)
162 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
164 164
165 low = (int)__pa(src) | 1; 165 low = (int)slow_virt_to_phys(src) | 1;
166 high = ((u64)__pa(src) >> 32); 166 high = ((u64)slow_virt_to_phys(src) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
169 cpu, high, low, txt); 169 cpu, high, low, txt);