Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (147 commits) KVM: kill file->f_count abuse in kvm KVM: MMU: kvm_pv_mmu_op should not take mmap_sem KVM: SVM: remove selective CR0 comment KVM: SVM: remove now obsolete FIXME comment KVM: SVM: disable CR8 intercept when tpr is not masking interrupts KVM: SVM: sync V_TPR with LAPIC.TPR if CR8 write intercept is disabled KVM: export kvm_lapic_set_tpr() to modules KVM: SVM: sync TPR value to V_TPR field in the VMCB KVM: ppc: PowerPC 440 KVM implementation KVM: Add MAINTAINERS entry for PowerPC KVM KVM: ppc: Add DCR access information to struct kvm_run ppc: Export tlb_44x_hwater for KVM KVM: Rename debugfs_dir to kvm_debugfs_dir KVM: x86 emulator: fix lea to really get the effective address KVM: x86 emulator: fix smsw and lmsw with a memory operand KVM: x86 emulator: initialize src.val and dst.val for register operands KVM: SVM: force a new asid when initializing the vmcb KVM: fix kvm_vcpu_kick vs __vcpu_run race KVM: add ioctls to save/store mpstate KVM: Rename VCPU_MP_STATE_* to KVM_MP_STATE_* ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-27 13:13:52 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-27 13:13:52 -0400
commit: 42cadc86008aae0fd9ff31642dc01ed50723cf32 (patch)
tree: b05d4c8f0561bad5a0183a89fb23ce4c8ee1653c /arch/x86/kernel
parent: fba5c1af5c4fd6645fe62ea84ccde0981282cf66 (diff)
parent: 66c0b394f08fd89236515c1c84485ea712a157be (diff)
7 files changed, 463 insertions, 3 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90e092d0af0c..fa19c3819540 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)     += test_nx.o
 obj-$(CONFIG_VMI)               += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST)         += kvm.o
+obj-$(CONFIG_KVM_CLOCK)         += kvmclock.o
 obj-$(CONFIG_PARAVIRT)          += paravirt.o paravirt_patch_$(BITS).o
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2251d0ae9570..268553817909 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
 #include <asm/hpet.h>
 #include <linux/kdebug.h>
 #include <asm/smp.h>
+#include <asm/reboot.h>
 #include <mach_ipi.h>
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
-void machine_crash_shutdown(struct pt_regs *regs)
+void native_machine_crash_shutdown(struct pt_regs *regs)
 {
        /* This function is only called after the system
         * has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..8b7a3cf37d2b
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ *   Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#define MMU_QUEUE_SIZE 1024
+struct kvm_para_state {
+        u8 mmu_queue[MMU_QUEUE_SIZE];
+        int mmu_queue_len;
+        enum paravirt_lazy_mode mode;
+};
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static struct kvm_para_state *kvm_para_state(void)
+{
+        return &per_cpu(para_state, raw_smp_processor_id());
+}
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+static void kvm_mmu_op(void *buffer, unsigned len)
+{
+        int r;
+        unsigned long a1, a2;
+        do {
+                a1 = __pa(buffer);
+                a2 = 0;   /* on i386 __pa() always returns <4G */
+                r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
+                buffer += r;
+                len -= r;
+        } while (len);
+}
+static void mmu_queue_flush(struct kvm_para_state *state)
+{
+        if (state->mmu_queue_len) {
+                kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
+                state->mmu_queue_len = 0;
+        }
+}
+static void kvm_deferred_mmu_op(void *buffer, int len)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        if (state->mode != PARAVIRT_LAZY_MMU) {
+                kvm_mmu_op(buffer, len);
+                return;
+        }
+        if (state->mmu_queue_len + len > sizeof state->mmu_queue)
+                mmu_queue_flush(state);
+        memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
+        state->mmu_queue_len += len;
+}
+static void kvm_mmu_write(void *dest, u64 val)
+{
+        __u64 pte_phys;
+        struct kvm_mmu_op_write_pte wpte;
+#ifdef CONFIG_HIGHPTE
+        struct page *page;
+        unsigned long dst = (unsigned long) dest;
+        page = kmap_atomic_to_page(dest);
+        pte_phys = page_to_pfn(page);
+        pte_phys <<= PAGE_SHIFT;
+        pte_phys += (dst & ~(PAGE_MASK));
+#else
+        pte_phys = (unsigned long)__pa(dest);
+#endif
+        wpte.header.op = KVM_MMU_OP_WRITE_PTE;
+        wpte.pte_val = val;
+        wpte.pte_phys = pte_phys;
+        kvm_deferred_mmu_op(&wpte, sizeof wpte);
+}
+/*
+ * We only need to hook operations that are MMU writes.  We hook these so that
+ * we can use lazy MMU mode to batch these operations.  We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                           pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+        kvm_mmu_write(pmdp, pmd_val(pmd));
+}
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+                                pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_pte_clear(struct mm_struct *mm,
+                          unsigned long addr, pte_t *ptep)
+{
+        kvm_mmu_write(ptep, 0);
+}
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+        kvm_mmu_write(pmdp, 0);
+}
+#endif
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+        kvm_mmu_write(pudp, pud_val(pud));
+}
+#if PAGETABLE_LEVELS == 4
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+        kvm_mmu_write(pgdp, pgd_val(pgd));
+}
+#endif
+#endif /* PAGETABLE_LEVELS >= 3 */
+static void kvm_flush_tlb(void)
+{
+        struct kvm_mmu_op_flush_tlb ftlb = {
+                .header.op = KVM_MMU_OP_FLUSH_TLB,
+        };
+        kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+}
+static void kvm_release_pt(u32 pfn)
+{
+        struct kvm_mmu_op_release_pt rpt = {
+                .header.op = KVM_MMU_OP_RELEASE_PT,
+                .pt_phys = (u64)pfn << PAGE_SHIFT,
+        };
+        kvm_mmu_op(&rpt, sizeof rpt);
+}
+static void kvm_enter_lazy_mmu(void)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        paravirt_enter_lazy_mmu();
+        state->mode = paravirt_get_lazy_mode();
+}
+static void kvm_leave_lazy_mmu(void)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        mmu_queue_flush(state);
+        paravirt_leave_lazy(paravirt_get_lazy_mode());
+        state->mode = paravirt_get_lazy_mode();
+}
+static void paravirt_ops_setup(void)
+{
+        pv_info.name = "KVM";
+        pv_info.paravirt_enabled = 1;
+        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+                pv_cpu_ops.io_delay = kvm_io_delay;
+        if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
+                pv_mmu_ops.set_pte = kvm_set_pte;
+                pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+                pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+                pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+                pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+                pv_mmu_ops.pte_clear = kvm_pte_clear;
+                pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+                pv_mmu_ops.set_pud = kvm_set_pud;
+#if PAGETABLE_LEVELS == 4
+                pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+#endif
+                pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+                pv_mmu_ops.release_pte = kvm_release_pt;
+                pv_mmu_ops.release_pmd = kvm_release_pt;
+                pv_mmu_ops.release_pud = kvm_release_pt;
+                pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+                pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+        }
+}
+void __init kvm_guest_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        paravirt_ops_setup();
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..ddee04043aeb
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+    Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <asm/reboot.h>
+#define KVM_SCALE 22
+static int kvmclock = 1;
+static int parse_no_kvmclock(char *arg)
+{
+        kvmclock = 0;
+        return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+/* The hypervisor will put information about time periodically here */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+        int cpu = smp_processor_id();
+        u64 delta = native_read_tsc() - last_tsc;
+        return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+        u32 wc_sec, wc_nsec;
+        u64 delta;
+        struct timespec ts;
+        int version, nsec;
+        int low, high;
+        low = (int)__pa(&wall_clock);
+        high = ((u64)__pa(&wall_clock) >> 32);
+        delta = kvm_clock_read();
+        native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+        do {
+                version = wall_clock.wc_version;
+                rmb();
+                wc_sec = wall_clock.wc_sec;
+                wc_nsec = wall_clock.wc_nsec;
+                rmb();
+        } while ((wall_clock.wc_version != version) || (version & 1));
+        delta = kvm_clock_read() - delta;
+        delta += wc_nsec;
+        nsec = do_div(delta, NSEC_PER_SEC);
+        set_normalized_timespec(&ts, wc_sec + delta, nsec);
+        /*
+         * Of all mechanisms of time adjustment I've tested, this one
+         * was the champion!
+         */
+        return ts.tv_sec + 1;
+}
+int kvm_set_wallclock(unsigned long now)
+{
+        return 0;
+}
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+        u64 last_tsc, now;
+        int cpu;
+        preempt_disable();
+        cpu = smp_processor_id();
+        last_tsc = get_clock(cpu, tsc_timestamp);
+        now = get_clock(cpu, system_time);
+        now += kvm_get_delta(last_tsc);
+        preempt_enable();
+        return now;
+}
+static struct clocksource kvm_clock = {
+        .name = "kvm-clock",
+        .read = kvm_clock_read,
+        .rating = 400,
+        .mask = CLOCKSOURCE_MASK(64),
+        .mult = 1 << KVM_SCALE,
+        .shift = KVM_SCALE,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+static int kvm_register_clock(void)
+{
+        int cpu = smp_processor_id();
+        int low, high;
+        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+        return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+static void kvm_setup_secondary_clock(void)
+{
+        /*
+         * Now that the first cpu already had this clocksource initialized,
+         * we shouldn't fail.
+         */
+        WARN_ON(kvm_register_clock());
+        /* ok, done with our trickery, call native */
+        setup_secondary_APIC_clock();
+}
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writting anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+        native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+        native_machine_crash_shutdown(regs);
+}
+#endif
+static void kvm_shutdown(void)
+{
+        native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+        native_machine_shutdown();
+}
+void __init kvmclock_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+                if (kvm_register_clock())
+                        return;
+                pv_time_ops.get_wallclock = kvm_get_wallclock;
+                pv_time_ops.set_wallclock = kvm_set_wallclock;
+                pv_time_ops.sched_clock = kvm_clock_read;
+                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+                machine_ops.shutdown  = kvm_shutdown;
+#ifdef CONFIG_KEXEC
+                machine_ops.crash_shutdown  = kvm_crash_shutdown;
+#endif
+                clocksource_register(&kvm_clock);
+        }
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1791a751a772..a4a838306b2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
        }
 }
-static void native_machine_shutdown(void)
+void native_machine_shutdown(void)
 {
        /* Stop the cpus and apics */
 #ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
        .shutdown = native_machine_shutdown,
        .emergency_restart = native_machine_emergency_restart,
        .restart = native_machine_restart,
-        .halt = native_machine_halt
+        .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC
+        .crash_shutdown = native_machine_crash_shutdown,
+#endif
 };
 void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
        machine_ops.halt();
 }
+#ifdef CONFIG_KEXEC
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+        machine_ops.crash_shutdown(regs);
+}
+#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 44cc9b933932..2283422af794 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
 #include <linux/pfn.h>
 #include <linux/pci.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 #include <video/edid.h>
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
        max_low_pfn = setup_memory();
+#ifdef CONFIG_KVM_CLOCK
+        kvmclock_init();
+#endif
 #ifdef CONFIG_VMI
        /*
         * Must be after max_low_pfn is determined, and before kernel
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
         */
        vmi_init();
 #endif
+        kvm_guest_init();
        /*
         * NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 60e64c8eee92..a94fb959a87a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
        io_delay_init();
+#ifdef CONFIG_KVM_CLOCK
+        kvmclock_init();
+#endif
 #ifdef CONFIG_SMP
        /* setup to use the early static init tables during kernel startup */
        x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
        init_apic_mappings();
        ioapic_init_mappings();
+        kvm_guest_init();
        /*
         * We trust e820 completely. No explicit ROM probing in memory.
         */
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-27 13:13:52 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-27 13:13:52 -0400
commit	42cadc86008aae0fd9ff31642dc01ed50723cf32 (patch)
tree	b05d4c8f0561bad5a0183a89fb23ce4c8ee1653c /arch/x86/kernel
parent	fba5c1af5c4fd6645fe62ea84ccde0981282cf66 (diff)
parent	66c0b394f08fd89236515c1c84485ea712a157be (diff)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d0af0c..fa19c3819540 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
80	obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o	80	obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
81		81
82	obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o	82	obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
		83	obj-$(CONFIG_KVM_GUEST) += kvm.o
		84	obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
83	obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o	85	obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
84		86
85	ifdef CONFIG_INPUT_PCSPKR	87	ifdef CONFIG_INPUT_PCSPKR


diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2251d0ae9570..268553817909 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
25	#include <asm/hpet.h>	25	#include <asm/hpet.h>
26	#include <linux/kdebug.h>	26	#include <linux/kdebug.h>
27	#include <asm/smp.h>	27	#include <asm/smp.h>
		28	#include <asm/reboot.h>
28		29
29	#include <mach_ipi.h>	30	#include <mach_ipi.h>
30		31
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
117	}	118	}
118	#endif	119	#endif
119		120
120	void machine_crash_shutdown(struct pt_regs *regs)	121	void native_machine_crash_shutdown(struct pt_regs *regs)
121	{	122	{
122	/* This function is only called after the system	123	/* This function is only called after the system
123	* has panicked or is otherwise in a critical state.	124	* has panicked or is otherwise in a critical state.


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 000000000000..8b7a3cf37d2b --- /dev/null +++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
		1	/*
		2	* KVM paravirt_ops implementation
		3	*
		4	* This program is free software; you can redistribute it and/or modify
		5	* it under the terms of the GNU General Public License as published by
		6	* the Free Software Foundation; either version 2 of the License, or
		7	* (at your option) any later version.
		8	*
		9	* This program is distributed in the hope that it will be useful,
		10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		12	* GNU General Public License for more details.
		13	*
		14	* You should have received a copy of the GNU General Public License
		15	* along with this program; if not, write to the Free Software
		16	* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
		17	*
		18	* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
		19	* Copyright IBM Corporation, 2007
		20	* Authors: Anthony Liguori <aliguori@us.ibm.com>
		21	*/
		22
		23	#include <linux/module.h>
		24	#include <linux/kernel.h>
		25	#include <linux/kvm_para.h>
		26	#include <linux/cpu.h>
		27	#include <linux/mm.h>
		28	#include <linux/highmem.h>
		29	#include <linux/hardirq.h>
		30
		31	#define MMU_QUEUE_SIZE 1024
		32
		33	struct kvm_para_state {
		34	u8 mmu_queue[MMU_QUEUE_SIZE];
		35	int mmu_queue_len;
		36	enum paravirt_lazy_mode mode;
		37	};
		38
		39	static DEFINE_PER_CPU(struct kvm_para_state, para_state);
		40
		41	static struct kvm_para_state *kvm_para_state(void)
		42	{
		43	return &per_cpu(para_state, raw_smp_processor_id());
		44	}
		45
		46	/*
		47	* No need for any "IO delay" on KVM
		48	*/
		49	static void kvm_io_delay(void)
		50	{
		51	}
		52
		53	static void kvm_mmu_op(void *buffer, unsigned len)
		54	{
		55	int r;
		56	unsigned long a1, a2;
		57
		58	do {
		59	a1 = __pa(buffer);
		60	a2 = 0; /* on i386 __pa() always returns <4G */
		61	r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
		62	buffer += r;
		63	len -= r;
		64	} while (len);
		65	}
		66
		67	static void mmu_queue_flush(struct kvm_para_state *state)
		68	{
		69	if (state->mmu_queue_len) {
		70	kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
		71	state->mmu_queue_len = 0;
		72	}
		73	}
		74
		75	static void kvm_deferred_mmu_op(void *buffer, int len)
		76	{
		77	struct kvm_para_state *state = kvm_para_state();
		78
		79	if (state->mode != PARAVIRT_LAZY_MMU) {
		80	kvm_mmu_op(buffer, len);
		81	return;
		82	}
		83	if (state->mmu_queue_len + len > sizeof state->mmu_queue)
		84	mmu_queue_flush(state);
		85	memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
		86	state->mmu_queue_len += len;
		87	}
		88
		89	static void kvm_mmu_write(void *dest, u64 val)
		90	{
		91	__u64 pte_phys;
		92	struct kvm_mmu_op_write_pte wpte;
		93
		94	#ifdef CONFIG_HIGHPTE
		95	struct page *page;
		96	unsigned long dst = (unsigned long) dest;
		97
		98	page = kmap_atomic_to_page(dest);
		99	pte_phys = page_to_pfn(page);
		100	pte_phys <<= PAGE_SHIFT;
		101	pte_phys += (dst & ~(PAGE_MASK));
		102	#else
		103	pte_phys = (unsigned long)__pa(dest);
		104	#endif
		105	wpte.header.op = KVM_MMU_OP_WRITE_PTE;
		106	wpte.pte_val = val;
		107	wpte.pte_phys = pte_phys;
		108
		109	kvm_deferred_mmu_op(&wpte, sizeof wpte);
		110	}
		111
		112	/*
		113	* We only need to hook operations that are MMU writes. We hook these so that
		114	* we can use lazy MMU mode to batch these operations. We could probably
		115	* improve the performance of the host code if we used some of the information
		116	* here to simplify processing of batched writes.
		117	*/
		118	static void kvm_set_pte(pte_t *ptep, pte_t pte)
		119	{
		120	kvm_mmu_write(ptep, pte_val(pte));
		121	}
		122
		123	static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
		124	pte_t *ptep, pte_t pte)
		125	{
		126	kvm_mmu_write(ptep, pte_val(pte));
		127	}
		128
		129	static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
		130	{
		131	kvm_mmu_write(pmdp, pmd_val(pmd));
		132	}
		133
		134	#if PAGETABLE_LEVELS >= 3
		135	#ifdef CONFIG_X86_PAE
		136	static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
		137	{
		138	kvm_mmu_write(ptep, pte_val(pte));
		139	}
		140
		141	static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
		142	pte_t *ptep, pte_t pte)
		143	{
		144	kvm_mmu_write(ptep, pte_val(pte));
		145	}
		146
		147	static void kvm_pte_clear(struct mm_struct *mm,
		148	unsigned long addr, pte_t *ptep)
		149	{
		150	kvm_mmu_write(ptep, 0);
		151	}
		152
		153	static void kvm_pmd_clear(pmd_t *pmdp)
		154	{
		155	kvm_mmu_write(pmdp, 0);
		156	}
		157	#endif
		158
		159	static void kvm_set_pud(pud_t *pudp, pud_t pud)
		160	{
		161	kvm_mmu_write(pudp, pud_val(pud));
		162	}
		163
		164	#if PAGETABLE_LEVELS == 4
		165	static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
		166	{
		167	kvm_mmu_write(pgdp, pgd_val(pgd));
		168	}
		169	#endif
		170	#endif /* PAGETABLE_LEVELS >= 3 */
		171
		172	static void kvm_flush_tlb(void)
		173	{
		174	struct kvm_mmu_op_flush_tlb ftlb = {
		175	.header.op = KVM_MMU_OP_FLUSH_TLB,
		176	};
		177
		178	kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
		179	}
		180
		181	static void kvm_release_pt(u32 pfn)
		182	{
		183	struct kvm_mmu_op_release_pt rpt = {
		184	.header.op = KVM_MMU_OP_RELEASE_PT,
		185	.pt_phys = (u64)pfn << PAGE_SHIFT,
		186	};
		187
		188	kvm_mmu_op(&rpt, sizeof rpt);
		189	}
		190
		191	static void kvm_enter_lazy_mmu(void)
		192	{
		193	struct kvm_para_state *state = kvm_para_state();
		194
		195	paravirt_enter_lazy_mmu();
		196	state->mode = paravirt_get_lazy_mode();
		197	}
		198
		199	static void kvm_leave_lazy_mmu(void)
		200	{
		201	struct kvm_para_state *state = kvm_para_state();
		202
		203	mmu_queue_flush(state);
		204	paravirt_leave_lazy(paravirt_get_lazy_mode());
		205	state->mode = paravirt_get_lazy_mode();
		206	}
		207
		208	static void paravirt_ops_setup(void)
		209	{
		210	pv_info.name = "KVM";
		211	pv_info.paravirt_enabled = 1;
		212
		213	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
		214	pv_cpu_ops.io_delay = kvm_io_delay;
		215
		216	if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
		217	pv_mmu_ops.set_pte = kvm_set_pte;
		218	pv_mmu_ops.set_pte_at = kvm_set_pte_at;
		219	pv_mmu_ops.set_pmd = kvm_set_pmd;
		220	#if PAGETABLE_LEVELS >= 3
		221	#ifdef CONFIG_X86_PAE
		222	pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
		223	pv_mmu_ops.set_pte_present = kvm_set_pte_present;
		224	pv_mmu_ops.pte_clear = kvm_pte_clear;
		225	pv_mmu_ops.pmd_clear = kvm_pmd_clear;
		226	#endif
		227	pv_mmu_ops.set_pud = kvm_set_pud;
		228	#if PAGETABLE_LEVELS == 4
		229	pv_mmu_ops.set_pgd = kvm_set_pgd;
		230	#endif
		231	#endif
		232	pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
		233	pv_mmu_ops.release_pte = kvm_release_pt;
		234	pv_mmu_ops.release_pmd = kvm_release_pt;
		235	pv_mmu_ops.release_pud = kvm_release_pt;
		236
		237	pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
		238	pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
		239	}
		240	}
		241
		242	void __init kvm_guest_init(void)
		243	{
		244	if (!kvm_para_available())
		245	return;
		246
		247	paravirt_ops_setup();
		248	}


diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000000..ddee04043aeb --- /dev/null +++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
		1	/* KVM paravirtual clock driver. A clocksource implementation
		2	Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
		3
		4	This program is free software; you can redistribute it and/or modify
		5	it under the terms of the GNU General Public License as published by
		6	the Free Software Foundation; either version 2 of the License, or
		7	(at your option) any later version.
		8
		9	This program is distributed in the hope that it will be useful,
		10	but WITHOUT ANY WARRANTY; without even the implied warranty of
		11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		12	GNU General Public License for more details.
		13
		14	You should have received a copy of the GNU General Public License
		15	along with this program; if not, write to the Free Software
		16	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
		17	*/
		18
		19	#include <linux/clocksource.h>
		20	#include <linux/kvm_para.h>
		21	#include <asm/arch_hooks.h>
		22	#include <asm/msr.h>
		23	#include <asm/apic.h>
		24	#include <linux/percpu.h>
		25	#include <asm/reboot.h>
		26
		27	#define KVM_SCALE 22
		28
		29	static int kvmclock = 1;
		30
		31	static int parse_no_kvmclock(char *arg)
		32	{
		33	kvmclock = 0;
		34	return 0;
		35	}
		36	early_param("no-kvmclock", parse_no_kvmclock);
		37
		38	/* The hypervisor will put information about time periodically here */
		39	static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
		40	#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
		41
		42	static inline u64 kvm_get_delta(u64 last_tsc)
		43	{
		44	int cpu = smp_processor_id();
		45	u64 delta = native_read_tsc() - last_tsc;
		46	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
		47	}
		48
		49	static struct kvm_wall_clock wall_clock;
		50	static cycle_t kvm_clock_read(void);
		51	/*
		52	* The wallclock is the time of day when we booted. Since then, some time may
		53	* have elapsed since the hypervisor wrote the data. So we try to account for
		54	* that with system time
		55	*/
		56	unsigned long kvm_get_wallclock(void)
		57	{
		58	u32 wc_sec, wc_nsec;
		59	u64 delta;
		60	struct timespec ts;
		61	int version, nsec;
		62	int low, high;
		63
		64	low = (int)__pa(&wall_clock);
		65	high = ((u64)__pa(&wall_clock) >> 32);
		66
		67	delta = kvm_clock_read();
		68
		69	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
		70	do {
		71	version = wall_clock.wc_version;
		72	rmb();
		73	wc_sec = wall_clock.wc_sec;
		74	wc_nsec = wall_clock.wc_nsec;
		75	rmb();
		76	} while ((wall_clock.wc_version != version) \|\| (version & 1));
		77
		78	delta = kvm_clock_read() - delta;
		79	delta += wc_nsec;
		80	nsec = do_div(delta, NSEC_PER_SEC);
		81	set_normalized_timespec(&ts, wc_sec + delta, nsec);
		82	/*
		83	* Of all mechanisms of time adjustment I've tested, this one
		84	* was the champion!
		85	*/
		86	return ts.tv_sec + 1;
		87	}
		88
		89	int kvm_set_wallclock(unsigned long now)
		90	{
		91	return 0;
		92	}
		93
		94	/*
		95	* This is our read_clock function. The host puts an tsc timestamp each time
		96	* it updates a new time. Without the tsc adjustment, we can have a situation
		97	* in which a vcpu starts to run earlier (smaller system_time), but probes
		98	* time later (compared to another vcpu), leading to backwards time
		99	*/
		100	static cycle_t kvm_clock_read(void)
		101	{
		102	u64 last_tsc, now;
		103	int cpu;
		104
		105	preempt_disable();
		106	cpu = smp_processor_id();
		107
		108	last_tsc = get_clock(cpu, tsc_timestamp);
		109	now = get_clock(cpu, system_time);
		110
		111	now += kvm_get_delta(last_tsc);
		112	preempt_enable();
		113
		114	return now;
		115	}
		116	static struct clocksource kvm_clock = {
		117	.name = "kvm-clock",
		118	.read = kvm_clock_read,
		119	.rating = 400,
		120	.mask = CLOCKSOURCE_MASK(64),
		121	.mult = 1 << KVM_SCALE,
		122	.shift = KVM_SCALE,
		123	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
		124	};
		125
		126	static int kvm_register_clock(void)
		127	{
		128	int cpu = smp_processor_id();
		129	int low, high;
		130	low = (int)__pa(&per_cpu(hv_clock, cpu)) \| 1;
		131	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
		132
		133	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
		134	}
		135
		136	static void kvm_setup_secondary_clock(void)
		137	{
		138	/*
		139	* Now that the first cpu already had this clocksource initialized,
		140	* we shouldn't fail.
		141	*/
		142	WARN_ON(kvm_register_clock());
		143	/* ok, done with our trickery, call native */
		144	setup_secondary_APIC_clock();
		145	}
		146
		147	/*
		148	* After the clock is registered, the host will keep writing to the
		149	* registered memory location. If the guest happens to shutdown, this memory
		150	* won't be valid. In cases like kexec, in which you install a new kernel, this
		151	* means a random memory location will be kept being written. So before any
		152	* kind of shutdown from our side, we unregister the clock by writting anything
		153	* that does not have the 'enable' bit set in the msr
		154	*/
		155	#ifdef CONFIG_KEXEC
		156	static void kvm_crash_shutdown(struct pt_regs *regs)
		157	{
		158	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
		159	native_machine_crash_shutdown(regs);
		160	}
		161	#endif
		162
		163	static void kvm_shutdown(void)
		164	{
		165	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
		166	native_machine_shutdown();
		167	}
		168
		169	void __init kvmclock_init(void)
		170	{
		171	if (!kvm_para_available())
		172	return;
		173
		174	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
		175	if (kvm_register_clock())
		176	return;
		177	pv_time_ops.get_wallclock = kvm_get_wallclock;
		178	pv_time_ops.set_wallclock = kvm_set_wallclock;
		179	pv_time_ops.sched_clock = kvm_clock_read;
		180	pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
		181	machine_ops.shutdown = kvm_shutdown;
		182	#ifdef CONFIG_KEXEC
		183	machine_ops.crash_shutdown = kvm_crash_shutdown;
		184	#endif
		185	clocksource_register(&kvm_clock);
		186	}
		187	}


diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1791a751a772..a4a838306b2c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
399	}	399	}
400	}	400	}
401		401
402	static void native_machine_shutdown(void)	402	void native_machine_shutdown(void)
403	{	403	{
404	/* Stop the cpus and apics */	404	/* Stop the cpus and apics */
405	#ifdef CONFIG_SMP	405	#ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
470	.shutdown = native_machine_shutdown,	470	.shutdown = native_machine_shutdown,
471	.emergency_restart = native_machine_emergency_restart,	471	.emergency_restart = native_machine_emergency_restart,
472	.restart = native_machine_restart,	472	.restart = native_machine_restart,
473	.halt = native_machine_halt	473	.halt = native_machine_halt,
		474	#ifdef CONFIG_KEXEC
		475	.crash_shutdown = native_machine_crash_shutdown,
		476	#endif
474	};	477	};
475		478
476	void machine_power_off(void)	479	void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
498	machine_ops.halt();	501	machine_ops.halt();
499	}	502	}
500		503
		504	#ifdef CONFIG_KEXEC
		505	void machine_crash_shutdown(struct pt_regs *regs)
		506	{
		507	machine_ops.crash_shutdown(regs);
		508	}
		509	#endif


diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 44cc9b933932..2283422af794 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
47	#include <linux/pfn.h>	47	#include <linux/pfn.h>
48	#include <linux/pci.h>	48	#include <linux/pci.h>
49	#include <linux/init_ohci1394_dma.h>	49	#include <linux/init_ohci1394_dma.h>
		50	#include <linux/kvm_para.h>
50		51
51	#include <video/edid.h>	52	#include <video/edid.h>
52		53
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
820		821
821	max_low_pfn = setup_memory();	822	max_low_pfn = setup_memory();
822		823
		824	#ifdef CONFIG_KVM_CLOCK
		825	kvmclock_init();
		826	#endif
		827
823	#ifdef CONFIG_VMI	828	#ifdef CONFIG_VMI
824	/*	829	/*
825	* Must be after max_low_pfn is determined, and before kernel	830	* Must be after max_low_pfn is determined, and before kernel
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
827	*/	832	*/
828	vmi_init();	833	vmi_init();
829	#endif	834	#endif
		835	kvm_guest_init();
830		836
831	/*	837	/*
832	* NOTE: before this point _nobody_ is allowed to allocate	838	* NOTE: before this point _nobody_ is allowed to allocate


diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 60e64c8eee92..a94fb959a87a 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
42	#include <linux/ctype.h>	42	#include <linux/ctype.h>
43	#include <linux/uaccess.h>	43	#include <linux/uaccess.h>
44	#include <linux/init_ohci1394_dma.h>	44	#include <linux/init_ohci1394_dma.h>
		45	#include <linux/kvm_para.h>
45		46
46	#include <asm/mtrr.h>	47	#include <asm/mtrr.h>
47	#include <asm/uaccess.h>	48	#include <asm/uaccess.h>
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
384		385
385	io_delay_init();	386	io_delay_init();
386		387
		388	#ifdef CONFIG_KVM_CLOCK
		389	kvmclock_init();
		390	#endif
		391
387	#ifdef CONFIG_SMP	392	#ifdef CONFIG_SMP
388	/* setup to use the early static init tables during kernel startup */	393	/* setup to use the early static init tables during kernel startup */
389	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;	394	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
488	init_apic_mappings();	493	init_apic_mappings();
489	ioapic_init_mappings();	494	ioapic_init_mappings();
490		495
		496	kvm_guest_init();
		497
491	/*	498	/*
492	* We trust e820 completely. No explicit ROM probing in memory.	499	* We trust e820 completely. No explicit ROM probing in memory.
493	*/	500	*/