Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (147 commits) KVM: kill file->f_count abuse in kvm KVM: MMU: kvm_pv_mmu_op should not take mmap_sem KVM: SVM: remove selective CR0 comment KVM: SVM: remove now obsolete FIXME comment KVM: SVM: disable CR8 intercept when tpr is not masking interrupts KVM: SVM: sync V_TPR with LAPIC.TPR if CR8 write intercept is disabled KVM: export kvm_lapic_set_tpr() to modules KVM: SVM: sync TPR value to V_TPR field in the VMCB KVM: ppc: PowerPC 440 KVM implementation KVM: Add MAINTAINERS entry for PowerPC KVM KVM: ppc: Add DCR access information to struct kvm_run ppc: Export tlb_44x_hwater for KVM KVM: Rename debugfs_dir to kvm_debugfs_dir KVM: x86 emulator: fix lea to really get the effective address KVM: x86 emulator: fix smsw and lmsw with a memory operand KVM: x86 emulator: initialize src.val and dst.val for register operands KVM: SVM: force a new asid when initializing the vmcb KVM: fix kvm_vcpu_kick vs __vcpu_run race KVM: add ioctls to save/store mpstate KVM: Rename VCPU_MP_STATE_* to KVM_MP_STATE_* ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-27 13:13:52 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-27 13:13:52 -0400
commit: 42cadc86008aae0fd9ff31642dc01ed50723cf32 (patch)
tree: b05d4c8f0561bad5a0183a89fb23ce4c8ee1653c
parent: fba5c1af5c4fd6645fe62ea84ccde0981282cf66 (diff)
parent: 66c0b394f08fd89236515c1c84485ea712a157be (diff)
119 files changed, 23723 insertions, 638 deletions
diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
new file mode 100644
index 00000000000..bec9d815da3
--- /dev/null
+++ b/Documentation/ia64/kvm.txt
@@ -0,0 +1,82 @@
+Currently, kvm module in EXPERIMENTAL stage on IA64. This means that
+interfaces are not stable enough to use. So, plase had better don't run
+critical applications in virtual machine. We will try our best to make it
+strong in future versions!
+                                Guide: How to boot up guests on kvm/ia64
+This guide is to describe how to enable kvm support for IA-64 systems.
+1. Get the kvm source from git.kernel.org.
+        Userspace source:
+                git clone git://git.kernel.org/pub/scm/virt/kvm/kvm-userspace.git
+        Kernel Source:
+                git clone git://git.kernel.org/pub/scm/linux/kernel/git/xiantao/kvm-ia64.git
+2. Compile the source code.
+        2.1 Compile userspace code:
+                (1)cd ./kvm-userspace
+                (2)./configure
+                (3)cd kernel
+                (4)make sync LINUX= $kernel_dir (kernel_dir is the directory of kernel source.)
+                (5)cd ..
+                (6)make qemu
+                (7)cd qemu; make install
+        2.2 Compile kernel source code:
+                (1) cd ./$kernel_dir
+                (2) Make menuconfig
+                (3) Enter into virtualization option, and choose kvm.
+                (4) make
+                (5) Once (4) done, make modules_install
+                (6) Make initrd, and use new kernel to reboot up host machine.
+                (7) Once (6) done, cd $kernel_dir/arch/ia64/kvm
+                (8) insmod kvm.ko; insmod kvm-intel.ko
+Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qemu, otherwise, may fail.
+3. Get Guest Firmware named as Flash.fd, and put it under right place:
+        (1) If you have the guest firmware (binary) released by Intel Corp for Xen, use it directly.
+        (2) If you have no firmware at hand, Please download its source from
+                hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
+            you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
+        (3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+4. Boot up Linux or Windows guests:
+        4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
+        4.2 Boot up guests use the following command.
+                /usr/local/bin/qemu-system-ia64 -smp xx -m 512 -hda $your_image
+                (xx is the number of virtual processors for the guest, now the maximum value is 4)
+5. Known possibile issue on some platforms with old Firmware.
+If meet strange host crashe issues, try to solve it through either of the following ways:
+(1): Upgrade your Firmware to the latest one.
+(2): Applying the below patch to kernel source.
+diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
+index 0b53344..f02b0f7 100644
+--- a/arch/ia64/kernel/pal.S
+++ b/arch/ia64/kernel/pal.S
+@@ -84,7 +84,8 @@ GLOBAL_ENTRY(ia64_pal_call_static)
+        mov ar.pfs = loc1
+        mov rp = loc0
+        ;;
+-       srlz.d                          // seralize restoration of psr.l
+       srlz.i                  // seralize restoration of psr.l
+       ;;
+        br.ret.sptk.many b0
+ END(ia64_pal_call_static)
+6. Bug report:
+        If you found any issues when use kvm/ia64, Please post the bug info to kvm-ia64-devel mailing list.
+        https://lists.sourceforge.net/lists/listinfo/kvm-ia64-devel/
+Thanks for your interest! Let's work together, and make kvm/ia64 stronger and stronger!
+                                                                Xiantao Zhang <xiantao.zhang@intel.com>
+                                                                                        2008.3.10
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index c18363bd8d1..240ce7a56c4 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -183,6 +183,8 @@ Code	Seq#	Include File		Comments
 0xAC    00-1F   linux/raw.h
 0xAD    00      Netfilter device        in development:
                                        <mailto:rusty@rustcorp.com.au>  
+0xAE    all     linux/kvm.h             Kernel-based Virtual Machine
+                                        <mailto:kvm-devel@lists.sourceforge.net>
 0xB0    all     RATIO devices           in development:
                                        <mailto:vgo@ratio.de>
 0xB1    00-1F   PPPoX                   <mailto:mostrows@styx.uwaterloo.ca>
diff --git a/Documentation/powerpc/kvm_440.txt b/Documentation/powerpc/kvm_440.txt
new file mode 100644
index 00000000000..c02a003fa03
--- /dev/null
+++ b/Documentation/powerpc/kvm_440.txt
@@ -0,0 +1,41 @@
+Hollis Blanchard <hollisb@us.ibm.com>
+15 Apr 2008
+Various notes on the implementation of KVM for PowerPC 440:
+To enforce isolation, host userspace, guest kernel, and guest userspace all
+run at user privilege level. Only the host kernel runs in supervisor mode.
+Executing privileged instructions in the guest traps into KVM (in the host
+kernel), where we decode and emulate them. Through this technique, unmodified
+440 Linux kernels can be run (slowly) as guests. Future performance work will
+focus on reducing the overhead and frequency of these traps.
+The usual code flow is started from userspace invoking an "run" ioctl, which
+causes KVM to switch into guest context. We use IVPR to hijack the host
+interrupt vectors while running the guest, which allows us to direct all
+interrupts to kvmppc_handle_interrupt(). At this point, we could either
+- handle the interrupt completely (e.g. emulate "mtspr SPRG0"), or
+- let the host interrupt handler run (e.g. when the decrementer fires), or
+- return to host userspace (e.g. when the guest performs device MMIO)
+Address spaces: We take advantage of the fact that Linux doesn't use the AS=1
+address space (in host or guest), which gives us virtual address space to use
+for guest mappings. While the guest is running, the host kernel remains mapped
+in AS=0, but the guest can only use AS=1 mappings.
+TLB entries: The TLB entries covering the host linear mapping remain
+present while running the guest. This reduces the overhead of lightweight
+exits, which are handled by KVM running in the host kernel. We keep three
+copies of the TLB:
+ - guest TLB: contents of the TLB as the guest sees it
+ - shadow TLB: the TLB that is actually in hardware while guest is running
+ - host TLB: to restore TLB state when context switching guest -> host
+When a TLB miss occurs because a mapping was not present in the shadow TLB,
+but was present in the guest TLB, KVM handles the fault without invoking the
+guest. Large guest pages are backed by multiple 4KB shadow pages through this
+mechanism.
+IO: MMIO and DCR accesses are emulated by userspace. We use virtio for network
+and block IO, so those drivers must be enabled in the guest. It's possible
+that some qemu device emulation (e.g. e1000 or rtl8139) may also work with
+little effort.
diff --git a/Documentation/s390/kvm.txt b/Documentation/s390/kvm.txt
new file mode 100644
index 00000000000..6f5ceb0f09f
--- /dev/null
+++ b/Documentation/s390/kvm.txt
@@ -0,0 +1,125 @@
+*** BIG FAT WARNING ***
+The kvm module is currently in EXPERIMENTAL state for s390. This means that
+the interface to the module is not yet considered to remain stable. Thus, be
+prepared that we keep breaking your userspace application and guest
+compatibility over and over again until we feel happy with the result. Make sure
+your guest kernel, your host kernel, and your userspace launcher are in a
+consistent state.
+This Documentation describes the unique ioctl calls to /dev/kvm, the resulting
+kvm-vm file descriptors, and the kvm-vcpu file descriptors that differ from x86.
+1. ioctl calls to /dev/kvm
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_GET_API_VERSION
+KVM_CREATE_VM           (*) see note
+KVM_CHECK_EXTENSION
+KVM_GET_VCPU_MMAP_SIZE
+Notes:
+* KVM_CREATE_VM may fail on s390, if the calling process has multiple
+threads and has not called KVM_S390_ENABLE_SIE before.
+In addition, on s390 the following architecture specific ioctls are supported:
+ioctl:          KVM_S390_ENABLE_SIE
+args:           none
+see also:       include/linux/kvm.h
+This call causes the kernel to switch on PGSTE in the user page table. This
+operation is needed in order to run a virtual machine, and it requires the
+calling process to be single-threaded. Note that the first call to KVM_CREATE_VM
+will implicitly try to switch on PGSTE if the user process has not called
+KVM_S390_ENABLE_SIE before. User processes that want to launch multiple threads
+before creating a virtual machine have to call KVM_S390_ENABLE_SIE, or will
+observe an error calling KVM_CREATE_VM. Switching on PGSTE is a one-time
+operation, is not reversible, and will persist over the entire lifetime of
+the calling process. It does not have any user-visible effect other than a small
+performance penalty.
+2. ioctl calls to the kvm-vm file descriptor
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_CREATE_VCPU
+KVM_SET_USER_MEMORY_REGION      (*) see note
+KVM_GET_DIRTY_LOG               (**) see note
+Notes:
+*  kvm does only allow exactly one memory slot on s390, which has to start
+   at guest absolute address zero and at a user address that is aligned on any
+   page boundary. This hardware "limitation" allows us to have a few unique
+   optimizations. The memory slot doesn't have to be filled
+   with memory actually, it may contain sparse holes. That said, with different
+   user memory layout this does still allow a large flexibility when
+   doing the guest memory setup.
+** KVM_GET_DIRTY_LOG doesn't work properly yet. The user will receive an empty
+log. This ioctl call is only needed for guest migration, and we intend to
+implement this one in the future.
+In addition, on s390 the following architecture specific ioctls for the kvm-vm
+file descriptor are supported:
+ioctl:          KVM_S390_INTERRUPT
+args:           struct kvm_s390_interrupt *
+see also:       include/linux/kvm.h
+This ioctl is used to submit a floating interrupt for a virtual machine.
+Floating interrupts may be delivered to any virtual cpu in the configuration.
+Only some interrupt types defined in include/linux/kvm.h make sense when
+submitted as floating interrupts. The following interrupts are not considered
+to be useful as floating interrupts, and a call to inject them will result in
+-EINVAL error code: program interrupts and interprocessor signals. Valid
+floating interrupts are:
+KVM_S390_INT_VIRTIO
+KVM_S390_INT_SERVICE
+3. ioctl calls to the kvm-vcpu file descriptor
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_RUN
+KVM_GET_REGS
+KVM_SET_REGS
+KVM_GET_SREGS
+KVM_SET_SREGS
+KVM_GET_FPU
+KVM_SET_FPU
+In addition, on s390 the following architecture specific ioctls for the
+kvm-vcpu file descriptor are supported:
+ioctl:          KVM_S390_INTERRUPT
+args:           struct kvm_s390_interrupt *
+see also:       include/linux/kvm.h
+This ioctl is used to submit an interrupt for a specific virtual cpu.
+Only some interrupt types defined in include/linux/kvm.h make sense when
+submitted for a specific cpu. The following interrupts are not considered
+to be useful, and a call to inject them will result in -EINVAL error code:
+service processor calls and virtio interrupts. Valid interrupt types are:
+KVM_S390_PROGRAM_INT
+KVM_S390_SIGP_STOP
+KVM_S390_RESTART
+KVM_S390_SIGP_SET_PREFIX
+KVM_S390_INT_EMERGENCY
+ioctl:          KVM_S390_STORE_STATUS
+args:           unsigned long
+see also:       include/linux/kvm.h
+This ioctl stores the state of the cpu at the guest real address given as
+argument, unless one of the following values defined in include/linux/kvm.h
+is given as arguement:
+KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in
+absolute lowcore as defined by the principles of operation
+KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in
+its prefix page just like the dump tool that comes with zipl. This is useful
+to create a system dump for use with lkcdutils or crash.
+ioctl:          KVM_S390_SET_INITIAL_PSW
+args:           struct kvm_s390_psw *
+see also:       include/linux/kvm.h
+This ioctl can be used to set the processor status word (psw) of a stopped cpu
+prior to running it with KVM_RUN. Note that this call is not required to modify
+the psw during sie intercepts that fall back to userspace because struct kvm_run
+does contain the psw, and this value is evaluated during reentry of KVM_RUN
+after the intercept exit was recognized.
+ioctl:          KVM_S390_INITIAL_RESET
+args:           none
+see also:       include/linux/kvm.h
+This ioctl can be used to perform an initial cpu reset as defined by the
+principles of operation. The target cpu has to be in stopped state.
diff --git a/MAINTAINERS b/MAINTAINERS
index a942f385249..c1dd1ae7b13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2329,6 +2329,13 @@ L:	kvm-devel@lists.sourceforge.net
 W:      kvm.sourceforge.net
 S:      Supported
+KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC
+P:      Hollis Blanchard
+M:      hollisb@us.ibm.com
+L:      kvm-ppc-devel@lists.sourceforge.net
+W:      kvm.sourceforge.net
+S:      Supported
 KERNEL VIRTUAL MACHINE For Itanium(KVM/IA64)
 P:      Anthony Xu
 M:      anthony.xu@intel.com
@@ -2338,6 +2345,16 @@ L:	kvm-ia64-devel@lists.sourceforge.net
 W:      kvm.sourceforge.net
 S:      Supported
+KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
+P:      Carsten Otte
+M:      cotte@de.ibm.com
+P:      Christian Borntraeger
+M:      borntraeger@de.ibm.com
+M:      linux390@de.ibm.com
+L:      linux-s390@vger.kernel.org
+W:      http://www.ibm.com/developerworks/linux/linux390/
+S:      Supported
 KEXEC
 P:      Eric Biederman
 M:      ebiederm@xmission.com
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index cd13e138bd0..3aa6c821449 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -19,6 +19,7 @@ config IA64
        select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
+        select HAVE_KVM
        default y
        help
          The Itanium Processor Family is Intel's 64-bit successor to
@@ -589,6 +590,8 @@ config MSPEC
 source "fs/Kconfig"
+source "arch/ia64/kvm/Kconfig"
 source "lib/Kconfig"
 #
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index f1645c4f703..ec4cca477f4 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -57,6 +57,7 @@ core-$(CONFIG_IA64_GENERIC) 	+= arch/ia64/dig/
 core-$(CONFIG_IA64_HP_ZX1)      += arch/ia64/dig/
 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
 core-$(CONFIG_IA64_SGI_SN2)     += arch/ia64/sn/
+core-$(CONFIG_KVM)              += arch/ia64/kvm/
 drivers-$(CONFIG_PCI)           += arch/ia64/pci/
 drivers-$(CONFIG_IA64_HP_SIM)   += arch/ia64/hp/sim/
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
new file mode 100644
index 00000000000..7914e482850
--- /dev/null
+++ b/arch/ia64/kvm/Kconfig
@@ -0,0 +1,49 @@
+#
+# KVM configuration
+#
+config HAVE_KVM
+        bool
+menuconfig VIRTUALIZATION
+        bool "Virtualization"
+        depends on HAVE_KVM || IA64
+        default y
+        ---help---
+          Say Y here to get to see options for using your Linux host to run other
+          operating systems inside virtual machines (guests).
+          This option alone does not add any kernel code.
+          If you say N, all options in this submenu will be skipped and disabled.
+if VIRTUALIZATION
+config KVM
+        tristate "Kernel-based Virtual Machine (KVM) support"
+        depends on HAVE_KVM && EXPERIMENTAL
+        select PREEMPT_NOTIFIERS
+        select ANON_INODES
+        ---help---
+          Support hosting fully virtualized guest machines using hardware
+          virtualization extensions.  You will need a fairly recent
+          processor equipped with virtualization extensions. You will also
+          need to select one or more of the processor modules below.
+          This module provides access to the hardware capabilities through
+          a character device node named /dev/kvm.
+          To compile this as a module, choose M here: the module
+          will be called kvm.
+          If unsure, say N.
+config KVM_INTEL
+        tristate "KVM for Intel Itanium 2 processors support"
+        depends on KVM && m
+        ---help---
+          Provides support for KVM on Itanium 2 processors equipped with the VT
+          extensions.
+config KVM_TRACE
+       bool
+endif # VIRTUALIZATION
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
new file mode 100644
index 00000000000..41b034ffa73
--- /dev/null
+++ b/arch/ia64/kvm/Makefile
@@ -0,0 +1,61 @@
+#This Make file is to generate asm-offsets.h and build source.
+#
+#Generate asm-offsets.h for vmm module build
+offsets-file := asm-offsets.h
+always  := $(offsets-file)
+targets := $(offsets-file)
+targets += arch/ia64/kvm/asm-offsets.s
+clean-files := $(addprefix $(objtree)/,$(targets) $(obj)/memcpy.S $(obj)/memset.S)
+# Default sed regexp - multiline due to syntax constraints
+define sed-y
+        "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
+endef
+quiet_cmd_offsets = GEN     $@
+define cmd_offsets
+        (set -e; \
+         echo "#ifndef __ASM_KVM_OFFSETS_H__"; \
+         echo "#define __ASM_KVM_OFFSETS_H__"; \
+         echo "/*"; \
+         echo " * DO NOT MODIFY."; \
+         echo " *"; \
+         echo " * This file was generated by Makefile"; \
+         echo " *"; \
+         echo " */"; \
+         echo ""; \
+         sed -ne $(sed-y) $<; \
+         echo ""; \
+         echo "#endif" ) > $@
+endef
+# We use internal rules to avoid the "is up to date" message from make
+arch/ia64/kvm/asm-offsets.s: arch/ia64/kvm/asm-offsets.c
+        $(call if_changed_dep,cc_s_c)
+$(obj)/$(offsets-file): arch/ia64/kvm/asm-offsets.s
+        $(call cmd,offsets)
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
+$(addprefix $(objtree)/,$(obj)/memcpy.S $(obj)/memset.S):
+        $(shell ln -snf ../lib/memcpy.S $(src)/memcpy.S)
+        $(shell ln -snf ../lib/memset.S $(src)/memset.S)
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
+obj-$(CONFIG_KVM) += kvm.o
+FORCE : $(obj)/$(offsets-file)
+EXTRA_CFLAGS_vcpu.o += -mfixed-range=f2-f5,f12-f127
+kvm-intel-objs = vmm.o vmm_ivt.o trampoline.o vcpu.o optvfault.o mmio.o \
+        vtlb.o process.o
+#Add link memcpy and memset to avoid possible structure assignment error
+kvm-intel-objs += memset.o memcpy.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/ia64/kvm/asm-offsets.c b/arch/ia64/kvm/asm-offsets.c
new file mode 100644
index 00000000000..4e3dc13a619
--- /dev/null
+++ b/arch/ia64/kvm/asm-offsets.c
@@ -0,0 +1,251 @@
+/*
+ * asm-offsets.c Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ *
+ * Anthony Xu    <anthony.xu@intel.com>
+ * Xiantao Zhang <xiantao.zhang@intel.com>
+ * Copyright (c) 2007 Intel Corporation  KVM support.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <linux/autoconf.h>
+#include <linux/kvm_host.h>
+#include "vcpu.h"
+#define task_struct kvm_vcpu
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " (%0) " #val : : "i" (val))
+#define BLANK() asm volatile("\n->" : :)
+#define OFFSET(_sym, _str, _mem) \
+    DEFINE(_sym, offsetof(_str, _mem));
+void foo(void)
+{
+        DEFINE(VMM_TASK_SIZE, sizeof(struct kvm_vcpu));
+        DEFINE(VMM_PT_REGS_SIZE, sizeof(struct kvm_pt_regs));
+        BLANK();
+        DEFINE(VMM_VCPU_META_RR0_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.metaphysical_rr0));
+        DEFINE(VMM_VCPU_META_SAVED_RR0_OFFSET,
+                        offsetof(struct kvm_vcpu,
+                                arch.metaphysical_saved_rr0));
+        DEFINE(VMM_VCPU_VRR0_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.vrr[0]));
+        DEFINE(VMM_VPD_IRR0_OFFSET,
+                        offsetof(struct vpd, irr[0]));
+        DEFINE(VMM_VCPU_ITC_CHECK_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.itc_check));
+        DEFINE(VMM_VCPU_IRQ_CHECK_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.irq_check));
+        DEFINE(VMM_VPD_VHPI_OFFSET,
+                        offsetof(struct vpd, vhpi));
+        DEFINE(VMM_VCPU_VSA_BASE_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.vsa_base));
+        DEFINE(VMM_VCPU_VPD_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.vpd));
+        DEFINE(VMM_VCPU_IRQ_CHECK,
+                        offsetof(struct kvm_vcpu, arch.irq_check));
+        DEFINE(VMM_VCPU_TIMER_PENDING,
+                        offsetof(struct kvm_vcpu, arch.timer_pending));
+        DEFINE(VMM_VCPU_META_SAVED_RR0_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.metaphysical_saved_rr0));
+        DEFINE(VMM_VCPU_MODE_FLAGS_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.mode_flags));
+        DEFINE(VMM_VCPU_ITC_OFS_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.itc_offset));
+        DEFINE(VMM_VCPU_LAST_ITC_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.last_itc));
+        DEFINE(VMM_VCPU_SAVED_GP_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.saved_gp));
+        BLANK();
+        DEFINE(VMM_PT_REGS_B6_OFFSET,
+                                offsetof(struct kvm_pt_regs, b6));
+        DEFINE(VMM_PT_REGS_B7_OFFSET,
+                                offsetof(struct kvm_pt_regs, b7));
+        DEFINE(VMM_PT_REGS_AR_CSD_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_csd));
+        DEFINE(VMM_PT_REGS_AR_SSD_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_ssd));
+        DEFINE(VMM_PT_REGS_R8_OFFSET,
+                                offsetof(struct kvm_pt_regs, r8));
+        DEFINE(VMM_PT_REGS_R9_OFFSET,
+                                offsetof(struct kvm_pt_regs, r9));
+        DEFINE(VMM_PT_REGS_R10_OFFSET,
+                                offsetof(struct kvm_pt_regs, r10));
+        DEFINE(VMM_PT_REGS_R11_OFFSET,
+                                offsetof(struct kvm_pt_regs, r11));
+        DEFINE(VMM_PT_REGS_CR_IPSR_OFFSET,
+                                offsetof(struct kvm_pt_regs, cr_ipsr));
+        DEFINE(VMM_PT_REGS_CR_IIP_OFFSET,
+                                offsetof(struct kvm_pt_regs, cr_iip));
+        DEFINE(VMM_PT_REGS_CR_IFS_OFFSET,
+                                offsetof(struct kvm_pt_regs, cr_ifs));
+        DEFINE(VMM_PT_REGS_AR_UNAT_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_unat));
+        DEFINE(VMM_PT_REGS_AR_PFS_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_pfs));
+        DEFINE(VMM_PT_REGS_AR_RSC_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_rsc));
+        DEFINE(VMM_PT_REGS_AR_RNAT_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_rnat));
+        DEFINE(VMM_PT_REGS_AR_BSPSTORE_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_bspstore));
+        DEFINE(VMM_PT_REGS_PR_OFFSET,
+                                offsetof(struct kvm_pt_regs, pr));
+        DEFINE(VMM_PT_REGS_B0_OFFSET,
+                                offsetof(struct kvm_pt_regs, b0));
+        DEFINE(VMM_PT_REGS_LOADRS_OFFSET,
+                                offsetof(struct kvm_pt_regs, loadrs));
+        DEFINE(VMM_PT_REGS_R1_OFFSET,
+                                offsetof(struct kvm_pt_regs, r1));
+        DEFINE(VMM_PT_REGS_R12_OFFSET,
+                                offsetof(struct kvm_pt_regs, r12));
+        DEFINE(VMM_PT_REGS_R13_OFFSET,
+                                offsetof(struct kvm_pt_regs, r13));
+        DEFINE(VMM_PT_REGS_AR_FPSR_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_fpsr));
+        DEFINE(VMM_PT_REGS_R15_OFFSET,
+                                offsetof(struct kvm_pt_regs, r15));
+        DEFINE(VMM_PT_REGS_R14_OFFSET,
+                                offsetof(struct kvm_pt_regs, r14));
+        DEFINE(VMM_PT_REGS_R2_OFFSET,
+                                offsetof(struct kvm_pt_regs, r2));
+        DEFINE(VMM_PT_REGS_R3_OFFSET,
+                                offsetof(struct kvm_pt_regs, r3));
+        DEFINE(VMM_PT_REGS_R16_OFFSET,
+                                offsetof(struct kvm_pt_regs, r16));
+        DEFINE(VMM_PT_REGS_R17_OFFSET,
+                                offsetof(struct kvm_pt_regs, r17));
+        DEFINE(VMM_PT_REGS_R18_OFFSET,
+                                offsetof(struct kvm_pt_regs, r18));
+        DEFINE(VMM_PT_REGS_R19_OFFSET,
+                                offsetof(struct kvm_pt_regs, r19));
+        DEFINE(VMM_PT_REGS_R20_OFFSET,
+                                offsetof(struct kvm_pt_regs, r20));
+        DEFINE(VMM_PT_REGS_R21_OFFSET,
+                                offsetof(struct kvm_pt_regs, r21));
+        DEFINE(VMM_PT_REGS_R22_OFFSET,
+                                offsetof(struct kvm_pt_regs, r22));
+        DEFINE(VMM_PT_REGS_R23_OFFSET,
+                                offsetof(struct kvm_pt_regs, r23));
+        DEFINE(VMM_PT_REGS_R24_OFFSET,
+                                offsetof(struct kvm_pt_regs, r24));
+        DEFINE(VMM_PT_REGS_R25_OFFSET,
+                                offsetof(struct kvm_pt_regs, r25));
+        DEFINE(VMM_PT_REGS_R26_OFFSET,
+                                offsetof(struct kvm_pt_regs, r26));
+        DEFINE(VMM_PT_REGS_R27_OFFSET,
+                                offsetof(struct kvm_pt_regs, r27));
+        DEFINE(VMM_PT_REGS_R28_OFFSET,
+                                offsetof(struct kvm_pt_regs, r28));
+        DEFINE(VMM_PT_REGS_R29_OFFSET,
+                                offsetof(struct kvm_pt_regs, r29));
+        DEFINE(VMM_PT_REGS_R30_OFFSET,
+                                offsetof(struct kvm_pt_regs, r30));
+        DEFINE(VMM_PT_REGS_R31_OFFSET,
+                                offsetof(struct kvm_pt_regs, r31));
+        DEFINE(VMM_PT_REGS_AR_CCV_OFFSET,
+                                offsetof(struct kvm_pt_regs, ar_ccv));
+        DEFINE(VMM_PT_REGS_F6_OFFSET,
+                                offsetof(struct kvm_pt_regs, f6));
+        DEFINE(VMM_PT_REGS_F7_OFFSET,
+                                offsetof(struct kvm_pt_regs, f7));
+        DEFINE(VMM_PT_REGS_F8_OFFSET,
+                                offsetof(struct kvm_pt_regs, f8));
+        DEFINE(VMM_PT_REGS_F9_OFFSET,
+                                offsetof(struct kvm_pt_regs, f9));
+        DEFINE(VMM_PT_REGS_F10_OFFSET,
+                                offsetof(struct kvm_pt_regs, f10));
+        DEFINE(VMM_PT_REGS_F11_OFFSET,
+                                offsetof(struct kvm_pt_regs, f11));
+        DEFINE(VMM_PT_REGS_R4_OFFSET,
+                                offsetof(struct kvm_pt_regs, r4));
+        DEFINE(VMM_PT_REGS_R5_OFFSET,
+                                offsetof(struct kvm_pt_regs, r5));
+        DEFINE(VMM_PT_REGS_R6_OFFSET,
+                                offsetof(struct kvm_pt_regs, r6));
+        DEFINE(VMM_PT_REGS_R7_OFFSET,
+                                offsetof(struct kvm_pt_regs, r7));
+        DEFINE(VMM_PT_REGS_EML_UNAT_OFFSET,
+                                offsetof(struct kvm_pt_regs, eml_unat));
+        DEFINE(VMM_VCPU_IIPA_OFFSET,
+                                offsetof(struct kvm_vcpu, arch.cr_iipa));
+        DEFINE(VMM_VCPU_OPCODE_OFFSET,
+                                offsetof(struct kvm_vcpu, arch.opcode));
+        DEFINE(VMM_VCPU_CAUSE_OFFSET, offsetof(struct kvm_vcpu, arch.cause));
+        DEFINE(VMM_VCPU_ISR_OFFSET,
+                                offsetof(struct kvm_vcpu, arch.cr_isr));
+        DEFINE(VMM_PT_REGS_R16_SLOT,
+                                (((offsetof(struct kvm_pt_regs, r16)
+                                - sizeof(struct kvm_pt_regs)) >> 3) & 0x3f));
+        DEFINE(VMM_VCPU_MODE_FLAGS_OFFSET,
+                                offsetof(struct kvm_vcpu, arch.mode_flags));
+        DEFINE(VMM_VCPU_GP_OFFSET, offsetof(struct kvm_vcpu, arch.__gp));
+        BLANK();
+        DEFINE(VMM_VPD_BASE_OFFSET, offsetof(struct kvm_vcpu, arch.vpd));
+        DEFINE(VMM_VPD_VIFS_OFFSET, offsetof(struct vpd, ifs));
+        DEFINE(VMM_VLSAPIC_INSVC_BASE_OFFSET,
+                        offsetof(struct kvm_vcpu, arch.insvc[0]));
+        DEFINE(VMM_VPD_VPTA_OFFSET, offsetof(struct vpd, pta));
+        DEFINE(VMM_VPD_VPSR_OFFSET, offsetof(struct vpd, vpsr));
+        DEFINE(VMM_CTX_R4_OFFSET, offsetof(union context, gr[4]));
+        DEFINE(VMM_CTX_R5_OFFSET, offsetof(union context, gr[5]));
+        DEFINE(VMM_CTX_R12_OFFSET, offsetof(union context, gr[12]));
+        DEFINE(VMM_CTX_R13_OFFSET, offsetof(union context, gr[13]));
+        DEFINE(VMM_CTX_KR0_OFFSET, offsetof(union context, ar[0]));
+        DEFINE(VMM_CTX_KR1_OFFSET, offsetof(union context, ar[1]));
+        DEFINE(VMM_CTX_B0_OFFSET, offsetof(union context, br[0]));
+        DEFINE(VMM_CTX_B1_OFFSET, offsetof(union context, br[1]));
+        DEFINE(VMM_CTX_B2_OFFSET, offsetof(union context, br[2]));
+        DEFINE(VMM_CTX_RR0_OFFSET, offsetof(union context, rr[0]));
+        DEFINE(VMM_CTX_RSC_OFFSET, offsetof(union context, ar[16]));
+        DEFINE(VMM_CTX_BSPSTORE_OFFSET, offsetof(union context, ar[18]));
+        DEFINE(VMM_CTX_RNAT_OFFSET, offsetof(union context, ar[19]));
+        DEFINE(VMM_CTX_FCR_OFFSET, offsetof(union context, ar[21]));
+        DEFINE(VMM_CTX_EFLAG_OFFSET, offsetof(union context, ar[24]));
+        DEFINE(VMM_CTX_CFLG_OFFSET, offsetof(union context, ar[27]));
+        DEFINE(VMM_CTX_FSR_OFFSET, offsetof(union context, ar[28]));
+        DEFINE(VMM_CTX_FIR_OFFSET, offsetof(union context, ar[29]));
+        DEFINE(VMM_CTX_FDR_OFFSET, offsetof(union context, ar[30]));
+        DEFINE(VMM_CTX_UNAT_OFFSET, offsetof(union context, ar[36]));
+        DEFINE(VMM_CTX_FPSR_OFFSET, offsetof(union context, ar[40]));
+        DEFINE(VMM_CTX_PFS_OFFSET, offsetof(union context, ar[64]));
+        DEFINE(VMM_CTX_LC_OFFSET, offsetof(union context, ar[65]));
+        DEFINE(VMM_CTX_DCR_OFFSET, offsetof(union context, cr[0]));
+        DEFINE(VMM_CTX_IVA_OFFSET, offsetof(union context, cr[2]));
+        DEFINE(VMM_CTX_PTA_OFFSET, offsetof(union context, cr[8]));
+        DEFINE(VMM_CTX_IBR0_OFFSET, offsetof(union context, ibr[0]));
+        DEFINE(VMM_CTX_DBR0_OFFSET, offsetof(union context, dbr[0]));
+        DEFINE(VMM_CTX_F2_OFFSET, offsetof(union context, fr[2]));
+        DEFINE(VMM_CTX_F3_OFFSET, offsetof(union context, fr[3]));
+        DEFINE(VMM_CTX_F32_OFFSET, offsetof(union context, fr[32]));
+        DEFINE(VMM_CTX_F33_OFFSET, offsetof(union context, fr[33]));
+        DEFINE(VMM_CTX_PKR0_OFFSET, offsetof(union context, pkr[0]));
+        DEFINE(VMM_CTX_PSR_OFFSET, offsetof(union context, psr));
+        BLANK();
+}
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
new file mode 100644
index 00000000000..6df07324013
--- /dev/null
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -0,0 +1,1806 @@
+/*
+ * kvm_ia64.c: Basic KVM suppport On Itanium series processors
+ *
+ *
+ *      Copyright (C) 2007, Intel Corporation.
+ *      Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/bitops.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/gcc_intrin.h>
+#include <asm/pal.h>
+#include <asm/cacheflush.h>
+#include <asm/div64.h>
+#include <asm/tlb.h>
+#include "misc.h"
+#include "vti.h"
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+static unsigned long kvm_vmm_base;
+static unsigned long kvm_vsa_base;
+static unsigned long kvm_vm_buffer;
+static unsigned long kvm_vm_buffer_size;
+unsigned long kvm_vmm_gp;
+static long vp_env_info;
+static struct kvm_vmm_info *kvm_vmm_info;
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+        { NULL }
+};
+struct fdesc{
+    unsigned long ip;
+    unsigned long gp;
+};
+static void kvm_flush_icache(unsigned long start, unsigned long len)
+{
+        int l;
+        for (l = 0; l < (len + 32); l += 32)
+                ia64_fc(start + l);
+        ia64_sync_i();
+        ia64_srlz_i();
+}
+static void kvm_flush_tlb_all(void)
+{
+        unsigned long i, j, count0, count1, stride0, stride1, addr;
+        long flags;
+        addr    = local_cpu_data->ptce_base;
+        count0  = local_cpu_data->ptce_count[0];
+        count1  = local_cpu_data->ptce_count[1];
+        stride0 = local_cpu_data->ptce_stride[0];
+        stride1 = local_cpu_data->ptce_stride[1];
+        local_irq_save(flags);
+        for (i = 0; i < count0; ++i) {
+                for (j = 0; j < count1; ++j) {
+                        ia64_ptce(addr);
+                        addr += stride1;
+                }
+                addr += stride0;
+        }
+        local_irq_restore(flags);
+        ia64_srlz_i();                  /* srlz.i implies srlz.d */
+}
+long ia64_pal_vp_create(u64 *vpd, u64 *host_iva, u64 *opt_handler)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_CREATE, (u64)vpd, (u64)host_iva,
+                        (u64)opt_handler);
+        return iprv.status;
+}
+static  DEFINE_SPINLOCK(vp_lock);
+void kvm_arch_hardware_enable(void *garbage)
+{
+        long  status;
+        long  tmp_base;
+        unsigned long pte;
+        unsigned long saved_psr;
+        int slot;
+        pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base),
+                                PAGE_KERNEL));
+        local_irq_save(saved_psr);
+        slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
+        if (slot < 0)
+                return;
+        local_irq_restore(saved_psr);
+        spin_lock(&vp_lock);
+        status = ia64_pal_vp_init_env(kvm_vsa_base ?
+                                VP_INIT_ENV : VP_INIT_ENV_INITALIZE,
+                        __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base);
+        if (status != 0) {
+                printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n");
+                return ;
+        }
+        if (!kvm_vsa_base) {
+                kvm_vsa_base = tmp_base;
+                printk(KERN_INFO"kvm: kvm_vsa_base:0x%lx\n", kvm_vsa_base);
+        }
+        spin_unlock(&vp_lock);
+        ia64_ptr_entry(0x3, slot);
+}
+void kvm_arch_hardware_disable(void *garbage)
+{
+        long status;
+        int slot;
+        unsigned long pte;
+        unsigned long saved_psr;
+        unsigned long host_iva = ia64_getreg(_IA64_REG_CR_IVA);
+        pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base),
+                                PAGE_KERNEL));
+        local_irq_save(saved_psr);
+        slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
+        if (slot < 0)
+                return;
+        local_irq_restore(saved_psr);
+        status = ia64_pal_vp_exit_env(host_iva);
+        if (status)
+                printk(KERN_DEBUG"kvm: Failed to disable VT support! :%ld\n",
+                                status);
+        ia64_ptr_entry(0x3, slot);
+}
+void kvm_arch_check_processor_compat(void *rtn)
+{
+        *(int *)rtn = 0;
+}
+int kvm_dev_ioctl_check_extension(long ext)
+{
+        int r;
+        switch (ext) {
+        case KVM_CAP_IRQCHIP:
+        case KVM_CAP_USER_MEMORY:
+                r = 1;
+                break;
+        default:
+                r = 0;
+        }
+        return r;
+}
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+                                        gpa_t addr)
+{
+        struct kvm_io_device *dev;
+        dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+        return dev;
+}
+static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+        kvm_run->hw.hardware_exit_reason = 1;
+        return 0;
+}
+static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        struct kvm_mmio_req *p;
+        struct kvm_io_device *mmio_dev;
+        p = kvm_get_vcpu_ioreq(vcpu);
+        if ((p->addr & PAGE_MASK) == IOAPIC_DEFAULT_BASE_ADDRESS)
+                goto mmio;
+        vcpu->mmio_needed = 1;
+        vcpu->mmio_phys_addr = kvm_run->mmio.phys_addr = p->addr;
+        vcpu->mmio_size = kvm_run->mmio.len = p->size;
+        vcpu->mmio_is_write = kvm_run->mmio.is_write = !p->dir;
+        if (vcpu->mmio_is_write)
+                memcpy(vcpu->mmio_data, &p->data, p->size);
+        memcpy(kvm_run->mmio.data, &p->data, p->size);
+        kvm_run->exit_reason = KVM_EXIT_MMIO;
+        return 0;
+mmio:
+        mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr);
+        if (mmio_dev) {
+                if (!p->dir)
+                        kvm_iodevice_write(mmio_dev, p->addr, p->size,
+                                                &p->data);
+                else
+                        kvm_iodevice_read(mmio_dev, p->addr, p->size,
+                                                &p->data);
+        } else
+                printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
+        p->state = STATE_IORESP_READY;
+        return 1;
+}
+static int handle_pal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p->exit_reason == EXIT_REASON_PAL_CALL)
+                return kvm_pal_emul(vcpu, kvm_run);
+        else {
+                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+                kvm_run->hw.hardware_exit_reason = 2;
+                return 0;
+        }
+}
+static int handle_sal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p->exit_reason == EXIT_REASON_SAL_CALL) {
+                kvm_sal_emul(vcpu);
+                return 1;
+        } else {
+                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+                kvm_run->hw.hardware_exit_reason = 3;
+                return 0;
+        }
+}
+/*
+ *  offset: address offset to IPI space.
+ *  value:  deliver value.
+ */
+static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
+                                uint64_t vector)
+{
+        switch (dm) {
+        case SAPIC_FIXED:
+                kvm_apic_set_irq(vcpu, vector, 0);
+                break;
+        case SAPIC_NMI:
+                kvm_apic_set_irq(vcpu, 2, 0);
+                break;
+        case SAPIC_EXTINT:
+                kvm_apic_set_irq(vcpu, 0, 0);
+                break;
+        case SAPIC_INIT:
+        case SAPIC_PMI:
+        default:
+                printk(KERN_ERR"kvm: Unimplemented Deliver reserved IPI!\n");
+                break;
+        }
+}
+static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
+                        unsigned long eid)
+{
+        union ia64_lid lid;
+        int i;
+        for (i = 0; i < KVM_MAX_VCPUS; i++) {
+                if (kvm->vcpus[i]) {
+                        lid.val = VCPU_LID(kvm->vcpus[i]);
+                        if (lid.id == id && lid.eid == eid)
+                                return kvm->vcpus[i];
+                }
+        }
+        return NULL;
+}
+static int handle_ipi(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        struct exit_ctl_data *p = kvm_get_exit_data(vcpu);
+        struct kvm_vcpu *target_vcpu;
+        struct kvm_pt_regs *regs;
+        union ia64_ipi_a addr = p->u.ipi_data.addr;
+        union ia64_ipi_d data = p->u.ipi_data.data;
+        target_vcpu = lid_to_vcpu(vcpu->kvm, addr.id, addr.eid);
+        if (!target_vcpu)
+                return handle_vm_error(vcpu, kvm_run);
+        if (!target_vcpu->arch.launched) {
+                regs = vcpu_regs(target_vcpu);
+                regs->cr_iip = vcpu->kvm->arch.rdv_sal_data.boot_ip;
+                regs->r1 = vcpu->kvm->arch.rdv_sal_data.boot_gp;
+                target_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                if (waitqueue_active(&target_vcpu->wq))
+                        wake_up_interruptible(&target_vcpu->wq);
+        } else {
+                vcpu_deliver_ipi(target_vcpu, data.dm, data.vector);
+                if (target_vcpu != vcpu)
+                        kvm_vcpu_kick(target_vcpu);
+        }
+        return 1;
+}
+struct call_data {
+        struct kvm_ptc_g ptc_g_data;
+        struct kvm_vcpu *vcpu;
+};
+static void vcpu_global_purge(void *info)
+{
+        struct call_data *p = (struct call_data *)info;
+        struct kvm_vcpu *vcpu = p->vcpu;
+        if (test_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                return;
+        set_bit(KVM_REQ_PTC_G, &vcpu->requests);
+        if (vcpu->arch.ptc_g_count < MAX_PTC_G_NUM) {
+                vcpu->arch.ptc_g_data[vcpu->arch.ptc_g_count++] =
+                                                        p->ptc_g_data;
+        } else {
+                clear_bit(KVM_REQ_PTC_G, &vcpu->requests);
+                vcpu->arch.ptc_g_count = 0;
+                set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+        }
+}
+static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        struct exit_ctl_data *p = kvm_get_exit_data(vcpu);
+        struct kvm *kvm = vcpu->kvm;
+        struct call_data call_data;
+        int i;
+        call_data.ptc_g_data = p->u.ptc_g_data;
+        for (i = 0; i < KVM_MAX_VCPUS; i++) {
+                if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
+                                                KVM_MP_STATE_UNINITIALIZED ||
+                                        vcpu == kvm->vcpus[i])
+                        continue;
+                if (waitqueue_active(&kvm->vcpus[i]->wq))
+                        wake_up_interruptible(&kvm->vcpus[i]->wq);
+                if (kvm->vcpus[i]->cpu != -1) {
+                        call_data.vcpu = kvm->vcpus[i];
+                        smp_call_function_single(kvm->vcpus[i]->cpu,
+                                        vcpu_global_purge, &call_data, 0, 1);
+                } else
+                        printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n");
+        }
+        return 1;
+}
+static int handle_switch_rr6(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        return 1;
+}
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+        ktime_t kt;
+        long itc_diff;
+        unsigned long vcpu_now_itc;
+        unsigned long expires;
+        struct hrtimer *p_ht = &vcpu->arch.hlt_timer;
+        unsigned long cyc_per_usec = local_cpu_data->cyc_per_usec;
+        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        vcpu_now_itc = ia64_getreg(_IA64_REG_AR_ITC) + vcpu->arch.itc_offset;
+        if (time_after(vcpu_now_itc, vpd->itm)) {
+                vcpu->arch.timer_check = 1;
+                return 1;
+        }
+        itc_diff = vpd->itm - vcpu_now_itc;
+        if (itc_diff < 0)
+                itc_diff = -itc_diff;
+        expires = div64_64(itc_diff, cyc_per_usec);
+        kt = ktime_set(0, 1000 * expires);
+        vcpu->arch.ht_active = 1;
+        hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
+        if (irqchip_in_kernel(vcpu->kvm)) {
+                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+                kvm_vcpu_block(vcpu);
+                hrtimer_cancel(p_ht);
+                vcpu->arch.ht_active = 0;
+                if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+                        return -EINTR;
+                return 1;
+        } else {
+                printk(KERN_ERR"kvm: Unsupported userspace halt!");
+                return 0;
+        }
+}
+static int handle_vm_shutdown(struct kvm_vcpu *vcpu,
+                struct kvm_run *kvm_run)
+{
+        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+        return 0;
+}
+static int handle_external_interrupt(struct kvm_vcpu *vcpu,
+                struct kvm_run *kvm_run)
+{
+        return 1;
+}
+static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
+                struct kvm_run *kvm_run) = {
+        [EXIT_REASON_VM_PANIC]              = handle_vm_error,
+        [EXIT_REASON_MMIO_INSTRUCTION]      = handle_mmio,
+        [EXIT_REASON_PAL_CALL]              = handle_pal_call,
+        [EXIT_REASON_SAL_CALL]              = handle_sal_call,
+        [EXIT_REASON_SWITCH_RR6]            = handle_switch_rr6,
+        [EXIT_REASON_VM_DESTROY]            = handle_vm_shutdown,
+        [EXIT_REASON_EXTERNAL_INTERRUPT]    = handle_external_interrupt,
+        [EXIT_REASON_IPI]                   = handle_ipi,
+        [EXIT_REASON_PTC_G]                 = handle_global_purge,
+};
+static const int kvm_vti_max_exit_handlers =
+                sizeof(kvm_vti_exit_handlers)/sizeof(*kvm_vti_exit_handlers);
+static void kvm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+static uint32_t kvm_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p_exit_data;
+        p_exit_data = kvm_get_exit_data(vcpu);
+        return p_exit_data->exit_reason;
+}
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+        u32 exit_reason = kvm_get_exit_reason(vcpu);
+        vcpu->arch.last_exit = exit_reason;
+        if (exit_reason < kvm_vti_max_exit_handlers
+                        && kvm_vti_exit_handlers[exit_reason])
+                return kvm_vti_exit_handlers[exit_reason](vcpu, kvm_run);
+        else {
+                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+                kvm_run->hw.hardware_exit_reason = exit_reason;
+        }
+        return 0;
+}
+static inline void vti_set_rr6(unsigned long rr6)
+{
+        ia64_set_rr(RR6, rr6);
+        ia64_srlz_i();
+}
+static int kvm_insert_vmm_mapping(struct kvm_vcpu *vcpu)
+{
+        unsigned long pte;
+        struct kvm *kvm = vcpu->kvm;
+        int r;
+        /*Insert a pair of tr to map vmm*/
+        pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base), PAGE_KERNEL));
+        r = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
+        if (r < 0)
+                goto out;
+        vcpu->arch.vmm_tr_slot = r;
+        /*Insert a pairt of tr to map data of vm*/
+        pte = pte_val(mk_pte_phys(__pa(kvm->arch.vm_base), PAGE_KERNEL));
+        r = ia64_itr_entry(0x3, KVM_VM_DATA_BASE,
+                                        pte, KVM_VM_DATA_SHIFT);
+        if (r < 0)
+                goto out;
+        vcpu->arch.vm_tr_slot = r;
+        r = 0;
+out:
+        return r;
+}
+static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu)
+{
+        ia64_ptr_entry(0x3, vcpu->arch.vmm_tr_slot);
+        ia64_ptr_entry(0x3, vcpu->arch.vm_tr_slot);
+}
+static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
+{
+        int cpu = smp_processor_id();
+        if (vcpu->arch.last_run_cpu != cpu ||
+                        per_cpu(last_vcpu, cpu) != vcpu) {
+                per_cpu(last_vcpu, cpu) = vcpu;
+                vcpu->arch.last_run_cpu = cpu;
+                kvm_flush_tlb_all();
+        }
+        vcpu->arch.host_rr6 = ia64_get_rr(RR6);
+        vti_set_rr6(vcpu->arch.vmm_rr);
+        return kvm_insert_vmm_mapping(vcpu);
+}
+static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
+{
+        kvm_purge_vmm_mapping(vcpu);
+        vti_set_rr6(vcpu->arch.host_rr6);
+}
+static int  vti_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        union context *host_ctx, *guest_ctx;
+        int r;
+        /*Get host and guest context with guest address space.*/
+        host_ctx = kvm_get_host_context(vcpu);
+        guest_ctx = kvm_get_guest_context(vcpu);
+        r = kvm_vcpu_pre_transition(vcpu);
+        if (r < 0)
+                goto out;
+        kvm_vmm_info->tramp_entry(host_ctx, guest_ctx);
+        kvm_vcpu_post_transition(vcpu);
+        r = 0;
+out:
+        return r;
+}
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        int r;
+again:
+        preempt_disable();
+        kvm_prepare_guest_switch(vcpu);
+        local_irq_disable();
+        if (signal_pending(current)) {
+                local_irq_enable();
+                preempt_enable();
+                r = -EINTR;
+                kvm_run->exit_reason = KVM_EXIT_INTR;
+                goto out;
+        }
+        vcpu->guest_mode = 1;
+        kvm_guest_enter();
+        r = vti_vcpu_run(vcpu, kvm_run);
+        if (r < 0) {
+                local_irq_enable();
+                preempt_enable();
+                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+                goto out;
+        }
+        vcpu->arch.launched = 1;
+        vcpu->guest_mode = 0;
+        local_irq_enable();
+        /*
+         * We must have an instruction between local_irq_enable() and
+         * kvm_guest_exit(), so the timer interrupt isn't delayed by
+         * the interrupt shadow.  The stat.exits increment will do nicely.
+         * But we need to prevent reordering, hence this barrier():
+         */
+        barrier();
+        kvm_guest_exit();
+        preempt_enable();
+        r = kvm_handle_exit(kvm_run, vcpu);
+        if (r > 0) {
+                if (!need_resched())
+                        goto again;
+        }
+out:
+        if (r > 0) {
+                kvm_resched(vcpu);
+                goto again;
+        }
+        return r;
+}
+static void kvm_set_mmio_data(struct kvm_vcpu *vcpu)
+{
+        struct kvm_mmio_req *p = kvm_get_vcpu_ioreq(vcpu);
+        if (!vcpu->mmio_is_write)
+                memcpy(&p->data, vcpu->mmio_data, 8);
+        p->state = STATE_IORESP_READY;
+}
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        int r;
+        sigset_t sigsaved;
+        vcpu_load(vcpu);
+        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+                kvm_vcpu_block(vcpu);
+                vcpu_put(vcpu);
+                return -EAGAIN;
+        }
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+        if (vcpu->mmio_needed) {
+                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+                kvm_set_mmio_data(vcpu);
+                vcpu->mmio_read_completed = 1;
+                vcpu->mmio_needed = 0;
+        }
+        r = __vcpu_run(vcpu, kvm_run);
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+        vcpu_put(vcpu);
+        return r;
+}
+/*
+ * Allocate 16M memory for every vm to hold its specific data.
+ * Its memory map is defined in kvm_host.h.
+ */
+static struct kvm *kvm_alloc_kvm(void)
+{
+        struct kvm *kvm;
+        uint64_t  vm_base;
+        vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
+        if (!vm_base)
+                return ERR_PTR(-ENOMEM);
+        printk(KERN_DEBUG"kvm: VM data's base Address:0x%lx\n", vm_base);
+        /* Zero all pages before use! */
+        memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
+        kvm = (struct kvm *)(vm_base + KVM_VM_OFS);
+        kvm->arch.vm_base = vm_base;
+        return kvm;
+}
+struct kvm_io_range {
+        unsigned long start;
+        unsigned long size;
+        unsigned long type;
+};
+static const struct kvm_io_range io_ranges[] = {
+        {VGA_IO_START, VGA_IO_SIZE, GPFN_FRAME_BUFFER},
+        {MMIO_START, MMIO_SIZE, GPFN_LOW_MMIO},
+        {LEGACY_IO_START, LEGACY_IO_SIZE, GPFN_LEGACY_IO},
+        {IO_SAPIC_START, IO_SAPIC_SIZE, GPFN_IOSAPIC},
+        {PIB_START, PIB_SIZE, GPFN_PIB},
+};
+static void kvm_build_io_pmt(struct kvm *kvm)
+{
+        unsigned long i, j;
+        /* Mark I/O ranges */
+        for (i = 0; i < (sizeof(io_ranges) / sizeof(struct kvm_io_range));
+                                                        i++) {
+                for (j = io_ranges[i].start;
+                                j < io_ranges[i].start + io_ranges[i].size;
+                                j += PAGE_SIZE)
+                        kvm_set_pmt_entry(kvm, j >> PAGE_SHIFT,
+                                        io_ranges[i].type, 0);
+        }
+}
+/*Use unused rids to virtualize guest rid.*/
+#define GUEST_PHYSICAL_RR0      0x1739
+#define GUEST_PHYSICAL_RR4      0x2739
+#define VMM_INIT_RR             0x1660
+static void kvm_init_vm(struct kvm *kvm)
+{
+        long vm_base;
+        BUG_ON(!kvm);
+        kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
+        kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
+        kvm->arch.vmm_init_rr = VMM_INIT_RR;
+        vm_base = kvm->arch.vm_base;
+        if (vm_base) {
+                kvm->arch.vhpt_base = vm_base + KVM_VHPT_OFS;
+                kvm->arch.vtlb_base = vm_base + KVM_VTLB_OFS;
+                kvm->arch.vpd_base  = vm_base + KVM_VPD_OFS;
+        }
+        /*
+         *Fill P2M entries for MMIO/IO ranges
+         */
+        kvm_build_io_pmt(kvm);
+}
+struct  kvm *kvm_arch_create_vm(void)
+{
+        struct kvm *kvm = kvm_alloc_kvm();
+        if (IS_ERR(kvm))
+                return ERR_PTR(-ENOMEM);
+        kvm_init_vm(kvm);
+        return kvm;
+}
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
+                                        struct kvm_irqchip *chip)
+{
+        int r;
+        r = 0;
+        switch (chip->chip_id) {
+        case KVM_IRQCHIP_IOAPIC:
+                memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm),
+                                sizeof(struct kvm_ioapic_state));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+        return r;
+}
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+        int r;
+        r = 0;
+        switch (chip->chip_id) {
+        case KVM_IRQCHIP_IOAPIC:
+                memcpy(ioapic_irqchip(kvm),
+                                &chip->chip.ioapic,
+                                sizeof(struct kvm_ioapic_state));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+        return r;
+}
+#define RESTORE_REGS(_x) vcpu->arch._x = regs->_x
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        int i;
+        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        int r;
+        vcpu_load(vcpu);
+        for (i = 0; i < 16; i++) {
+                vpd->vgr[i] = regs->vpd.vgr[i];
+                vpd->vbgr[i] = regs->vpd.vbgr[i];
+        }
+        for (i = 0; i < 128; i++)
+                vpd->vcr[i] = regs->vpd.vcr[i];
+        vpd->vhpi = regs->vpd.vhpi;
+        vpd->vnat = regs->vpd.vnat;
+        vpd->vbnat = regs->vpd.vbnat;
+        vpd->vpsr = regs->vpd.vpsr;
+        vpd->vpr = regs->vpd.vpr;
+        r = -EFAULT;
+        r = copy_from_user(&vcpu->arch.guest, regs->saved_guest,
+                                                sizeof(union context));
+        if (r)
+                goto out;
+        r = copy_from_user(vcpu + 1, regs->saved_stack +
+                        sizeof(struct kvm_vcpu),
+                        IA64_STK_OFFSET - sizeof(struct kvm_vcpu));
+        if (r)
+                goto out;
+        vcpu->arch.exit_data =
+                ((struct kvm_vcpu *)(regs->saved_stack))->arch.exit_data;
+        RESTORE_REGS(mp_state);
+        RESTORE_REGS(vmm_rr);
+        memcpy(vcpu->arch.itrs, regs->itrs, sizeof(struct thash_data) * NITRS);
+        memcpy(vcpu->arch.dtrs, regs->dtrs, sizeof(struct thash_data) * NDTRS);
+        RESTORE_REGS(itr_regions);
+        RESTORE_REGS(dtr_regions);
+        RESTORE_REGS(tc_regions);
+        RESTORE_REGS(irq_check);
+        RESTORE_REGS(itc_check);
+        RESTORE_REGS(timer_check);
+        RESTORE_REGS(timer_pending);
+        RESTORE_REGS(last_itc);
+        for (i = 0; i < 8; i++) {
+                vcpu->arch.vrr[i] = regs->vrr[i];
+                vcpu->arch.ibr[i] = regs->ibr[i];
+                vcpu->arch.dbr[i] = regs->dbr[i];
+        }
+        for (i = 0; i < 4; i++)
+                vcpu->arch.insvc[i] = regs->insvc[i];
+        RESTORE_REGS(xtp);
+        RESTORE_REGS(metaphysical_rr0);
+        RESTORE_REGS(metaphysical_rr4);
+        RESTORE_REGS(metaphysical_saved_rr0);
+        RESTORE_REGS(metaphysical_saved_rr4);
+        RESTORE_REGS(fp_psr);
+        RESTORE_REGS(saved_gp);
+        vcpu->arch.irq_new_pending = 1;
+        vcpu->arch.itc_offset = regs->saved_itc - ia64_getreg(_IA64_REG_AR_ITC);
+        set_bit(KVM_REQ_RESUME, &vcpu->requests);
+        vcpu_put(vcpu);
+        r = 0;
+out:
+        return r;
+}
+long kvm_arch_vm_ioctl(struct file *filp,
+                unsigned int ioctl, unsigned long arg)
+{
+        struct kvm *kvm = filp->private_data;
+        void __user *argp = (void __user *)arg;
+        int r = -EINVAL;
+        switch (ioctl) {
+        case KVM_SET_MEMORY_REGION: {
+                struct kvm_memory_region kvm_mem;
+                struct kvm_userspace_memory_region kvm_userspace_mem;
+                r = -EFAULT;
+                if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+                        goto out;
+                kvm_userspace_mem.slot = kvm_mem.slot;
+                kvm_userspace_mem.flags = kvm_mem.flags;
+                kvm_userspace_mem.guest_phys_addr =
+                                        kvm_mem.guest_phys_addr;
+                kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+                r = kvm_vm_ioctl_set_memory_region(kvm,
+                                        &kvm_userspace_mem, 0);
+                if (r)
+                        goto out;
+                break;
+                }
+        case KVM_CREATE_IRQCHIP:
+                r = -EFAULT;
+                r = kvm_ioapic_init(kvm);
+                if (r)
+                        goto out;
+                break;
+        case KVM_IRQ_LINE: {
+                struct kvm_irq_level irq_event;
+                r = -EFAULT;
+                if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                        goto out;
+                if (irqchip_in_kernel(kvm)) {
+                        mutex_lock(&kvm->lock);
+                        kvm_ioapic_set_irq(kvm->arch.vioapic,
+                                                irq_event.irq,
+                                                irq_event.level);
+                        mutex_unlock(&kvm->lock);
+                        r = 0;
+                }
+                break;
+                }
+        case KVM_GET_IRQCHIP: {
+                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+                struct kvm_irqchip chip;
+                r = -EFAULT;
+                if (copy_from_user(&chip, argp, sizeof chip))
+                                goto out;
+                r = -ENXIO;
+                if (!irqchip_in_kernel(kvm))
+                        goto out;
+                r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+                if (r)
+                        goto out;
+                r = -EFAULT;
+                if (copy_to_user(argp, &chip, sizeof chip))
+                                goto out;
+                r = 0;
+                break;
+                }
+        case KVM_SET_IRQCHIP: {
+                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+                struct kvm_irqchip chip;
+                r = -EFAULT;
+                if (copy_from_user(&chip, argp, sizeof chip))
+                                goto out;
+                r = -ENXIO;
+                if (!irqchip_in_kernel(kvm))
+                        goto out;
+                r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+                if (r)
+                        goto out;
+                r = 0;
+                break;
+                }
+        default:
+                ;
+        }
+out:
+        return r;
+}
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                struct kvm_sregs *sregs)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                struct kvm_sregs *sregs)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                struct kvm_translation *tr)
+{
+        return -EINVAL;
+}
+static int kvm_alloc_vmm_area(void)
+{
+        if (!kvm_vmm_base && (kvm_vm_buffer_size < KVM_VM_BUFFER_SIZE)) {
+                kvm_vmm_base = __get_free_pages(GFP_KERNEL,
+                                get_order(KVM_VMM_SIZE));
+                if (!kvm_vmm_base)
+                        return -ENOMEM;
+                memset((void *)kvm_vmm_base, 0, KVM_VMM_SIZE);
+                kvm_vm_buffer = kvm_vmm_base + VMM_SIZE;
+                printk(KERN_DEBUG"kvm:VMM's Base Addr:0x%lx, vm_buffer:0x%lx\n",
+                                kvm_vmm_base, kvm_vm_buffer);
+        }
+        return 0;
+}
+static void kvm_free_vmm_area(void)
+{
+        if (kvm_vmm_base) {
+                /*Zero this area before free to avoid bits leak!!*/
+                memset((void *)kvm_vmm_base, 0, KVM_VMM_SIZE);
+                free_pages(kvm_vmm_base, get_order(KVM_VMM_SIZE));
+                kvm_vmm_base  = 0;
+                kvm_vm_buffer = 0;
+                kvm_vsa_base = 0;
+        }
+}
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it. Leave it as blank for IA64.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+}
+static void vti_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+static int vti_init_vpd(struct kvm_vcpu *vcpu)
+{
+        int i;
+        union cpuid3_t cpuid3;
+        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        if (IS_ERR(vpd))
+                return PTR_ERR(vpd);
+        /* CPUID init */
+        for (i = 0; i < 5; i++)
+                vpd->vcpuid[i] = ia64_get_cpuid(i);
+        /* Limit the CPUID number to 5 */
+        cpuid3.value = vpd->vcpuid[3];
+        cpuid3.number = 4;      /* 5 - 1 */
+        vpd->vcpuid[3] = cpuid3.value;
+        /*Set vac and vdc fields*/
+        vpd->vac.a_from_int_cr = 1;
+        vpd->vac.a_to_int_cr = 1;
+        vpd->vac.a_from_psr = 1;
+        vpd->vac.a_from_cpuid = 1;
+        vpd->vac.a_cover = 1;
+        vpd->vac.a_bsw = 1;
+        vpd->vac.a_int = 1;
+        vpd->vdc.d_vmsw = 1;
+        /*Set virtual buffer*/
+        vpd->virt_env_vaddr = KVM_VM_BUFFER_BASE;
+        return 0;
+}
+static int vti_create_vp(struct kvm_vcpu *vcpu)
+{
+        long ret;
+        struct vpd *vpd = vcpu->arch.vpd;
+        unsigned long  vmm_ivt;
+        vmm_ivt = kvm_vmm_info->vmm_ivt;
+        printk(KERN_DEBUG "kvm: vcpu:%p,ivt: 0x%lx\n", vcpu, vmm_ivt);
+        ret = ia64_pal_vp_create((u64 *)vpd, (u64 *)vmm_ivt, 0);
+        if (ret) {
+                printk(KERN_ERR"kvm: ia64_pal_vp_create failed!\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void init_ptce_info(struct kvm_vcpu *vcpu)
+{
+        ia64_ptce_info_t ptce = {0};
+        ia64_get_ptce(&ptce);
+        vcpu->arch.ptce_base = ptce.base;
+        vcpu->arch.ptce_count[0] = ptce.count[0];
+        vcpu->arch.ptce_count[1] = ptce.count[1];
+        vcpu->arch.ptce_stride[0] = ptce.stride[0];
+        vcpu->arch.ptce_stride[1] = ptce.stride[1];
+}
+static void kvm_migrate_hlt_timer(struct kvm_vcpu *vcpu)
+{
+        struct hrtimer *p_ht = &vcpu->arch.hlt_timer;
+        if (hrtimer_cancel(p_ht))
+                hrtimer_start(p_ht, p_ht->expires, HRTIMER_MODE_ABS);
+}
+static enum hrtimer_restart hlt_timer_fn(struct hrtimer *data)
+{
+        struct kvm_vcpu *vcpu;
+        wait_queue_head_t *q;
+        vcpu  = container_of(data, struct kvm_vcpu, arch.hlt_timer);
+        if (vcpu->arch.mp_state != KVM_MP_STATE_HALTED)
+                goto out;
+        q = &vcpu->wq;
+        if (waitqueue_active(q)) {
+                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                wake_up_interruptible(q);
+        }
+out:
+        vcpu->arch.timer_check = 1;
+        return HRTIMER_NORESTART;
+}
+#define PALE_RESET_ENTRY    0x80000000ffffffb0UL
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+        struct kvm_vcpu *v;
+        int r;
+        int i;
+        long itc_offset;
+        struct kvm *kvm = vcpu->kvm;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        union context *p_ctx = &vcpu->arch.guest;
+        struct kvm_vcpu *vmm_vcpu = to_guest(vcpu->kvm, vcpu);
+        /*Init vcpu context for first run.*/
+        if (IS_ERR(vmm_vcpu))
+                return PTR_ERR(vmm_vcpu);
+        if (vcpu->vcpu_id == 0) {
+                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                /*Set entry address for first run.*/
+                regs->cr_iip = PALE_RESET_ENTRY;
+                /*Initilize itc offset for vcpus*/
+                itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
+                for (i = 0; i < MAX_VCPU_NUM; i++) {
+                        v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+                        v->arch.itc_offset = itc_offset;
+                        v->arch.last_itc = 0;
+                }
+        } else
+                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+        r = -ENOMEM;
+        vcpu->arch.apic = kzalloc(sizeof(struct kvm_lapic), GFP_KERNEL);
+        if (!vcpu->arch.apic)
+                goto out;
+        vcpu->arch.apic->vcpu = vcpu;
+        p_ctx->gr[1] = 0;
+        p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + IA64_STK_OFFSET);
+        p_ctx->gr[13] = (unsigned long)vmm_vcpu;
+        p_ctx->psr = 0x1008522000UL;
+        p_ctx->ar[40] = FPSR_DEFAULT; /*fpsr*/
+        p_ctx->caller_unat = 0;
+        p_ctx->pr = 0x0;
+        p_ctx->ar[36] = 0x0; /*unat*/
+        p_ctx->ar[19] = 0x0; /*rnat*/
+        p_ctx->ar[18] = (unsigned long)vmm_vcpu +
+                                ((sizeof(struct kvm_vcpu)+15) & ~15);
+        p_ctx->ar[64] = 0x0; /*pfs*/
+        p_ctx->cr[0] = 0x7e04UL;
+        p_ctx->cr[2] = (unsigned long)kvm_vmm_info->vmm_ivt;
+        p_ctx->cr[8] = 0x3c;
+        /*Initilize region register*/
+        p_ctx->rr[0] = 0x30;
+        p_ctx->rr[1] = 0x30;
+        p_ctx->rr[2] = 0x30;
+        p_ctx->rr[3] = 0x30;
+        p_ctx->rr[4] = 0x30;
+        p_ctx->rr[5] = 0x30;
+        p_ctx->rr[7] = 0x30;
+        /*Initilize branch register 0*/
+        p_ctx->br[0] = *(unsigned long *)kvm_vmm_info->vmm_entry;
+        vcpu->arch.vmm_rr = kvm->arch.vmm_init_rr;
+        vcpu->arch.metaphysical_rr0 = kvm->arch.metaphysical_rr0;
+        vcpu->arch.metaphysical_rr4 = kvm->arch.metaphysical_rr4;
+        hrtimer_init(&vcpu->arch.hlt_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        vcpu->arch.hlt_timer.function = hlt_timer_fn;
+        vcpu->arch.last_run_cpu = -1;
+        vcpu->arch.vpd = (struct vpd *)VPD_ADDR(vcpu->vcpu_id);
+        vcpu->arch.vsa_base = kvm_vsa_base;
+        vcpu->arch.__gp = kvm_vmm_gp;
+        vcpu->arch.dirty_log_lock_pa = __pa(&kvm->arch.dirty_log_lock);
+        vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_ADDR(vcpu->vcpu_id);
+        vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_ADDR(vcpu->vcpu_id);
+        init_ptce_info(vcpu);
+        r = 0;
+out:
+        return r;
+}
+static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id)
+{
+        unsigned long psr;
+        int r;
+        local_irq_save(psr);
+        r = kvm_insert_vmm_mapping(vcpu);
+        if (r)
+                goto fail;
+        r = kvm_vcpu_init(vcpu, vcpu->kvm, id);
+        if (r)
+                goto fail;
+        r = vti_init_vpd(vcpu);
+        if (r) {
+                printk(KERN_DEBUG"kvm: vpd init error!!\n");
+                goto uninit;
+        }
+        r = vti_create_vp(vcpu);
+        if (r)
+                goto uninit;
+        kvm_purge_vmm_mapping(vcpu);
+        local_irq_restore(psr);
+        return 0;
+uninit:
+        kvm_vcpu_uninit(vcpu);
+fail:
+        return r;
+}
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+                unsigned int id)
+{
+        struct kvm_vcpu *vcpu;
+        unsigned long vm_base = kvm->arch.vm_base;
+        int r;
+        int cpu;
+        r = -ENOMEM;
+        if (!vm_base) {
+                printk(KERN_ERR"kvm: Create vcpu[%d] error!\n", id);
+                goto fail;
+        }
+        vcpu = (struct kvm_vcpu *)(vm_base + KVM_VCPU_OFS + VCPU_SIZE * id);
+        vcpu->kvm = kvm;
+        cpu = get_cpu();
+        vti_vcpu_load(vcpu, cpu);
+        r = vti_vcpu_setup(vcpu, id);
+        put_cpu();
+        if (r) {
+                printk(KERN_DEBUG"kvm: vcpu_setup error!!\n");
+                goto fail;
+        }
+        return vcpu;
+fail:
+        return ERR_PTR(r);
+}
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                struct kvm_debug_guest *dbg)
+{
+        return -EINVAL;
+}
+static void free_kvm(struct kvm *kvm)
+{
+        unsigned long vm_base = kvm->arch.vm_base;
+        if (vm_base) {
+                memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
+                free_pages(vm_base, get_order(KVM_VM_DATA_SIZE));
+        }
+}
+static void kvm_release_vm_pages(struct kvm *kvm)
+{
+        struct kvm_memory_slot *memslot;
+        int i, j;
+        unsigned long base_gfn;
+        for (i = 0; i < kvm->nmemslots; i++) {
+                memslot = &kvm->memslots[i];
+                base_gfn = memslot->base_gfn;
+                for (j = 0; j < memslot->npages; j++) {
+                        if (memslot->rmap[j])
+                                put_page((struct page *)memslot->rmap[j]);
+                }
+        }
+}
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+        kfree(kvm->arch.vioapic);
+        kvm_release_vm_pages(kvm);
+        kvm_free_physmem(kvm);
+        free_kvm(kvm);
+}
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+        if (cpu != vcpu->cpu) {
+                vcpu->cpu = cpu;
+                if (vcpu->arch.ht_active)
+                        kvm_migrate_hlt_timer(vcpu);
+        }
+}
+#define SAVE_REGS(_x)   regs->_x = vcpu->arch._x
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        int i;
+        int r;
+        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        vcpu_load(vcpu);
+        for (i = 0; i < 16; i++) {
+                regs->vpd.vgr[i] = vpd->vgr[i];
+                regs->vpd.vbgr[i] = vpd->vbgr[i];
+        }
+        for (i = 0; i < 128; i++)
+                regs->vpd.vcr[i] = vpd->vcr[i];
+        regs->vpd.vhpi = vpd->vhpi;
+        regs->vpd.vnat = vpd->vnat;
+        regs->vpd.vbnat = vpd->vbnat;
+        regs->vpd.vpsr = vpd->vpsr;
+        regs->vpd.vpr = vpd->vpr;
+        r = -EFAULT;
+        r = copy_to_user(regs->saved_guest, &vcpu->arch.guest,
+                                        sizeof(union context));
+        if (r)
+                goto out;
+        r = copy_to_user(regs->saved_stack, (void *)vcpu, IA64_STK_OFFSET);
+        if (r)
+                goto out;
+        SAVE_REGS(mp_state);
+        SAVE_REGS(vmm_rr);
+        memcpy(regs->itrs, vcpu->arch.itrs, sizeof(struct thash_data) * NITRS);
+        memcpy(regs->dtrs, vcpu->arch.dtrs, sizeof(struct thash_data) * NDTRS);
+        SAVE_REGS(itr_regions);
+        SAVE_REGS(dtr_regions);
+        SAVE_REGS(tc_regions);
+        SAVE_REGS(irq_check);
+        SAVE_REGS(itc_check);
+        SAVE_REGS(timer_check);
+        SAVE_REGS(timer_pending);
+        SAVE_REGS(last_itc);
+        for (i = 0; i < 8; i++) {
+                regs->vrr[i] = vcpu->arch.vrr[i];
+                regs->ibr[i] = vcpu->arch.ibr[i];
+                regs->dbr[i] = vcpu->arch.dbr[i];
+        }
+        for (i = 0; i < 4; i++)
+                regs->insvc[i] = vcpu->arch.insvc[i];
+        regs->saved_itc = vcpu->arch.itc_offset + ia64_getreg(_IA64_REG_AR_ITC);
+        SAVE_REGS(xtp);
+        SAVE_REGS(metaphysical_rr0);
+        SAVE_REGS(metaphysical_rr4);
+        SAVE_REGS(metaphysical_saved_rr0);
+        SAVE_REGS(metaphysical_saved_rr4);
+        SAVE_REGS(fp_psr);
+        SAVE_REGS(saved_gp);
+        vcpu_put(vcpu);
+        r = 0;
+out:
+        return r;
+}
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+        hrtimer_cancel(&vcpu->arch.hlt_timer);
+        kfree(vcpu->arch.apic);
+}
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                unsigned int ioctl, unsigned long arg)
+{
+        return -EINVAL;
+}
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                struct kvm_userspace_memory_region *mem,
+                struct kvm_memory_slot old,
+                int user_alloc)
+{
+        unsigned long i;
+        struct page *page;
+        int npages = mem->memory_size >> PAGE_SHIFT;
+        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+        unsigned long base_gfn = memslot->base_gfn;
+        for (i = 0; i < npages; i++) {
+                page = gfn_to_page(kvm, base_gfn + i);
+                kvm_set_pmt_entry(kvm, base_gfn + i,
+                                page_to_pfn(page) << PAGE_SHIFT,
+                                _PAGE_AR_RWX|_PAGE_MA_WB);
+                memslot->rmap[i] = (unsigned long)page;
+        }
+        return 0;
+}
+long kvm_arch_dev_ioctl(struct file *filp,
+                unsigned int ioctl, unsigned long arg)
+{
+        return -EINVAL;
+}
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+        kvm_vcpu_uninit(vcpu);
+}
+static int vti_cpu_has_kvm_support(void)
+{
+        long  avail = 1, status = 1, control = 1;
+        long ret;
+        ret = ia64_pal_proc_get_features(&avail, &status, &control, 0);
+        if (ret)
+                goto out;
+        if (!(avail & PAL_PROC_VM_BIT))
+                goto out;
+        printk(KERN_DEBUG"kvm: Hardware Supports VT\n");
+        ret = ia64_pal_vp_env_info(&kvm_vm_buffer_size, &vp_env_info);
+        if (ret)
+                goto out;
+        printk(KERN_DEBUG"kvm: VM Buffer Size:0x%lx\n", kvm_vm_buffer_size);
+        if (!(vp_env_info & VP_OPCODE)) {
+                printk(KERN_WARNING"kvm: No opcode ability on hardware, "
+                                "vm_env_info:0x%lx\n", vp_env_info);
+        }
+        return 1;
+out:
+        return 0;
+}
+static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
+                                                struct module *module)
+{
+        unsigned long module_base;
+        unsigned long vmm_size;
+        unsigned long vmm_offset, func_offset, fdesc_offset;
+        struct fdesc *p_fdesc;
+        BUG_ON(!module);
+        if (!kvm_vmm_base) {
+                printk("kvm: kvm area hasn't been initilized yet!!\n");
+                return -EFAULT;
+        }
+        /*Calculate new position of relocated vmm module.*/
+        module_base = (unsigned long)module->module_core;
+        vmm_size = module->core_size;
+        if (unlikely(vmm_size > KVM_VMM_SIZE))
+                return -EFAULT;
+        memcpy((void *)kvm_vmm_base, (void *)module_base, vmm_size);
+        kvm_flush_icache(kvm_vmm_base, vmm_size);
+        /*Recalculate kvm_vmm_info based on new VMM*/
+        vmm_offset = vmm_info->vmm_ivt - module_base;
+        kvm_vmm_info->vmm_ivt = KVM_VMM_BASE + vmm_offset;
+        printk(KERN_DEBUG"kvm: Relocated VMM's IVT Base Addr:%lx\n",
+                        kvm_vmm_info->vmm_ivt);
+        fdesc_offset = (unsigned long)vmm_info->vmm_entry - module_base;
+        kvm_vmm_info->vmm_entry = (kvm_vmm_entry *)(KVM_VMM_BASE +
+                                                        fdesc_offset);
+        func_offset = *(unsigned long *)vmm_info->vmm_entry - module_base;
+        p_fdesc = (struct fdesc *)(kvm_vmm_base + fdesc_offset);
+        p_fdesc->ip = KVM_VMM_BASE + func_offset;
+        p_fdesc->gp = KVM_VMM_BASE+(p_fdesc->gp - module_base);
+        printk(KERN_DEBUG"kvm: Relocated VMM's Init Entry Addr:%lx\n",
+                        KVM_VMM_BASE+func_offset);
+        fdesc_offset = (unsigned long)vmm_info->tramp_entry - module_base;
+        kvm_vmm_info->tramp_entry = (kvm_tramp_entry *)(KVM_VMM_BASE +
+                        fdesc_offset);
+        func_offset = *(unsigned long *)vmm_info->tramp_entry - module_base;
+        p_fdesc = (struct fdesc *)(kvm_vmm_base + fdesc_offset);
+        p_fdesc->ip = KVM_VMM_BASE + func_offset;
+        p_fdesc->gp = KVM_VMM_BASE + (p_fdesc->gp - module_base);
+        kvm_vmm_gp = p_fdesc->gp;
+        printk(KERN_DEBUG"kvm: Relocated VMM's Entry IP:%p\n",
+                                                kvm_vmm_info->vmm_entry);
+        printk(KERN_DEBUG"kvm: Relocated VMM's Trampoline Entry IP:0x%lx\n",
+                                                KVM_VMM_BASE + func_offset);
+        return 0;
+}
+int kvm_arch_init(void *opaque)
+{
+        int r;
+        struct kvm_vmm_info *vmm_info = (struct kvm_vmm_info *)opaque;
+        if (!vti_cpu_has_kvm_support()) {
+                printk(KERN_ERR "kvm: No Hardware Virtualization Support!\n");
+                r = -EOPNOTSUPP;
+                goto out;
+        }
+        if (kvm_vmm_info) {
+                printk(KERN_ERR "kvm: Already loaded VMM module!\n");
+                r = -EEXIST;
+                goto out;
+        }
+        r = -ENOMEM;
+        kvm_vmm_info = kzalloc(sizeof(struct kvm_vmm_info), GFP_KERNEL);
+        if (!kvm_vmm_info)
+                goto out;
+        if (kvm_alloc_vmm_area())
+                goto out_free0;
+        r = kvm_relocate_vmm(vmm_info, vmm_info->module);
+        if (r)
+                goto out_free1;
+        return 0;
+out_free1:
+        kvm_free_vmm_area();
+out_free0:
+        kfree(kvm_vmm_info);
+out:
+        return r;
+}
+void kvm_arch_exit(void)
+{
+        kvm_free_vmm_area();
+        kfree(kvm_vmm_info);
+        kvm_vmm_info = NULL;
+}
+static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
+                struct kvm_dirty_log *log)
+{
+        struct kvm_memory_slot *memslot;
+        int r, i;
+        long n, base;
+        unsigned long *dirty_bitmap = (unsigned long *)((void *)kvm - KVM_VM_OFS
+                                        + KVM_MEM_DIRTY_LOG_OFS);
+        r = -EINVAL;
+        if (log->slot >= KVM_MEMORY_SLOTS)
+                goto out;
+        memslot = &kvm->memslots[log->slot];
+        r = -ENOENT;
+        if (!memslot->dirty_bitmap)
+                goto out;
+        n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+        base = memslot->base_gfn / BITS_PER_LONG;
+        for (i = 0; i < n/sizeof(long); ++i) {
+                memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
+                dirty_bitmap[base + i] = 0;
+        }
+        r = 0;
+out:
+        return r;
+}
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                struct kvm_dirty_log *log)
+{
+        int r;
+        int n;
+        struct kvm_memory_slot *memslot;
+        int is_dirty = 0;
+        spin_lock(&kvm->arch.dirty_log_lock);
+        r = kvm_ia64_sync_dirty_log(kvm, log);
+        if (r)
+                goto out;
+        r = kvm_get_dirty_log(kvm, log, &is_dirty);
+        if (r)
+                goto out;
+        /* If nothing is dirty, don't bother messing with page tables. */
+        if (is_dirty) {
+                kvm_flush_remote_tlbs(kvm);
+                memslot = &kvm->memslots[log->slot];
+                n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+                memset(memslot->dirty_bitmap, 0, n);
+        }
+        r = 0;
+out:
+        spin_unlock(&kvm->arch.dirty_log_lock);
+        return r;
+}
+int kvm_arch_hardware_setup(void)
+{
+        return 0;
+}
+void kvm_arch_hardware_unsetup(void)
+{
+}
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+        printk(KERN_DEBUG"vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+        int ipi_pcpu = vcpu->cpu;
+        if (waitqueue_active(&vcpu->wq))
+                wake_up_interruptible(&vcpu->wq);
+        if (vcpu->guest_mode)
+                smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+{
+        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        if (!test_and_set_bit(vec, &vpd->irr[0])) {
+                vcpu->arch.irq_new_pending = 1;
+                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+                        kvm_vcpu_kick(vcpu);
+                else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
+                        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                        if (waitqueue_active(&vcpu->wq))
+                                wake_up_interruptible(&vcpu->wq);
+                }
+                return 1;
+        }
+        return 0;
+}
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+{
+        return apic->vcpu->vcpu_id == dest;
+}
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+{
+        return 0;
+}
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+                                       unsigned long bitmap)
+{
+        struct kvm_vcpu *lvcpu = kvm->vcpus[0];
+        int i;
+        for (i = 1; i < KVM_MAX_VCPUS; i++) {
+                if (!kvm->vcpus[i])
+                        continue;
+                if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp)
+                        lvcpu = kvm->vcpus[i];
+        }
+        return lvcpu;
+}
+static int find_highest_bits(int *dat)
+{
+        u32  bits, bitnum;
+        int i;
+        /* loop for all 256 bits */
+        for (i = 7; i >= 0 ; i--) {
+                bits = dat[i];
+                if (bits) {
+                        bitnum = fls(bits);
+                        return i * 32 + bitnum - 1;
+                }
+        }
+        return -1;
+}
+int kvm_highest_pending_irq(struct kvm_vcpu *vcpu)
+{
+    struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+    if (vpd->irr[0] & (1UL << NMI_VECTOR))
+                return NMI_VECTOR;
+    if (vpd->irr[0] & (1UL << ExtINT_VECTOR))
+                return ExtINT_VECTOR;
+    return find_highest_bits((int *)&vpd->irr[0]);
+}
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+{
+        if (kvm_highest_pending_irq(vcpu) != -1)
+                return 1;
+        return 0;
+}
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        return 0;
+}
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+        return gfn;
+}
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE;
+}
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL;
+}
diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
new file mode 100644
index 00000000000..091f936c448
--- /dev/null
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -0,0 +1,500 @@
+/*
+ * PAL/SAL call delegation
+ *
+ * Copyright (c) 2004 Li Susie <susie.li@intel.com>
+ * Copyright (c) 2005 Yu Ke <ke.yu@intel.com>
+ * Copyright (c) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+#include <linux/kvm_host.h>
+#include <linux/smp.h>
+#include "vti.h"
+#include "misc.h"
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/tlb.h>
+/*
+ * Handy macros to make sure that the PAL return values start out
+ * as something meaningful.
+ */
+#define INIT_PAL_STATUS_UNIMPLEMENTED(x)                \
+        {                                               \
+                x.status = PAL_STATUS_UNIMPLEMENTED;    \
+                x.v0 = 0;                               \
+                x.v1 = 0;                               \
+                x.v2 = 0;                               \
+        }
+#define INIT_PAL_STATUS_SUCCESS(x)                      \
+        {                                               \
+                x.status = PAL_STATUS_SUCCESS;          \
+                x.v0 = 0;                               \
+                x.v1 = 0;                               \
+                x.v2 = 0;                               \
+    }
+static void kvm_get_pal_call_data(struct kvm_vcpu *vcpu,
+                u64 *gr28, u64 *gr29, u64 *gr30, u64 *gr31) {
+        struct exit_ctl_data *p;
+        if (vcpu) {
+                p = &vcpu->arch.exit_data;
+                if (p->exit_reason == EXIT_REASON_PAL_CALL) {
+                        *gr28 = p->u.pal_data.gr28;
+                        *gr29 = p->u.pal_data.gr29;
+                        *gr30 = p->u.pal_data.gr30;
+                        *gr31 = p->u.pal_data.gr31;
+                        return ;
+                }
+        }
+        printk(KERN_DEBUG"Failed to get vcpu pal data!!!\n");
+}
+static void set_pal_result(struct kvm_vcpu *vcpu,
+                struct ia64_pal_retval result) {
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p && p->exit_reason == EXIT_REASON_PAL_CALL) {
+                p->u.pal_data.ret = result;
+                return ;
+        }
+        INIT_PAL_STATUS_UNIMPLEMENTED(p->u.pal_data.ret);
+}
+static void set_sal_result(struct kvm_vcpu *vcpu,
+                struct sal_ret_values result) {
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p && p->exit_reason == EXIT_REASON_SAL_CALL) {
+                p->u.sal_data.ret = result;
+                return ;
+        }
+        printk(KERN_WARNING"Failed to set sal result!!\n");
+}
+struct cache_flush_args {
+        u64 cache_type;
+        u64 operation;
+        u64 progress;
+        long status;
+};
+cpumask_t cpu_cache_coherent_map;
+static void remote_pal_cache_flush(void *data)
+{
+        struct cache_flush_args *args = data;
+        long status;
+        u64 progress = args->progress;
+        status = ia64_pal_cache_flush(args->cache_type, args->operation,
+                                        &progress, NULL);
+        if (status != 0)
+        args->status = status;
+}
+static struct ia64_pal_retval pal_cache_flush(struct kvm_vcpu *vcpu)
+{
+        u64 gr28, gr29, gr30, gr31;
+        struct ia64_pal_retval result = {0, 0, 0, 0};
+        struct cache_flush_args args = {0, 0, 0, 0};
+        long psr;
+        gr28 = gr29 = gr30 = gr31 = 0;
+        kvm_get_pal_call_data(vcpu, &gr28, &gr29, &gr30, &gr31);
+        if (gr31 != 0)
+                printk(KERN_ERR"vcpu:%p called cache_flush error!\n", vcpu);
+        /* Always call Host Pal in int=1 */
+        gr30 &= ~PAL_CACHE_FLUSH_CHK_INTRS;
+        args.cache_type = gr29;
+        args.operation = gr30;
+        smp_call_function(remote_pal_cache_flush,
+                                (void *)&args, 1, 1);
+        if (args.status != 0)
+                printk(KERN_ERR"pal_cache_flush error!,"
+                                "status:0x%lx\n", args.status);
+        /*
+         * Call Host PAL cache flush
+         * Clear psr.ic when call PAL_CACHE_FLUSH
+         */
+        local_irq_save(psr);
+        result.status = ia64_pal_cache_flush(gr29, gr30, &result.v1,
+                                                &result.v0);
+        local_irq_restore(psr);
+        if (result.status != 0)
+                printk(KERN_ERR"vcpu:%p crashed due to cache_flush err:%ld"
+                                "in1:%lx,in2:%lx\n",
+                                vcpu, result.status, gr29, gr30);
+#if 0
+        if (gr29 == PAL_CACHE_TYPE_COHERENT) {
+                cpus_setall(vcpu->arch.cache_coherent_map);
+                cpu_clear(vcpu->cpu, vcpu->arch.cache_coherent_map);
+                cpus_setall(cpu_cache_coherent_map);
+                cpu_clear(vcpu->cpu, cpu_cache_coherent_map);
+        }
+#endif
+        return result;
+}
+struct ia64_pal_retval pal_cache_summary(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        PAL_CALL(result, PAL_CACHE_SUMMARY, 0, 0, 0);
+        return result;
+}
+static struct ia64_pal_retval pal_freq_base(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        PAL_CALL(result, PAL_FREQ_BASE, 0, 0, 0);
+        /*
+         * PAL_FREQ_BASE may not be implemented in some platforms,
+         * call SAL instead.
+         */
+        if (result.v0 == 0) {
+                result.status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
+                                                        &result.v0,
+                                                        &result.v1);
+                result.v2 = 0;
+        }
+        return result;
+}
+static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        PAL_CALL(result, PAL_FREQ_RATIOS, 0, 0, 0);
+        return result;
+}
+static struct ia64_pal_retval pal_logical_to_physica(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        INIT_PAL_STATUS_UNIMPLEMENTED(result);
+        return result;
+}
+static struct ia64_pal_retval pal_platform_addr(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        INIT_PAL_STATUS_SUCCESS(result);
+        return result;
+}
+static struct ia64_pal_retval pal_proc_get_features(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result = {0, 0, 0, 0};
+        long in0, in1, in2, in3;
+        kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+        result.status = ia64_pal_proc_get_features(&result.v0, &result.v1,
+                        &result.v2, in2);
+        return result;
+}
+static struct ia64_pal_retval pal_cache_info(struct kvm_vcpu *vcpu)
+{
+        pal_cache_config_info_t ci;
+        long status;
+        unsigned long in0, in1, in2, in3, r9, r10;
+        kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+        status = ia64_pal_cache_config_info(in1, in2, &ci);
+        r9 = ci.pcci_info_1.pcci1_data;
+        r10 = ci.pcci_info_2.pcci2_data;
+        return ((struct ia64_pal_retval){status, r9, r10, 0});
+}
+#define GUEST_IMPL_VA_MSB       59
+#define GUEST_RID_BITS          18
+static struct ia64_pal_retval pal_vm_summary(struct kvm_vcpu *vcpu)
+{
+        pal_vm_info_1_u_t vminfo1;
+        pal_vm_info_2_u_t vminfo2;
+        struct ia64_pal_retval result;
+        PAL_CALL(result, PAL_VM_SUMMARY, 0, 0, 0);
+        if (!result.status) {
+                vminfo1.pvi1_val = result.v0;
+                vminfo1.pal_vm_info_1_s.max_itr_entry = 8;
+                vminfo1.pal_vm_info_1_s.max_dtr_entry = 8;
+                result.v0 = vminfo1.pvi1_val;
+                vminfo2.pal_vm_info_2_s.impl_va_msb = GUEST_IMPL_VA_MSB;
+                vminfo2.pal_vm_info_2_s.rid_size = GUEST_RID_BITS;
+                result.v1 = vminfo2.pvi2_val;
+        }
+        return result;
+}
+static struct ia64_pal_retval pal_vm_info(struct kvm_vcpu *vcpu)
+{
+        struct ia64_pal_retval result;
+        INIT_PAL_STATUS_UNIMPLEMENTED(result);
+        return result;
+}
+static  u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu)
+{
+        u64 index = 0;
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p && (p->exit_reason == EXIT_REASON_PAL_CALL))
+                index = p->u.pal_data.gr28;
+        return index;
+}
+int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+        u64 gr28;
+        struct ia64_pal_retval result;
+        int ret = 1;
+        gr28 = kvm_get_pal_call_index(vcpu);
+        /*printk("pal_call index:%lx\n",gr28);*/
+        switch (gr28) {
+        case PAL_CACHE_FLUSH:
+                result = pal_cache_flush(vcpu);
+                break;
+        case PAL_CACHE_SUMMARY:
+                result = pal_cache_summary(vcpu);
+                break;
+        case PAL_HALT_LIGHT:
+        {
+                vcpu->arch.timer_pending = 1;
+                INIT_PAL_STATUS_SUCCESS(result);
+                if (kvm_highest_pending_irq(vcpu) == -1)
+                        ret = kvm_emulate_halt(vcpu);
+        }
+                break;
+        case PAL_FREQ_RATIOS:
+                result = pal_freq_ratios(vcpu);
+                break;
+        case PAL_FREQ_BASE:
+                result = pal_freq_base(vcpu);
+                break;
+        case PAL_LOGICAL_TO_PHYSICAL :
+                result = pal_logical_to_physica(vcpu);
+                break;
+        case PAL_VM_SUMMARY :
+                result = pal_vm_summary(vcpu);
+                break;
+        case PAL_VM_INFO :
+                result = pal_vm_info(vcpu);
+                break;
+        case PAL_PLATFORM_ADDR :
+                result = pal_platform_addr(vcpu);
+                break;
+        case PAL_CACHE_INFO:
+                result = pal_cache_info(vcpu);
+                break;
+        case PAL_PTCE_INFO:
+                INIT_PAL_STATUS_SUCCESS(result);
+                result.v1 = (1L << 32) | 1L;
+                break;
+        case PAL_VM_PAGE_SIZE:
+                result.status = ia64_pal_vm_page_size(&result.v0,
+                                                        &result.v1);
+                break;
+        case PAL_RSE_INFO:
+                result.status = ia64_pal_rse_info(&result.v0,
+                                        (pal_hints_u_t *)&result.v1);
+                break;
+        case PAL_PROC_GET_FEATURES:
+                result = pal_proc_get_features(vcpu);
+                break;
+        case PAL_DEBUG_INFO:
+                result.status = ia64_pal_debug_info(&result.v0,
+                                                        &result.v1);
+                break;
+        case PAL_VERSION:
+                result.status = ia64_pal_version(
+                                (pal_version_u_t *)&result.v0,
+                                (pal_version_u_t *)&result.v1);
+                break;
+        case PAL_FIXED_ADDR:
+                result.status = PAL_STATUS_SUCCESS;
+                result.v0 = vcpu->vcpu_id;
+                break;
+        default:
+                INIT_PAL_STATUS_UNIMPLEMENTED(result);
+                printk(KERN_WARNING"kvm: Unsupported pal call,"
+                                        " index:0x%lx\n", gr28);
+        }
+        set_pal_result(vcpu, result);
+        return ret;
+}
+static struct sal_ret_values sal_emulator(struct kvm *kvm,
+                                long index, unsigned long in1,
+                                unsigned long in2, unsigned long in3,
+                                unsigned long in4, unsigned long in5,
+                                unsigned long in6, unsigned long in7)
+{
+        unsigned long r9  = 0;
+        unsigned long r10 = 0;
+        long r11 = 0;
+        long status;
+        status = 0;
+        switch (index) {
+        case SAL_FREQ_BASE:
+                status = ia64_sal_freq_base(in1, &r9, &r10);
+                break;
+        case SAL_PCI_CONFIG_READ:
+                printk(KERN_WARNING"kvm: Not allowed to call here!"
+                        " SAL_PCI_CONFIG_READ\n");
+                break;
+        case SAL_PCI_CONFIG_WRITE:
+                printk(KERN_WARNING"kvm: Not allowed to call here!"
+                        " SAL_PCI_CONFIG_WRITE\n");
+                break;
+        case SAL_SET_VECTORS:
+                if (in1 == SAL_VECTOR_OS_BOOT_RENDEZ) {
+                        if (in4 != 0 || in5 != 0 || in6 != 0 || in7 != 0) {
+                                status = -2;
+                        } else {
+                                kvm->arch.rdv_sal_data.boot_ip = in2;
+                                kvm->arch.rdv_sal_data.boot_gp = in3;
+                        }
+                        printk("Rendvous called! iip:%lx\n\n", in2);
+                } else
+                        printk(KERN_WARNING"kvm: CALLED SAL_SET_VECTORS %lu."
+                                                        "ignored...\n", in1);
+                break;
+        case SAL_GET_STATE_INFO:
+                /* No more info.  */
+                status = -5;
+                r9 = 0;
+                break;
+        case SAL_GET_STATE_INFO_SIZE:
+                /* Return a dummy size.  */
+                status = 0;
+                r9 = 128;
+                break;
+        case SAL_CLEAR_STATE_INFO:
+                /* Noop.  */
+                break;
+        case SAL_MC_RENDEZ:
+                printk(KERN_WARNING
+                        "kvm: called SAL_MC_RENDEZ. ignored...\n");
+                break;
+        case SAL_MC_SET_PARAMS:
+                printk(KERN_WARNING
+                        "kvm: called  SAL_MC_SET_PARAMS.ignored!\n");
+                break;
+        case SAL_CACHE_FLUSH:
+                if (1) {
+                        /*Flush using SAL.
+                        This method is faster but has a side
+                        effect on other vcpu running on
+                        this cpu.  */
+                        status = ia64_sal_cache_flush(in1);
+                } else {
+                        /*Maybe need to implement the method
+                        without side effect!*/
+                        status = 0;
+                }
+                break;
+        case SAL_CACHE_INIT:
+                printk(KERN_WARNING
+                        "kvm: called SAL_CACHE_INIT.  ignored...\n");
+                break;
+        case SAL_UPDATE_PAL:
+                printk(KERN_WARNING
+                        "kvm: CALLED SAL_UPDATE_PAL.  ignored...\n");
+                break;
+        default:
+                printk(KERN_WARNING"kvm: called SAL_CALL with unknown index."
+                                                " index:%ld\n", index);
+                status = -1;
+                break;
+        }
+        return ((struct sal_ret_values) {status, r9, r10, r11});
+}
+static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1,
+                u64 *in2, u64 *in3, u64 *in4, u64 *in5, u64 *in6, u64 *in7){
+        struct exit_ctl_data *p;
+        p = kvm_get_exit_data(vcpu);
+        if (p) {
+                if (p->exit_reason == EXIT_REASON_SAL_CALL) {
+                        *in0 = p->u.sal_data.in0;
+                        *in1 = p->u.sal_data.in1;
+                        *in2 = p->u.sal_data.in2;
+                        *in3 = p->u.sal_data.in3;
+                        *in4 = p->u.sal_data.in4;
+                        *in5 = p->u.sal_data.in5;
+                        *in6 = p->u.sal_data.in6;
+                        *in7 = p->u.sal_data.in7;
+                        return ;
+                }
+        }
+        *in0 = 0;
+}
+void kvm_sal_emul(struct kvm_vcpu *vcpu)
+{
+        struct sal_ret_values result;
+        u64 index, in1, in2, in3, in4, in5, in6, in7;
+        kvm_get_sal_call_data(vcpu, &index, &in1, &in2,
+                        &in3, &in4, &in5, &in6, &in7);
+        result = sal_emulator(vcpu->kvm, index, in1, in2, in3,
+                                        in4, in5, in6, in7);
+        set_sal_result(vcpu, result);
+}
diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
new file mode 100644
index 00000000000..13980d9b8bc
--- /dev/null
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -0,0 +1,273 @@
+/*
+ *  kvm_minstate.h: min save macros
+ *  Copyright (c) 2007, Intel Corporation.
+ *
+ *  Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
+ *  Xiantao Zhang (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <asm/asmmacro.h>
+#include <asm/types.h>
+#include <asm/kregs.h>
+#include "asm-offsets.h"
+#define KVM_MINSTATE_START_SAVE_MIN                                             \
+        mov ar.rsc = 0;/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */\
+        ;;                                                                      \
+        mov.m r28 = ar.rnat;                                                    \
+        addl r22 = VMM_RBS_OFFSET,r1;            /* compute base of RBS */      \
+        ;;                                                                      \
+        lfetch.fault.excl.nt1 [r22];                                            \
+        addl r1 = IA64_STK_OFFSET-VMM_PT_REGS_SIZE,r1;  /* compute base of memory stack */  \
+        mov r23 = ar.bspstore;                  /* save ar.bspstore */          \
+        ;;                                                                      \
+        mov ar.bspstore = r22;                          /* switch to kernel RBS */\
+        ;;                                                                      \
+        mov r18 = ar.bsp;                                                       \
+        mov ar.rsc = 0x3;     /* set eager mode, pl 0, little-endian, loadrs=0 */
+#define KVM_MINSTATE_END_SAVE_MIN                                               \
+        bsw.1;          /* switch back to bank 1 (must be last in insn group) */\
+        ;;
+#define PAL_VSA_SYNC_READ                                               \
+        /* begin to call pal vps sync_read */                           \
+        add r25 = VMM_VPD_BASE_OFFSET, r21;                             \
+        adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21;  /* entry point */    \
+        ;;                                                              \
+        ld8 r25 = [r25];      /* read vpd base */                       \
+        ld8 r20 = [r20];                                                \
+        ;;                                                              \
+        add r20 = PAL_VPS_SYNC_READ,r20;                                \
+        ;;                                                              \
+{ .mii;                                                                 \
+        nop 0x0;                                                        \
+        mov r24 = ip;                                                   \
+        mov b0 = r20;                                                   \
+        ;;                                                              \
+};                                                                      \
+{ .mmb;                                                                 \
+        add r24 = 0x20, r24;                                            \
+        nop 0x0;                                                        \
+        br.cond.sptk b0;        /*  call the service */                 \
+        ;;                                                              \
+};
+#define KVM_MINSTATE_GET_CURRENT(reg)   mov reg=r21
+/*
+ * KVM_DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
+ * the minimum state necessary that allows us to turn psr.ic back
+ * on.
+ *
+ * Assumed state upon entry:
+ *  psr.ic: off
+ *  r31:        contains saved predicates (pr)
+ *
+ * Upon exit, the state is as follows:
+ *  psr.ic: off
+ *   r2 = points to &pt_regs.r16
+ *   r8 = contents of ar.ccv
+ *   r9 = contents of ar.csd
+ *  r10 = contents of ar.ssd
+ *  r11 = FPSR_DEFAULT
+ *  r12 = kernel sp (kernel virtual address)
+ *  r13 = points to current task_struct (kernel virtual address)
+ *  p15 = TRUE if psr.i is set in cr.ipsr
+ *  predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
+ *        preserved
+ *
+ * Note that psr.ic is NOT turned on by this macro.  This is so that
+ * we can pass interruption state as arguments to a handler.
+ */
+#define PT(f) (VMM_PT_REGS_##f##_OFFSET)
+#define KVM_DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                   \
+        KVM_MINSTATE_GET_CURRENT(r16);  /* M (or M;;I) */       \
+        mov r27 = ar.rsc;         /* M */                       \
+        mov r20 = r1;         /* A */                           \
+        mov r25 = ar.unat;        /* M */                       \
+        mov r29 = cr.ipsr;        /* M */                       \
+        mov r26 = ar.pfs;         /* I */                       \
+        mov r18 = cr.isr;                                       \
+        COVER;              /* B;; (or nothing) */              \
+        ;;                                                      \
+        tbit.z p0,p15 = r29,IA64_PSR_I_BIT;                     \
+        mov r1 = r16;                                           \
+/*      mov r21=r16;    */                                      \
+        /* switch from user to kernel RBS: */                   \
+        ;;                                                      \
+        invala;             /* M */                             \
+        SAVE_IFS;                                               \
+        ;;                                                      \
+        KVM_MINSTATE_START_SAVE_MIN                             \
+        adds r17 = 2*L1_CACHE_BYTES,r1;/* cache-line size */    \
+        adds r16 = PT(CR_IPSR),r1;                              \
+        ;;                                                      \
+        lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;             \
+        st8 [r16] = r29;      /* save cr.ipsr */                \
+        ;;                                                      \
+        lfetch.fault.excl.nt1 [r17];                            \
+        tbit.nz p15,p0 = r29,IA64_PSR_I_BIT;                    \
+        mov r29 = b0                                            \
+        ;;                                                      \
+        adds r16 = PT(R8),r1; /* initialize first base pointer */\
+        adds r17 = PT(R9),r1; /* initialize second base pointer */\
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r8,16;                       \
+.mem.offset 8,0; st8.spill [r17] = r9,16;                       \
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r10,24;                      \
+.mem.offset 8,0; st8.spill [r17] = r11,24;                      \
+        ;;                                                      \
+        mov r9 = cr.iip;         /* M */                        \
+        mov r10 = ar.fpsr;        /* M */                       \
+        ;;                                                      \
+        st8 [r16] = r9,16;    /* save cr.iip */                 \
+        st8 [r17] = r30,16;   /* save cr.ifs */                 \
+        sub r18 = r18,r22;    /* r18=RSE.ndirty*8 */            \
+        ;;                                                      \
+        st8 [r16] = r25,16;   /* save ar.unat */                \
+        st8 [r17] = r26,16;    /* save ar.pfs */                \
+        shl r18 = r18,16;     /* calu ar.rsc used for "loadrs" */\
+        ;;                                                      \
+        st8 [r16] = r27,16;   /* save ar.rsc */                 \
+        st8 [r17] = r28,16;   /* save ar.rnat */                \
+        ;;          /* avoid RAW on r16 & r17 */                \
+        st8 [r16] = r23,16;   /* save ar.bspstore */            \
+        st8 [r17] = r31,16;   /* save predicates */             \
+        ;;                                                      \
+        st8 [r16] = r29,16;   /* save b0 */                     \
+        st8 [r17] = r18,16;   /* save ar.rsc value for "loadrs" */\
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r20,16;/* save original r1 */  \
+.mem.offset 8,0; st8.spill [r17] = r12,16;                      \
+        adds r12 = -16,r1;    /* switch to kernel memory stack */  \
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r13,16;                      \
+.mem.offset 8,0; st8.spill [r17] = r10,16;      /* save ar.fpsr */\
+        mov r13 = r21;   /* establish `current' */              \
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r15,16;                      \
+.mem.offset 8,0; st8.spill [r17] = r14,16;                      \
+        ;;                                                      \
+.mem.offset 0,0; st8.spill [r16] = r2,16;                       \
+.mem.offset 8,0; st8.spill [r17] = r3,16;                       \
+        adds r2 = VMM_PT_REGS_R16_OFFSET,r1;                    \
+         ;;                                                     \
+        adds r16 = VMM_VCPU_IIPA_OFFSET,r13;                    \
+        adds r17 = VMM_VCPU_ISR_OFFSET,r13;                     \
+        mov r26 = cr.iipa;                                      \
+        mov r27 = cr.isr;                                       \
+        ;;                                                      \
+        st8 [r16] = r26;                                        \
+        st8 [r17] = r27;                                        \
+        ;;                                                      \
+        EXTRA;                                                  \
+        mov r8 = ar.ccv;                                        \
+        mov r9 = ar.csd;                                        \
+        mov r10 = ar.ssd;                                       \
+        movl r11 = FPSR_DEFAULT;   /* L-unit */                 \
+        adds r17 = VMM_VCPU_GP_OFFSET,r13;                      \
+        ;;                                                      \
+        ld8 r1 = [r17];/* establish kernel global pointer */    \
+        ;;                                                      \
+        PAL_VSA_SYNC_READ                                       \
+        KVM_MINSTATE_END_SAVE_MIN
+/*
+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
+ *
+ * Assumed state upon entry:
+ *  psr.ic: on
+ *  r2: points to &pt_regs.f6
+ *  r3: points to &pt_regs.f7
+ *  r8: contents of ar.ccv
+ *  r9: contents of ar.csd
+ *  r10:        contents of ar.ssd
+ *  r11:        FPSR_DEFAULT
+ *
+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
+ */
+#define KVM_SAVE_REST                           \
+.mem.offset 0,0; st8.spill [r2] = r16,16;       \
+.mem.offset 8,0; st8.spill [r3] = r17,16;       \
+        ;;                              \
+.mem.offset 0,0; st8.spill [r2] = r18,16;       \
+.mem.offset 8,0; st8.spill [r3] = r19,16;       \
+        ;;                              \
+.mem.offset 0,0; st8.spill [r2] = r20,16;       \
+.mem.offset 8,0; st8.spill [r3] = r21,16;       \
+        mov r18=b6;                     \
+        ;;                              \
+.mem.offset 0,0; st8.spill [r2] = r22,16;       \
+.mem.offset 8,0; st8.spill [r3] = r23,16;       \
+        mov r19 = b7;                           \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2] = r24,16;       \
+.mem.offset 8,0; st8.spill [r3] = r25,16;       \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2] = r26,16;       \
+.mem.offset 8,0; st8.spill [r3] = r27,16;       \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2] = r28,16;       \
+.mem.offset 8,0; st8.spill [r3] = r29,16;       \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2] = r30,16;       \
+.mem.offset 8,0; st8.spill [r3] = r31,32;       \
+        ;;                                      \
+        mov ar.fpsr = r11;                      \
+        st8 [r2] = r8,8;                        \
+        adds r24 = PT(B6)-PT(F7),r3;            \
+        adds r25 = PT(B7)-PT(F7),r3;            \
+        ;;                                      \
+        st8 [r24] = r18,16;       /* b6 */      \
+        st8 [r25] = r19,16;       /* b7 */      \
+        adds r2 = PT(R4)-PT(F6),r2;             \
+        adds r3 = PT(R5)-PT(F7),r3;             \
+        ;;                                      \
+        st8 [r24] = r9; /* ar.csd */            \
+        st8 [r25] = r10;        /* ar.ssd */    \
+        ;;                                      \
+        mov r18 = ar.unat;                      \
+        adds r19 = PT(EML_UNAT)-PT(R4),r2;      \
+        ;;                                      \
+        st8 [r19] = r18; /* eml_unat */         \
+#define KVM_SAVE_EXTRA                          \
+.mem.offset 0,0; st8.spill [r2] = r4,16;        \
+.mem.offset 8,0; st8.spill [r3] = r5,16;        \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2] = r6,16;        \
+.mem.offset 8,0; st8.spill [r3] = r7;           \
+        ;;                                      \
+        mov r26 = ar.unat;                      \
+        ;;                                      \
+        st8 [r2] = r26;/* eml_unat */           \
+#define KVM_SAVE_MIN_WITH_COVER         KVM_DO_SAVE_MIN(cover, mov r30 = cr.ifs,)
+#define KVM_SAVE_MIN_WITH_COVER_R19     KVM_DO_SAVE_MIN(cover, mov r30 = cr.ifs, mov r15 = r19)
+#define KVM_SAVE_MIN                    KVM_DO_SAVE_MIN(     , mov r30 = r0, )
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
new file mode 100644
index 00000000000..6d6cbcb1489
--- /dev/null
+++ b/arch/ia64/kvm/lapic.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_IA64_LAPIC_H
+#define __KVM_IA64_LAPIC_H
+#include <linux/kvm_host.h>
+/*
+ * vlsapic
+ */
+struct kvm_lapic{
+        struct kvm_vcpu *vcpu;
+        uint64_t insvc[4];
+        uint64_t vhpi;
+        uint8_t xtp;
+        uint8_t pal_init_pending;
+        uint8_t pad[2];
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+#endif
diff --git a/arch/ia64/kvm/misc.h b/arch/ia64/kvm/misc.h
new file mode 100644
index 00000000000..e585c460734
--- /dev/null
+++ b/arch/ia64/kvm/misc.h
@@ -0,0 +1,93 @@
+#ifndef __KVM_IA64_MISC_H
+#define __KVM_IA64_MISC_H
+#include <linux/kvm_host.h>
+/*
+ * misc.h
+ *      Copyright (C) 2007, Intel Corporation.
+ *      Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+/*
+ *Return p2m base address at host side!
+ */
+static inline uint64_t *kvm_host_get_pmt(struct kvm *kvm)
+{
+        return (uint64_t *)(kvm->arch.vm_base + KVM_P2M_OFS);
+}
+static inline void kvm_set_pmt_entry(struct kvm *kvm, gfn_t gfn,
+                u64 paddr, u64 mem_flags)
+{
+        uint64_t *pmt_base = kvm_host_get_pmt(kvm);
+        unsigned long pte;
+        pte = PAGE_ALIGN(paddr) | mem_flags;
+        pmt_base[gfn] = pte;
+}
+/*Function for translating host address to guest address*/
+static inline void *to_guest(struct kvm *kvm, void *addr)
+{
+        return (void *)((unsigned long)(addr) - kvm->arch.vm_base +
+                        KVM_VM_DATA_BASE);
+}
+/*Function for translating guest address to host address*/
+static inline void *to_host(struct kvm *kvm, void *addr)
+{
+        return (void *)((unsigned long)addr - KVM_VM_DATA_BASE
+                        + kvm->arch.vm_base);
+}
+/* Get host context of the vcpu */
+static inline union context *kvm_get_host_context(struct kvm_vcpu *vcpu)
+{
+        union context *ctx = &vcpu->arch.host;
+        return to_guest(vcpu->kvm, ctx);
+}
+/* Get guest context of the vcpu */
+static inline union context *kvm_get_guest_context(struct kvm_vcpu *vcpu)
+{
+        union context *ctx = &vcpu->arch.guest;
+        return  to_guest(vcpu->kvm, ctx);
+}
+/* kvm get exit data from gvmm! */
+static inline struct exit_ctl_data *kvm_get_exit_data(struct kvm_vcpu *vcpu)
+{
+        return &vcpu->arch.exit_data;
+}
+/*kvm get vcpu ioreq for kvm module!*/
+static inline struct kvm_mmio_req *kvm_get_vcpu_ioreq(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p_ctl_data;
+        if (vcpu) {
+                p_ctl_data = kvm_get_exit_data(vcpu);
+                if (p_ctl_data->exit_reason == EXIT_REASON_MMIO_INSTRUCTION)
+                        return &p_ctl_data->u.ioreq;
+        }
+        return NULL;
+}
+#endif
diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
new file mode 100644
index 00000000000..351bf70da46
--- /dev/null
+++ b/arch/ia64/kvm/mmio.c
@@ -0,0 +1,341 @@
+/*
+ * mmio.c: MMIO emulation components.
+ * Copyright (c) 2004, Intel Corporation.
+ *  Yaozu Dong (Eddie Dong) (Eddie.dong@intel.com)
+ *  Kun Tian (Kevin Tian) (Kevin.tian@intel.com)
+ *
+ * Copyright (c) 2007 Intel Corporation  KVM support.
+ * Xuefei Xu (Anthony Xu) (anthony.xu@intel.com)
+ * Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <linux/kvm_host.h>
+#include "vcpu.h"
+static void vlsapic_write_xtp(struct kvm_vcpu *v, uint8_t val)
+{
+        VLSAPIC_XTP(v) = val;
+}
+/*
+ * LSAPIC OFFSET
+ */
+#define PIB_LOW_HALF(ofst)     !(ofst & (1 << 20))
+#define PIB_OFST_INTA          0x1E0000
+#define PIB_OFST_XTP           0x1E0008
+/*
+ * execute write IPI op.
+ */
+static void vlsapic_write_ipi(struct kvm_vcpu *vcpu,
+                                        uint64_t addr, uint64_t data)
+{
+        struct exit_ctl_data *p = &current_vcpu->arch.exit_data;
+        unsigned long psr;
+        local_irq_save(psr);
+        p->exit_reason = EXIT_REASON_IPI;
+        p->u.ipi_data.addr.val = addr;
+        p->u.ipi_data.data.val = data;
+        vmm_transition(current_vcpu);
+        local_irq_restore(psr);
+}
+void lsapic_write(struct kvm_vcpu *v, unsigned long addr,
+                        unsigned long length, unsigned long val)
+{
+        addr &= (PIB_SIZE - 1);
+        switch (addr) {
+        case PIB_OFST_INTA:
+                /*panic_domain(NULL, "Undefined write on PIB INTA\n");*/
+                panic_vm(v);
+                break;
+        case PIB_OFST_XTP:
+                if (length == 1) {
+                        vlsapic_write_xtp(v, val);
+                } else {
+                        /*panic_domain(NULL,
+                        "Undefined write on PIB XTP\n");*/
+                        panic_vm(v);
+                }
+                break;
+        default:
+                if (PIB_LOW_HALF(addr)) {
+                        /*lower half */
+                        if (length != 8)
+                                /*panic_domain(NULL,
+                                "Can't LHF write with size %ld!\n",
+                                length);*/
+                                panic_vm(v);
+                        else
+                                vlsapic_write_ipi(v, addr, val);
+                } else {   /*   upper half
+                                printk("IPI-UHF write %lx\n",addr);*/
+                        panic_vm(v);
+                }
+                break;
+        }
+}
+unsigned long lsapic_read(struct kvm_vcpu *v, unsigned long addr,
+                unsigned long length)
+{
+        uint64_t result = 0;
+        addr &= (PIB_SIZE - 1);
+        switch (addr) {
+        case PIB_OFST_INTA:
+                if (length == 1) /* 1 byte load */
+                        ; /* There is no i8259, there is no INTA access*/
+                else
+                        /*panic_domain(NULL,"Undefined read on PIB INTA\n"); */
+                        panic_vm(v);
+                break;
+        case PIB_OFST_XTP:
+                if (length == 1) {
+                        result = VLSAPIC_XTP(v);
+                        /* printk("read xtp %lx\n", result); */
+                } else {
+                        /*panic_domain(NULL,
+                        "Undefined read on PIB XTP\n");*/
+                        panic_vm(v);
+                }
+                break;
+        default:
+                panic_vm(v);
+                break;
+        }
+        return result;
+}
+static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest,
+                                        u16 s, int ma, int dir)
+{
+        unsigned long iot;
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        unsigned long psr;
+        iot = __gpfn_is_io(src_pa >> PAGE_SHIFT);
+        local_irq_save(psr);
+        /*Intercept the acces for PIB range*/
+        if (iot == GPFN_PIB) {
+                if (!dir)
+                        lsapic_write(vcpu, src_pa, s, *dest);
+                else
+                        *dest = lsapic_read(vcpu, src_pa, s);
+                goto out;
+        }
+        p->exit_reason = EXIT_REASON_MMIO_INSTRUCTION;
+        p->u.ioreq.addr = src_pa;
+        p->u.ioreq.size = s;
+        p->u.ioreq.dir = dir;
+        if (dir == IOREQ_WRITE)
+                p->u.ioreq.data = *dest;
+        p->u.ioreq.state = STATE_IOREQ_READY;
+        vmm_transition(vcpu);
+        if (p->u.ioreq.state == STATE_IORESP_READY) {
+                if (dir == IOREQ_READ)
+                        *dest = p->u.ioreq.data;
+        } else
+                panic_vm(vcpu);
+out:
+        local_irq_restore(psr);
+        return ;
+}
+/*
+   dir 1: read 0:write
+   inst_type 0:integer 1:floating point
+ */
+#define SL_INTEGER      0       /* store/load interger*/
+#define SL_FLOATING     1       /* store/load floating*/
+void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
+{
+        struct kvm_pt_regs *regs;
+        IA64_BUNDLE bundle;
+        int slot, dir = 0;
+        int inst_type = -1;
+        u16 size = 0;
+        u64 data, slot1a, slot1b, temp, update_reg;
+        s32 imm;
+        INST64 inst;
+        regs = vcpu_regs(vcpu);
+        if (fetch_code(vcpu, regs->cr_iip, &bundle)) {
+                /* if fetch code fail, return and try again */
+                return;
+        }
+        slot = ((struct ia64_psr *)&(regs->cr_ipsr))->ri;
+        if (!slot)
+                inst.inst = bundle.slot0;
+        else if (slot == 1) {
+                slot1a = bundle.slot1a;
+                slot1b = bundle.slot1b;
+                inst.inst = slot1a + (slot1b << 18);
+        } else if (slot == 2)
+                inst.inst = bundle.slot2;
+        /* Integer Load/Store */
+        if (inst.M1.major == 4 && inst.M1.m == 0 && inst.M1.x == 0) {
+                inst_type = SL_INTEGER;
+                size = (inst.M1.x6 & 0x3);
+                if ((inst.M1.x6 >> 2) > 0xb) {
+                        /*write*/
+                        dir = IOREQ_WRITE;
+                        data = vcpu_get_gr(vcpu, inst.M4.r2);
+                } else if ((inst.M1.x6 >> 2) < 0xb) {
+                        /*read*/
+                        dir = IOREQ_READ;
+                }
+        } else if (inst.M2.major == 4 && inst.M2.m == 1 && inst.M2.x == 0) {
+                /* Integer Load + Reg update */
+                inst_type = SL_INTEGER;
+                dir = IOREQ_READ;
+                size = (inst.M2.x6 & 0x3);
+                temp = vcpu_get_gr(vcpu, inst.M2.r3);
+                update_reg = vcpu_get_gr(vcpu, inst.M2.r2);
+                temp += update_reg;
+                vcpu_set_gr(vcpu, inst.M2.r3, temp, 0);
+        } else if (inst.M3.major == 5) {
+                /*Integer Load/Store + Imm update*/
+                inst_type = SL_INTEGER;
+                size = (inst.M3.x6&0x3);
+                if ((inst.M5.x6 >> 2) > 0xb) {
+                        /*write*/
+                        dir = IOREQ_WRITE;
+                        data = vcpu_get_gr(vcpu, inst.M5.r2);
+                        temp = vcpu_get_gr(vcpu, inst.M5.r3);
+                        imm = (inst.M5.s << 31) | (inst.M5.i << 30) |
+                                (inst.M5.imm7 << 23);
+                        temp += imm >> 23;
+                        vcpu_set_gr(vcpu, inst.M5.r3, temp, 0);
+                } else if ((inst.M3.x6 >> 2) < 0xb) {
+                        /*read*/
+                        dir = IOREQ_READ;
+                        temp = vcpu_get_gr(vcpu, inst.M3.r3);
+                        imm = (inst.M3.s << 31) | (inst.M3.i << 30) |
+                                (inst.M3.imm7 << 23);
+                        temp += imm >> 23;
+                        vcpu_set_gr(vcpu, inst.M3.r3, temp, 0);
+                }
+        } else if (inst.M9.major == 6 && inst.M9.x6 == 0x3B
+                                && inst.M9.m == 0 && inst.M9.x == 0) {
+                /* Floating-point spill*/
+                struct ia64_fpreg v;
+                inst_type = SL_FLOATING;
+                dir = IOREQ_WRITE;
+                vcpu_get_fpreg(vcpu, inst.M9.f2, &v);
+                /* Write high word. FIXME: this is a kludge!  */
+                v.u.bits[1] &= 0x3ffff;
+                mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE);
+                data = v.u.bits[0];
+                size = 3;
+        } else if (inst.M10.major == 7 && inst.M10.x6 == 0x3B) {
+                /* Floating-point spill + Imm update */
+                struct ia64_fpreg v;
+                inst_type = SL_FLOATING;
+                dir = IOREQ_WRITE;
+                vcpu_get_fpreg(vcpu, inst.M10.f2, &v);
+                temp = vcpu_get_gr(vcpu, inst.M10.r3);
+                imm = (inst.M10.s << 31) | (inst.M10.i << 30) |
+                        (inst.M10.imm7 << 23);
+                temp += imm >> 23;
+                vcpu_set_gr(vcpu, inst.M10.r3, temp, 0);
+                /* Write high word.FIXME: this is a kludge!  */
+                v.u.bits[1] &= 0x3ffff;
+                mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE);
+                data = v.u.bits[0];
+                size = 3;
+        } else if (inst.M10.major == 7 && inst.M10.x6 == 0x31) {
+                /* Floating-point stf8 + Imm update */
+                struct ia64_fpreg v;
+                inst_type = SL_FLOATING;
+                dir = IOREQ_WRITE;
+                size = 3;
+                vcpu_get_fpreg(vcpu, inst.M10.f2, &v);
+                data = v.u.bits[0]; /* Significand.  */
+                temp = vcpu_get_gr(vcpu, inst.M10.r3);
+                imm = (inst.M10.s << 31) | (inst.M10.i << 30) |
+                        (inst.M10.imm7 << 23);
+                temp += imm >> 23;
+                vcpu_set_gr(vcpu, inst.M10.r3, temp, 0);
+        } else if (inst.M15.major == 7 && inst.M15.x6 >= 0x2c
+                        && inst.M15.x6 <= 0x2f) {
+                temp = vcpu_get_gr(vcpu, inst.M15.r3);
+                imm = (inst.M15.s << 31) | (inst.M15.i << 30) |
+                        (inst.M15.imm7 << 23);
+                temp += imm >> 23;
+                vcpu_set_gr(vcpu, inst.M15.r3, temp, 0);
+                vcpu_increment_iip(vcpu);
+                return;
+        } else if (inst.M12.major == 6 && inst.M12.m == 1
+                        && inst.M12.x == 1 && inst.M12.x6 == 1) {
+                /* Floating-point Load Pair + Imm ldfp8 M12*/
+                struct ia64_fpreg v;
+                inst_type = SL_FLOATING;
+                dir = IOREQ_READ;
+                size = 8;     /*ldfd*/
+                mmio_access(vcpu, padr, &data, size, ma, dir);
+                v.u.bits[0] = data;
+                v.u.bits[1] = 0x1003E;
+                vcpu_set_fpreg(vcpu, inst.M12.f1, &v);
+                padr += 8;
+                mmio_access(vcpu, padr, &data, size, ma, dir);
+                v.u.bits[0] = data;
+                v.u.bits[1] = 0x1003E;
+                vcpu_set_fpreg(vcpu, inst.M12.f2, &v);
+                padr += 8;
+                vcpu_set_gr(vcpu, inst.M12.r3, padr, 0);
+                vcpu_increment_iip(vcpu);
+                return;
+        } else {
+                inst_type = -1;
+                panic_vm(vcpu);
+        }
+        size = 1 << size;
+        if (dir == IOREQ_WRITE) {
+                mmio_access(vcpu, padr, &data, size, ma, dir);
+        } else {
+                mmio_access(vcpu, padr, &data, size, ma, dir);
+                if (inst_type == SL_INTEGER)
+                        vcpu_set_gr(vcpu, inst.M1.r1, data, 0);
+                else
+                        panic_vm(vcpu);
+        }
+        vcpu_increment_iip(vcpu);
+}
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
new file mode 100644
index 00000000000..e4f15d641b2
--- /dev/null
+++ b/arch/ia64/kvm/optvfault.S
@@ -0,0 +1,918 @@
+/*
+ * arch/ia64/vmx/optvfault.S
+ * optimize virtualization fault handler
+ *
+ * Copyright (C) 2006 Intel Co
+ *      Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ */
+#include <asm/asmmacro.h>
+#include <asm/processor.h>
+#include "vti.h"
+#include "asm-offsets.h"
+#define ACCE_MOV_FROM_AR
+#define ACCE_MOV_FROM_RR
+#define ACCE_MOV_TO_RR
+#define ACCE_RSM
+#define ACCE_SSM
+#define ACCE_MOV_TO_PSR
+#define ACCE_THASH
+//mov r1=ar3
+GLOBAL_ENTRY(kvm_asm_mov_from_ar)
+#ifndef ACCE_MOV_FROM_AR
+        br.many kvm_virtualization_fault_back
+#endif
+        add r18=VMM_VCPU_ITC_OFS_OFFSET, r21
+        add r16=VMM_VCPU_LAST_ITC_OFFSET,r21
+        extr.u r17=r25,6,7
+        ;;
+        ld8 r18=[r18]
+        mov r19=ar.itc
+        mov r24=b0
+        ;;
+        add r19=r19,r18
+        addl r20=@gprel(asm_mov_to_reg),gp
+        ;;
+        st8 [r16] = r19
+        adds r30=kvm_resume_to_guest-asm_mov_to_reg,r20
+        shladd r17=r17,4,r20
+        ;;
+        mov b0=r17
+        br.sptk.few b0
+        ;;
+END(kvm_asm_mov_from_ar)
+// mov r1=rr[r3]
+GLOBAL_ENTRY(kvm_asm_mov_from_rr)
+#ifndef ACCE_MOV_FROM_RR
+        br.many kvm_virtualization_fault_back
+#endif
+        extr.u r16=r25,20,7
+        extr.u r17=r25,6,7
+        addl r20=@gprel(asm_mov_from_reg),gp
+        ;;
+        adds r30=kvm_asm_mov_from_rr_back_1-asm_mov_from_reg,r20
+        shladd r16=r16,4,r20
+        mov r24=b0
+        ;;
+        add r27=VMM_VCPU_VRR0_OFFSET,r21
+        mov b0=r16
+        br.many b0
+        ;;
+kvm_asm_mov_from_rr_back_1:
+        adds r30=kvm_resume_to_guest-asm_mov_from_reg,r20
+        adds r22=asm_mov_to_reg-asm_mov_from_reg,r20
+        shr.u r26=r19,61
+        ;;
+        shladd r17=r17,4,r22
+        shladd r27=r26,3,r27
+        ;;
+        ld8 r19=[r27]
+        mov b0=r17
+        br.many b0
+END(kvm_asm_mov_from_rr)
+// mov rr[r3]=r2
+GLOBAL_ENTRY(kvm_asm_mov_to_rr)
+#ifndef ACCE_MOV_TO_RR
+        br.many kvm_virtualization_fault_back
+#endif
+        extr.u r16=r25,20,7
+        extr.u r17=r25,13,7
+        addl r20=@gprel(asm_mov_from_reg),gp
+        ;;
+        adds r30=kvm_asm_mov_to_rr_back_1-asm_mov_from_reg,r20
+        shladd r16=r16,4,r20
+        mov r22=b0
+        ;;
+        add r27=VMM_VCPU_VRR0_OFFSET,r21
+        mov b0=r16
+        br.many b0
+        ;;
+kvm_asm_mov_to_rr_back_1:
+        adds r30=kvm_asm_mov_to_rr_back_2-asm_mov_from_reg,r20
+        shr.u r23=r19,61
+        shladd r17=r17,4,r20
+        ;;
+        //if rr6, go back
+        cmp.eq p6,p0=6,r23
+        mov b0=r22
+        (p6) br.cond.dpnt.many kvm_virtualization_fault_back
+        ;;
+        mov r28=r19
+        mov b0=r17
+        br.many b0
+kvm_asm_mov_to_rr_back_2:
+        adds r30=kvm_resume_to_guest-asm_mov_from_reg,r20
+        shladd r27=r23,3,r27
+        ;; // vrr.rid<<4 |0xe
+        st8 [r27]=r19
+        mov b0=r30
+        ;;
+        extr.u r16=r19,8,26
+        extr.u r18 =r19,2,6
+        mov r17 =0xe
+        ;;
+        shladd r16 = r16, 4, r17
+        extr.u r19 =r19,0,8
+        ;;
+        shl r16 = r16,8
+        ;;
+        add r19 = r19, r16
+        ;; //set ve 1
+        dep r19=-1,r19,0,1
+        cmp.lt p6,p0=14,r18
+        ;;
+        (p6) mov r18=14
+        ;;
+        (p6) dep r19=r18,r19,2,6
+        ;;
+        cmp.eq p6,p0=0,r23
+        ;;
+        cmp.eq.or p6,p0=4,r23
+        ;;
+        adds r16=VMM_VCPU_MODE_FLAGS_OFFSET,r21
+        (p6) adds r17=VMM_VCPU_META_SAVED_RR0_OFFSET,r21
+        ;;
+        ld4 r16=[r16]
+        cmp.eq p7,p0=r0,r0
+        (p6) shladd r17=r23,1,r17
+        ;;
+        (p6) st8 [r17]=r19
+        (p6) tbit.nz p6,p7=r16,0
+        ;;
+        (p7) mov rr[r28]=r19
+        mov r24=r22
+        br.many b0
+END(kvm_asm_mov_to_rr)
+//rsm
+GLOBAL_ENTRY(kvm_asm_rsm)
+#ifndef ACCE_RSM
+        br.many kvm_virtualization_fault_back
+#endif
+        add r16=VMM_VPD_BASE_OFFSET,r21
+        extr.u r26=r25,6,21
+        extr.u r27=r25,31,2
+        ;;
+        ld8 r16=[r16]
+        extr.u r28=r25,36,1
+        dep r26=r27,r26,21,2
+        ;;
+        add r17=VPD_VPSR_START_OFFSET,r16
+        add r22=VMM_VCPU_MODE_FLAGS_OFFSET,r21
+        //r26 is imm24
+        dep r26=r28,r26,23,1
+        ;;
+        ld8 r18=[r17]
+        movl r28=IA64_PSR_IC+IA64_PSR_I+IA64_PSR_DT+IA64_PSR_SI
+        ld4 r23=[r22]
+        sub r27=-1,r26
+        mov r24=b0
+        ;;
+        mov r20=cr.ipsr
+        or r28=r27,r28
+        and r19=r18,r27
+        ;;
+        st8 [r17]=r19
+        and r20=r20,r28
+        /* Comment it out due to short of fp lazy alorgithm support
+        adds r27=IA64_VCPU_FP_PSR_OFFSET,r21
+        ;;
+        ld8 r27=[r27]
+        ;;
+        tbit.nz p8,p0= r27,IA64_PSR_DFH_BIT
+        ;;
+        (p8) dep r20=-1,r20,IA64_PSR_DFH_BIT,1
+        */
+        ;;
+        mov cr.ipsr=r20
+        tbit.nz p6,p0=r23,0
+        ;;
+        tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
+        (p6) br.dptk kvm_resume_to_guest
+        ;;
+        add r26=VMM_VCPU_META_RR0_OFFSET,r21
+        add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
+        dep r23=-1,r23,0,1
+        ;;
+        ld8 r26=[r26]
+        ld8 r27=[r27]
+        st4 [r22]=r23
+        dep.z r28=4,61,3
+        ;;
+        mov rr[r0]=r26
+        ;;
+        mov rr[r28]=r27
+        ;;
+        srlz.d
+        br.many kvm_resume_to_guest
+END(kvm_asm_rsm)
+//ssm
+GLOBAL_ENTRY(kvm_asm_ssm)
+#ifndef ACCE_SSM
+        br.many kvm_virtualization_fault_back
+#endif
+        add r16=VMM_VPD_BASE_OFFSET,r21
+        extr.u r26=r25,6,21
+        extr.u r27=r25,31,2
+        ;;
+        ld8 r16=[r16]
+        extr.u r28=r25,36,1
+        dep r26=r27,r26,21,2
+        ;;  //r26 is imm24
+        add r27=VPD_VPSR_START_OFFSET,r16
+        dep r26=r28,r26,23,1
+        ;;  //r19 vpsr
+        ld8 r29=[r27]
+        mov r24=b0
+        ;;
+        add r22=VMM_VCPU_MODE_FLAGS_OFFSET,r21
+        mov r20=cr.ipsr
+        or r19=r29,r26
+        ;;
+        ld4 r23=[r22]
+        st8 [r27]=r19
+        or r20=r20,r26
+        ;;
+        mov cr.ipsr=r20
+        movl r28=IA64_PSR_DT+IA64_PSR_RT+IA64_PSR_IT
+        ;;
+        and r19=r28,r19
+        tbit.z p6,p0=r23,0
+        ;;
+        cmp.ne.or p6,p0=r28,r19
+        (p6) br.dptk kvm_asm_ssm_1
+        ;;
+        add r26=VMM_VCPU_META_SAVED_RR0_OFFSET,r21
+        add r27=VMM_VCPU_META_SAVED_RR0_OFFSET+8,r21
+        dep r23=0,r23,0,1
+        ;;
+        ld8 r26=[r26]
+        ld8 r27=[r27]
+        st4 [r22]=r23
+        dep.z r28=4,61,3
+        ;;
+        mov rr[r0]=r26
+        ;;
+        mov rr[r28]=r27
+        ;;
+        srlz.d
+        ;;
+kvm_asm_ssm_1:
+        tbit.nz p6,p0=r29,IA64_PSR_I_BIT
+        ;;
+        tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
+        (p6) br.dptk kvm_resume_to_guest
+        ;;
+        add r29=VPD_VTPR_START_OFFSET,r16
+        add r30=VPD_VHPI_START_OFFSET,r16
+        ;;
+        ld8 r29=[r29]
+        ld8 r30=[r30]
+        ;;
+        extr.u r17=r29,4,4
+        extr.u r18=r29,16,1
+        ;;
+        dep r17=r18,r17,4,1
+        ;;
+        cmp.gt p6,p0=r30,r17
+        (p6) br.dpnt.few kvm_asm_dispatch_vexirq
+        br.many kvm_resume_to_guest
+END(kvm_asm_ssm)
+//mov psr.l=r2
+GLOBAL_ENTRY(kvm_asm_mov_to_psr)
+#ifndef ACCE_MOV_TO_PSR
+        br.many kvm_virtualization_fault_back
+#endif
+        add r16=VMM_VPD_BASE_OFFSET,r21
+        extr.u r26=r25,13,7 //r2
+        ;;
+        ld8 r16=[r16]
+        addl r20=@gprel(asm_mov_from_reg),gp
+        ;;
+        adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
+        shladd r26=r26,4,r20
+        mov r24=b0
+        ;;
+        add r27=VPD_VPSR_START_OFFSET,r16
+        mov b0=r26
+        br.many b0
+        ;;
+kvm_asm_mov_to_psr_back:
+        ld8 r17=[r27]
+        add r22=VMM_VCPU_MODE_FLAGS_OFFSET,r21
+        dep r19=0,r19,32,32
+        ;;
+        ld4 r23=[r22]
+        dep r18=0,r17,0,32
+        ;;
+        add r30=r18,r19
+        movl r28=IA64_PSR_DT+IA64_PSR_RT+IA64_PSR_IT
+        ;;
+        st8 [r27]=r30
+        and r27=r28,r30
+        and r29=r28,r17
+        ;;
+        cmp.eq p5,p0=r29,r27
+        cmp.eq p6,p7=r28,r27
+        (p5) br.many kvm_asm_mov_to_psr_1
+        ;;
+        //virtual to physical
+        (p7) add r26=VMM_VCPU_META_RR0_OFFSET,r21
+        (p7) add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
+        (p7) dep r23=-1,r23,0,1
+        ;;
+        //physical to virtual
+        (p6) add r26=VMM_VCPU_META_SAVED_RR0_OFFSET,r21
+        (p6) add r27=VMM_VCPU_META_SAVED_RR0_OFFSET+8,r21
+        (p6) dep r23=0,r23,0,1
+        ;;
+        ld8 r26=[r26]
+        ld8 r27=[r27]
+        st4 [r22]=r23
+        dep.z r28=4,61,3
+        ;;
+        mov rr[r0]=r26
+        ;;
+        mov rr[r28]=r27
+        ;;
+        srlz.d
+        ;;
+kvm_asm_mov_to_psr_1:
+        mov r20=cr.ipsr
+        movl r28=IA64_PSR_IC+IA64_PSR_I+IA64_PSR_DT+IA64_PSR_SI+IA64_PSR_RT
+        ;;
+        or r19=r19,r28
+        dep r20=0,r20,0,32
+        ;;
+        add r20=r19,r20
+        mov b0=r24
+        ;;
+        /* Comment it out due to short of fp lazy algorithm support
+        adds r27=IA64_VCPU_FP_PSR_OFFSET,r21
+        ;;
+        ld8 r27=[r27]
+        ;;
+        tbit.nz p8,p0=r27,IA64_PSR_DFH_BIT
+        ;;
+        (p8) dep r20=-1,r20,IA64_PSR_DFH_BIT,1
+        ;;
+        */
+        mov cr.ipsr=r20
+        cmp.ne p6,p0=r0,r0
+        ;;
+        tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
+        tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
+        (p6) br.dpnt.few kvm_resume_to_guest
+        ;;
+        add r29=VPD_VTPR_START_OFFSET,r16
+        add r30=VPD_VHPI_START_OFFSET,r16
+        ;;
+        ld8 r29=[r29]
+        ld8 r30=[r30]
+        ;;
+        extr.u r17=r29,4,4
+        extr.u r18=r29,16,1
+        ;;
+        dep r17=r18,r17,4,1
+        ;;
+        cmp.gt p6,p0=r30,r17
+        (p6) br.dpnt.few kvm_asm_dispatch_vexirq
+        br.many kvm_resume_to_guest
+END(kvm_asm_mov_to_psr)
+ENTRY(kvm_asm_dispatch_vexirq)
+//increment iip
+        mov r16=cr.ipsr
+        ;;
+        extr.u r17=r16,IA64_PSR_RI_BIT,2
+        tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
+        ;;
+        (p6) mov r18=cr.iip
+        (p6) mov r17=r0
+        (p7) add r17=1,r17
+        ;;
+        (p6) add r18=0x10,r18
+        dep r16=r17,r16,IA64_PSR_RI_BIT,2
+        ;;
+        (p6) mov cr.iip=r18
+        mov cr.ipsr=r16
+        mov r30 =1
+        br.many kvm_dispatch_vexirq
+END(kvm_asm_dispatch_vexirq)
+// thash
+// TODO: add support when pta.vf = 1
+GLOBAL_ENTRY(kvm_asm_thash)
+#ifndef ACCE_THASH
+        br.many kvm_virtualization_fault_back
+#endif
+        extr.u r17=r25,20,7             // get r3 from opcode in r25
+        extr.u r18=r25,6,7              // get r1 from opcode in r25
+        addl r20=@gprel(asm_mov_from_reg),gp
+        ;;
+        adds r30=kvm_asm_thash_back1-asm_mov_from_reg,r20
+        shladd r17=r17,4,r20    // get addr of MOVE_FROM_REG(r17)
+        adds r16=VMM_VPD_BASE_OFFSET,r21        // get vcpu.arch.priveregs
+        ;;
+        mov r24=b0
+        ;;
+        ld8 r16=[r16]           // get VPD addr
+        mov b0=r17
+        br.many b0                      // r19 return value
+        ;;
+kvm_asm_thash_back1:
+        shr.u r23=r19,61                // get RR number
+        adds r25=VMM_VCPU_VRR0_OFFSET,r21       // get vcpu->arch.vrr[0]'s addr
+        adds r16=VMM_VPD_VPTA_OFFSET,r16        // get vpta
+        ;;
+        shladd r27=r23,3,r25    // get vcpu->arch.vrr[r23]'s addr
+        ld8 r17=[r16]           // get PTA
+        mov r26=1
+        ;;
+        extr.u r29=r17,2,6              // get pta.size
+        ld8 r25=[r27]           // get vcpu->arch.vrr[r23]'s value
+        ;;
+        extr.u r25=r25,2,6              // get rr.ps
+        shl r22=r26,r29         // 1UL << pta.size
+        ;;
+        shr.u r23=r19,r25               // vaddr >> rr.ps
+        adds r26=3,r29          // pta.size + 3
+        shl r27=r17,3           // pta << 3
+        ;;
+        shl r23=r23,3           // (vaddr >> rr.ps) << 3
+        shr.u r27=r27,r26               // (pta << 3) >> (pta.size+3)
+        movl r16=7<<61
+        ;;
+        adds r22=-1,r22         // (1UL << pta.size) - 1
+        shl r27=r27,r29         // ((pta<<3)>>(pta.size+3))<<pta.size
+        and r19=r19,r16         // vaddr & VRN_MASK
+        ;;
+        and r22=r22,r23         // vhpt_offset
+        or r19=r19,r27 // (vadr&VRN_MASK)|(((pta<<3)>>(pta.size + 3))<<pta.size)
+        adds r26=asm_mov_to_reg-asm_mov_from_reg,r20
+        ;;
+        or r19=r19,r22          // calc pval
+        shladd r17=r18,4,r26
+        adds r30=kvm_resume_to_guest-asm_mov_from_reg,r20
+        ;;
+        mov b0=r17
+        br.many b0
+END(kvm_asm_thash)
+#define MOV_TO_REG0     \
+{;                      \
+        nop.b 0x0;              \
+        nop.b 0x0;              \
+        nop.b 0x0;              \
+        ;;                      \
+};
+#define MOV_TO_REG(n)   \
+{;                      \
+        mov r##n##=r19; \
+        mov b0=r30;     \
+        br.sptk.many b0;        \
+        ;;                      \
+};
+#define MOV_FROM_REG(n) \
+{;                              \
+        mov r19=r##n##;         \
+        mov b0=r30;             \
+        br.sptk.many b0;                \
+        ;;                              \
+};
+#define MOV_TO_BANK0_REG(n)                     \
+ENTRY_MIN_ALIGN(asm_mov_to_bank0_reg##n##);     \
+{;                                              \
+        mov r26=r2;                             \
+        mov r2=r19;                             \
+        bsw.1;                                  \
+        ;;                                              \
+};                                              \
+{;                                              \
+        mov r##n##=r2;                          \
+        nop.b 0x0;                                      \
+        bsw.0;                                  \
+        ;;                                              \
+};                                              \
+{;                                              \
+        mov r2=r26;                             \
+        mov b0=r30;                             \
+        br.sptk.many b0;                                \
+        ;;                                              \
+};                                              \
+END(asm_mov_to_bank0_reg##n##)
+#define MOV_FROM_BANK0_REG(n)                   \
+ENTRY_MIN_ALIGN(asm_mov_from_bank0_reg##n##);   \
+{;                                              \
+        mov r26=r2;                             \
+        nop.b 0x0;                                      \
+        bsw.1;                                  \
+        ;;                                              \
+};                                              \
+{;                                              \
+        mov r2=r##n##;                          \
+        nop.b 0x0;                                      \
+        bsw.0;                                  \
+        ;;                                              \
+};                                              \
+{;                                              \
+        mov r19=r2;                             \
+        mov r2=r26;                             \
+        mov b0=r30;                             \
+};                                              \
+{;                                              \
+        nop.b 0x0;                                      \
+        nop.b 0x0;                                      \
+        br.sptk.many b0;                                \
+        ;;                                              \
+};                                              \
+END(asm_mov_from_bank0_reg##n##)
+#define JMP_TO_MOV_TO_BANK0_REG(n)              \
+{;                                              \
+        nop.b 0x0;                                      \
+        nop.b 0x0;                                      \
+        br.sptk.many asm_mov_to_bank0_reg##n##; \
+        ;;                                              \
+}
+#define JMP_TO_MOV_FROM_BANK0_REG(n)            \
+{;                                              \
+        nop.b 0x0;                                      \
+        nop.b 0x0;                                      \
+        br.sptk.many asm_mov_from_bank0_reg##n##;       \
+        ;;                                              \
+}
+MOV_FROM_BANK0_REG(16)
+MOV_FROM_BANK0_REG(17)
+MOV_FROM_BANK0_REG(18)
+MOV_FROM_BANK0_REG(19)
+MOV_FROM_BANK0_REG(20)
+MOV_FROM_BANK0_REG(21)
+MOV_FROM_BANK0_REG(22)
+MOV_FROM_BANK0_REG(23)
+MOV_FROM_BANK0_REG(24)
+MOV_FROM_BANK0_REG(25)
+MOV_FROM_BANK0_REG(26)
+MOV_FROM_BANK0_REG(27)
+MOV_FROM_BANK0_REG(28)
+MOV_FROM_BANK0_REG(29)
+MOV_FROM_BANK0_REG(30)
+MOV_FROM_BANK0_REG(31)
+// mov from reg table
+ENTRY(asm_mov_from_reg)
+        MOV_FROM_REG(0)
+        MOV_FROM_REG(1)
+        MOV_FROM_REG(2)
+        MOV_FROM_REG(3)
+        MOV_FROM_REG(4)
+        MOV_FROM_REG(5)
+        MOV_FROM_REG(6)
+        MOV_FROM_REG(7)
+        MOV_FROM_REG(8)
+        MOV_FROM_REG(9)
+        MOV_FROM_REG(10)
+        MOV_FROM_REG(11)
+        MOV_FROM_REG(12)
+        MOV_FROM_REG(13)
+        MOV_FROM_REG(14)
+        MOV_FROM_REG(15)
+        JMP_TO_MOV_FROM_BANK0_REG(16)
+        JMP_TO_MOV_FROM_BANK0_REG(17)
+        JMP_TO_MOV_FROM_BANK0_REG(18)
+        JMP_TO_MOV_FROM_BANK0_REG(19)
+        JMP_TO_MOV_FROM_BANK0_REG(20)
+        JMP_TO_MOV_FROM_BANK0_REG(21)
+        JMP_TO_MOV_FROM_BANK0_REG(22)
+        JMP_TO_MOV_FROM_BANK0_REG(23)
+        JMP_TO_MOV_FROM_BANK0_REG(24)
+        JMP_TO_MOV_FROM_BANK0_REG(25)
+        JMP_TO_MOV_FROM_BANK0_REG(26)
+        JMP_TO_MOV_FROM_BANK0_REG(27)
+        JMP_TO_MOV_FROM_BANK0_REG(28)
+        JMP_TO_MOV_FROM_BANK0_REG(29)
+        JMP_TO_MOV_FROM_BANK0_REG(30)
+        JMP_TO_MOV_FROM_BANK0_REG(31)
+        MOV_FROM_REG(32)
+        MOV_FROM_REG(33)
+        MOV_FROM_REG(34)
+        MOV_FROM_REG(35)
+        MOV_FROM_REG(36)
+        MOV_FROM_REG(37)
+        MOV_FROM_REG(38)
+        MOV_FROM_REG(39)
+        MOV_FROM_REG(40)
+        MOV_FROM_REG(41)
+        MOV_FROM_REG(42)
+        MOV_FROM_REG(43)
+        MOV_FROM_REG(44)
+        MOV_FROM_REG(45)
+        MOV_FROM_REG(46)
+        MOV_FROM_REG(47)
+        MOV_FROM_REG(48)
+        MOV_FROM_REG(49)
+        MOV_FROM_REG(50)
+        MOV_FROM_REG(51)
+        MOV_FROM_REG(52)
+        MOV_FROM_REG(53)
+        MOV_FROM_REG(54)
+        MOV_FROM_REG(55)
+        MOV_FROM_REG(56)
+        MOV_FROM_REG(57)
+        MOV_FROM_REG(58)
+        MOV_FROM_REG(59)
+        MOV_FROM_REG(60)
+        MOV_FROM_REG(61)
+        MOV_FROM_REG(62)
+        MOV_FROM_REG(63)
+        MOV_FROM_REG(64)
+        MOV_FROM_REG(65)
+        MOV_FROM_REG(66)
+        MOV_FROM_REG(67)
+        MOV_FROM_REG(68)
+        MOV_FROM_REG(69)
+        MOV_FROM_REG(70)
+        MOV_FROM_REG(71)
+        MOV_FROM_REG(72)
+        MOV_FROM_REG(73)
+        MOV_FROM_REG(74)
+        MOV_FROM_REG(75)
+        MOV_FROM_REG(76)
+        MOV_FROM_REG(77)
+        MOV_FROM_REG(78)
+        MOV_FROM_REG(79)
+        MOV_FROM_REG(80)
+        MOV_FROM_REG(81)
+        MOV_FROM_REG(82)
+        MOV_FROM_REG(83)
+        MOV_FROM_REG(84)
+        MOV_FROM_REG(85)
+        MOV_FROM_REG(86)
+        MOV_FROM_REG(87)
+        MOV_FROM_REG(88)
+        MOV_FROM_REG(89)
+        MOV_FROM_REG(90)
+        MOV_FROM_REG(91)
+        MOV_FROM_REG(92)
+        MOV_FROM_REG(93)
+        MOV_FROM_REG(94)
+        MOV_FROM_REG(95)
+        MOV_FROM_REG(96)
+        MOV_FROM_REG(97)
+        MOV_FROM_REG(98)
+        MOV_FROM_REG(99)
+        MOV_FROM_REG(100)
+        MOV_FROM_REG(101)
+        MOV_FROM_REG(102)
+        MOV_FROM_REG(103)
+        MOV_FROM_REG(104)
+        MOV_FROM_REG(105)
+        MOV_FROM_REG(106)
+        MOV_FROM_REG(107)
+        MOV_FROM_REG(108)
+        MOV_FROM_REG(109)
+        MOV_FROM_REG(110)
+        MOV_FROM_REG(111)
+        MOV_FROM_REG(112)
+        MOV_FROM_REG(113)
+        MOV_FROM_REG(114)
+        MOV_FROM_REG(115)
+        MOV_FROM_REG(116)
+        MOV_FROM_REG(117)
+        MOV_FROM_REG(118)
+        MOV_FROM_REG(119)
+        MOV_FROM_REG(120)
+        MOV_FROM_REG(121)
+        MOV_FROM_REG(122)
+        MOV_FROM_REG(123)
+        MOV_FROM_REG(124)
+        MOV_FROM_REG(125)
+        MOV_FROM_REG(126)
+        MOV_FROM_REG(127)
+END(asm_mov_from_reg)
+/* must be in bank 0
+ * parameter:
+ * r31: pr
+ * r24: b0
+ */
+ENTRY(kvm_resume_to_guest)
+        adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+        ;;
+        ld8 r1 =[r16]
+        adds r20 = VMM_VCPU_VSA_BASE_OFFSET,r21
+        ;;
+        mov r16=cr.ipsr
+        ;;
+        ld8 r20 = [r20]
+        adds r19=VMM_VPD_BASE_OFFSET,r21
+        ;;
+        ld8 r25=[r19]
+        extr.u r17=r16,IA64_PSR_RI_BIT,2
+        tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
+        ;;
+        (p6) mov r18=cr.iip
+        (p6) mov r17=r0
+        ;;
+        (p6) add r18=0x10,r18
+        (p7) add r17=1,r17
+        ;;
+        (p6) mov cr.iip=r18
+        dep r16=r17,r16,IA64_PSR_RI_BIT,2
+        ;;
+        mov cr.ipsr=r16
+        adds r19= VPD_VPSR_START_OFFSET,r25
+        add r28=PAL_VPS_RESUME_NORMAL,r20
+        add r29=PAL_VPS_RESUME_HANDLER,r20
+        ;;
+        ld8 r19=[r19]
+        mov b0=r29
+        cmp.ne p6,p7 = r0,r0
+        ;;
+        tbit.z p6,p7 = r19,IA64_PSR_IC_BIT              // p1=vpsr.ic
+        ;;
+        (p6) ld8 r26=[r25]
+        (p7) mov b0=r28
+        mov pr=r31,-2
+        br.sptk.many b0             // call pal service
+        ;;
+END(kvm_resume_to_guest)
+MOV_TO_BANK0_REG(16)
+MOV_TO_BANK0_REG(17)
+MOV_TO_BANK0_REG(18)
+MOV_TO_BANK0_REG(19)
+MOV_TO_BANK0_REG(20)
+MOV_TO_BANK0_REG(21)
+MOV_TO_BANK0_REG(22)
+MOV_TO_BANK0_REG(23)
+MOV_TO_BANK0_REG(24)
+MOV_TO_BANK0_REG(25)
+MOV_TO_BANK0_REG(26)
+MOV_TO_BANK0_REG(27)
+MOV_TO_BANK0_REG(28)
+MOV_TO_BANK0_REG(29)
+MOV_TO_BANK0_REG(30)
+MOV_TO_BANK0_REG(31)
+// mov to reg table
+ENTRY(asm_mov_to_reg)
+        MOV_TO_REG0
+        MOV_TO_REG(1)
+        MOV_TO_REG(2)
+        MOV_TO_REG(3)
+        MOV_TO_REG(4)
+        MOV_TO_REG(5)
+        MOV_TO_REG(6)
+        MOV_TO_REG(7)
+        MOV_TO_REG(8)
+        MOV_TO_REG(9)
+        MOV_TO_REG(10)
+        MOV_TO_REG(11)
+        MOV_TO_REG(12)
+        MOV_TO_REG(13)
+        MOV_TO_REG(14)
+        MOV_TO_REG(15)
+        JMP_TO_MOV_TO_BANK0_REG(16)
+        JMP_TO_MOV_TO_BANK0_REG(17)
+        JMP_TO_MOV_TO_BANK0_REG(18)
+        JMP_TO_MOV_TO_BANK0_REG(19)
+        JMP_TO_MOV_TO_BANK0_REG(20)
+        JMP_TO_MOV_TO_BANK0_REG(21)
+        JMP_TO_MOV_TO_BANK0_REG(22)
+        JMP_TO_MOV_TO_BANK0_REG(23)
+        JMP_TO_MOV_TO_BANK0_REG(24)
+        JMP_TO_MOV_TO_BANK0_REG(25)
+        JMP_TO_MOV_TO_BANK0_REG(26)
+        JMP_TO_MOV_TO_BANK0_REG(27)
+        JMP_TO_MOV_TO_BANK0_REG(28)
+        JMP_TO_MOV_TO_BANK0_REG(29)
+        JMP_TO_MOV_TO_BANK0_REG(30)
+        JMP_TO_MOV_TO_BANK0_REG(31)
+        MOV_TO_REG(32)
+        MOV_TO_REG(33)
+        MOV_TO_REG(34)
+        MOV_TO_REG(35)
+        MOV_TO_REG(36)
+        MOV_TO_REG(37)
+        MOV_TO_REG(38)
+        MOV_TO_REG(39)
+        MOV_TO_REG(40)
+        MOV_TO_REG(41)
+        MOV_TO_REG(42)
+        MOV_TO_REG(43)
+        MOV_TO_REG(44)
+        MOV_TO_REG(45)
+        MOV_TO_REG(46)
+        MOV_TO_REG(47)
+        MOV_TO_REG(48)
+        MOV_TO_REG(49)
+        MOV_TO_REG(50)
+        MOV_TO_REG(51)
+        MOV_TO_REG(52)
+        MOV_TO_REG(53)
+        MOV_TO_REG(54)
+        MOV_TO_REG(55)
+        MOV_TO_REG(56)
+        MOV_TO_REG(57)
+        MOV_TO_REG(58)
+        MOV_TO_REG(59)
+        MOV_TO_REG(60)
+        MOV_TO_REG(61)
+        MOV_TO_REG(62)
+        MOV_TO_REG(63)
+        MOV_TO_REG(64)
+        MOV_TO_REG(65)
+        MOV_TO_REG(66)
+        MOV_TO_REG(67)
+        MOV_TO_REG(68)
+        MOV_TO_REG(69)
+        MOV_TO_REG(70)
+        MOV_TO_REG(71)
+        MOV_TO_REG(72)
+        MOV_TO_REG(73)
+        MOV_TO_REG(74)
+        MOV_TO_REG(75)
+        MOV_TO_REG(76)
+        MOV_TO_REG(77)
+        MOV_TO_REG(78)
+        MOV_TO_REG(79)
+        MOV_TO_REG(80)
+        MOV_TO_REG(81)
+        MOV_TO_REG(82)
+        MOV_TO_REG(83)
+        MOV_TO_REG(84)
+        MOV_TO_REG(85)
+        MOV_TO_REG(86)
+        MOV_TO_REG(87)
+        MOV_TO_REG(88)
+        MOV_TO_REG(89)
+        MOV_TO_REG(90)
+        MOV_TO_REG(91)
+        MOV_TO_REG(92)
+        MOV_TO_REG(93)
+        MOV_TO_REG(94)
+        MOV_TO_REG(95)
+        MOV_TO_REG(96)
+        MOV_TO_REG(97)
+        MOV_TO_REG(98)
+        MOV_TO_REG(99)
+        MOV_TO_REG(100)
+        MOV_TO_REG(101)
+        MOV_TO_REG(102)
+        MOV_TO_REG(103)
+        MOV_TO_REG(104)
+        MOV_TO_REG(105)
+        MOV_TO_REG(106)
+        MOV_TO_REG(107)
+        MOV_TO_REG(108)
+        MOV_TO_REG(109)
+        MOV_TO_REG(110)
+        MOV_TO_REG(111)
+        MOV_TO_REG(112)
+        MOV_TO_REG(113)
+        MOV_TO_REG(114)
+        MOV_TO_REG(115)
+        MOV_TO_REG(116)
+        MOV_TO_REG(117)
+        MOV_TO_REG(118)
+        MOV_TO_REG(119)
+        MOV_TO_REG(120)
+        MOV_TO_REG(121)
+        MOV_TO_REG(122)
+        MOV_TO_REG(123)
+        MOV_TO_REG(124)
+        MOV_TO_REG(125)
+        MOV_TO_REG(126)
+        MOV_TO_REG(127)
+END(asm_mov_to_reg)
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
new file mode 100644
index 00000000000..5a33f7ed29a
--- /dev/null
+++ b/arch/ia64/kvm/process.c
@@ -0,0 +1,970 @@
+/*
+ * process.c: handle interruption inject for guests.
+ * Copyright (c) 2005, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *      Shaofan Li (Susue Li) <susie.li@intel.com>
+ *      Xiaoyan Feng (Fleming Feng)  <fleming.feng@intel.com>
+ *      Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
+ *      Xiantao Zhang (xiantao.zhang@intel.com)
+ */
+#include "vcpu.h"
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/fpswa.h>
+#include <asm/kregs.h>
+#include <asm/tlb.h>
+fpswa_interface_t *vmm_fpswa_interface;
+#define IA64_VHPT_TRANS_VECTOR                  0x0000
+#define IA64_INST_TLB_VECTOR                    0x0400
+#define IA64_DATA_TLB_VECTOR                    0x0800
+#define IA64_ALT_INST_TLB_VECTOR                0x0c00
+#define IA64_ALT_DATA_TLB_VECTOR                0x1000
+#define IA64_DATA_NESTED_TLB_VECTOR             0x1400
+#define IA64_INST_KEY_MISS_VECTOR               0x1800
+#define IA64_DATA_KEY_MISS_VECTOR               0x1c00
+#define IA64_DIRTY_BIT_VECTOR                   0x2000
+#define IA64_INST_ACCESS_BIT_VECTOR             0x2400
+#define IA64_DATA_ACCESS_BIT_VECTOR             0x2800
+#define IA64_BREAK_VECTOR                       0x2c00
+#define IA64_EXTINT_VECTOR                      0x3000
+#define IA64_PAGE_NOT_PRESENT_VECTOR            0x5000
+#define IA64_KEY_PERMISSION_VECTOR              0x5100
+#define IA64_INST_ACCESS_RIGHTS_VECTOR          0x5200
+#define IA64_DATA_ACCESS_RIGHTS_VECTOR          0x5300
+#define IA64_GENEX_VECTOR                       0x5400
+#define IA64_DISABLED_FPREG_VECTOR              0x5500
+#define IA64_NAT_CONSUMPTION_VECTOR             0x5600
+#define IA64_SPECULATION_VECTOR         0x5700 /* UNUSED */
+#define IA64_DEBUG_VECTOR                       0x5900
+#define IA64_UNALIGNED_REF_VECTOR               0x5a00
+#define IA64_UNSUPPORTED_DATA_REF_VECTOR        0x5b00
+#define IA64_FP_FAULT_VECTOR                    0x5c00
+#define IA64_FP_TRAP_VECTOR                     0x5d00
+#define IA64_LOWERPRIV_TRANSFER_TRAP_VECTOR     0x5e00
+#define IA64_TAKEN_BRANCH_TRAP_VECTOR           0x5f00
+#define IA64_SINGLE_STEP_TRAP_VECTOR            0x6000
+/* SDM vol2 5.5 - IVA based interruption handling */
+#define INITIAL_PSR_VALUE_AT_INTERRUPTION (IA64_PSR_UP | IA64_PSR_MFL |\
+                        IA64_PSR_MFH | IA64_PSR_PK | IA64_PSR_DT |      \
+                        IA64_PSR_RT | IA64_PSR_MC|IA64_PSR_IT)
+#define DOMN_PAL_REQUEST    0x110000
+#define DOMN_SAL_REQUEST    0x110001
+static u64 vec2off[68] = {0x0, 0x400, 0x800, 0xc00, 0x1000, 0x1400, 0x1800,
+        0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00,
+        0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5100, 0x5200, 0x5300, 0x5400,
+        0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00, 0x5c00, 0x5d00,
+        0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500, 0x6600,
+        0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
+        0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800,
+        0x7900, 0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00
+};
+static void collect_interruption(struct kvm_vcpu *vcpu)
+{
+        u64 ipsr;
+        u64 vdcr;
+        u64 vifs;
+        unsigned long vpsr;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        vpsr = vcpu_get_psr(vcpu);
+        vcpu_bsw0(vcpu);
+        if (vpsr & IA64_PSR_IC) {
+                /* Sync mpsr id/da/dd/ss/ed bits to vipsr
+                 * since after guest do rfi, we still want these bits on in
+                 * mpsr
+                 */
+                ipsr = regs->cr_ipsr;
+                vpsr = vpsr | (ipsr & (IA64_PSR_ID | IA64_PSR_DA
+                                        | IA64_PSR_DD | IA64_PSR_SS
+                                        | IA64_PSR_ED));
+                vcpu_set_ipsr(vcpu, vpsr);
+                /* Currently, for trap, we do not advance IIP to next
+                 * instruction. That's because we assume caller already
+                 * set up IIP correctly
+                 */
+                vcpu_set_iip(vcpu , regs->cr_iip);
+                /* set vifs.v to zero */
+                vifs = VCPU(vcpu, ifs);
+                vifs &= ~IA64_IFS_V;
+                vcpu_set_ifs(vcpu, vifs);
+                vcpu_set_iipa(vcpu, VMX(vcpu, cr_iipa));
+        }
+        vdcr = VCPU(vcpu, dcr);
+        /* Set guest psr
+         * up/mfl/mfh/pk/dt/rt/mc/it keeps unchanged
+         * be: set to the value of dcr.be
+         * pp: set to the value of dcr.pp
+         */
+        vpsr &= INITIAL_PSR_VALUE_AT_INTERRUPTION;
+        vpsr |= (vdcr & IA64_DCR_BE);
+        /* VDCR pp bit position is different from VPSR pp bit */
+        if (vdcr & IA64_DCR_PP) {
+                vpsr |= IA64_PSR_PP;
+        } else {
+                vpsr &= ~IA64_PSR_PP;;
+        }
+        vcpu_set_psr(vcpu, vpsr);
+}
+void inject_guest_interruption(struct kvm_vcpu *vcpu, u64 vec)
+{
+        u64 viva;
+        struct kvm_pt_regs *regs;
+        union ia64_isr pt_isr;
+        regs = vcpu_regs(vcpu);
+        /* clear cr.isr.ir (incomplete register frame)*/
+        pt_isr.val = VMX(vcpu, cr_isr);
+        pt_isr.ir = 0;
+        VMX(vcpu, cr_isr) = pt_isr.val;
+        collect_interruption(vcpu);
+        viva = vcpu_get_iva(vcpu);
+        regs->cr_iip = viva + vec;
+}
+static u64 vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, u64 ifa)
+{
+        union ia64_rr rr, rr1;
+        rr.val = vcpu_get_rr(vcpu, ifa);
+        rr1.val = 0;
+        rr1.ps = rr.ps;
+        rr1.rid = rr.rid;
+        return (rr1.val);
+}
+/*
+ * Set vIFA & vITIR & vIHA, when vPSR.ic =1
+ * Parameter:
+ *  set_ifa: if true, set vIFA
+ *  set_itir: if true, set vITIR
+ *  set_iha: if true, set vIHA
+ */
+void set_ifa_itir_iha(struct kvm_vcpu *vcpu, u64 vadr,
+                int set_ifa, int set_itir, int set_iha)
+{
+        long vpsr;
+        u64 value;
+        vpsr = VCPU(vcpu, vpsr);
+        /* Vol2, Table 8-1 */
+        if (vpsr & IA64_PSR_IC) {
+                if (set_ifa)
+                        vcpu_set_ifa(vcpu, vadr);
+                if (set_itir) {
+                        value = vcpu_get_itir_on_fault(vcpu, vadr);
+                        vcpu_set_itir(vcpu, value);
+                }
+                if (set_iha) {
+                        value = vcpu_thash(vcpu, vadr);
+                        vcpu_set_iha(vcpu, value);
+                }
+        }
+}
+/*
+ * Data TLB Fault
+ *  @ Data TLB vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void dtlb_fault(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        /* If vPSR.ic, IFA, ITIR, IHA */
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 1);
+        inject_guest_interruption(vcpu, IA64_DATA_TLB_VECTOR);
+}
+/*
+ * Instruction TLB Fault
+ *  @ Instruction TLB vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void itlb_fault(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        /* If vPSR.ic, IFA, ITIR, IHA */
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 1);
+        inject_guest_interruption(vcpu, IA64_INST_TLB_VECTOR);
+}
+/*
+ * Data Nested TLB Fault
+ *  @ Data Nested TLB Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void nested_dtlb(struct kvm_vcpu *vcpu)
+{
+        inject_guest_interruption(vcpu, IA64_DATA_NESTED_TLB_VECTOR);
+}
+/*
+ * Alternate Data TLB Fault
+ *  @ Alternate Data TLB vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void alt_dtlb(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 0);
+        inject_guest_interruption(vcpu, IA64_ALT_DATA_TLB_VECTOR);
+}
+/*
+ * Data TLB Fault
+ *  @ Data TLB vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void alt_itlb(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 0);
+        inject_guest_interruption(vcpu, IA64_ALT_INST_TLB_VECTOR);
+}
+/* Deal with:
+ *  VHPT Translation Vector
+ */
+static void _vhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        /* If vPSR.ic, IFA, ITIR, IHA*/
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 1);
+        inject_guest_interruption(vcpu, IA64_VHPT_TRANS_VECTOR);
+}
+/*
+ * VHPT Instruction Fault
+ *  @ VHPT Translation vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void ivhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        _vhpt_fault(vcpu, vadr);
+}
+/*
+ * VHPT Data Fault
+ *  @ VHPT Translation vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void dvhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        _vhpt_fault(vcpu, vadr);
+}
+/*
+ * Deal with:
+ *  General Exception vector
+ */
+void _general_exception(struct kvm_vcpu *vcpu)
+{
+        inject_guest_interruption(vcpu, IA64_GENEX_VECTOR);
+}
+/*
+ * Illegal Operation Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void illegal_op(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/*
+ * Illegal Dependency Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void illegal_dep(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/*
+ * Reserved Register/Field Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void rsv_reg_field(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/*
+ * Privileged Operation Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void privilege_op(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/*
+ * Unimplement Data Address Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void unimpl_daddr(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/*
+ * Privileged Register Fault
+ *  @ General Exception Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void privilege_reg(struct kvm_vcpu *vcpu)
+{
+        _general_exception(vcpu);
+}
+/* Deal with
+ *  Nat consumption vector
+ * Parameter:
+ *  vaddr: Optional, if t == REGISTER
+ */
+static void _nat_consumption_fault(struct kvm_vcpu *vcpu, u64 vadr,
+                                                enum tlb_miss_type t)
+{
+        /* If vPSR.ic && t == DATA/INST, IFA */
+        if (t == DATA || t == INSTRUCTION) {
+                /* IFA */
+                set_ifa_itir_iha(vcpu, vadr, 1, 0, 0);
+        }
+        inject_guest_interruption(vcpu, IA64_NAT_CONSUMPTION_VECTOR);
+}
+/*
+ * Instruction Nat Page Consumption Fault
+ *  @ Nat Consumption Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void inat_page_consumption(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        _nat_consumption_fault(vcpu, vadr, INSTRUCTION);
+}
+/*
+ * Register Nat Consumption Fault
+ *  @ Nat Consumption Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void rnat_consumption(struct kvm_vcpu *vcpu)
+{
+        _nat_consumption_fault(vcpu, 0, REGISTER);
+}
+/*
+ * Data Nat Page Consumption Fault
+ *  @ Nat Consumption Vector
+ * Refer to SDM Vol2 Table 5-6 & 8-1
+ */
+void dnat_page_consumption(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        _nat_consumption_fault(vcpu, vadr, DATA);
+}
+/* Deal with
+ *  Page not present vector
+ */
+static void __page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        /* If vPSR.ic, IFA, ITIR */
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 0);
+        inject_guest_interruption(vcpu, IA64_PAGE_NOT_PRESENT_VECTOR);
+}
+void data_page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        __page_not_present(vcpu, vadr);
+}
+void inst_page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        __page_not_present(vcpu, vadr);
+}
+/* Deal with
+ *  Data access rights vector
+ */
+void data_access_rights(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        /* If vPSR.ic, IFA, ITIR */
+        set_ifa_itir_iha(vcpu, vadr, 1, 1, 0);
+        inject_guest_interruption(vcpu, IA64_DATA_ACCESS_RIGHTS_VECTOR);
+}
+fpswa_ret_t vmm_fp_emulate(int fp_fault, void *bundle, unsigned long *ipsr,
+                unsigned long *fpsr, unsigned long *isr, unsigned long *pr,
+                unsigned long *ifs, struct kvm_pt_regs *regs)
+{
+        fp_state_t fp_state;
+        fpswa_ret_t ret;
+        struct kvm_vcpu *vcpu = current_vcpu;
+        uint64_t old_rr7 = ia64_get_rr(7UL<<61);
+        if (!vmm_fpswa_interface)
+                return (fpswa_ret_t) {-1, 0, 0, 0};
+        /*
+         * Just let fpswa driver to use hardware fp registers.
+         * No fp register is valid in memory.
+         */
+        memset(&fp_state, 0, sizeof(fp_state_t));
+        /*
+         * unsigned long (*EFI_FPSWA) (
+         *      unsigned long    trap_type,
+         *      void             *Bundle,
+         *      unsigned long    *pipsr,
+         *      unsigned long    *pfsr,
+         *      unsigned long    *pisr,
+         *      unsigned long    *ppreds,
+         *      unsigned long    *pifs,
+         *      void             *fp_state);
+         */
+        /*Call host fpswa interface directly to virtualize
+         *guest fpswa request!
+         */
+        ia64_set_rr(7UL << 61, vcpu->arch.host.rr[7]);
+        ia64_srlz_d();
+        ret = (*vmm_fpswa_interface->fpswa) (fp_fault, bundle,
+                        ipsr, fpsr, isr, pr, ifs, &fp_state);
+        ia64_set_rr(7UL << 61, old_rr7);
+        ia64_srlz_d();
+        return ret;
+}
+/*
+ * Handle floating-point assist faults and traps for domain.
+ */
+unsigned long vmm_handle_fpu_swa(int fp_fault, struct kvm_pt_regs *regs,
+                                        unsigned long isr)
+{
+        struct kvm_vcpu *v = current_vcpu;
+        IA64_BUNDLE bundle;
+        unsigned long fault_ip;
+        fpswa_ret_t ret;
+        fault_ip = regs->cr_iip;
+        /*
+         * When the FP trap occurs, the trapping instruction is completed.
+         * If ipsr.ri == 0, there is the trapping instruction in previous
+         * bundle.
+         */
+        if (!fp_fault && (ia64_psr(regs)->ri == 0))
+                fault_ip -= 16;
+        if (fetch_code(v, fault_ip, &bundle))
+                return -EAGAIN;
+        if (!bundle.i64[0] && !bundle.i64[1])
+                return -EACCES;
+        ret = vmm_fp_emulate(fp_fault, &bundle, &regs->cr_ipsr, &regs->ar_fpsr,
+                        &isr, &regs->pr, &regs->cr_ifs, regs);
+        return ret.status;
+}
+void reflect_interruption(u64 ifa, u64 isr, u64 iim,
+                u64 vec, struct kvm_pt_regs *regs)
+{
+        u64 vector;
+        int status ;
+        struct kvm_vcpu *vcpu = current_vcpu;
+        u64 vpsr = VCPU(vcpu, vpsr);
+        vector = vec2off[vec];
+        if (!(vpsr & IA64_PSR_IC) && (vector != IA64_DATA_NESTED_TLB_VECTOR)) {
+                panic_vm(vcpu);
+                return;
+        }
+        switch (vec) {
+        case 32:        /*IA64_FP_FAULT_VECTOR*/
+                status = vmm_handle_fpu_swa(1, regs, isr);
+                if (!status) {
+                        vcpu_increment_iip(vcpu);
+                        return;
+                } else if (-EAGAIN == status)
+                        return;
+                break;
+        case 33:        /*IA64_FP_TRAP_VECTOR*/
+                status = vmm_handle_fpu_swa(0, regs, isr);
+                if (!status)
+                        return ;
+                else if (-EAGAIN == status) {
+                        vcpu_decrement_iip(vcpu);
+                        return ;
+                }
+                break;
+        }
+        VCPU(vcpu, isr) = isr;
+        VCPU(vcpu, iipa) = regs->cr_iip;
+        if (vector == IA64_BREAK_VECTOR || vector == IA64_SPECULATION_VECTOR)
+                VCPU(vcpu, iim) = iim;
+        else
+                set_ifa_itir_iha(vcpu, ifa, 1, 1, 1);
+        inject_guest_interruption(vcpu, vector);
+}
+static void set_pal_call_data(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        /*FIXME:For static and stacked convention, firmware
+         * has put the parameters in gr28-gr31 before
+         * break to vmm  !!*/
+        p->u.pal_data.gr28 = vcpu_get_gr(vcpu, 28);
+        p->u.pal_data.gr29 = vcpu_get_gr(vcpu, 29);
+        p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30);
+        p->u.pal_data.gr31 = vcpu_get_gr(vcpu, 31);
+        p->exit_reason = EXIT_REASON_PAL_CALL;
+}
+static void set_pal_call_result(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        if (p->exit_reason == EXIT_REASON_PAL_CALL) {
+                vcpu_set_gr(vcpu, 8, p->u.pal_data.ret.status, 0);
+                vcpu_set_gr(vcpu, 9, p->u.pal_data.ret.v0, 0);
+                vcpu_set_gr(vcpu, 10, p->u.pal_data.ret.v1, 0);
+                vcpu_set_gr(vcpu, 11, p->u.pal_data.ret.v2, 0);
+        } else
+                panic_vm(vcpu);
+}
+static void set_sal_call_data(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        p->u.sal_data.in0 = vcpu_get_gr(vcpu, 32);
+        p->u.sal_data.in1 = vcpu_get_gr(vcpu, 33);
+        p->u.sal_data.in2 = vcpu_get_gr(vcpu, 34);
+        p->u.sal_data.in3 = vcpu_get_gr(vcpu, 35);
+        p->u.sal_data.in4 = vcpu_get_gr(vcpu, 36);
+        p->u.sal_data.in5 = vcpu_get_gr(vcpu, 37);
+        p->u.sal_data.in6 = vcpu_get_gr(vcpu, 38);
+        p->u.sal_data.in7 = vcpu_get_gr(vcpu, 39);
+        p->exit_reason = EXIT_REASON_SAL_CALL;
+}
+static void set_sal_call_result(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        if (p->exit_reason == EXIT_REASON_SAL_CALL) {
+                vcpu_set_gr(vcpu, 8, p->u.sal_data.ret.r8, 0);
+                vcpu_set_gr(vcpu, 9, p->u.sal_data.ret.r9, 0);
+                vcpu_set_gr(vcpu, 10, p->u.sal_data.ret.r10, 0);
+                vcpu_set_gr(vcpu, 11, p->u.sal_data.ret.r11, 0);
+        } else
+                panic_vm(vcpu);
+}
+void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
+                unsigned long isr, unsigned long iim)
+{
+        struct kvm_vcpu *v = current_vcpu;
+        if (ia64_psr(regs)->cpl == 0) {
+                /* Allow hypercalls only when cpl = 0.  */
+                if (iim == DOMN_PAL_REQUEST) {
+                        set_pal_call_data(v);
+                        vmm_transition(v);
+                        set_pal_call_result(v);
+                        vcpu_increment_iip(v);
+                        return;
+                } else if (iim == DOMN_SAL_REQUEST) {
+                        set_sal_call_data(v);
+                        vmm_transition(v);
+                        set_sal_call_result(v);
+                        vcpu_increment_iip(v);
+                        return;
+                }
+        }
+        reflect_interruption(ifa, isr, iim, 11, regs);
+}
+void check_pending_irq(struct kvm_vcpu *vcpu)
+{
+        int  mask, h_pending, h_inservice;
+        u64 isr;
+        unsigned long  vpsr;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        h_pending = highest_pending_irq(vcpu);
+        if (h_pending == NULL_VECTOR) {
+                update_vhpi(vcpu, NULL_VECTOR);
+                return;
+        }
+        h_inservice = highest_inservice_irq(vcpu);
+        vpsr = VCPU(vcpu, vpsr);
+        mask = irq_masked(vcpu, h_pending, h_inservice);
+        if ((vpsr & IA64_PSR_I) && IRQ_NO_MASKED == mask) {
+                isr = vpsr & IA64_PSR_RI;
+                update_vhpi(vcpu, h_pending);
+                reflect_interruption(0, isr, 0, 12, regs); /* EXT IRQ */
+        } else if (mask == IRQ_MASKED_BY_INSVC) {
+                if (VCPU(vcpu, vhpi))
+                        update_vhpi(vcpu, NULL_VECTOR);
+        } else {
+                /* masked by vpsr.i or vtpr.*/
+                update_vhpi(vcpu, h_pending);
+        }
+}
+static void generate_exirq(struct kvm_vcpu *vcpu)
+{
+        unsigned  vpsr;
+        uint64_t isr;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        vpsr = VCPU(vcpu, vpsr);
+        isr = vpsr & IA64_PSR_RI;
+        if (!(vpsr & IA64_PSR_IC))
+                panic_vm(vcpu);
+        reflect_interruption(0, isr, 0, 12, regs); /* EXT IRQ */
+}
+void vhpi_detection(struct kvm_vcpu *vcpu)
+{
+        uint64_t    threshold, vhpi;
+        union ia64_tpr       vtpr;
+        struct ia64_psr vpsr;
+        vpsr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        vtpr.val = VCPU(vcpu, tpr);
+        threshold = ((!vpsr.i) << 5) | (vtpr.mmi << 4) | vtpr.mic;
+        vhpi = VCPU(vcpu, vhpi);
+        if (vhpi > threshold) {
+                /* interrupt actived*/
+                generate_exirq(vcpu);
+        }
+}
+void leave_hypervisor_tail(void)
+{
+        struct kvm_vcpu *v = current_vcpu;
+        if (VMX(v, timer_check)) {
+                VMX(v, timer_check) = 0;
+                if (VMX(v, itc_check)) {
+                        if (vcpu_get_itc(v) > VCPU(v, itm)) {
+                                if (!(VCPU(v, itv) & (1 << 16))) {
+                                        vcpu_pend_interrupt(v, VCPU(v, itv)
+                                                        & 0xff);
+                                VMX(v, itc_check) = 0;
+                                } else {
+                                        v->arch.timer_pending = 1;
+                                }
+                                VMX(v, last_itc) = VCPU(v, itm) + 1;
+                        }
+                }
+        }
+        rmb();
+        if (v->arch.irq_new_pending) {
+                v->arch.irq_new_pending = 0;
+                VMX(v, irq_check) = 0;
+                check_pending_irq(v);
+                return;
+        }
+        if (VMX(v, irq_check)) {
+                VMX(v, irq_check) = 0;
+                vhpi_detection(v);
+        }
+}
+static inline void handle_lds(struct kvm_pt_regs *regs)
+{
+        regs->cr_ipsr |= IA64_PSR_ED;
+}
+void physical_tlb_miss(struct kvm_vcpu *vcpu, unsigned long vadr, int type)
+{
+        unsigned long pte;
+        union ia64_rr rr;
+        rr.val = ia64_get_rr(vadr);
+        pte =  vadr & _PAGE_PPN_MASK;
+        pte = pte | PHY_PAGE_WB;
+        thash_vhpt_insert(vcpu, pte, (u64)(rr.ps << 2), vadr, type);
+        return;
+}
+void kvm_page_fault(u64 vadr , u64 vec, struct kvm_pt_regs *regs)
+{
+        unsigned long vpsr;
+        int type;
+        u64 vhpt_adr, gppa, pteval, rr, itir;
+        union ia64_isr misr;
+        union ia64_pta vpta;
+        struct thash_data *data;
+        struct kvm_vcpu *v = current_vcpu;
+        vpsr = VCPU(v, vpsr);
+        misr.val = VMX(v, cr_isr);
+        type = vec;
+        if (is_physical_mode(v) && (!(vadr << 1 >> 62))) {
+                if (vec == 2) {
+                        if (__gpfn_is_io((vadr << 1) >> (PAGE_SHIFT + 1))) {
+                                emulate_io_inst(v, ((vadr << 1) >> 1), 4);
+                                return;
+                        }
+                }
+                physical_tlb_miss(v, vadr, type);
+                return;
+        }
+        data = vtlb_lookup(v, vadr, type);
+        if (data != 0) {
+                if (type == D_TLB) {
+                        gppa = (vadr & ((1UL << data->ps) - 1))
+                                + (data->ppn >> (data->ps - 12) << data->ps);
+                        if (__gpfn_is_io(gppa >> PAGE_SHIFT)) {
+                                if (data->pl >= ((regs->cr_ipsr >>
+                                                IA64_PSR_CPL0_BIT) & 3))
+                                        emulate_io_inst(v, gppa, data->ma);
+                                else {
+                                        vcpu_set_isr(v, misr.val);
+                                        data_access_rights(v, vadr);
+                                }
+                                return ;
+                        }
+                }
+                thash_vhpt_insert(v, data->page_flags, data->itir, vadr, type);
+        } else if (type == D_TLB) {
+                if (misr.sp) {
+                        handle_lds(regs);
+                        return;
+                }
+                rr = vcpu_get_rr(v, vadr);
+                itir = rr & (RR_RID_MASK | RR_PS_MASK);
+                if (!vhpt_enabled(v, vadr, misr.rs ? RSE_REF : DATA_REF)) {
+                        if (vpsr & IA64_PSR_IC) {
+                                vcpu_set_isr(v, misr.val);
+                                alt_dtlb(v, vadr);
+                        } else {
+                                nested_dtlb(v);
+                        }
+                        return ;
+                }
+                vpta.val = vcpu_get_pta(v);
+                /* avoid recursively walking (short format) VHPT */
+                vhpt_adr = vcpu_thash(v, vadr);
+                if (!guest_vhpt_lookup(vhpt_adr, &pteval)) {
+                        /* VHPT successfully read.  */
+                        if (!(pteval & _PAGE_P)) {
+                                if (vpsr & IA64_PSR_IC) {
+                                        vcpu_set_isr(v, misr.val);
+                                        dtlb_fault(v, vadr);
+                                } else {
+                                        nested_dtlb(v);
+                                }
+                        } else if ((pteval & _PAGE_MA_MASK) != _PAGE_MA_ST) {
+                                thash_purge_and_insert(v, pteval, itir,
+                                                                vadr, D_TLB);
+                        } else if (vpsr & IA64_PSR_IC) {
+                                vcpu_set_isr(v, misr.val);
+                                dtlb_fault(v, vadr);
+                        } else {
+                                nested_dtlb(v);
+                        }
+                } else {
+                        /* Can't read VHPT.  */
+                        if (vpsr & IA64_PSR_IC) {
+                                vcpu_set_isr(v, misr.val);
+                                dvhpt_fault(v, vadr);
+                        } else {
+                                nested_dtlb(v);
+                        }
+                }
+        } else if (type == I_TLB) {
+                if (!(vpsr & IA64_PSR_IC))
+                        misr.ni = 1;
+                if (!vhpt_enabled(v, vadr, INST_REF)) {
+                        vcpu_set_isr(v, misr.val);
+                        alt_itlb(v, vadr);
+                        return;
+                }
+                vpta.val = vcpu_get_pta(v);
+                vhpt_adr = vcpu_thash(v, vadr);
+                if (!guest_vhpt_lookup(vhpt_adr, &pteval)) {
+                        /* VHPT successfully read.  */
+                        if (pteval & _PAGE_P) {
+                                if ((pteval & _PAGE_MA_MASK) == _PAGE_MA_ST) {
+                                        vcpu_set_isr(v, misr.val);
+                                        itlb_fault(v, vadr);
+                                        return ;
+                                }
+                                rr = vcpu_get_rr(v, vadr);
+                                itir = rr & (RR_RID_MASK | RR_PS_MASK);
+                                thash_purge_and_insert(v, pteval, itir,
+                                                        vadr, I_TLB);
+                        } else {
+                                vcpu_set_isr(v, misr.val);
+                                inst_page_not_present(v, vadr);
+                        }
+                } else {
+                        vcpu_set_isr(v, misr.val);
+                        ivhpt_fault(v, vadr);
+                }
+        }
+}
+void kvm_vexirq(struct kvm_vcpu *vcpu)
+{
+        u64 vpsr, isr;
+        struct kvm_pt_regs *regs;
+        regs = vcpu_regs(vcpu);
+        vpsr = VCPU(vcpu, vpsr);
+        isr = vpsr & IA64_PSR_RI;
+        reflect_interruption(0, isr, 0, 12, regs); /*EXT IRQ*/
+}
+void kvm_ia64_handle_irq(struct kvm_vcpu *v)
+{
+        struct exit_ctl_data *p = &v->arch.exit_data;
+        long psr;
+        local_irq_save(psr);
+        p->exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+        vmm_transition(v);
+        local_irq_restore(psr);
+        VMX(v, timer_check) = 1;
+}
+static void ptc_ga_remote_func(struct kvm_vcpu *v, int pos)
+{
+        u64 oldrid, moldrid, oldpsbits, vaddr;
+        struct kvm_ptc_g *p = &v->arch.ptc_g_data[pos];
+        vaddr = p->vaddr;
+        oldrid = VMX(v, vrr[0]);
+        VMX(v, vrr[0]) = p->rr;
+        oldpsbits = VMX(v, psbits[0]);
+        VMX(v, psbits[0]) = VMX(v, psbits[REGION_NUMBER(vaddr)]);
+        moldrid = ia64_get_rr(0x0);
+        ia64_set_rr(0x0, vrrtomrr(p->rr));
+        ia64_srlz_d();
+        vaddr = PAGEALIGN(vaddr, p->ps);
+        thash_purge_entries_remote(v, vaddr, p->ps);
+        VMX(v, vrr[0]) = oldrid;
+        VMX(v, psbits[0]) = oldpsbits;
+        ia64_set_rr(0x0, moldrid);
+        ia64_dv_serialize_data();
+}
+static void vcpu_do_resume(struct kvm_vcpu *vcpu)
+{
+        /*Re-init VHPT and VTLB once from resume*/
+        vcpu->arch.vhpt.num = VHPT_NUM_ENTRIES;
+        thash_init(&vcpu->arch.vhpt, VHPT_SHIFT);
+        vcpu->arch.vtlb.num = VTLB_NUM_ENTRIES;
+        thash_init(&vcpu->arch.vtlb, VTLB_SHIFT);
+        ia64_set_pta(vcpu->arch.vhpt.pta.val);
+}
+static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
+{
+        if (test_and_clear_bit(KVM_REQ_RESUME, &vcpu->requests)) {
+                vcpu_do_resume(vcpu);
+                return;
+        }
+        if (unlikely(test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))) {
+                thash_purge_all(vcpu);
+                return;
+        }
+        if (test_and_clear_bit(KVM_REQ_PTC_G, &vcpu->requests)) {
+                while (vcpu->arch.ptc_g_count > 0)
+                        ptc_ga_remote_func(vcpu, --vcpu->arch.ptc_g_count);
+        }
+}
+void vmm_transition(struct kvm_vcpu *vcpu)
+{
+        ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
+                        0, 0, 0, 0, 0, 0);
+        vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
+        ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
+                                                0, 0, 0, 0, 0, 0);
+        kvm_do_resume_op(vcpu);
+}
diff --git a/arch/ia64/kvm/trampoline.S b/arch/ia64/kvm/trampoline.S
new file mode 100644
index 00000000000..30897d44d61
--- /dev/null
+++ b/arch/ia64/kvm/trampoline.S
@@ -0,0 +1,1038 @@
+/* Save all processor states
+ *
+ * Copyright (c) 2007 Fleming Feng <fleming.feng@intel.com>
+ * Copyright (c) 2007 Anthony Xu   <anthony.xu@intel.com>
+ */
+#include <asm/asmmacro.h>
+#include "asm-offsets.h"
+#define CTX(name)    VMM_CTX_##name##_OFFSET
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_BRANCH_REGS                        \
+        add     r2 = CTX(B0),r32;               \
+        add     r3 = CTX(B1),r32;               \
+        mov     r16 = b0;                       \
+        mov     r17 = b1;                       \
+        ;;                                      \
+        st8     [r2]=r16,16;                    \
+        st8     [r3]=r17,16;                    \
+        ;;                                      \
+        mov     r16 = b2;                       \
+        mov     r17 = b3;                       \
+        ;;                                      \
+        st8     [r2]=r16,16;                    \
+        st8     [r3]=r17,16;                    \
+        ;;                                      \
+        mov     r16 = b4;                       \
+        mov     r17 = b5;                       \
+        ;;                                      \
+        st8     [r2]=r16;                       \
+        st8     [r3]=r17;                       \
+        ;;
+        /*
+         *      r33:            context_t base address
+         */
+#define RESTORE_BRANCH_REGS                     \
+        add     r2 = CTX(B0),r33;               \
+        add     r3 = CTX(B1),r33;               \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     b0 = r16;                       \
+        mov     b1 = r17;                       \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     b2 = r16;                       \
+        mov     b3 = r17;                       \
+        ;;                                      \
+        ld8     r16=[r2];                       \
+        ld8     r17=[r3];                       \
+        ;;                                      \
+        mov     b4=r16;                         \
+        mov     b5=r17;                         \
+        ;;
+        /*
+         *      r32: context_t base address
+         *      bsw == 1
+         *      Save all bank1 general registers, r4 ~ r7
+         */
+#define SAVE_GENERAL_REGS                       \
+        add     r2=CTX(R4),r32;                 \
+        add     r3=CTX(R5),r32;                 \
+        ;;                                      \
+.mem.offset 0,0;                                \
+        st8.spill       [r2]=r4,16;             \
+.mem.offset 8,0;                                \
+        st8.spill       [r3]=r5,16;             \
+        ;;                                      \
+.mem.offset 0,0;                                \
+        st8.spill       [r2]=r6,48;             \
+.mem.offset 8,0;                                \
+        st8.spill       [r3]=r7,48;             \
+        ;;                                      \
+.mem.offset 0,0;                                \
+    st8.spill    [r2]=r12;                      \
+.mem.offset 8,0;                                \
+    st8.spill    [r3]=r13;                      \
+    ;;
+        /*
+         *      r33: context_t base address
+         *      bsw == 1
+         */
+#define RESTORE_GENERAL_REGS                    \
+        add     r2=CTX(R4),r33;                 \
+        add     r3=CTX(R5),r33;                 \
+        ;;                                      \
+        ld8.fill        r4=[r2],16;             \
+        ld8.fill        r5=[r3],16;             \
+        ;;                                      \
+        ld8.fill        r6=[r2],48;             \
+        ld8.fill        r7=[r3],48;             \
+        ;;                                      \
+        ld8.fill    r12=[r2];                   \
+        ld8.fill    r13 =[r3];                  \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_KERNEL_REGS                        \
+        add     r2 = CTX(KR0),r32;              \
+        add     r3 = CTX(KR1),r32;              \
+        mov     r16 = ar.k0;                    \
+        mov     r17 = ar.k1;                    \
+        ;;                                      \
+        st8     [r2] = r16,16;                  \
+        st8     [r3] = r17,16;                  \
+        ;;                                      \
+        mov     r16 = ar.k2;                    \
+        mov     r17 = ar.k3;                    \
+        ;;                                      \
+        st8     [r2] = r16,16;                  \
+        st8     [r3] = r17,16;                  \
+        ;;                                      \
+        mov     r16 = ar.k4;                    \
+        mov     r17 = ar.k5;                    \
+        ;;                                      \
+        st8     [r2] = r16,16;                  \
+        st8     [r3] = r17,16;                  \
+        ;;                                      \
+        mov     r16 = ar.k6;                    \
+        mov     r17 = ar.k7;                    \
+        ;;                                      \
+        st8     [r2] = r16;                     \
+        st8     [r3] = r17;                     \
+        ;;
+        /*
+         *      r33:            context_t base address
+         */
+#define RESTORE_KERNEL_REGS                     \
+        add     r2 = CTX(KR0),r33;              \
+        add     r3 = CTX(KR1),r33;              \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     ar.k0=r16;                      \
+        mov     ar.k1=r17;                      \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     ar.k2=r16;                      \
+        mov     ar.k3=r17;                      \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     ar.k4=r16;                      \
+        mov     ar.k5=r17;                      \
+        ;;                                      \
+        ld8     r16=[r2],16;                    \
+        ld8     r17=[r3],16;                    \
+        ;;                                      \
+        mov     ar.k6=r16;                      \
+        mov     ar.k7=r17;                      \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_APP_REGS                           \
+        add  r2 = CTX(BSPSTORE),r32;            \
+        mov  r16 = ar.bspstore;                 \
+        ;;                                      \
+        st8  [r2] = r16,CTX(RNAT)-CTX(BSPSTORE);\
+        mov  r16 = ar.rnat;                     \
+        ;;                                      \
+        st8  [r2] = r16,CTX(FCR)-CTX(RNAT);     \
+        mov  r16 = ar.fcr;                      \
+        ;;                                      \
+        st8  [r2] = r16,CTX(EFLAG)-CTX(FCR);    \
+        mov  r16 = ar.eflag;                    \
+        ;;                                      \
+        st8  [r2] = r16,CTX(CFLG)-CTX(EFLAG);   \
+        mov  r16 = ar.cflg;                     \
+        ;;                                      \
+        st8  [r2] = r16,CTX(FSR)-CTX(CFLG);     \
+        mov  r16 = ar.fsr;                      \
+        ;;                                      \
+        st8  [r2] = r16,CTX(FIR)-CTX(FSR);      \
+        mov  r16 = ar.fir;                      \
+        ;;                                      \
+        st8  [r2] = r16,CTX(FDR)-CTX(FIR);      \
+        mov  r16 = ar.fdr;                      \
+        ;;                                      \
+        st8  [r2] = r16,CTX(UNAT)-CTX(FDR);     \
+        mov  r16 = ar.unat;                     \
+        ;;                                      \
+        st8  [r2] = r16,CTX(FPSR)-CTX(UNAT);    \
+        mov  r16 = ar.fpsr;                     \
+        ;;                                      \
+        st8  [r2] = r16,CTX(PFS)-CTX(FPSR);     \
+        mov  r16 = ar.pfs;                      \
+        ;;                                      \
+        st8  [r2] = r16,CTX(LC)-CTX(PFS);       \
+        mov  r16 = ar.lc;                       \
+        ;;                                      \
+        st8  [r2] = r16;                        \
+        ;;
+        /*
+         *      r33:            context_t base address
+         */
+#define RESTORE_APP_REGS                        \
+        add  r2=CTX(BSPSTORE),r33;              \
+        ;;                                      \
+        ld8  r16=[r2],CTX(RNAT)-CTX(BSPSTORE);  \
+        ;;                                      \
+        mov  ar.bspstore=r16;                   \
+        ld8  r16=[r2],CTX(FCR)-CTX(RNAT);       \
+        ;;                                      \
+        mov  ar.rnat=r16;                       \
+        ld8  r16=[r2],CTX(EFLAG)-CTX(FCR);      \
+        ;;                                      \
+        mov  ar.fcr=r16;                        \
+        ld8  r16=[r2],CTX(CFLG)-CTX(EFLAG);     \
+        ;;                                      \
+        mov  ar.eflag=r16;                      \
+        ld8  r16=[r2],CTX(FSR)-CTX(CFLG);       \
+        ;;                                      \
+        mov  ar.cflg=r16;                       \
+        ld8  r16=[r2],CTX(FIR)-CTX(FSR);        \
+        ;;                                      \
+        mov  ar.fsr=r16;                        \
+        ld8  r16=[r2],CTX(FDR)-CTX(FIR);        \
+        ;;                                      \
+        mov  ar.fir=r16;                        \
+        ld8  r16=[r2],CTX(UNAT)-CTX(FDR);       \
+        ;;                                      \
+        mov  ar.fdr=r16;                        \
+        ld8  r16=[r2],CTX(FPSR)-CTX(UNAT);      \
+        ;;                                      \
+        mov  ar.unat=r16;                       \
+        ld8  r16=[r2],CTX(PFS)-CTX(FPSR);       \
+        ;;                                      \
+        mov  ar.fpsr=r16;                       \
+        ld8  r16=[r2],CTX(LC)-CTX(PFS);         \
+        ;;                                      \
+        mov  ar.pfs=r16;                        \
+        ld8  r16=[r2];                          \
+        ;;                                      \
+        mov  ar.lc=r16;                         \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_CTL_REGS                           \
+        add     r2 = CTX(DCR),r32;              \
+        mov     r16 = cr.dcr;                   \
+        ;;                                      \
+        st8     [r2] = r16,CTX(IVA)-CTX(DCR);   \
+        ;;                                      \
+        mov     r16 = cr.iva;                   \
+        ;;                                      \
+        st8     [r2] = r16,CTX(PTA)-CTX(IVA);   \
+        ;;                                      \
+        mov r16 = cr.pta;                       \
+        ;;                                      \
+        st8 [r2] = r16 ;                        \
+        ;;
+        /*
+         *      r33:            context_t base address
+         */
+#define RESTORE_CTL_REGS                                \
+        add     r2 = CTX(DCR),r33;                      \
+        ;;                                              \
+        ld8     r16 = [r2],CTX(IVA)-CTX(DCR);           \
+        ;;                                              \
+        mov     cr.dcr = r16;                           \
+        dv_serialize_data;                              \
+        ;;                                              \
+        ld8     r16 = [r2],CTX(PTA)-CTX(IVA);           \
+        ;;                                              \
+        mov     cr.iva = r16;                           \
+        dv_serialize_data;                              \
+        ;;                                              \
+        ld8 r16 = [r2];                                 \
+        ;;                                              \
+        mov cr.pta = r16;                               \
+        dv_serialize_data;                              \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_REGION_REGS                        \
+        add     r2=CTX(RR0),r32;                \
+        mov     r16=rr[r0];                     \
+        dep.z   r18=1,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        mov     r17=rr[r18];                    \
+        dep.z   r18=2,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r17,8;                     \
+        mov     r16=rr[r18];                    \
+        dep.z   r18=3,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        mov     r17=rr[r18];                    \
+        dep.z   r18=4,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r17,8;                     \
+        mov     r16=rr[r18];                    \
+        dep.z   r18=5,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        mov     r17=rr[r18];                    \
+        dep.z   r18=7,61,3;                     \
+        ;;                                      \
+        st8     [r2]=r17,16;                    \
+        mov     r16=rr[r18];                    \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        ;;
+        /*
+         *      r33:context_t base address
+         */
+#define RESTORE_REGION_REGS     \
+        add     r2=CTX(RR0),r33;\
+        mov r18=r0;             \
+        ;;                      \
+        ld8     r20=[r2],8;     \
+        ;;      /* rr0 */       \
+        ld8     r21=[r2],8;     \
+        ;;      /* rr1 */       \
+        ld8     r22=[r2],8;     \
+        ;;      /* rr2 */       \
+        ld8     r23=[r2],8;     \
+        ;;      /* rr3 */       \
+        ld8     r24=[r2],8;     \
+        ;;      /* rr4 */       \
+        ld8     r25=[r2],16;    \
+        ;;      /* rr5 */       \
+        ld8     r27=[r2];       \
+        ;;      /* rr7 */       \
+        mov rr[r18]=r20;        \
+        dep.z   r18=1,61,3;     \
+        ;;  /* rr1 */           \
+        mov rr[r18]=r21;        \
+        dep.z   r18=2,61,3;     \
+        ;;  /* rr2 */           \
+        mov rr[r18]=r22;        \
+        dep.z   r18=3,61,3;     \
+        ;;  /* rr3 */           \
+        mov rr[r18]=r23;        \
+        dep.z   r18=4,61,3;     \
+        ;;  /* rr4 */           \
+        mov rr[r18]=r24;        \
+        dep.z   r18=5,61,3;     \
+        ;;  /* rr5 */           \
+        mov rr[r18]=r25;        \
+        dep.z   r18=7,61,3;     \
+        ;;  /* rr7 */           \
+        mov rr[r18]=r27;        \
+        ;;                      \
+        srlz.i;                 \
+        ;;
+        /*
+         *      r32:    context_t base address
+         *      r36~r39:scratch registers
+         */
+#define SAVE_DEBUG_REGS                         \
+        add     r2=CTX(IBR0),r32;               \
+        add     r3=CTX(DBR0),r32;               \
+        mov     r16=ibr[r0];                    \
+        mov     r17=dbr[r0];                    \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=1,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=2,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=2,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=3,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=4,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=5,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=6,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        add     r18=7,r0;                       \
+        ;;                                      \
+        mov     r16=ibr[r18];                   \
+        mov     r17=dbr[r18];                   \
+        ;;                                      \
+        st8     [r2]=r16,8;                     \
+        st8     [r3]=r17,8;                     \
+        ;;
+/*
+ *      r33:    point to context_t structure
+ *      ar.lc are corrupted.
+ */
+#define RESTORE_DEBUG_REGS                      \
+        add     r2=CTX(IBR0),r33;               \
+        add     r3=CTX(DBR0),r33;               \
+        mov r16=7;                              \
+        mov r17=r0;                             \
+        ;;                                      \
+        mov ar.lc = r16;                        \
+        ;;                                      \
+1:                                              \
+        ld8 r18=[r2],8;                         \
+        ld8 r19=[r3],8;                         \
+        ;;                                      \
+        mov ibr[r17]=r18;                       \
+        mov dbr[r17]=r19;                       \
+        ;;                                      \
+        srlz.i;                                 \
+        ;;                                      \
+        add r17=1,r17;                          \
+        br.cloop.sptk 1b;                       \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_FPU_LOW                            \
+        add     r2=CTX(F2),r32;                 \
+        add     r3=CTX(F3),r32;                 \
+        ;;                                      \
+        stf.spill.nta   [r2]=f2,32;             \
+        stf.spill.nta   [r3]=f3,32;             \
+        ;;                                      \
+        stf.spill.nta   [r2]=f4,32;             \
+        stf.spill.nta   [r3]=f5,32;             \
+        ;;                                      \
+        stf.spill.nta   [r2]=f6,32;             \
+        stf.spill.nta   [r3]=f7,32;             \
+        ;;                                      \
+        stf.spill.nta   [r2]=f8,32;             \
+        stf.spill.nta   [r3]=f9,32;             \
+        ;;                                      \
+        stf.spill.nta   [r2]=f10,32;            \
+        stf.spill.nta   [r3]=f11,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f12,32;            \
+        stf.spill.nta   [r3]=f13,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f14,32;            \
+        stf.spill.nta   [r3]=f15,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f16,32;            \
+        stf.spill.nta   [r3]=f17,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f18,32;            \
+        stf.spill.nta   [r3]=f19,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f20,32;            \
+        stf.spill.nta   [r3]=f21,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f22,32;            \
+        stf.spill.nta   [r3]=f23,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f24,32;            \
+        stf.spill.nta   [r3]=f25,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f26,32;            \
+        stf.spill.nta   [r3]=f27,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f28,32;            \
+        stf.spill.nta   [r3]=f29,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f30;               \
+        stf.spill.nta   [r3]=f31;               \
+        ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_FPU_HIGH                           \
+        add     r2=CTX(F32),r32;                \
+        add     r3=CTX(F33),r32;                \
+        ;;                                      \
+        stf.spill.nta   [r2]=f32,32;            \
+        stf.spill.nta   [r3]=f33,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f34,32;            \
+        stf.spill.nta   [r3]=f35,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f36,32;            \
+        stf.spill.nta   [r3]=f37,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f38,32;            \
+        stf.spill.nta   [r3]=f39,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f40,32;            \
+        stf.spill.nta   [r3]=f41,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f42,32;            \
+        stf.spill.nta   [r3]=f43,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f44,32;            \
+        stf.spill.nta   [r3]=f45,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f46,32;            \
+        stf.spill.nta   [r3]=f47,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f48,32;            \
+        stf.spill.nta   [r3]=f49,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f50,32;            \
+        stf.spill.nta   [r3]=f51,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f52,32;            \
+        stf.spill.nta   [r3]=f53,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f54,32;            \
+        stf.spill.nta   [r3]=f55,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f56,32;            \
+        stf.spill.nta   [r3]=f57,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f58,32;            \
+        stf.spill.nta   [r3]=f59,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f60,32;            \
+        stf.spill.nta   [r3]=f61,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f62,32;            \
+        stf.spill.nta   [r3]=f63,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f64,32;            \
+        stf.spill.nta   [r3]=f65,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f66,32;            \
+        stf.spill.nta   [r3]=f67,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f68,32;            \
+        stf.spill.nta   [r3]=f69,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f70,32;            \
+        stf.spill.nta   [r3]=f71,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f72,32;            \
+        stf.spill.nta   [r3]=f73,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f74,32;            \
+        stf.spill.nta   [r3]=f75,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f76,32;            \
+        stf.spill.nta   [r3]=f77,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f78,32;            \
+        stf.spill.nta   [r3]=f79,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f80,32;            \
+        stf.spill.nta   [r3]=f81,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f82,32;            \
+        stf.spill.nta   [r3]=f83,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f84,32;            \
+        stf.spill.nta   [r3]=f85,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f86,32;            \
+        stf.spill.nta   [r3]=f87,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f88,32;            \
+        stf.spill.nta   [r3]=f89,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f90,32;            \
+        stf.spill.nta   [r3]=f91,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f92,32;            \
+        stf.spill.nta   [r3]=f93,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f94,32;            \
+        stf.spill.nta   [r3]=f95,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f96,32;            \
+        stf.spill.nta   [r3]=f97,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f98,32;            \
+        stf.spill.nta   [r3]=f99,32;            \
+        ;;                                      \
+        stf.spill.nta   [r2]=f100,32;           \
+        stf.spill.nta   [r3]=f101,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f102,32;           \
+        stf.spill.nta   [r3]=f103,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f104,32;           \
+        stf.spill.nta   [r3]=f105,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f106,32;           \
+        stf.spill.nta   [r3]=f107,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f108,32;           \
+        stf.spill.nta   [r3]=f109,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f110,32;           \
+        stf.spill.nta   [r3]=f111,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f112,32;           \
+        stf.spill.nta   [r3]=f113,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f114,32;           \
+        stf.spill.nta   [r3]=f115,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f116,32;           \
+        stf.spill.nta   [r3]=f117,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f118,32;           \
+        stf.spill.nta   [r3]=f119,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f120,32;           \
+        stf.spill.nta   [r3]=f121,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f122,32;           \
+        stf.spill.nta   [r3]=f123,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f124,32;           \
+        stf.spill.nta   [r3]=f125,32;           \
+        ;;                                      \
+        stf.spill.nta   [r2]=f126;              \
+        stf.spill.nta   [r3]=f127;              \
+        ;;
+     /*
+      *      r33:    point to context_t structure
+      */
+#define RESTORE_FPU_LOW                         \
+    add     r2 = CTX(F2), r33;                  \
+    add     r3 = CTX(F3), r33;                  \
+    ;;                                          \
+    ldf.fill.nta f2 = [r2], 32;                 \
+    ldf.fill.nta f3 = [r3], 32;                 \
+    ;;                                          \
+    ldf.fill.nta f4 = [r2], 32;                 \
+    ldf.fill.nta f5 = [r3], 32;                 \
+    ;;                                          \
+    ldf.fill.nta f6 = [r2], 32;                 \
+    ldf.fill.nta f7 = [r3], 32;                 \
+    ;;                                          \
+    ldf.fill.nta f8 = [r2], 32;                 \
+    ldf.fill.nta f9 = [r3], 32;                 \
+    ;;                                          \
+    ldf.fill.nta f10 = [r2], 32;                \
+    ldf.fill.nta f11 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f12 = [r2], 32;                \
+    ldf.fill.nta f13 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f14 = [r2], 32;                \
+    ldf.fill.nta f15 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f16 = [r2], 32;                \
+    ldf.fill.nta f17 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f18 = [r2], 32;                \
+    ldf.fill.nta f19 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f20 = [r2], 32;                \
+    ldf.fill.nta f21 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f22 = [r2], 32;                \
+    ldf.fill.nta f23 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f24 = [r2], 32;                \
+    ldf.fill.nta f25 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f26 = [r2], 32;                \
+    ldf.fill.nta f27 = [r3], 32;                \
+        ;;                                      \
+    ldf.fill.nta f28 = [r2], 32;                \
+    ldf.fill.nta f29 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f30 = [r2], 32;                \
+    ldf.fill.nta f31 = [r3], 32;                \
+    ;;
+    /*
+     *      r33:    point to context_t structure
+     */
+#define RESTORE_FPU_HIGH                        \
+    add     r2 = CTX(F32), r33;                 \
+    add     r3 = CTX(F33), r33;                 \
+    ;;                                          \
+    ldf.fill.nta f32 = [r2], 32;                \
+    ldf.fill.nta f33 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f34 = [r2], 32;                \
+    ldf.fill.nta f35 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f36 = [r2], 32;                \
+    ldf.fill.nta f37 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f38 = [r2], 32;                \
+    ldf.fill.nta f39 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f40 = [r2], 32;                \
+    ldf.fill.nta f41 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f42 = [r2], 32;                \
+    ldf.fill.nta f43 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f44 = [r2], 32;                \
+    ldf.fill.nta f45 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f46 = [r2], 32;                \
+    ldf.fill.nta f47 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f48 = [r2], 32;                \
+    ldf.fill.nta f49 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f50 = [r2], 32;                \
+    ldf.fill.nta f51 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f52 = [r2], 32;                \
+    ldf.fill.nta f53 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f54 = [r2], 32;                \
+    ldf.fill.nta f55 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f56 = [r2], 32;                \
+    ldf.fill.nta f57 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f58 = [r2], 32;                \
+    ldf.fill.nta f59 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f60 = [r2], 32;                \
+    ldf.fill.nta f61 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f62 = [r2], 32;                \
+    ldf.fill.nta f63 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f64 = [r2], 32;                \
+    ldf.fill.nta f65 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f66 = [r2], 32;                \
+    ldf.fill.nta f67 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f68 = [r2], 32;                \
+    ldf.fill.nta f69 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f70 = [r2], 32;                \
+    ldf.fill.nta f71 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f72 = [r2], 32;                \
+    ldf.fill.nta f73 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f74 = [r2], 32;                \
+    ldf.fill.nta f75 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f76 = [r2], 32;                \
+    ldf.fill.nta f77 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f78 = [r2], 32;                \
+    ldf.fill.nta f79 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f80 = [r2], 32;                \
+    ldf.fill.nta f81 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f82 = [r2], 32;                \
+    ldf.fill.nta f83 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f84 = [r2], 32;                \
+    ldf.fill.nta f85 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f86 = [r2], 32;                \
+    ldf.fill.nta f87 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f88 = [r2], 32;                \
+    ldf.fill.nta f89 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f90 = [r2], 32;                \
+    ldf.fill.nta f91 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f92 = [r2], 32;                \
+    ldf.fill.nta f93 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f94 = [r2], 32;                \
+    ldf.fill.nta f95 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f96 = [r2], 32;                \
+    ldf.fill.nta f97 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f98 = [r2], 32;                \
+    ldf.fill.nta f99 = [r3], 32;                \
+    ;;                                          \
+    ldf.fill.nta f100 = [r2], 32;               \
+    ldf.fill.nta f101 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f102 = [r2], 32;               \
+    ldf.fill.nta f103 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f104 = [r2], 32;               \
+    ldf.fill.nta f105 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f106 = [r2], 32;               \
+    ldf.fill.nta f107 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f108 = [r2], 32;               \
+    ldf.fill.nta f109 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f110 = [r2], 32;               \
+    ldf.fill.nta f111 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f112 = [r2], 32;               \
+    ldf.fill.nta f113 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f114 = [r2], 32;               \
+    ldf.fill.nta f115 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f116 = [r2], 32;               \
+    ldf.fill.nta f117 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f118 = [r2], 32;               \
+    ldf.fill.nta f119 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f120 = [r2], 32;               \
+    ldf.fill.nta f121 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f122 = [r2], 32;               \
+    ldf.fill.nta f123 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f124 = [r2], 32;               \
+    ldf.fill.nta f125 = [r3], 32;               \
+    ;;                                          \
+    ldf.fill.nta f126 = [r2], 32;               \
+    ldf.fill.nta f127 = [r3], 32;               \
+    ;;
+        /*
+         *      r32:            context_t base address
+         */
+#define SAVE_PTK_REGS                           \
+    add r2=CTX(PKR0), r32;                      \
+    mov r16=7;                                  \
+    ;;                                          \
+    mov ar.lc=r16;                              \
+    mov r17=r0;                                 \
+    ;;                                          \
+1:                                              \
+    mov r18=pkr[r17];                           \
+    ;;                                          \
+    srlz.i;                                     \
+    ;;                                          \
+    st8 [r2]=r18, 8;                            \
+    ;;                                          \
+    add r17 =1,r17;                             \
+    ;;                                          \
+    br.cloop.sptk 1b;                           \
+    ;;
+/*
+ *      r33:    point to context_t structure
+ *      ar.lc are corrupted.
+ */
+#define RESTORE_PTK_REGS                        \
+    add r2=CTX(PKR0), r33;                      \
+    mov r16=7;                                  \
+    ;;                                          \
+    mov ar.lc=r16;                              \
+    mov r17=r0;                                 \
+    ;;                                          \
+1:                                              \
+    ld8 r18=[r2], 8;                            \
+    ;;                                          \
+    mov pkr[r17]=r18;                           \
+    ;;                                          \
+    srlz.i;                                     \
+    ;;                                          \
+    add r17 =1,r17;                             \
+    ;;                                          \
+    br.cloop.sptk 1b;                           \
+    ;;
+/*
+ * void vmm_trampoline( context_t * from,
+ *                      context_t * to)
+ *
+ *      from:   r32
+ *      to:     r33
+ *  note: interrupt disabled before call this function.
+ */
+GLOBAL_ENTRY(vmm_trampoline)
+    mov r16 = psr
+    adds r2 = CTX(PSR), r32
+    ;;
+    st8 [r2] = r16, 8       // psr
+    mov r17 = pr
+    ;;
+    st8 [r2] = r17, 8       // pr
+    mov r18 = ar.unat
+    ;;
+    st8 [r2] = r18
+    mov r17 = ar.rsc
+    ;;
+    adds r2 = CTX(RSC),r32
+    ;;
+    st8 [r2]= r17
+    mov ar.rsc =0
+    flushrs
+    ;;
+    SAVE_GENERAL_REGS
+    ;;
+    SAVE_KERNEL_REGS
+    ;;
+    SAVE_APP_REGS
+    ;;
+    SAVE_BRANCH_REGS
+    ;;
+    SAVE_CTL_REGS
+    ;;
+    SAVE_REGION_REGS
+    ;;
+    //SAVE_DEBUG_REGS
+    ;;
+    rsm  psr.dfl
+    ;;
+    srlz.d
+    ;;
+    SAVE_FPU_LOW
+    ;;
+    rsm  psr.dfh
+    ;;
+    srlz.d
+    ;;
+    SAVE_FPU_HIGH
+    ;;
+    SAVE_PTK_REGS
+    ;;
+    RESTORE_PTK_REGS
+    ;;
+    RESTORE_FPU_HIGH
+    ;;
+    RESTORE_FPU_LOW
+    ;;
+    //RESTORE_DEBUG_REGS
+    ;;
+    RESTORE_REGION_REGS
+    ;;
+    RESTORE_CTL_REGS
+    ;;
+    RESTORE_BRANCH_REGS
+    ;;
+    RESTORE_APP_REGS
+    ;;
+    RESTORE_KERNEL_REGS
+    ;;
+    RESTORE_GENERAL_REGS
+    ;;
+    adds r2=CTX(PSR), r33
+    ;;
+    ld8 r16=[r2], 8       // psr
+    ;;
+    mov psr.l=r16
+    ;;
+    srlz.d
+    ;;
+    ld8 r16=[r2], 8       // pr
+    ;;
+    mov pr =r16,-1
+    ld8 r16=[r2]       // unat
+    ;;
+    mov ar.unat=r16
+    ;;
+    adds r2=CTX(RSC),r33
+    ;;
+    ld8 r16 =[r2]
+    ;;
+    mov ar.rsc = r16
+    ;;
+    br.ret.sptk.few b0
+END(vmm_trampoline)
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
new file mode 100644
index 00000000000..e44027ce566
--- /dev/null
+++ b/arch/ia64/kvm/vcpu.c
@@ -0,0 +1,2163 @@
+/*
+ * kvm_vcpu.c: handling all virtual cpu related thing.
+ * Copyright (c) 2005, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Shaofan Li (Susue Li) <susie.li@intel.com>
+ *  Yaozu Dong (Eddie Dong) (Eddie.dong@intel.com)
+ *  Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
+ *  Xiantao Zhang <xiantao.zhang@intel.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/ia64regs.h>
+#include <asm/gcc_intrin.h>
+#include <asm/kregs.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include "asm-offsets.h"
+#include "vcpu.h"
+/*
+ * Special notes:
+ * - Index by it/dt/rt sequence
+ * - Only existing mode transitions are allowed in this table
+ * - RSE is placed at lazy mode when emulating guest partial mode
+ * - If gva happens to be rr0 and rr4, only allowed case is identity
+ *   mapping (gva=gpa), or panic! (How?)
+ */
+int mm_switch_table[8][8] = {
+        /*  2004/09/12(Kevin): Allow switch to self */
+        /*
+         *  (it,dt,rt): (0,0,0) -> (1,1,1)
+         *  This kind of transition usually occurs in the very early
+         *  stage of Linux boot up procedure. Another case is in efi
+         *  and pal calls. (see "arch/ia64/kernel/head.S")
+         *
+         *  (it,dt,rt): (0,0,0) -> (0,1,1)
+         *  This kind of transition is found when OSYa exits efi boot
+         *  service. Due to gva = gpa in this case (Same region),
+         *  data access can be satisfied though itlb entry for physical
+         *  emulation is hit.
+         */
+        {SW_SELF, 0,  0,  SW_NOP, 0,  0,  0,  SW_P2V},
+        {0,  0,  0,  0,  0,  0,  0,  0},
+        {0,  0,  0,  0,  0,  0,  0,  0},
+        /*
+         *  (it,dt,rt): (0,1,1) -> (1,1,1)
+         *  This kind of transition is found in OSYa.
+         *
+         *  (it,dt,rt): (0,1,1) -> (0,0,0)
+         *  This kind of transition is found in OSYa
+         */
+        {SW_NOP, 0,  0,  SW_SELF, 0,  0,  0,  SW_P2V},
+        /* (1,0,0)->(1,1,1) */
+        {0,  0,  0,  0,  0,  0,  0,  SW_P2V},
+        /*
+         *  (it,dt,rt): (1,0,1) -> (1,1,1)
+         *  This kind of transition usually occurs when Linux returns
+         *  from the low level TLB miss handlers.
+         *  (see "arch/ia64/kernel/ivt.S")
+         */
+        {0,  0,  0,  0,  0,  SW_SELF, 0,  SW_P2V},
+        {0,  0,  0,  0,  0,  0,  0,  0},
+        /*
+         *  (it,dt,rt): (1,1,1) -> (1,0,1)
+         *  This kind of transition usually occurs in Linux low level
+         *  TLB miss handler. (see "arch/ia64/kernel/ivt.S")
+         *
+         *  (it,dt,rt): (1,1,1) -> (0,0,0)
+         *  This kind of transition usually occurs in pal and efi calls,
+         *  which requires running in physical mode.
+         *  (see "arch/ia64/kernel/head.S")
+         *  (1,1,1)->(1,0,0)
+         */
+        {SW_V2P, 0,  0,  0,  SW_V2P, SW_V2P, 0,  SW_SELF},
+};
+void physical_mode_init(struct kvm_vcpu  *vcpu)
+{
+        vcpu->arch.mode_flags = GUEST_IN_PHY;
+}
+void switch_to_physical_rid(struct kvm_vcpu *vcpu)
+{
+        unsigned long psr;
+        /* Save original virtual mode rr[0] and rr[4] */
+        psr = ia64_clear_ic();
+        ia64_set_rr(VRN0<<VRN_SHIFT, vcpu->arch.metaphysical_rr0);
+        ia64_srlz_d();
+        ia64_set_rr(VRN4<<VRN_SHIFT, vcpu->arch.metaphysical_rr4);
+        ia64_srlz_d();
+        ia64_set_psr(psr);
+        return;
+}
+void switch_to_virtual_rid(struct kvm_vcpu *vcpu)
+{
+        unsigned long psr;
+        psr = ia64_clear_ic();
+        ia64_set_rr(VRN0 << VRN_SHIFT, vcpu->arch.metaphysical_saved_rr0);
+        ia64_srlz_d();
+        ia64_set_rr(VRN4 << VRN_SHIFT, vcpu->arch.metaphysical_saved_rr4);
+        ia64_srlz_d();
+        ia64_set_psr(psr);
+        return;
+}
+static int mm_switch_action(struct ia64_psr opsr, struct ia64_psr npsr)
+{
+        return mm_switch_table[MODE_IND(opsr)][MODE_IND(npsr)];
+}
+void switch_mm_mode(struct kvm_vcpu *vcpu, struct ia64_psr old_psr,
+                                        struct ia64_psr new_psr)
+{
+        int act;
+        act = mm_switch_action(old_psr, new_psr);
+        switch (act) {
+        case SW_V2P:
+                /*printk("V -> P mode transition: (0x%lx -> 0x%lx)\n",
+                old_psr.val, new_psr.val);*/
+                switch_to_physical_rid(vcpu);
+                /*
+                 * Set rse to enforced lazy, to prevent active rse
+                 *save/restor when guest physical mode.
+                 */
+                vcpu->arch.mode_flags |= GUEST_IN_PHY;
+                break;
+        case SW_P2V:
+                switch_to_virtual_rid(vcpu);
+                /*
+                 * recover old mode which is saved when entering
+                 * guest physical mode
+                 */
+                vcpu->arch.mode_flags &= ~GUEST_IN_PHY;
+                break;
+        case SW_SELF:
+                break;
+        case SW_NOP:
+                break;
+        default:
+                /* Sanity check */
+                break;
+        }
+        return;
+}
+/*
+ * In physical mode, insert tc/tr for region 0 and 4 uses
+ * RID[0] and RID[4] which is for physical mode emulation.
+ * However what those inserted tc/tr wants is rid for
+ * virtual mode. So original virtual rid needs to be restored
+ * before insert.
+ *
+ * Operations which required such switch include:
+ *  - insertions (itc.*, itr.*)
+ *  - purges (ptc.* and ptr.*)
+ *  - tpa
+ *  - tak
+ *  - thash?, ttag?
+ * All above needs actual virtual rid for destination entry.
+ */
+void check_mm_mode_switch(struct kvm_vcpu *vcpu,  struct ia64_psr old_psr,
+                                        struct ia64_psr new_psr)
+{
+        if ((old_psr.dt != new_psr.dt)
+                        || (old_psr.it != new_psr.it)
+                        || (old_psr.rt != new_psr.rt))
+                switch_mm_mode(vcpu, old_psr, new_psr);
+        return;
+}
+/*
+ * In physical mode, insert tc/tr for region 0 and 4 uses
+ * RID[0] and RID[4] which is for physical mode emulation.
+ * However what those inserted tc/tr wants is rid for
+ * virtual mode. So original virtual rid needs to be restored
+ * before insert.
+ *
+ * Operations which required such switch include:
+ *  - insertions (itc.*, itr.*)
+ *  - purges (ptc.* and ptr.*)
+ *  - tpa
+ *  - tak
+ *  - thash?, ttag?
+ * All above needs actual virtual rid for destination entry.
+ */
+void prepare_if_physical_mode(struct kvm_vcpu *vcpu)
+{
+        if (is_physical_mode(vcpu)) {
+                vcpu->arch.mode_flags |= GUEST_PHY_EMUL;
+                switch_to_virtual_rid(vcpu);
+        }
+        return;
+}
+/* Recover always follows prepare */
+void recover_if_physical_mode(struct kvm_vcpu *vcpu)
+{
+        if (is_physical_mode(vcpu))
+                switch_to_physical_rid(vcpu);
+        vcpu->arch.mode_flags &= ~GUEST_PHY_EMUL;
+        return;
+}
+#define RPT(x)  ((u16) &((struct kvm_pt_regs *)0)->x)
+static u16 gr_info[32] = {
+        0,      /* r0 is read-only : WE SHOULD NEVER GET THIS */
+        RPT(r1), RPT(r2), RPT(r3),
+        RPT(r4), RPT(r5), RPT(r6), RPT(r7),
+        RPT(r8), RPT(r9), RPT(r10), RPT(r11),
+        RPT(r12), RPT(r13), RPT(r14), RPT(r15),
+        RPT(r16), RPT(r17), RPT(r18), RPT(r19),
+        RPT(r20), RPT(r21), RPT(r22), RPT(r23),
+        RPT(r24), RPT(r25), RPT(r26), RPT(r27),
+        RPT(r28), RPT(r29), RPT(r30), RPT(r31)
+};
+#define IA64_FIRST_STACKED_GR   32
+#define IA64_FIRST_ROTATING_FR  32
+static inline unsigned long
+rotate_reg(unsigned long sor, unsigned long rrb, unsigned long reg)
+{
+        reg += rrb;
+        if (reg >= sor)
+                reg -= sor;
+        return reg;
+}
+/*
+ * Return the (rotated) index for floating point register
+ * be in the REGNUM (REGNUM must range from 32-127,
+ * result is in the range from 0-95.
+ */
+static inline unsigned long fph_index(struct kvm_pt_regs *regs,
+                                                long regnum)
+{
+        unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f;
+        return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
+}
+/*
+ * The inverse of the above: given bspstore and the number of
+ * registers, calculate ar.bsp.
+ */
+static inline unsigned long *kvm_rse_skip_regs(unsigned long *addr,
+                                                        long num_regs)
+{
+        long delta = ia64_rse_slot_num(addr) + num_regs;
+        int i = 0;
+        if (num_regs < 0)
+                delta -= 0x3e;
+        if (delta < 0) {
+                while (delta <= -0x3f) {
+                        i--;
+                        delta += 0x3f;
+                }
+        } else {
+                while (delta >= 0x3f) {
+                        i++;
+                        delta -= 0x3f;
+                }
+        }
+        return addr + num_regs + i;
+}
+static void get_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
+                                        unsigned long *val, int *nat)
+{
+        unsigned long *bsp, *addr, *rnat_addr, *bspstore;
+        unsigned long *kbs = (void *) current_vcpu + VMM_RBS_OFFSET;
+        unsigned long nat_mask;
+        unsigned long old_rsc, new_rsc;
+        long sof = (regs->cr_ifs) & 0x7f;
+        long sor = (((regs->cr_ifs >> 14) & 0xf) << 3);
+        long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
+        long ridx = r1 - 32;
+        if (ridx < sor)
+                ridx = rotate_reg(sor, rrb_gr, ridx);
+        old_rsc = ia64_getreg(_IA64_REG_AR_RSC);
+        new_rsc = old_rsc&(~(0x3));
+        ia64_setreg(_IA64_REG_AR_RSC, new_rsc);
+        bspstore = (unsigned long *)ia64_getreg(_IA64_REG_AR_BSPSTORE);
+        bsp = kbs + (regs->loadrs >> 19);
+        addr = kvm_rse_skip_regs(bsp, -sof + ridx);
+        nat_mask = 1UL << ia64_rse_slot_num(addr);
+        rnat_addr = ia64_rse_rnat_addr(addr);
+        if (addr >= bspstore) {
+                ia64_flushrs();
+                ia64_mf();
+                bspstore = (unsigned long *)ia64_getreg(_IA64_REG_AR_BSPSTORE);
+        }
+        *val = *addr;
+        if (nat) {
+                if (bspstore < rnat_addr)
+                        *nat = (int)!!(ia64_getreg(_IA64_REG_AR_RNAT)
+                                                        & nat_mask);
+                else
+                        *nat = (int)!!((*rnat_addr) & nat_mask);
+                ia64_setreg(_IA64_REG_AR_RSC, old_rsc);
+        }
+}
+void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
+                                unsigned long val, unsigned long nat)
+{
+        unsigned long *bsp, *bspstore, *addr, *rnat_addr;
+        unsigned long *kbs = (void *) current_vcpu + VMM_RBS_OFFSET;
+        unsigned long nat_mask;
+        unsigned long old_rsc, new_rsc, psr;
+        unsigned long rnat;
+        long sof = (regs->cr_ifs) & 0x7f;
+        long sor = (((regs->cr_ifs >> 14) & 0xf) << 3);
+        long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
+        long ridx = r1 - 32;
+        if (ridx < sor)
+                ridx = rotate_reg(sor, rrb_gr, ridx);
+        old_rsc = ia64_getreg(_IA64_REG_AR_RSC);
+        /* put RSC to lazy mode, and set loadrs 0 */
+        new_rsc = old_rsc & (~0x3fff0003);
+        ia64_setreg(_IA64_REG_AR_RSC, new_rsc);
+        bsp = kbs + (regs->loadrs >> 19); /* 16 + 3 */
+        addr = kvm_rse_skip_regs(bsp, -sof + ridx);
+        nat_mask = 1UL << ia64_rse_slot_num(addr);
+        rnat_addr = ia64_rse_rnat_addr(addr);
+        local_irq_save(psr);
+        bspstore = (unsigned long *)ia64_getreg(_IA64_REG_AR_BSPSTORE);
+        if (addr >= bspstore) {
+                ia64_flushrs();
+                ia64_mf();
+                *addr = val;
+                bspstore = (unsigned long *)ia64_getreg(_IA64_REG_AR_BSPSTORE);
+                rnat = ia64_getreg(_IA64_REG_AR_RNAT);
+                if (bspstore < rnat_addr)
+                        rnat = rnat & (~nat_mask);
+                else
+                        *rnat_addr = (*rnat_addr)&(~nat_mask);
+                ia64_mf();
+                ia64_loadrs();
+                ia64_setreg(_IA64_REG_AR_RNAT, rnat);
+        } else {
+                rnat = ia64_getreg(_IA64_REG_AR_RNAT);
+                *addr = val;
+                if (bspstore < rnat_addr)
+                        rnat = rnat&(~nat_mask);
+                else
+                        *rnat_addr = (*rnat_addr) & (~nat_mask);
+                ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore);
+                ia64_setreg(_IA64_REG_AR_RNAT, rnat);
+        }
+        local_irq_restore(psr);
+        ia64_setreg(_IA64_REG_AR_RSC, old_rsc);
+}
+void getreg(unsigned long regnum, unsigned long *val,
+                                int *nat, struct kvm_pt_regs *regs)
+{
+        unsigned long addr, *unat;
+        if (regnum >= IA64_FIRST_STACKED_GR) {
+                get_rse_reg(regs, regnum, val, nat);
+                return;
+        }
+        /*
+         * Now look at registers in [0-31] range and init correct UNAT
+         */
+        addr = (unsigned long)regs;
+        unat = &regs->eml_unat;;
+        addr += gr_info[regnum];
+        *val  = *(unsigned long *)addr;
+        /*
+         * do it only when requested
+         */
+        if (nat)
+                *nat  = (*unat >> ((addr >> 3) & 0x3f)) & 0x1UL;
+}
+void setreg(unsigned long regnum, unsigned long val,
+                        int nat, struct kvm_pt_regs *regs)
+{
+        unsigned long addr;
+        unsigned long bitmask;
+        unsigned long *unat;
+        /*
+         * First takes care of stacked registers
+         */
+        if (regnum >= IA64_FIRST_STACKED_GR) {
+                set_rse_reg(regs, regnum, val, nat);
+                return;
+        }
+        /*
+         * Now look at registers in [0-31] range and init correct UNAT
+         */
+        addr = (unsigned long)regs;
+        unat = &regs->eml_unat;
+        /*
+         * add offset from base of struct
+         * and do it !
+         */
+        addr += gr_info[regnum];
+        *(unsigned long *)addr = val;
+        /*
+         * We need to clear the corresponding UNAT bit to fully emulate the load
+         * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4
+         */
+        bitmask   = 1UL << ((addr >> 3) & 0x3f);
+        if (nat)
+                *unat |= bitmask;
+         else
+                *unat &= ~bitmask;
+}
+u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        u64 val;
+        if (!reg)
+                return 0;
+        getreg(reg, &val, 0, regs);
+        return val;
+}
+void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 value, int nat)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        long sof = (regs->cr_ifs) & 0x7f;
+        if (!reg)
+                return;
+        if (reg >= sof + 32)
+                return;
+        setreg(reg, value, nat, regs);  /* FIXME: handle NATs later*/
+}
+void getfpreg(unsigned long regnum, struct ia64_fpreg *fpval,
+                                struct kvm_pt_regs *regs)
+{
+        /* Take floating register rotation into consideration*/
+        if (regnum >= IA64_FIRST_ROTATING_FR)
+                regnum = IA64_FIRST_ROTATING_FR + fph_index(regs, regnum);
+#define CASE_FIXED_FP(reg)                      \
+        case  (reg) :                           \
+                ia64_stf_spill(fpval, reg);     \
+        break
+        switch (regnum) {
+                CASE_FIXED_FP(0);
+                CASE_FIXED_FP(1);
+                CASE_FIXED_FP(2);
+                CASE_FIXED_FP(3);
+                CASE_FIXED_FP(4);
+                CASE_FIXED_FP(5);
+                CASE_FIXED_FP(6);
+                CASE_FIXED_FP(7);
+                CASE_FIXED_FP(8);
+                CASE_FIXED_FP(9);
+                CASE_FIXED_FP(10);
+                CASE_FIXED_FP(11);
+                CASE_FIXED_FP(12);
+                CASE_FIXED_FP(13);
+                CASE_FIXED_FP(14);
+                CASE_FIXED_FP(15);
+                CASE_FIXED_FP(16);
+                CASE_FIXED_FP(17);
+                CASE_FIXED_FP(18);
+                CASE_FIXED_FP(19);
+                CASE_FIXED_FP(20);
+                CASE_FIXED_FP(21);
+                CASE_FIXED_FP(22);
+                CASE_FIXED_FP(23);
+                CASE_FIXED_FP(24);
+                CASE_FIXED_FP(25);
+                CASE_FIXED_FP(26);
+                CASE_FIXED_FP(27);
+                CASE_FIXED_FP(28);
+                CASE_FIXED_FP(29);
+                CASE_FIXED_FP(30);
+                CASE_FIXED_FP(31);
+                CASE_FIXED_FP(32);
+                CASE_FIXED_FP(33);
+                CASE_FIXED_FP(34);
+                CASE_FIXED_FP(35);
+                CASE_FIXED_FP(36);
+                CASE_FIXED_FP(37);
+                CASE_FIXED_FP(38);
+                CASE_FIXED_FP(39);
+                CASE_FIXED_FP(40);
+                CASE_FIXED_FP(41);
+                CASE_FIXED_FP(42);
+                CASE_FIXED_FP(43);
+                CASE_FIXED_FP(44);
+                CASE_FIXED_FP(45);
+                CASE_FIXED_FP(46);
+                CASE_FIXED_FP(47);
+                CASE_FIXED_FP(48);
+                CASE_FIXED_FP(49);
+                CASE_FIXED_FP(50);
+                CASE_FIXED_FP(51);
+                CASE_FIXED_FP(52);
+                CASE_FIXED_FP(53);
+                CASE_FIXED_FP(54);
+                CASE_FIXED_FP(55);
+                CASE_FIXED_FP(56);
+                CASE_FIXED_FP(57);
+                CASE_FIXED_FP(58);
+                CASE_FIXED_FP(59);
+                CASE_FIXED_FP(60);
+                CASE_FIXED_FP(61);
+                CASE_FIXED_FP(62);
+                CASE_FIXED_FP(63);
+                CASE_FIXED_FP(64);
+                CASE_FIXED_FP(65);
+                CASE_FIXED_FP(66);
+                CASE_FIXED_FP(67);
+                CASE_FIXED_FP(68);
+                CASE_FIXED_FP(69);
+                CASE_FIXED_FP(70);
+                CASE_FIXED_FP(71);
+                CASE_FIXED_FP(72);
+                CASE_FIXED_FP(73);
+                CASE_FIXED_FP(74);
+                CASE_FIXED_FP(75);
+                CASE_FIXED_FP(76);
+                CASE_FIXED_FP(77);
+                CASE_FIXED_FP(78);
+                CASE_FIXED_FP(79);
+                CASE_FIXED_FP(80);
+                CASE_FIXED_FP(81);
+                CASE_FIXED_FP(82);
+                CASE_FIXED_FP(83);
+                CASE_FIXED_FP(84);
+                CASE_FIXED_FP(85);
+                CASE_FIXED_FP(86);
+                CASE_FIXED_FP(87);
+                CASE_FIXED_FP(88);
+                CASE_FIXED_FP(89);
+                CASE_FIXED_FP(90);
+                CASE_FIXED_FP(91);
+                CASE_FIXED_FP(92);
+                CASE_FIXED_FP(93);
+                CASE_FIXED_FP(94);
+                CASE_FIXED_FP(95);
+                CASE_FIXED_FP(96);
+                CASE_FIXED_FP(97);
+                CASE_FIXED_FP(98);
+                CASE_FIXED_FP(99);
+                CASE_FIXED_FP(100);
+                CASE_FIXED_FP(101);
+                CASE_FIXED_FP(102);
+                CASE_FIXED_FP(103);
+                CASE_FIXED_FP(104);
+                CASE_FIXED_FP(105);
+                CASE_FIXED_FP(106);
+                CASE_FIXED_FP(107);
+                CASE_FIXED_FP(108);
+                CASE_FIXED_FP(109);
+                CASE_FIXED_FP(110);
+                CASE_FIXED_FP(111);
+                CASE_FIXED_FP(112);
+                CASE_FIXED_FP(113);
+                CASE_FIXED_FP(114);
+                CASE_FIXED_FP(115);
+                CASE_FIXED_FP(116);
+                CASE_FIXED_FP(117);
+                CASE_FIXED_FP(118);
+                CASE_FIXED_FP(119);
+                CASE_FIXED_FP(120);
+                CASE_FIXED_FP(121);
+                CASE_FIXED_FP(122);
+                CASE_FIXED_FP(123);
+                CASE_FIXED_FP(124);
+                CASE_FIXED_FP(125);
+                CASE_FIXED_FP(126);
+                CASE_FIXED_FP(127);
+        }
+#undef CASE_FIXED_FP
+}
+void setfpreg(unsigned long regnum, struct ia64_fpreg *fpval,
+                                        struct kvm_pt_regs *regs)
+{
+        /* Take floating register rotation into consideration*/
+        if (regnum >= IA64_FIRST_ROTATING_FR)
+                regnum = IA64_FIRST_ROTATING_FR + fph_index(regs, regnum);
+#define CASE_FIXED_FP(reg)                      \
+        case (reg) :                            \
+                ia64_ldf_fill(reg, fpval);      \
+        break
+        switch (regnum) {
+                CASE_FIXED_FP(2);
+                CASE_FIXED_FP(3);
+                CASE_FIXED_FP(4);
+                CASE_FIXED_FP(5);
+                CASE_FIXED_FP(6);
+                CASE_FIXED_FP(7);
+                CASE_FIXED_FP(8);
+                CASE_FIXED_FP(9);
+                CASE_FIXED_FP(10);
+                CASE_FIXED_FP(11);
+                CASE_FIXED_FP(12);
+                CASE_FIXED_FP(13);
+                CASE_FIXED_FP(14);
+                CASE_FIXED_FP(15);
+                CASE_FIXED_FP(16);
+                CASE_FIXED_FP(17);
+                CASE_FIXED_FP(18);
+                CASE_FIXED_FP(19);
+                CASE_FIXED_FP(20);
+                CASE_FIXED_FP(21);
+                CASE_FIXED_FP(22);
+                CASE_FIXED_FP(23);
+                CASE_FIXED_FP(24);
+                CASE_FIXED_FP(25);
+                CASE_FIXED_FP(26);
+                CASE_FIXED_FP(27);
+                CASE_FIXED_FP(28);
+                CASE_FIXED_FP(29);
+                CASE_FIXED_FP(30);
+                CASE_FIXED_FP(31);
+                CASE_FIXED_FP(32);
+                CASE_FIXED_FP(33);
+                CASE_FIXED_FP(34);
+                CASE_FIXED_FP(35);
+                CASE_FIXED_FP(36);
+                CASE_FIXED_FP(37);
+                CASE_FIXED_FP(38);
+                CASE_FIXED_FP(39);
+                CASE_FIXED_FP(40);
+                CASE_FIXED_FP(41);
+                CASE_FIXED_FP(42);
+                CASE_FIXED_FP(43);
+                CASE_FIXED_FP(44);
+                CASE_FIXED_FP(45);
+                CASE_FIXED_FP(46);
+                CASE_FIXED_FP(47);
+                CASE_FIXED_FP(48);
+                CASE_FIXED_FP(49);
+                CASE_FIXED_FP(50);
+                CASE_FIXED_FP(51);
+                CASE_FIXED_FP(52);
+                CASE_FIXED_FP(53);
+                CASE_FIXED_FP(54);
+                CASE_FIXED_FP(55);
+                CASE_FIXED_FP(56);
+                CASE_FIXED_FP(57);
+                CASE_FIXED_FP(58);
+                CASE_FIXED_FP(59);
+                CASE_FIXED_FP(60);
+                CASE_FIXED_FP(61);
+                CASE_FIXED_FP(62);
+                CASE_FIXED_FP(63);
+                CASE_FIXED_FP(64);
+                CASE_FIXED_FP(65);
+                CASE_FIXED_FP(66);
+                CASE_FIXED_FP(67);
+                CASE_FIXED_FP(68);
+                CASE_FIXED_FP(69);
+                CASE_FIXED_FP(70);
+                CASE_FIXED_FP(71);
+                CASE_FIXED_FP(72);
+                CASE_FIXED_FP(73);
+                CASE_FIXED_FP(74);
+                CASE_FIXED_FP(75);
+                CASE_FIXED_FP(76);
+                CASE_FIXED_FP(77);
+                CASE_FIXED_FP(78);
+                CASE_FIXED_FP(79);
+                CASE_FIXED_FP(80);
+                CASE_FIXED_FP(81);
+                CASE_FIXED_FP(82);
+                CASE_FIXED_FP(83);
+                CASE_FIXED_FP(84);
+                CASE_FIXED_FP(85);
+                CASE_FIXED_FP(86);
+                CASE_FIXED_FP(87);
+                CASE_FIXED_FP(88);
+                CASE_FIXED_FP(89);
+                CASE_FIXED_FP(90);
+                CASE_FIXED_FP(91);
+                CASE_FIXED_FP(92);
+                CASE_FIXED_FP(93);
+                CASE_FIXED_FP(94);
+                CASE_FIXED_FP(95);
+                CASE_FIXED_FP(96);
+                CASE_FIXED_FP(97);
+                CASE_FIXED_FP(98);
+                CASE_FIXED_FP(99);
+                CASE_FIXED_FP(100);
+                CASE_FIXED_FP(101);
+                CASE_FIXED_FP(102);
+                CASE_FIXED_FP(103);
+                CASE_FIXED_FP(104);
+                CASE_FIXED_FP(105);
+                CASE_FIXED_FP(106);
+                CASE_FIXED_FP(107);
+                CASE_FIXED_FP(108);
+                CASE_FIXED_FP(109);
+                CASE_FIXED_FP(110);
+                CASE_FIXED_FP(111);
+                CASE_FIXED_FP(112);
+                CASE_FIXED_FP(113);
+                CASE_FIXED_FP(114);
+                CASE_FIXED_FP(115);
+                CASE_FIXED_FP(116);
+                CASE_FIXED_FP(117);
+                CASE_FIXED_FP(118);
+                CASE_FIXED_FP(119);
+                CASE_FIXED_FP(120);
+                CASE_FIXED_FP(121);
+                CASE_FIXED_FP(122);
+                CASE_FIXED_FP(123);
+                CASE_FIXED_FP(124);
+                CASE_FIXED_FP(125);
+                CASE_FIXED_FP(126);
+                CASE_FIXED_FP(127);
+        }
+}
+void vcpu_get_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
+                                                struct ia64_fpreg *val)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        getfpreg(reg, val, regs);   /* FIXME: handle NATs later*/
+}
+void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
+                                                struct ia64_fpreg *val)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        if (reg > 1)
+                setfpreg(reg, val, regs);   /* FIXME: handle NATs later*/
+}
+/************************************************************************
+ * lsapic timer
+ ***********************************************************************/
+u64 vcpu_get_itc(struct kvm_vcpu *vcpu)
+{
+        unsigned long guest_itc;
+        guest_itc = VMX(vcpu, itc_offset) + ia64_getreg(_IA64_REG_AR_ITC);
+        if (guest_itc >= VMX(vcpu, last_itc)) {
+                VMX(vcpu, last_itc) = guest_itc;
+                return  guest_itc;
+        } else
+                return VMX(vcpu, last_itc);
+}
+static inline void vcpu_set_itm(struct kvm_vcpu *vcpu, u64 val);
+static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
+{
+        struct kvm_vcpu *v;
+        int i;
+        long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC);
+        unsigned long vitv = VCPU(vcpu, itv);
+        if (vcpu->vcpu_id == 0) {
+                for (i = 0; i < MAX_VCPU_NUM; i++) {
+                        v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+                        VMX(v, itc_offset) = itc_offset;
+                        VMX(v, last_itc) = 0;
+                }
+        }
+        VMX(vcpu, last_itc) = 0;
+        if (VCPU(vcpu, itm) <= val) {
+                VMX(vcpu, itc_check) = 0;
+                vcpu_unpend_interrupt(vcpu, vitv);
+        } else {
+                VMX(vcpu, itc_check) = 1;
+                vcpu_set_itm(vcpu, VCPU(vcpu, itm));
+        }
+}
+static inline u64 vcpu_get_itm(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, itm));
+}
+static inline void vcpu_set_itm(struct kvm_vcpu *vcpu, u64 val)
+{
+        unsigned long vitv = VCPU(vcpu, itv);
+        VCPU(vcpu, itm) = val;
+        if (val > vcpu_get_itc(vcpu)) {
+                VMX(vcpu, itc_check) = 1;
+                vcpu_unpend_interrupt(vcpu, vitv);
+                VMX(vcpu, timer_pending) = 0;
+        } else
+                VMX(vcpu, itc_check) = 0;
+}
+#define  ITV_VECTOR(itv)    (itv&0xff)
+#define  ITV_IRQ_MASK(itv)  (itv&(1<<16))
+static inline void vcpu_set_itv(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, itv) = val;
+        if (!ITV_IRQ_MASK(val) && vcpu->arch.timer_pending) {
+                vcpu_pend_interrupt(vcpu, ITV_VECTOR(val));
+                vcpu->arch.timer_pending = 0;
+        }
+}
+static inline void vcpu_set_eoi(struct kvm_vcpu *vcpu, u64 val)
+{
+        int vec;
+        vec = highest_inservice_irq(vcpu);
+        if (vec == NULL_VECTOR)
+                return;
+        VMX(vcpu, insvc[vec >> 6]) &= ~(1UL << (vec & 63));
+        VCPU(vcpu, eoi) = 0;
+        vcpu->arch.irq_new_pending = 1;
+}
+/* See Table 5-8 in SDM vol2 for the definition */
+int irq_masked(struct kvm_vcpu *vcpu, int h_pending, int h_inservice)
+{
+        union ia64_tpr vtpr;
+        vtpr.val = VCPU(vcpu, tpr);
+        if (h_inservice == NMI_VECTOR)
+                return IRQ_MASKED_BY_INSVC;
+        if (h_pending == NMI_VECTOR) {
+                /* Non Maskable Interrupt */
+                return IRQ_NO_MASKED;
+        }
+        if (h_inservice == ExtINT_VECTOR)
+                return IRQ_MASKED_BY_INSVC;
+        if (h_pending == ExtINT_VECTOR) {
+                if (vtpr.mmi) {
+                        /* mask all external IRQ */
+                        return IRQ_MASKED_BY_VTPR;
+                } else
+                        return IRQ_NO_MASKED;
+        }
+        if (is_higher_irq(h_pending, h_inservice)) {
+                if (is_higher_class(h_pending, vtpr.mic + (vtpr.mmi << 4)))
+                        return IRQ_NO_MASKED;
+                else
+                        return IRQ_MASKED_BY_VTPR;
+        } else {
+                return IRQ_MASKED_BY_INSVC;
+        }
+}
+void vcpu_pend_interrupt(struct kvm_vcpu *vcpu, u8 vec)
+{
+        long spsr;
+        int ret;
+        local_irq_save(spsr);
+        ret = test_and_set_bit(vec, &VCPU(vcpu, irr[0]));
+        local_irq_restore(spsr);
+        vcpu->arch.irq_new_pending = 1;
+}
+void vcpu_unpend_interrupt(struct kvm_vcpu *vcpu, u8 vec)
+{
+        long spsr;
+        int ret;
+        local_irq_save(spsr);
+        ret = test_and_clear_bit(vec, &VCPU(vcpu, irr[0]));
+        local_irq_restore(spsr);
+        if (ret) {
+                vcpu->arch.irq_new_pending = 1;
+                wmb();
+        }
+}
+void update_vhpi(struct kvm_vcpu *vcpu, int vec)
+{
+        u64 vhpi;
+        if (vec == NULL_VECTOR)
+                vhpi = 0;
+        else if (vec == NMI_VECTOR)
+                vhpi = 32;
+        else if (vec == ExtINT_VECTOR)
+                vhpi = 16;
+        else
+                vhpi = vec >> 4;
+        VCPU(vcpu, vhpi) = vhpi;
+        if (VCPU(vcpu, vac).a_int)
+                ia64_call_vsa(PAL_VPS_SET_PENDING_INTERRUPT,
+                                (u64)vcpu->arch.vpd, 0, 0, 0, 0, 0, 0);
+}
+u64 vcpu_get_ivr(struct kvm_vcpu *vcpu)
+{
+        int vec, h_inservice, mask;
+        vec = highest_pending_irq(vcpu);
+        h_inservice = highest_inservice_irq(vcpu);
+        mask = irq_masked(vcpu, vec, h_inservice);
+        if (vec == NULL_VECTOR || mask == IRQ_MASKED_BY_INSVC) {
+                if (VCPU(vcpu, vhpi))
+                        update_vhpi(vcpu, NULL_VECTOR);
+                return IA64_SPURIOUS_INT_VECTOR;
+        }
+        if (mask == IRQ_MASKED_BY_VTPR) {
+                update_vhpi(vcpu, vec);
+                return IA64_SPURIOUS_INT_VECTOR;
+        }
+        VMX(vcpu, insvc[vec >> 6]) |= (1UL << (vec & 63));
+        vcpu_unpend_interrupt(vcpu, vec);
+        return  (u64)vec;
+}
+/**************************************************************************
+  Privileged operation emulation routines
+ **************************************************************************/
+u64 vcpu_thash(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        union ia64_pta vpta;
+        union ia64_rr vrr;
+        u64 pval;
+        u64 vhpt_offset;
+        vpta.val = vcpu_get_pta(vcpu);
+        vrr.val = vcpu_get_rr(vcpu, vadr);
+        vhpt_offset = ((vadr >> vrr.ps) << 3) & ((1UL << (vpta.size)) - 1);
+        if (vpta.vf) {
+                pval = ia64_call_vsa(PAL_VPS_THASH, vadr, vrr.val,
+                                vpta.val, 0, 0, 0, 0);
+        } else {
+                pval = (vadr & VRN_MASK) | vhpt_offset |
+                        (vpta.val << 3 >> (vpta.size + 3) << (vpta.size));
+        }
+        return  pval;
+}
+u64 vcpu_ttag(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        union ia64_rr vrr;
+        union ia64_pta vpta;
+        u64 pval;
+        vpta.val = vcpu_get_pta(vcpu);
+        vrr.val = vcpu_get_rr(vcpu, vadr);
+        if (vpta.vf) {
+                pval = ia64_call_vsa(PAL_VPS_TTAG, vadr, vrr.val,
+                                                0, 0, 0, 0, 0);
+        } else
+                pval = 1;
+        return  pval;
+}
+u64 vcpu_tak(struct kvm_vcpu *vcpu, u64 vadr)
+{
+        struct thash_data *data;
+        union ia64_pta vpta;
+        u64 key;
+        vpta.val = vcpu_get_pta(vcpu);
+        if (vpta.vf == 0) {
+                key = 1;
+                return key;
+        }
+        data = vtlb_lookup(vcpu, vadr, D_TLB);
+        if (!data || !data->p)
+                key = 1;
+        else
+                key = data->key;
+        return key;
+}
+void kvm_thash(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long thash, vadr;
+        vadr = vcpu_get_gr(vcpu, inst.M46.r3);
+        thash = vcpu_thash(vcpu, vadr);
+        vcpu_set_gr(vcpu, inst.M46.r1, thash, 0);
+}
+void kvm_ttag(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long tag, vadr;
+        vadr = vcpu_get_gr(vcpu, inst.M46.r3);
+        tag = vcpu_ttag(vcpu, vadr);
+        vcpu_set_gr(vcpu, inst.M46.r1, tag, 0);
+}
+int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, u64 *padr)
+{
+        struct thash_data *data;
+        union ia64_isr visr, pt_isr;
+        struct kvm_pt_regs *regs;
+        struct ia64_psr vpsr;
+        regs = vcpu_regs(vcpu);
+        pt_isr.val = VMX(vcpu, cr_isr);
+        visr.val = 0;
+        visr.ei = pt_isr.ei;
+        visr.ir = pt_isr.ir;
+        vpsr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        visr.na = 1;
+        data = vhpt_lookup(vadr);
+        if (data) {
+                if (data->p == 0) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        data_page_not_present(vcpu, vadr);
+                        return IA64_FAULT;
+                } else if (data->ma == VA_MATTR_NATPAGE) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        dnat_page_consumption(vcpu, vadr);
+                        return IA64_FAULT;
+                } else {
+                        *padr = (data->gpaddr >> data->ps << data->ps) |
+                                (vadr & (PSIZE(data->ps) - 1));
+                        return IA64_NO_FAULT;
+                }
+        }
+        data = vtlb_lookup(vcpu, vadr, D_TLB);
+        if (data) {
+                if (data->p == 0) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        data_page_not_present(vcpu, vadr);
+                        return IA64_FAULT;
+                } else if (data->ma == VA_MATTR_NATPAGE) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        dnat_page_consumption(vcpu, vadr);
+                        return IA64_FAULT;
+                } else{
+                        *padr = ((data->ppn >> (data->ps - 12)) << data->ps)
+                                | (vadr & (PSIZE(data->ps) - 1));
+                        return IA64_NO_FAULT;
+                }
+        }
+        if (!vhpt_enabled(vcpu, vadr, NA_REF)) {
+                if (vpsr.ic) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        alt_dtlb(vcpu, vadr);
+                        return IA64_FAULT;
+                } else {
+                        nested_dtlb(vcpu);
+                        return IA64_FAULT;
+                }
+        } else {
+                if (vpsr.ic) {
+                        vcpu_set_isr(vcpu, visr.val);
+                        dvhpt_fault(vcpu, vadr);
+                        return IA64_FAULT;
+                } else{
+                        nested_dtlb(vcpu);
+                        return IA64_FAULT;
+                }
+        }
+        return IA64_NO_FAULT;
+}
+int kvm_tpa(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r1, r3;
+        r3 = vcpu_get_gr(vcpu, inst.M46.r3);
+        if (vcpu_tpa(vcpu, r3, &r1))
+                return IA64_FAULT;
+        vcpu_set_gr(vcpu, inst.M46.r1, r1, 0);
+        return(IA64_NO_FAULT);
+}
+void kvm_tak(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r1, r3;
+        r3 = vcpu_get_gr(vcpu, inst.M46.r3);
+        r1 = vcpu_tak(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M46.r1, r1, 0);
+}
+/************************************
+ * Insert/Purge translation register/cache
+ ************************************/
+void vcpu_itc_i(struct kvm_vcpu *vcpu, u64 pte, u64 itir, u64 ifa)
+{
+        thash_purge_and_insert(vcpu, pte, itir, ifa, I_TLB);
+}
+void vcpu_itc_d(struct kvm_vcpu *vcpu, u64 pte, u64 itir, u64 ifa)
+{
+        thash_purge_and_insert(vcpu, pte, itir, ifa, D_TLB);
+}
+void vcpu_itr_i(struct kvm_vcpu *vcpu, u64 slot, u64 pte, u64 itir, u64 ifa)
+{
+        u64 ps, va, rid;
+        struct thash_data *p_itr;
+        ps = itir_ps(itir);
+        va = PAGEALIGN(ifa, ps);
+        pte &= ~PAGE_FLAGS_RV_MASK;
+        rid = vcpu_get_rr(vcpu, ifa);
+        rid = rid & RR_RID_MASK;
+        p_itr = (struct thash_data *)&vcpu->arch.itrs[slot];
+        vcpu_set_tr(p_itr, pte, itir, va, rid);
+        vcpu_quick_region_set(VMX(vcpu, itr_regions), va);
+}
+void vcpu_itr_d(struct kvm_vcpu *vcpu, u64 slot, u64 pte, u64 itir, u64 ifa)
+{
+        u64 gpfn;
+        u64 ps, va, rid;
+        struct thash_data *p_dtr;
+        ps = itir_ps(itir);
+        va = PAGEALIGN(ifa, ps);
+        pte &= ~PAGE_FLAGS_RV_MASK;
+        if (ps != _PAGE_SIZE_16M)
+                thash_purge_entries(vcpu, va, ps);
+        gpfn = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
+        if (__gpfn_is_io(gpfn))
+                pte |= VTLB_PTE_IO;
+        rid = vcpu_get_rr(vcpu, va);
+        rid = rid & RR_RID_MASK;
+        p_dtr = (struct thash_data *)&vcpu->arch.dtrs[slot];
+        vcpu_set_tr((struct thash_data *)&vcpu->arch.dtrs[slot],
+                                                        pte, itir, va, rid);
+        vcpu_quick_region_set(VMX(vcpu, dtr_regions), va);
+}
+void vcpu_ptr_d(struct kvm_vcpu *vcpu, u64 ifa, u64 ps)
+{
+        int index;
+        u64 va;
+        va = PAGEALIGN(ifa, ps);
+        while ((index = vtr_find_overlap(vcpu, va, ps, D_TLB)) >= 0)
+                vcpu->arch.dtrs[index].page_flags = 0;
+        thash_purge_entries(vcpu, va, ps);
+}
+void vcpu_ptr_i(struct kvm_vcpu *vcpu, u64 ifa, u64 ps)
+{
+        int index;
+        u64 va;
+        va = PAGEALIGN(ifa, ps);
+        while ((index = vtr_find_overlap(vcpu, va, ps, I_TLB)) >= 0)
+                vcpu->arch.itrs[index].page_flags = 0;
+        thash_purge_entries(vcpu, va, ps);
+}
+void vcpu_ptc_l(struct kvm_vcpu *vcpu, u64 va, u64 ps)
+{
+        va = PAGEALIGN(va, ps);
+        thash_purge_entries(vcpu, va, ps);
+}
+void vcpu_ptc_e(struct kvm_vcpu *vcpu, u64 va)
+{
+        thash_purge_all(vcpu);
+}
+void vcpu_ptc_ga(struct kvm_vcpu *vcpu, u64 va, u64 ps)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        long psr;
+        local_irq_save(psr);
+        p->exit_reason = EXIT_REASON_PTC_G;
+        p->u.ptc_g_data.rr = vcpu_get_rr(vcpu, va);
+        p->u.ptc_g_data.vaddr = va;
+        p->u.ptc_g_data.ps = ps;
+        vmm_transition(vcpu);
+        /* Do Local Purge Here*/
+        vcpu_ptc_l(vcpu, va, ps);
+        local_irq_restore(psr);
+}
+void vcpu_ptc_g(struct kvm_vcpu *vcpu, u64 va, u64 ps)
+{
+        vcpu_ptc_ga(vcpu, va, ps);
+}
+void kvm_ptc_e(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        vcpu_ptc_e(vcpu, ifa);
+}
+void kvm_ptc_g(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa, itir;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        itir = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_ptc_g(vcpu, ifa, itir_ps(itir));
+}
+void kvm_ptc_ga(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa, itir;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        itir = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_ptc_ga(vcpu, ifa, itir_ps(itir));
+}
+void kvm_ptc_l(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa, itir;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        itir = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_ptc_l(vcpu, ifa, itir_ps(itir));
+}
+void kvm_ptr_d(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa, itir;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        itir = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_ptr_d(vcpu, ifa, itir_ps(itir));
+}
+void kvm_ptr_i(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long ifa, itir;
+        ifa = vcpu_get_gr(vcpu, inst.M45.r3);
+        itir = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_ptr_i(vcpu, ifa, itir_ps(itir));
+}
+void kvm_itr_d(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long itir, ifa, pte, slot;
+        slot = vcpu_get_gr(vcpu, inst.M45.r3);
+        pte = vcpu_get_gr(vcpu, inst.M45.r2);
+        itir = vcpu_get_itir(vcpu);
+        ifa = vcpu_get_ifa(vcpu);
+        vcpu_itr_d(vcpu, slot, pte, itir, ifa);
+}
+void kvm_itr_i(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long itir, ifa, pte, slot;
+        slot = vcpu_get_gr(vcpu, inst.M45.r3);
+        pte = vcpu_get_gr(vcpu, inst.M45.r2);
+        itir = vcpu_get_itir(vcpu);
+        ifa = vcpu_get_ifa(vcpu);
+        vcpu_itr_i(vcpu, slot, pte, itir, ifa);
+}
+void kvm_itc_d(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long itir, ifa, pte;
+        itir = vcpu_get_itir(vcpu);
+        ifa = vcpu_get_ifa(vcpu);
+        pte = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_itc_d(vcpu, pte, itir, ifa);
+}
+void kvm_itc_i(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long itir, ifa, pte;
+        itir = vcpu_get_itir(vcpu);
+        ifa = vcpu_get_ifa(vcpu);
+        pte = vcpu_get_gr(vcpu, inst.M45.r2);
+        vcpu_itc_i(vcpu, pte, itir, ifa);
+}
+/*************************************
+ * Moves to semi-privileged registers
+ *************************************/
+void kvm_mov_to_ar_imm(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long imm;
+        if (inst.M30.s)
+                imm = -inst.M30.imm;
+        else
+                imm = inst.M30.imm;
+        vcpu_set_itc(vcpu, imm);
+}
+void kvm_mov_to_ar_reg(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r2;
+        r2 = vcpu_get_gr(vcpu, inst.M29.r2);
+        vcpu_set_itc(vcpu, r2);
+}
+void kvm_mov_from_ar_reg(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r1;
+        r1 = vcpu_get_itc(vcpu);
+        vcpu_set_gr(vcpu, inst.M31.r1, r1, 0);
+}
+/**************************************************************************
+  struct kvm_vcpu*protection key register access routines
+ **************************************************************************/
+unsigned long vcpu_get_pkr(struct kvm_vcpu *vcpu, unsigned long reg)
+{
+        return ((unsigned long)ia64_get_pkr(reg));
+}
+void vcpu_set_pkr(struct kvm_vcpu *vcpu, unsigned long reg, unsigned long val)
+{
+        ia64_set_pkr(reg, val);
+}
+unsigned long vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, unsigned long ifa)
+{
+        union ia64_rr rr, rr1;
+        rr.val = vcpu_get_rr(vcpu, ifa);
+        rr1.val = 0;
+        rr1.ps = rr.ps;
+        rr1.rid = rr.rid;
+        return (rr1.val);
+}
+/********************************
+ * Moves to privileged registers
+ ********************************/
+unsigned long vcpu_set_rr(struct kvm_vcpu *vcpu, unsigned long reg,
+                                        unsigned long val)
+{
+        union ia64_rr oldrr, newrr;
+        unsigned long rrval;
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        unsigned long psr;
+        oldrr.val = vcpu_get_rr(vcpu, reg);
+        newrr.val = val;
+        vcpu->arch.vrr[reg >> VRN_SHIFT] = val;
+        switch ((unsigned long)(reg >> VRN_SHIFT)) {
+        case VRN6:
+                vcpu->arch.vmm_rr = vrrtomrr(val);
+                local_irq_save(psr);
+                p->exit_reason = EXIT_REASON_SWITCH_RR6;
+                vmm_transition(vcpu);
+                local_irq_restore(psr);
+                break;
+        case VRN4:
+                rrval = vrrtomrr(val);
+                vcpu->arch.metaphysical_saved_rr4 = rrval;
+                if (!is_physical_mode(vcpu))
+                        ia64_set_rr(reg, rrval);
+                break;
+        case VRN0:
+                rrval = vrrtomrr(val);
+                vcpu->arch.metaphysical_saved_rr0 = rrval;
+                if (!is_physical_mode(vcpu))
+                        ia64_set_rr(reg, rrval);
+                break;
+        default:
+                ia64_set_rr(reg, vrrtomrr(val));
+                break;
+        }
+        return (IA64_NO_FAULT);
+}
+void kvm_mov_to_rr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r2;
+        r3 = vcpu_get_gr(vcpu, inst.M42.r3);
+        r2 = vcpu_get_gr(vcpu, inst.M42.r2);
+        vcpu_set_rr(vcpu, r3, r2);
+}
+void kvm_mov_to_dbr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+}
+void kvm_mov_to_ibr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+}
+void kvm_mov_to_pmc(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r2;
+        r3 = vcpu_get_gr(vcpu, inst.M42.r3);
+        r2 = vcpu_get_gr(vcpu, inst.M42.r2);
+        vcpu_set_pmc(vcpu, r3, r2);
+}
+void kvm_mov_to_pmd(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r2;
+        r3 = vcpu_get_gr(vcpu, inst.M42.r3);
+        r2 = vcpu_get_gr(vcpu, inst.M42.r2);
+        vcpu_set_pmd(vcpu, r3, r2);
+}
+void kvm_mov_to_pkr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        u64 r3, r2;
+        r3 = vcpu_get_gr(vcpu, inst.M42.r3);
+        r2 = vcpu_get_gr(vcpu, inst.M42.r2);
+        vcpu_set_pkr(vcpu, r3, r2);
+}
+void kvm_mov_from_rr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_rr(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+void kvm_mov_from_pkr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_pkr(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+void kvm_mov_from_dbr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_dbr(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+void kvm_mov_from_ibr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_ibr(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+void kvm_mov_from_pmc(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_pmc(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+unsigned long vcpu_get_cpuid(struct kvm_vcpu *vcpu, unsigned long reg)
+{
+        /* FIXME: This could get called as a result of a rsvd-reg fault */
+        if (reg > (ia64_get_cpuid(3) & 0xff))
+                return 0;
+        else
+                return ia64_get_cpuid(reg);
+}
+void kvm_mov_from_cpuid(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r3, r1;
+        r3 = vcpu_get_gr(vcpu, inst.M43.r3);
+        r1 = vcpu_get_cpuid(vcpu, r3);
+        vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
+}
+void vcpu_set_tpr(struct kvm_vcpu *vcpu, unsigned long val)
+{
+        VCPU(vcpu, tpr) = val;
+        vcpu->arch.irq_check = 1;
+}
+unsigned long kvm_mov_to_cr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long r2;
+        r2 = vcpu_get_gr(vcpu, inst.M32.r2);
+        VCPU(vcpu, vcr[inst.M32.cr3]) = r2;
+        switch (inst.M32.cr3) {
+        case 0:
+                vcpu_set_dcr(vcpu, r2);
+                break;
+        case 1:
+                vcpu_set_itm(vcpu, r2);
+                break;
+        case 66:
+                vcpu_set_tpr(vcpu, r2);
+                break;
+        case 67:
+                vcpu_set_eoi(vcpu, r2);
+                break;
+        default:
+                break;
+        }
+        return 0;
+}
+unsigned long kvm_mov_from_cr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long tgt = inst.M33.r1;
+        unsigned long val;
+        switch (inst.M33.cr3) {
+        case 65:
+                val = vcpu_get_ivr(vcpu);
+                vcpu_set_gr(vcpu, tgt, val, 0);
+                break;
+        case 67:
+                vcpu_set_gr(vcpu, tgt, 0L, 0);
+                break;
+        default:
+                val = VCPU(vcpu, vcr[inst.M33.cr3]);
+                vcpu_set_gr(vcpu, tgt, val, 0);
+                break;
+        }
+        return 0;
+}
+void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
+{
+        unsigned long mask;
+        struct kvm_pt_regs *regs;
+        struct ia64_psr old_psr, new_psr;
+        old_psr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        regs = vcpu_regs(vcpu);
+        /* We only support guest as:
+         *  vpsr.pk = 0
+         *  vpsr.is = 0
+         * Otherwise panic
+         */
+        if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
+                panic_vm(vcpu);
+        /*
+         * For those IA64_PSR bits: id/da/dd/ss/ed/ia
+         * Since these bits will become 0, after success execution of each
+         * instruction, we will change set them to mIA64_PSR
+         */
+        VCPU(vcpu, vpsr) = val
+                & (~(IA64_PSR_ID | IA64_PSR_DA | IA64_PSR_DD |
+                        IA64_PSR_SS | IA64_PSR_ED | IA64_PSR_IA));
+        if (!old_psr.i && (val & IA64_PSR_I)) {
+                /* vpsr.i 0->1 */
+                vcpu->arch.irq_check = 1;
+        }
+        new_psr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        /*
+         * All vIA64_PSR bits shall go to mPSR (v->tf->tf_special.psr)
+         * , except for the following bits:
+         *  ic/i/dt/si/rt/mc/it/bn/vm
+         */
+        mask =  IA64_PSR_IC + IA64_PSR_I + IA64_PSR_DT + IA64_PSR_SI +
+                IA64_PSR_RT + IA64_PSR_MC + IA64_PSR_IT + IA64_PSR_BN +
+                IA64_PSR_VM;
+        regs->cr_ipsr = (regs->cr_ipsr & mask) | (val & (~mask));
+        check_mm_mode_switch(vcpu, old_psr, new_psr);
+        return ;
+}
+unsigned long vcpu_cover(struct kvm_vcpu *vcpu)
+{
+        struct ia64_psr vpsr;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        vpsr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        if (!vpsr.ic)
+                VCPU(vcpu, ifs) = regs->cr_ifs;
+        regs->cr_ifs = IA64_IFS_V;
+        return (IA64_NO_FAULT);
+}
+/**************************************************************************
+  VCPU banked general register access routines
+ **************************************************************************/
+#define vcpu_bsw0_unat(i, b0unat, b1unat, runat, VMM_PT_REGS_R16_SLOT)  \
+        do {                                                            \
+                __asm__ __volatile__ (                                  \
+                                ";;extr.u %0 = %3,%6,16;;\n"            \
+                                "dep %1 = %0, %1, 0, 16;;\n"            \
+                                "st8 [%4] = %1\n"                       \
+                                "extr.u %0 = %2, 16, 16;;\n"            \
+                                "dep %3 = %0, %3, %6, 16;;\n"           \
+                                "st8 [%5] = %3\n"                       \
+                                ::"r"(i), "r"(*b1unat), "r"(*b0unat),   \
+                                "r"(*runat), "r"(b1unat), "r"(runat),   \
+                                "i"(VMM_PT_REGS_R16_SLOT) : "memory");  \
+        } while (0)
+void vcpu_bsw0(struct kvm_vcpu *vcpu)
+{
+        unsigned long i;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        unsigned long *r = &regs->r16;
+        unsigned long *b0 = &VCPU(vcpu, vbgr[0]);
+        unsigned long *b1 = &VCPU(vcpu, vgr[0]);
+        unsigned long *runat = &regs->eml_unat;
+        unsigned long *b0unat = &VCPU(vcpu, vbnat);
+        unsigned long *b1unat = &VCPU(vcpu, vnat);
+        if (VCPU(vcpu, vpsr) & IA64_PSR_BN) {
+                for (i = 0; i < 16; i++) {
+                        *b1++ = *r;
+                        *r++ = *b0++;
+                }
+                vcpu_bsw0_unat(i, b0unat, b1unat, runat,
+                                VMM_PT_REGS_R16_SLOT);
+                VCPU(vcpu, vpsr) &= ~IA64_PSR_BN;
+        }
+}
+#define vcpu_bsw1_unat(i, b0unat, b1unat, runat, VMM_PT_REGS_R16_SLOT)  \
+        do {                                                            \
+                __asm__ __volatile__ (";;extr.u %0 = %3, %6, 16;;\n"    \
+                                "dep %1 = %0, %1, 16, 16;;\n"           \
+                                "st8 [%4] = %1\n"                       \
+                                "extr.u %0 = %2, 0, 16;;\n"             \
+                                "dep %3 = %0, %3, %6, 16;;\n"           \
+                                "st8 [%5] = %3\n"                       \
+                                ::"r"(i), "r"(*b0unat), "r"(*b1unat),   \
+                                "r"(*runat), "r"(b0unat), "r"(runat),   \
+                                "i"(VMM_PT_REGS_R16_SLOT) : "memory");  \
+        } while (0)
+void vcpu_bsw1(struct kvm_vcpu *vcpu)
+{
+        unsigned long i;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        unsigned long *r = &regs->r16;
+        unsigned long *b0 = &VCPU(vcpu, vbgr[0]);
+        unsigned long *b1 = &VCPU(vcpu, vgr[0]);
+        unsigned long *runat = &regs->eml_unat;
+        unsigned long *b0unat = &VCPU(vcpu, vbnat);
+        unsigned long *b1unat = &VCPU(vcpu, vnat);
+        if (!(VCPU(vcpu, vpsr) & IA64_PSR_BN)) {
+                for (i = 0; i < 16; i++) {
+                        *b0++ = *r;
+                        *r++ = *b1++;
+                }
+                vcpu_bsw1_unat(i, b0unat, b1unat, runat,
+                                VMM_PT_REGS_R16_SLOT);
+                VCPU(vcpu, vpsr) |= IA64_PSR_BN;
+        }
+}
+void vcpu_rfi(struct kvm_vcpu *vcpu)
+{
+        unsigned long ifs, psr;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        psr = VCPU(vcpu, ipsr);
+        if (psr & IA64_PSR_BN)
+                vcpu_bsw1(vcpu);
+        else
+                vcpu_bsw0(vcpu);
+        vcpu_set_psr(vcpu, psr);
+        ifs = VCPU(vcpu, ifs);
+        if (ifs >> 63)
+                regs->cr_ifs = ifs;
+        regs->cr_iip = VCPU(vcpu, iip);
+}
+/*
+   VPSR can't keep track of below bits of guest PSR
+   This function gets guest PSR
+ */
+unsigned long vcpu_get_psr(struct kvm_vcpu *vcpu)
+{
+        unsigned long mask;
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        mask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
+                IA64_PSR_MFH | IA64_PSR_CPL | IA64_PSR_RI;
+        return (VCPU(vcpu, vpsr) & ~mask) | (regs->cr_ipsr & mask);
+}
+void kvm_rsm(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long vpsr;
+        unsigned long imm24 = (inst.M44.i<<23) | (inst.M44.i2<<21)
+                                        | inst.M44.imm;
+        vpsr = vcpu_get_psr(vcpu);
+        vpsr &= (~imm24);
+        vcpu_set_psr(vcpu, vpsr);
+}
+void kvm_ssm(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long vpsr;
+        unsigned long imm24 = (inst.M44.i << 23) | (inst.M44.i2 << 21)
+                                | inst.M44.imm;
+        vpsr = vcpu_get_psr(vcpu);
+        vpsr |= imm24;
+        vcpu_set_psr(vcpu, vpsr);
+}
+/* Generate Mask
+ * Parameter:
+ *  bit -- starting bit
+ *  len -- how many bits
+ */
+#define MASK(bit,len)                                   \
+({                                                      \
+                __u64   ret;                            \
+                                                        \
+                __asm __volatile("dep %0=-1, r0, %1, %2"\
+                                : "=r" (ret):           \
+                  "M" (bit),                            \
+                  "M" (len));                           \
+                ret;                                    \
+})
+void vcpu_set_psr_l(struct kvm_vcpu *vcpu, unsigned long val)
+{
+        val = (val & MASK(0, 32)) | (vcpu_get_psr(vcpu) & MASK(32, 32));
+        vcpu_set_psr(vcpu, val);
+}
+void kvm_mov_to_psr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long val;
+        val = vcpu_get_gr(vcpu, inst.M35.r2);
+        vcpu_set_psr_l(vcpu, val);
+}
+void kvm_mov_from_psr(struct kvm_vcpu *vcpu, INST64 inst)
+{
+        unsigned long val;
+        val = vcpu_get_psr(vcpu);
+        val = (val & MASK(0, 32)) | (val & MASK(35, 2));
+        vcpu_set_gr(vcpu, inst.M33.r1, val, 0);
+}
+void vcpu_increment_iip(struct kvm_vcpu *vcpu)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        struct ia64_psr *ipsr = (struct ia64_psr *)&regs->cr_ipsr;
+        if (ipsr->ri == 2) {
+                ipsr->ri = 0;
+                regs->cr_iip += 16;
+        } else
+                ipsr->ri++;
+}
+void vcpu_decrement_iip(struct kvm_vcpu *vcpu)
+{
+        struct kvm_pt_regs *regs = vcpu_regs(vcpu);
+        struct ia64_psr *ipsr = (struct ia64_psr *)&regs->cr_ipsr;
+        if (ipsr->ri == 0) {
+                ipsr->ri = 2;
+                regs->cr_iip -= 16;
+        } else
+                ipsr->ri--;
+}
+/** Emulate a privileged operation.
+ *
+ *
+ * @param vcpu virtual cpu
+ * @cause the reason cause virtualization fault
+ * @opcode the instruction code which cause virtualization fault
+ */
+void kvm_emulate(struct kvm_vcpu *vcpu, struct kvm_pt_regs *regs)
+{
+        unsigned long status, cause, opcode ;
+        INST64 inst;
+        status = IA64_NO_FAULT;
+        cause = VMX(vcpu, cause);
+        opcode = VMX(vcpu, opcode);
+        inst.inst = opcode;
+        /*
+         * Switch to actual virtual rid in rr0 and rr4,
+         * which is required by some tlb related instructions.
+         */
+        prepare_if_physical_mode(vcpu);
+        switch (cause) {
+        case EVENT_RSM:
+                kvm_rsm(vcpu, inst);
+                break;
+        case EVENT_SSM:
+                kvm_ssm(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_PSR:
+                kvm_mov_to_psr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_PSR:
+                kvm_mov_from_psr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_CR:
+                kvm_mov_from_cr(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_CR:
+                kvm_mov_to_cr(vcpu, inst);
+                break;
+        case EVENT_BSW_0:
+                vcpu_bsw0(vcpu);
+                break;
+        case EVENT_BSW_1:
+                vcpu_bsw1(vcpu);
+                break;
+        case EVENT_COVER:
+                vcpu_cover(vcpu);
+                break;
+        case EVENT_RFI:
+                vcpu_rfi(vcpu);
+                break;
+        case EVENT_ITR_D:
+                kvm_itr_d(vcpu, inst);
+                break;
+        case EVENT_ITR_I:
+                kvm_itr_i(vcpu, inst);
+                break;
+        case EVENT_PTR_D:
+                kvm_ptr_d(vcpu, inst);
+                break;
+        case EVENT_PTR_I:
+                kvm_ptr_i(vcpu, inst);
+                break;
+        case EVENT_ITC_D:
+                kvm_itc_d(vcpu, inst);
+                break;
+        case EVENT_ITC_I:
+                kvm_itc_i(vcpu, inst);
+                break;
+        case EVENT_PTC_L:
+                kvm_ptc_l(vcpu, inst);
+                break;
+        case EVENT_PTC_G:
+                kvm_ptc_g(vcpu, inst);
+                break;
+        case EVENT_PTC_GA:
+                kvm_ptc_ga(vcpu, inst);
+                break;
+        case EVENT_PTC_E:
+                kvm_ptc_e(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_RR:
+                kvm_mov_to_rr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_RR:
+                kvm_mov_from_rr(vcpu, inst);
+                break;
+        case EVENT_THASH:
+                kvm_thash(vcpu, inst);
+                break;
+        case EVENT_TTAG:
+                kvm_ttag(vcpu, inst);
+                break;
+        case EVENT_TPA:
+                status = kvm_tpa(vcpu, inst);
+                break;
+        case EVENT_TAK:
+                kvm_tak(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_AR_IMM:
+                kvm_mov_to_ar_imm(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_AR:
+                kvm_mov_to_ar_reg(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_AR:
+                kvm_mov_from_ar_reg(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_DBR:
+                kvm_mov_to_dbr(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_IBR:
+                kvm_mov_to_ibr(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_PMC:
+                kvm_mov_to_pmc(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_PMD:
+                kvm_mov_to_pmd(vcpu, inst);
+                break;
+        case EVENT_MOV_TO_PKR:
+                kvm_mov_to_pkr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_DBR:
+                kvm_mov_from_dbr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_IBR:
+                kvm_mov_from_ibr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_PMC:
+                kvm_mov_from_pmc(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_PKR:
+                kvm_mov_from_pkr(vcpu, inst);
+                break;
+        case EVENT_MOV_FROM_CPUID:
+                kvm_mov_from_cpuid(vcpu, inst);
+                break;
+        case EVENT_VMSW:
+                status = IA64_FAULT;
+                break;
+        default:
+                break;
+        };
+        /*Assume all status is NO_FAULT ?*/
+        if (status == IA64_NO_FAULT && cause != EVENT_RFI)
+                vcpu_increment_iip(vcpu);
+        recover_if_physical_mode(vcpu);
+}
+void init_vcpu(struct kvm_vcpu *vcpu)
+{
+        int i;
+        vcpu->arch.mode_flags = GUEST_IN_PHY;
+        VMX(vcpu, vrr[0]) = 0x38;
+        VMX(vcpu, vrr[1]) = 0x38;
+        VMX(vcpu, vrr[2]) = 0x38;
+        VMX(vcpu, vrr[3]) = 0x38;
+        VMX(vcpu, vrr[4]) = 0x38;
+        VMX(vcpu, vrr[5]) = 0x38;
+        VMX(vcpu, vrr[6]) = 0x38;
+        VMX(vcpu, vrr[7]) = 0x38;
+        VCPU(vcpu, vpsr) = IA64_PSR_BN;
+        VCPU(vcpu, dcr) = 0;
+        /* pta.size must not be 0.  The minimum is 15 (32k) */
+        VCPU(vcpu, pta) = 15 << 2;
+        VCPU(vcpu, itv) = 0x10000;
+        VCPU(vcpu, itm) = 0;
+        VMX(vcpu, last_itc) = 0;
+        VCPU(vcpu, lid) = VCPU_LID(vcpu);
+        VCPU(vcpu, ivr) = 0;
+        VCPU(vcpu, tpr) = 0x10000;
+        VCPU(vcpu, eoi) = 0;
+        VCPU(vcpu, irr[0]) = 0;
+        VCPU(vcpu, irr[1]) = 0;
+        VCPU(vcpu, irr[2]) = 0;
+        VCPU(vcpu, irr[3]) = 0;
+        VCPU(vcpu, pmv) = 0x10000;
+        VCPU(vcpu, cmcv) = 0x10000;
+        VCPU(vcpu, lrr0) = 0x10000;   /* default reset value? */
+        VCPU(vcpu, lrr1) = 0x10000;   /* default reset value? */
+        update_vhpi(vcpu, NULL_VECTOR);
+        VLSAPIC_XTP(vcpu) = 0x80;       /* disabled */
+        for (i = 0; i < 4; i++)
+                VLSAPIC_INSVC(vcpu, i) = 0;
+}
+void kvm_init_all_rr(struct kvm_vcpu *vcpu)
+{
+        unsigned long psr;
+        local_irq_save(psr);
+        /* WARNING: not allow co-exist of both virtual mode and physical
+         * mode in same region
+         */
+        vcpu->arch.metaphysical_saved_rr0 = vrrtomrr(VMX(vcpu, vrr[VRN0]));
+        vcpu->arch.metaphysical_saved_rr4 = vrrtomrr(VMX(vcpu, vrr[VRN4]));
+        if (is_physical_mode(vcpu)) {
+                if (vcpu->arch.mode_flags & GUEST_PHY_EMUL)
+                        panic_vm(vcpu);
+                ia64_set_rr((VRN0 << VRN_SHIFT), vcpu->arch.metaphysical_rr0);
+                ia64_dv_serialize_data();
+                ia64_set_rr((VRN4 << VRN_SHIFT), vcpu->arch.metaphysical_rr4);
+                ia64_dv_serialize_data();
+        } else {
+                ia64_set_rr((VRN0 << VRN_SHIFT),
+                                vcpu->arch.metaphysical_saved_rr0);
+                ia64_dv_serialize_data();
+                ia64_set_rr((VRN4 << VRN_SHIFT),
+                                vcpu->arch.metaphysical_saved_rr4);
+                ia64_dv_serialize_data();
+        }
+        ia64_set_rr((VRN1 << VRN_SHIFT),
+                        vrrtomrr(VMX(vcpu, vrr[VRN1])));
+        ia64_dv_serialize_data();
+        ia64_set_rr((VRN2 << VRN_SHIFT),
+                        vrrtomrr(VMX(vcpu, vrr[VRN2])));
+        ia64_dv_serialize_data();
+        ia64_set_rr((VRN3 << VRN_SHIFT),
+                        vrrtomrr(VMX(vcpu, vrr[VRN3])));
+        ia64_dv_serialize_data();
+        ia64_set_rr((VRN5 << VRN_SHIFT),
+                        vrrtomrr(VMX(vcpu, vrr[VRN5])));
+        ia64_dv_serialize_data();
+        ia64_set_rr((VRN7 << VRN_SHIFT),
+                        vrrtomrr(VMX(vcpu, vrr[VRN7])));
+        ia64_dv_serialize_data();
+        ia64_srlz_d();
+        ia64_set_psr(psr);
+}
+int vmm_entry(void)
+{
+        struct kvm_vcpu *v;
+        v = current_vcpu;
+        ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)v->arch.vpd,
+                                                0, 0, 0, 0, 0, 0);
+        kvm_init_vtlb(v);
+        kvm_init_vhpt(v);
+        init_vcpu(v);
+        kvm_init_all_rr(v);
+        vmm_reset_entry();
+        return 0;
+}
+void panic_vm(struct kvm_vcpu *v)
+{
+        struct exit_ctl_data *p = &v->arch.exit_data;
+        p->exit_reason = EXIT_REASON_VM_PANIC;
+        vmm_transition(v);
+        /*Never to return*/
+        while (1);
+}
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
new file mode 100644
index 00000000000..b0fcfb62c49
--- /dev/null
+++ b/arch/ia64/kvm/vcpu.h
@@ -0,0 +1,740 @@
+/*
+ *  vcpu.h: vcpu routines
+ *      Copyright (c) 2005, Intel Corporation.
+ *      Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
+ *      Yaozu Dong (Eddie Dong) (Eddie.dong@intel.com)
+ *
+ *      Copyright (c) 2007, Intel Corporation.
+ *      Xuefei Xu (Anthony Xu) (Anthony.xu@intel.com)
+ *      Xiantao Zhang (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __KVM_VCPU_H__
+#define __KVM_VCPU_H__
+#include <asm/types.h>
+#include <asm/fpu.h>
+#include <asm/processor.h>
+#ifndef __ASSEMBLY__
+#include "vti.h"
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+typedef unsigned long IA64_INST;
+typedef union U_IA64_BUNDLE {
+        unsigned long i64[2];
+        struct { unsigned long template:5, slot0:41, slot1a:18,
+                slot1b:23, slot2:41; };
+        /* NOTE: following doesn't work because bitfields can't cross natural
+           size boundaries
+           struct { unsigned long template:5, slot0:41, slot1:41, slot2:41; }; */
+} IA64_BUNDLE;
+typedef union U_INST64_A5 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, imm7b:7, r3:2, imm5c:5,
+                imm9d:9, s:1, major:4; };
+} INST64_A5;
+typedef union U_INST64_B4 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, btype:3, un3:3, p:1, b2:3, un11:11, x6:6,
+                wh:2, d:1, un1:1, major:4; };
+} INST64_B4;
+typedef union U_INST64_B8 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, un21:21, x6:6, un4:4, major:4; };
+} INST64_B8;
+typedef union U_INST64_B9 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm20:20, :1, x6:6, :3, i:1, major:4; };
+} INST64_B9;
+typedef union U_INST64_I19 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm20:20, :1, x6:6, x3:3, i:1, major:4; };
+} INST64_I19;
+typedef union U_INST64_I26 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, ar3:7, x6:6, x3:3, :1, major:4; };
+} INST64_I26;
+typedef union U_INST64_I27 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, imm:7, ar3:7, x6:6, x3:3, s:1, major:4; };
+} INST64_I27;
+typedef union U_INST64_I28 { /* not privileged (mov from AR) */
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, :7, ar3:7, x6:6, x3:3, :1, major:4; };
+} INST64_I28;
+typedef union U_INST64_M28 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :14, r3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M28;
+typedef union U_INST64_M29 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, ar3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M29;
+typedef union U_INST64_M30 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, imm:7, ar3:7, x4:4, x2:2,
+                x3:3, s:1, major:4; };
+} INST64_M30;
+typedef union U_INST64_M31 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, :7, ar3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M31;
+typedef union U_INST64_M32 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, cr3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M32;
+typedef union U_INST64_M33 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, :7, cr3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M33;
+typedef union U_INST64_M35 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, :7, x6:6, x3:3, :1, major:4; };
+} INST64_M35;
+typedef union U_INST64_M36 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, :14, x6:6, x3:3, :1, major:4; };
+} INST64_M36;
+typedef union U_INST64_M37 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm20a:20, :1, x4:4, x2:2, x3:3,
+                i:1, major:4; };
+} INST64_M37;
+typedef union U_INST64_M41 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, :7, x6:6, x3:3, :1, major:4; };
+} INST64_M41;
+typedef union U_INST64_M42 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, r3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M42;
+typedef union U_INST64_M43 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, :7, r3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M43;
+typedef union U_INST64_M44 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm:21, x4:4, i2:2, x3:3, i:1, major:4; };
+} INST64_M44;
+typedef union U_INST64_M45 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, r2:7, r3:7, x6:6, x3:3, :1, major:4; };
+} INST64_M45;
+typedef union U_INST64_M46 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, un7:7, r3:7, x6:6,
+                x3:3, un1:1, major:4; };
+} INST64_M46;
+typedef union U_INST64_M47 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, un14:14, r3:7, x6:6, x3:3, un1:1, major:4; };
+} INST64_M47;
+typedef union U_INST64_M1{
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, un7:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M1;
+typedef union U_INST64_M2{
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, r2:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M2;
+typedef union U_INST64_M3{
+        IA64_INST inst;
+        struct { unsigned long qp:6, r1:7, imm7:7, r3:7, i:1, hint:2,
+                x6:6, s:1, major:4; };
+} INST64_M3;
+typedef union U_INST64_M4 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, un7:7, r2:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M4;
+typedef union U_INST64_M5 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm7:7, r2:7, r3:7, i:1, hint:2,
+                x6:6, s:1, major:4; };
+} INST64_M5;
+typedef union U_INST64_M6 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, f1:7, un7:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M6;
+typedef union U_INST64_M9 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, f2:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M9;
+typedef union U_INST64_M10 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, imm7:7, f2:7, r3:7, i:1, hint:2,
+                x6:6, s:1, major:4; };
+} INST64_M10;
+typedef union U_INST64_M12 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, f1:7, f2:7, r3:7, x:1, hint:2,
+                x6:6, m:1, major:4; };
+} INST64_M12;
+typedef union U_INST64_M15 {
+        IA64_INST inst;
+        struct { unsigned long qp:6, :7, imm7:7, r3:7, i:1, hint:2,
+                x6:6, s:1, major:4; };
+} INST64_M15;
+typedef union U_INST64 {
+        IA64_INST inst;
+        struct { unsigned long :37, major:4; } generic;
+        INST64_A5 A5;   /* used in build_hypercall_bundle only */
+        INST64_B4 B4;   /* used in build_hypercall_bundle only */
+        INST64_B8 B8;   /* rfi, bsw.[01] */
+        INST64_B9 B9;   /* break.b */
+        INST64_I19 I19; /* used in build_hypercall_bundle only */
+        INST64_I26 I26; /* mov register to ar (I unit) */
+        INST64_I27 I27; /* mov immediate to ar (I unit) */
+        INST64_I28 I28; /* mov from ar (I unit) */
+        INST64_M1  M1;  /* ld integer */
+        INST64_M2  M2;
+        INST64_M3  M3;
+        INST64_M4  M4;  /* st integer */
+        INST64_M5  M5;
+        INST64_M6  M6;  /* ldfd floating pointer                */
+        INST64_M9  M9;  /* stfd floating pointer                */
+        INST64_M10 M10; /* stfd floating pointer                */
+        INST64_M12 M12;     /* ldfd pair floating pointer               */
+        INST64_M15 M15; /* lfetch + imm update                  */
+        INST64_M28 M28; /* purge translation cache entry        */
+        INST64_M29 M29; /* mov register to ar (M unit)          */
+        INST64_M30 M30; /* mov immediate to ar (M unit)         */
+        INST64_M31 M31; /* mov from ar (M unit)                 */
+        INST64_M32 M32; /* mov reg to cr                        */
+        INST64_M33 M33; /* mov from cr                          */
+        INST64_M35 M35; /* mov to psr                           */
+        INST64_M36 M36; /* mov from psr                         */
+        INST64_M37 M37; /* break.m                              */
+        INST64_M41 M41; /* translation cache insert             */
+        INST64_M42 M42; /* mov to indirect reg/translation reg insert*/
+        INST64_M43 M43; /* mov from indirect reg                */
+        INST64_M44 M44; /* set/reset system mask                */
+        INST64_M45 M45; /* translation purge                    */
+        INST64_M46 M46; /* translation access (tpa,tak)         */
+        INST64_M47 M47; /* purge translation entry              */
+} INST64;
+#define MASK_41 ((unsigned long)0x1ffffffffff)
+/* Virtual address memory attributes encoding */
+#define VA_MATTR_WB         0x0
+#define VA_MATTR_UC         0x4
+#define VA_MATTR_UCE        0x5
+#define VA_MATTR_WC         0x6
+#define VA_MATTR_NATPAGE    0x7
+#define PMASK(size)         (~((size) - 1))
+#define PSIZE(size)         (1UL<<(size))
+#define CLEARLSB(ppn, nbits)    (((ppn) >> (nbits)) << (nbits))
+#define PAGEALIGN(va, ps)       CLEARLSB(va, ps)
+#define PAGE_FLAGS_RV_MASK   (0x2|(0x3UL<<50)|(((1UL<<11)-1)<<53))
+#define _PAGE_MA_ST     (0x1 <<  2) /* is reserved for software use */
+#define ARCH_PAGE_SHIFT   12
+#define INVALID_TI_TAG (1UL << 63)
+#define VTLB_PTE_P_BIT      0
+#define VTLB_PTE_IO_BIT     60
+#define VTLB_PTE_IO         (1UL<<VTLB_PTE_IO_BIT)
+#define VTLB_PTE_P          (1UL<<VTLB_PTE_P_BIT)
+#define vcpu_quick_region_check(_tr_regions,_ifa)               \
+        (_tr_regions & (1 << ((unsigned long)_ifa >> 61)))
+#define vcpu_quick_region_set(_tr_regions,_ifa)             \
+        do {_tr_regions |= (1 << ((unsigned long)_ifa >> 61)); } while (0)
+static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
+                u64 va, u64 rid)
+{
+        trp->page_flags = pte;
+        trp->itir = itir;
+        trp->vadr = va;
+        trp->rid = rid;
+}
+extern u64 kvm_lookup_mpa(u64 gpfn);
+extern u64 kvm_gpa_to_mpa(u64 gpa);
+/* Return I/O type if trye */
+#define __gpfn_is_io(gpfn)                      \
+        ({                                              \
+         u64 pte, ret = 0;                      \
+         pte = kvm_lookup_mpa(gpfn);            \
+         if (!(pte & GPFN_INV_MASK))            \
+         ret = pte & GPFN_IO_MASK;      \
+         ret;                                   \
+         })
+#endif
+#define IA64_NO_FAULT   0
+#define IA64_FAULT      1
+#define VMM_RBS_OFFSET  ((VMM_TASK_SIZE + 15) & ~15)
+#define SW_BAD  0   /* Bad mode transitition */
+#define SW_V2P  1   /* Physical emulatino is activated */
+#define SW_P2V  2   /* Exit physical mode emulation */
+#define SW_SELF 3   /* No mode transition */
+#define SW_NOP  4   /* Mode transition, but without action required */
+#define GUEST_IN_PHY    0x1
+#define GUEST_PHY_EMUL  0x2
+#define current_vcpu ((struct kvm_vcpu *) ia64_getreg(_IA64_REG_TP))
+#define VRN_SHIFT       61
+#define VRN_MASK        0xe000000000000000
+#define VRN0            0x0UL
+#define VRN1            0x1UL
+#define VRN2            0x2UL
+#define VRN3            0x3UL
+#define VRN4            0x4UL
+#define VRN5            0x5UL
+#define VRN6            0x6UL
+#define VRN7            0x7UL
+#define IRQ_NO_MASKED         0
+#define IRQ_MASKED_BY_VTPR    1
+#define IRQ_MASKED_BY_INSVC   2   /* masked by inservice IRQ */
+#define PTA_BASE_SHIFT      15
+#define IA64_PSR_VM_BIT     46
+#define IA64_PSR_VM (__IA64_UL(1) << IA64_PSR_VM_BIT)
+/* Interruption Function State */
+#define IA64_IFS_V_BIT      63
+#define IA64_IFS_V  (__IA64_UL(1) << IA64_IFS_V_BIT)
+#define PHY_PAGE_UC (_PAGE_A|_PAGE_D|_PAGE_P|_PAGE_MA_UC|_PAGE_AR_RWX)
+#define PHY_PAGE_WB (_PAGE_A|_PAGE_D|_PAGE_P|_PAGE_MA_WB|_PAGE_AR_RWX)
+#ifndef __ASSEMBLY__
+#include <asm/gcc_intrin.h>
+#define is_physical_mode(v)             \
+        ((v->arch.mode_flags) & GUEST_IN_PHY)
+#define is_virtual_mode(v)      \
+        (!is_physical_mode(v))
+#define MODE_IND(psr)   \
+        (((psr).it << 2) + ((psr).dt << 1) + (psr).rt)
+#define _vmm_raw_spin_lock(x)                                           \
+        do {                                                            \
+                __u32 *ia64_spinlock_ptr = (__u32 *) (x);               \
+                __u64 ia64_spinlock_val;                                \
+                ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr, 1, 0);\
+                if (unlikely(ia64_spinlock_val)) {                      \
+                        do {                                            \
+                                while (*ia64_spinlock_ptr)              \
+                                ia64_barrier();                         \
+                                ia64_spinlock_val =                     \
+                                ia64_cmpxchg4_acq(ia64_spinlock_ptr, 1, 0);\
+                        } while (ia64_spinlock_val);                    \
+                }                                                       \
+        } while (0)
+#define _vmm_raw_spin_unlock(x)                         \
+        do { barrier();                         \
+                ((spinlock_t *)x)->raw_lock.lock = 0; } \
+while (0)
+void vmm_spin_lock(spinlock_t *lock);
+void vmm_spin_unlock(spinlock_t *lock);
+enum {
+        I_TLB = 1,
+        D_TLB = 2
+};
+union kvm_va {
+        struct {
+                unsigned long off : 60;         /* intra-region offset */
+                unsigned long reg :  4;         /* region number */
+        } f;
+        unsigned long l;
+        void *p;
+};
+#define __kvm_pa(x)     ({union kvm_va _v; _v.l = (long) (x);           \
+                                                _v.f.reg = 0; _v.l; })
+#define __kvm_va(x)     ({union kvm_va _v; _v.l = (long) (x);           \
+                                _v.f.reg = -1; _v.p; })
+#define _REGION_ID(x)           ({union ia64_rr _v; _v.val = (long)(x); \
+                                                _v.rid; })
+#define _REGION_PAGE_SIZE(x)    ({union ia64_rr _v; _v.val = (long)(x); \
+                                                _v.ps; })
+#define _REGION_HW_WALKER(x)    ({union ia64_rr _v; _v.val = (long)(x); \
+                                                _v.ve; })
+enum vhpt_ref{ DATA_REF, NA_REF, INST_REF, RSE_REF };
+enum tlb_miss_type { INSTRUCTION, DATA, REGISTER };
+#define VCPU(_v, _x) ((_v)->arch.vpd->_x)
+#define VMX(_v, _x)  ((_v)->arch._x)
+#define VLSAPIC_INSVC(vcpu, i) ((vcpu)->arch.insvc[i])
+#define VLSAPIC_XTP(_v)        VMX(_v, xtp)
+static inline unsigned long itir_ps(unsigned long itir)
+{
+        return ((itir >> 2) & 0x3f);
+}
+/**************************************************************************
+  VCPU control register access routines
+ **************************************************************************/
+static inline u64 vcpu_get_itir(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, itir));
+}
+static inline void vcpu_set_itir(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, itir) = val;
+}
+static inline u64 vcpu_get_ifa(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, ifa));
+}
+static inline void vcpu_set_ifa(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, ifa) = val;
+}
+static inline u64 vcpu_get_iva(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, iva));
+}
+static inline u64 vcpu_get_pta(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, pta));
+}
+static inline u64 vcpu_get_lid(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, lid));
+}
+static inline u64 vcpu_get_tpr(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, tpr));
+}
+static inline u64 vcpu_get_eoi(struct kvm_vcpu *vcpu)
+{
+        return (0UL);           /*reads of eoi always return 0 */
+}
+static inline u64 vcpu_get_irr0(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, irr[0]));
+}
+static inline u64 vcpu_get_irr1(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, irr[1]));
+}
+static inline u64 vcpu_get_irr2(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, irr[2]));
+}
+static inline u64 vcpu_get_irr3(struct kvm_vcpu *vcpu)
+{
+        return ((u64)VCPU(vcpu, irr[3]));
+}
+static inline void vcpu_set_dcr(struct kvm_vcpu *vcpu, u64 val)
+{
+        ia64_setreg(_IA64_REG_CR_DCR, val);
+}
+static inline void vcpu_set_isr(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, isr) = val;
+}
+static inline void vcpu_set_lid(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, lid) = val;
+}
+static inline void vcpu_set_ipsr(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, ipsr) = val;
+}
+static inline void vcpu_set_iip(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, iip) = val;
+}
+static inline void vcpu_set_ifs(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, ifs) = val;
+}
+static inline void vcpu_set_iipa(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, iipa) = val;
+}
+static inline void vcpu_set_iha(struct kvm_vcpu *vcpu, u64 val)
+{
+        VCPU(vcpu, iha) = val;
+}
+static inline u64 vcpu_get_rr(struct kvm_vcpu *vcpu, u64 reg)
+{
+        return vcpu->arch.vrr[reg>>61];
+}
+/**************************************************************************
+  VCPU debug breakpoint register access routines
+ **************************************************************************/
+static inline void vcpu_set_dbr(struct kvm_vcpu *vcpu, u64 reg, u64 val)
+{
+        __ia64_set_dbr(reg, val);
+}
+static inline void vcpu_set_ibr(struct kvm_vcpu *vcpu, u64 reg, u64 val)
+{
+        ia64_set_ibr(reg, val);
+}
+static inline u64 vcpu_get_dbr(struct kvm_vcpu *vcpu, u64 reg)
+{
+        return ((u64)__ia64_get_dbr(reg));
+}
+static inline u64 vcpu_get_ibr(struct kvm_vcpu *vcpu, u64 reg)
+{
+        return ((u64)ia64_get_ibr(reg));
+}
+/**************************************************************************
+  VCPU performance monitor register access routines
+ **************************************************************************/
+static inline void vcpu_set_pmc(struct kvm_vcpu *vcpu, u64 reg, u64 val)
+{
+        /* NOTE: Writes to unimplemented PMC registers are discarded */
+        ia64_set_pmc(reg, val);
+}
+static inline void vcpu_set_pmd(struct kvm_vcpu *vcpu, u64 reg, u64 val)
+{
+        /* NOTE: Writes to unimplemented PMD registers are discarded */
+        ia64_set_pmd(reg, val);
+}
+static inline u64 vcpu_get_pmc(struct kvm_vcpu *vcpu, u64 reg)
+{
+        /* NOTE: Reads from unimplemented PMC registers return zero */
+        return ((u64)ia64_get_pmc(reg));
+}
+static inline u64 vcpu_get_pmd(struct kvm_vcpu *vcpu, u64 reg)
+{
+        /* NOTE: Reads from unimplemented PMD registers return zero */
+        return ((u64)ia64_get_pmd(reg));
+}
+static inline unsigned long vrrtomrr(unsigned long val)
+{
+        union ia64_rr rr;
+        rr.val = val;
+        rr.rid = (rr.rid << 4) | 0xe;
+        if (rr.ps > PAGE_SHIFT)
+                rr.ps = PAGE_SHIFT;
+        rr.ve = 1;
+        return rr.val;
+}
+static inline int highest_bits(int *dat)
+{
+        u32  bits, bitnum;
+        int i;
+        /* loop for all 256 bits */
+        for (i = 7; i >= 0 ; i--) {
+                bits = dat[i];
+                if (bits) {
+                        bitnum = fls(bits);
+                        return i * 32 + bitnum - 1;
+                }
+        }
+        return NULL_VECTOR;
+}
+/*
+ * The pending irq is higher than the inservice one.
+ *
+ */
+static inline int is_higher_irq(int pending, int inservice)
+{
+        return ((pending > inservice)
+                        || ((pending != NULL_VECTOR)
+                                && (inservice == NULL_VECTOR)));
+}
+static inline int is_higher_class(int pending, int mic)
+{
+        return ((pending >> 4) > mic);
+}
+/*
+ * Return 0-255 for pending irq.
+ *        NULL_VECTOR: when no pending.
+ */
+static inline int highest_pending_irq(struct kvm_vcpu *vcpu)
+{
+        if (VCPU(vcpu, irr[0]) & (1UL<<NMI_VECTOR))
+                return NMI_VECTOR;
+        if (VCPU(vcpu, irr[0]) & (1UL<<ExtINT_VECTOR))
+                return ExtINT_VECTOR;
+        return highest_bits((int *)&VCPU(vcpu, irr[0]));
+}
+static inline int highest_inservice_irq(struct kvm_vcpu *vcpu)
+{
+        if (VMX(vcpu, insvc[0]) & (1UL<<NMI_VECTOR))
+                return NMI_VECTOR;
+        if (VMX(vcpu, insvc[0]) & (1UL<<ExtINT_VECTOR))
+                return ExtINT_VECTOR;
+        return highest_bits((int *)&(VMX(vcpu, insvc[0])));
+}
+extern void vcpu_get_fpreg(struct kvm_vcpu *vcpu, u64 reg,
+                                        struct ia64_fpreg *val);
+extern void vcpu_set_fpreg(struct kvm_vcpu *vcpu, u64 reg,
+                                        struct ia64_fpreg *val);
+extern u64 vcpu_get_gr(struct kvm_vcpu *vcpu, u64 reg);
+extern void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 val, int nat);
+extern u64 vcpu_get_psr(struct kvm_vcpu *vcpu);
+extern void vcpu_set_psr(struct kvm_vcpu *vcpu, u64 val);
+extern u64 vcpu_thash(struct kvm_vcpu *vcpu, u64 vadr);
+extern void vcpu_bsw0(struct kvm_vcpu *vcpu);
+extern void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte,
+                                        u64 itir, u64 va, int type);
+extern struct thash_data *vhpt_lookup(u64 va);
+extern u64 guest_vhpt_lookup(u64 iha, u64 *pte);
+extern void thash_purge_entries(struct kvm_vcpu *v, u64 va, u64 ps);
+extern void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps);
+extern u64 translate_phy_pte(u64 *pte, u64 itir, u64 va);
+extern int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte,
+                u64 itir, u64 ifa, int type);
+extern void thash_purge_all(struct kvm_vcpu *v);
+extern struct thash_data *vtlb_lookup(struct kvm_vcpu *v,
+                                                u64 va, int is_data);
+extern int vtr_find_overlap(struct kvm_vcpu *vcpu, u64 va,
+                                                u64 ps, int is_data);
+extern void vcpu_increment_iip(struct kvm_vcpu *v);
+extern void vcpu_decrement_iip(struct kvm_vcpu *vcpu);
+extern void vcpu_pend_interrupt(struct kvm_vcpu *vcpu, u8 vec);
+extern void vcpu_unpend_interrupt(struct kvm_vcpu *vcpu, u8 vec);
+extern void data_page_not_present(struct kvm_vcpu *vcpu, u64 vadr);
+extern void dnat_page_consumption(struct kvm_vcpu *vcpu, u64 vadr);
+extern void alt_dtlb(struct kvm_vcpu *vcpu, u64 vadr);
+extern void nested_dtlb(struct kvm_vcpu *vcpu);
+extern void dvhpt_fault(struct kvm_vcpu *vcpu, u64 vadr);
+extern int vhpt_enabled(struct kvm_vcpu *vcpu, u64 vadr, enum vhpt_ref ref);
+extern void update_vhpi(struct kvm_vcpu *vcpu, int vec);
+extern int irq_masked(struct kvm_vcpu *vcpu, int h_pending, int h_inservice);
+extern int fetch_code(struct kvm_vcpu *vcpu, u64 gip, IA64_BUNDLE *pbundle);
+extern void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma);
+extern void vmm_transition(struct kvm_vcpu *vcpu);
+extern void vmm_trampoline(union context *from, union context *to);
+extern int vmm_entry(void);
+extern  u64 vcpu_get_itc(struct kvm_vcpu *vcpu);
+extern void vmm_reset_entry(void);
+void kvm_init_vtlb(struct kvm_vcpu *v);
+void kvm_init_vhpt(struct kvm_vcpu *v);
+void thash_init(struct thash_cb *hcb, u64 sz);
+void panic_vm(struct kvm_vcpu *v);
+extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
+                u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+#endif
+#endif  /* __VCPU_H__ */
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
new file mode 100644
index 00000000000..2275bf4e681
--- /dev/null
+++ b/arch/ia64/kvm/vmm.c
@@ -0,0 +1,66 @@
+/*
+ * vmm.c: vmm module interface with kvm module
+ *
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ *  Xiantao Zhang (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+#include<linux/module.h>
+#include<asm/fpswa.h>
+#include "vcpu.h"
+MODULE_AUTHOR("Intel");
+MODULE_LICENSE("GPL");
+extern char kvm_ia64_ivt;
+extern fpswa_interface_t *vmm_fpswa_interface;
+struct kvm_vmm_info vmm_info = {
+        .module      = THIS_MODULE,
+        .vmm_entry   = vmm_entry,
+        .tramp_entry = vmm_trampoline,
+        .vmm_ivt     = (unsigned long)&kvm_ia64_ivt,
+};
+static int __init  kvm_vmm_init(void)
+{
+        vmm_fpswa_interface = fpswa_interface;
+        /*Register vmm data to kvm side*/
+        return kvm_init(&vmm_info, 1024, THIS_MODULE);
+}
+static void __exit kvm_vmm_exit(void)
+{
+        kvm_exit();
+        return ;
+}
+void vmm_spin_lock(spinlock_t *lock)
+{
+        _vmm_raw_spin_lock(lock);
+}
+void vmm_spin_unlock(spinlock_t *lock)
+{
+        _vmm_raw_spin_unlock(lock);
+}
+module_init(kvm_vmm_init)
+module_exit(kvm_vmm_exit)
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
new file mode 100644
index 00000000000..3ee5f481c06
--- /dev/null
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -0,0 +1,1424 @@
+/*
+ * /ia64/kvm_ivt.S
+ *
+ * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger <davidm@hpl.hp.com>
+ * Copyright (C) 2000, 2002-2003 Intel Co
+ *      Asit Mallick <asit.k.mallick@intel.com>
+ *      Suresh Siddha <suresh.b.siddha@intel.com>
+ *      Kenneth Chen <kenneth.w.chen@intel.com>
+ *      Fenghua Yu <fenghua.yu@intel.com>
+ *
+ *
+ * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling
+ * for SMP
+ * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB
+ * handler now uses virtual PT.
+ *
+ * 07/6/20 Xuefei Xu  (Anthony Xu) (anthony.xu@intel.com)
+ *              Supporting Intel virtualization architecture
+ *
+ */
+/*
+ * This file defines the interruption vector table used by the CPU.
+ * It does not include one entry per possible cause of interruption.
+ *
+ * The first 20 entries of the table contain 64 bundles each while the
+ * remaining 48 entries contain only 16 bundles each.
+ *
+ * The 64 bundles are used to allow inlining the whole handler for
+ * critical
+ * interruptions like TLB misses.
+ *
+ *  For each entry, the comment is as follows:
+ *
+ *              // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss
+ *              (12,51)
+ *  entry offset ----/     /         /                  /
+ *  /
+ *  entry number ---------/         /                  /
+ *  /
+ *  size of the entry -------------/                  /
+ *  /
+ *  vector name -------------------------------------/
+ *  /
+ *  interruptions triggering this vector
+ *  ----------------------/
+ *
+ * The table is 32KB in size and must be aligned on 32KB
+ * boundary.
+ * (The CPU ignores the 15 lower bits of the address)
+ *
+ * Table is based upon EAS2.6 (Oct 1999)
+ */
+#include <asm/asmmacro.h>
+#include <asm/cache.h>
+#include <asm/pgtable.h>
+#include "asm-offsets.h"
+#include "vcpu.h"
+#include "kvm_minstate.h"
+#include "vti.h"
+#if 1
+# define PSR_DEFAULT_BITS   psr.ac
+#else
+# define PSR_DEFAULT_BITS   0
+#endif
+#define KVM_FAULT(n)    \
+    kvm_fault_##n:;          \
+    mov r19=n;;          \
+    br.sptk.many kvm_fault_##n;         \
+    ;;                  \
+#define KVM_REFLECT(n)    \
+    mov r31=pr;           \
+    mov r19=n;       /* prepare to save predicates */ \
+    mov r29=cr.ipsr;      \
+    ;;      \
+    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
+(p7)br.sptk.many kvm_dispatch_reflection;        \
+    br.sptk.many kvm_panic;      \
+GLOBAL_ENTRY(kvm_panic)
+    br.sptk.many kvm_panic
+    ;;
+END(kvm_panic)
+    .section .text.ivt,"ax"
+    .align 32768    // align on 32KB boundary
+    .global kvm_ia64_ivt
+kvm_ia64_ivt:
+///////////////////////////////////////////////////////////////
+// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
+ENTRY(kvm_vhpt_miss)
+    KVM_FAULT(0)
+END(kvm_vhpt_miss)
+    .org kvm_ia64_ivt+0x400
+////////////////////////////////////////////////////////////////
+// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
+ENTRY(kvm_itlb_miss)
+    mov r31 = pr
+    mov r29=cr.ipsr;
+    ;;
+    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+    (p6) br.sptk kvm_alt_itlb_miss
+    mov r19 = 1
+    br.sptk kvm_itlb_miss_dispatch
+    KVM_FAULT(1);
+END(kvm_itlb_miss)
+    .org kvm_ia64_ivt+0x0800
+//////////////////////////////////////////////////////////////////
+// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
+ENTRY(kvm_dtlb_miss)
+    mov r31 = pr
+    mov r29=cr.ipsr;
+    ;;
+    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+(p6)br.sptk kvm_alt_dtlb_miss
+    br.sptk kvm_dtlb_miss_dispatch
+END(kvm_dtlb_miss)
+     .org kvm_ia64_ivt+0x0c00
+////////////////////////////////////////////////////////////////////
+// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
+ENTRY(kvm_alt_itlb_miss)
+    mov r16=cr.ifa    // get address that caused the TLB miss
+    ;;
+    movl r17=PAGE_KERNEL
+    mov r24=cr.ipsr
+    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+    ;;
+    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+    ;;
+    or r19=r17,r19      // insert PTE control bits into r19
+    ;;
+    movl r20=IA64_GRANULE_SHIFT<<2
+    ;;
+    mov cr.itir=r20
+    ;;
+    itc.i r19           // insert the TLB entry
+    mov pr=r31,-1
+    rfi
+END(kvm_alt_itlb_miss)
+    .org kvm_ia64_ivt+0x1000
+/////////////////////////////////////////////////////////////////////
+// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
+ENTRY(kvm_alt_dtlb_miss)
+    mov r16=cr.ifa              // get address that caused the TLB miss
+    ;;
+    movl r17=PAGE_KERNEL
+    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+    mov r24=cr.ipsr
+    ;;
+    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+    ;;
+    or r19=r19,r17      // insert PTE control bits into r19
+    ;;
+    movl r20=IA64_GRANULE_SHIFT<<2
+    ;;
+    mov cr.itir=r20
+    ;;
+    itc.d r19           // insert the TLB entry
+    mov pr=r31,-1
+    rfi
+END(kvm_alt_dtlb_miss)
+    .org kvm_ia64_ivt+0x1400
+//////////////////////////////////////////////////////////////////////
+// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
+ENTRY(kvm_nested_dtlb_miss)
+    KVM_FAULT(5)
+END(kvm_nested_dtlb_miss)
+    .org kvm_ia64_ivt+0x1800
+/////////////////////////////////////////////////////////////////////
+// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
+ENTRY(kvm_ikey_miss)
+    KVM_REFLECT(6)
+END(kvm_ikey_miss)
+    .org kvm_ia64_ivt+0x1c00
+/////////////////////////////////////////////////////////////////////
+// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
+ENTRY(kvm_dkey_miss)
+    KVM_REFLECT(7)
+END(kvm_dkey_miss)
+    .org kvm_ia64_ivt+0x2000
+////////////////////////////////////////////////////////////////////
+// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
+ENTRY(kvm_dirty_bit)
+    KVM_REFLECT(8)
+END(kvm_dirty_bit)
+    .org kvm_ia64_ivt+0x2400
+////////////////////////////////////////////////////////////////////
+// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
+ENTRY(kvm_iaccess_bit)
+    KVM_REFLECT(9)
+END(kvm_iaccess_bit)
+    .org kvm_ia64_ivt+0x2800
+///////////////////////////////////////////////////////////////////
+// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
+ENTRY(kvm_daccess_bit)
+    KVM_REFLECT(10)
+END(kvm_daccess_bit)
+    .org kvm_ia64_ivt+0x2c00
+/////////////////////////////////////////////////////////////////
+// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
+ENTRY(kvm_break_fault)
+    mov r31=pr
+    mov r19=11
+    mov r29=cr.ipsr
+    ;;
+    KVM_SAVE_MIN_WITH_COVER_R19
+    ;;
+    alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!)
+    mov out0=cr.ifa
+    mov out2=cr.isr     // FIXME: pity to make this slow access twice
+    mov out3=cr.iim     // FIXME: pity to make this slow access twice
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15)ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    KVM_SAVE_REST
+    mov rp=r14
+    ;;
+    adds out1=16,sp
+    br.call.sptk.many b6=kvm_ia64_handle_break
+    ;;
+END(kvm_break_fault)
+    .org kvm_ia64_ivt+0x3000
+/////////////////////////////////////////////////////////////////
+// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
+ENTRY(kvm_interrupt)
+    mov r31=pr          // prepare to save predicates
+    mov r19=12
+    mov r29=cr.ipsr
+    ;;
+    tbit.z p6,p7=r29,IA64_PSR_VM_BIT
+    tbit.z p0,p15=r29,IA64_PSR_I_BIT
+    ;;
+(p7) br.sptk kvm_dispatch_interrupt
+    ;;
+    mov r27=ar.rsc              /* M */
+    mov r20=r1                  /* A */
+    mov r25=ar.unat             /* M */
+    mov r26=ar.pfs              /* I */
+    mov r28=cr.iip              /* M */
+    cover                       /* B (or nothing) */
+    ;;
+    mov r1=sp
+    ;;
+    invala                      /* M */
+    mov r30=cr.ifs
+    ;;
+    addl r1=-VMM_PT_REGS_SIZE,r1
+    ;;
+    adds r17=2*L1_CACHE_BYTES,r1        /* really: biggest cache-line size */
+    adds r16=PT(CR_IPSR),r1
+    ;;
+    lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
+    st8 [r16]=r29                       /* save cr.ipsr */
+    ;;
+    lfetch.fault.excl.nt1 [r17]
+    mov r29=b0
+    ;;
+    adds r16=PT(R8),r1          /* initialize first base pointer */
+    adds r17=PT(R9),r1          /* initialize second base pointer */
+    mov r18=r0                  /* make sure r18 isn't NaT */
+    ;;
+.mem.offset 0,0; st8.spill [r16]=r8,16
+.mem.offset 8,0; st8.spill [r17]=r9,16
+        ;;
+.mem.offset 0,0; st8.spill [r16]=r10,24
+.mem.offset 8,0; st8.spill [r17]=r11,24
+        ;;
+    st8 [r16]=r28,16            /* save cr.iip */
+    st8 [r17]=r30,16            /* save cr.ifs */
+    mov r8=ar.fpsr              /* M */
+    mov r9=ar.csd
+    mov r10=ar.ssd
+    movl r11=FPSR_DEFAULT       /* L-unit */
+    ;;
+    st8 [r16]=r25,16            /* save ar.unat */
+    st8 [r17]=r26,16            /* save ar.pfs */
+    shl r18=r18,16              /* compute ar.rsc to be used for "loadrs" */
+    ;;
+    st8 [r16]=r27,16            /* save ar.rsc */
+    adds r17=16,r17             /* skip over ar_rnat field */
+    ;;
+    st8 [r17]=r31,16            /* save predicates */
+    adds r16=16,r16             /* skip over ar_bspstore field */
+    ;;
+    st8 [r16]=r29,16            /* save b0 */
+    st8 [r17]=r18,16            /* save ar.rsc value for "loadrs" */
+    ;;
+.mem.offset 0,0; st8.spill [r16]=r20,16    /* save original r1 */
+.mem.offset 8,0; st8.spill [r17]=r12,16
+    adds r12=-16,r1
+    /* switch to kernel memory stack (with 16 bytes of scratch) */
+    ;;
+.mem.offset 0,0; st8.spill [r16]=r13,16
+.mem.offset 8,0; st8.spill [r17]=r8,16 /* save ar.fpsr */
+    ;;
+.mem.offset 0,0; st8.spill [r16]=r15,16
+.mem.offset 8,0; st8.spill [r17]=r14,16
+    dep r14=-1,r0,60,4
+    ;;
+.mem.offset 0,0; st8.spill [r16]=r2,16
+.mem.offset 8,0; st8.spill [r17]=r3,16
+    adds r2=VMM_PT_REGS_R16_OFFSET,r1
+    adds r14 = VMM_VCPU_GP_OFFSET,r13
+    ;;
+    mov r8=ar.ccv
+    ld8 r14 = [r14]
+    ;;
+    mov r1=r14       /* establish kernel global pointer */
+    ;;                                          \
+    bsw.1
+    ;;
+    alloc r14=ar.pfs,0,0,1,0    // must be first in an insn group
+    mov out0=r13
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i
+    ;;
+    //(p15) ssm psr.i
+    adds r3=8,r2                // set up second base pointer for SAVE_REST
+    srlz.i                      // ensure everybody knows psr.ic is back on
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r16,16
+.mem.offset 8,0; st8.spill [r3]=r17,16
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r18,16
+.mem.offset 8,0; st8.spill [r3]=r19,16
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r20,16
+.mem.offset 8,0; st8.spill [r3]=r21,16
+    mov r18=b6
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r22,16
+.mem.offset 8,0; st8.spill [r3]=r23,16
+    mov r19=b7
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r24,16
+.mem.offset 8,0; st8.spill [r3]=r25,16
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r26,16
+.mem.offset 8,0; st8.spill [r3]=r27,16
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r28,16
+.mem.offset 8,0; st8.spill [r3]=r29,16
+    ;;
+.mem.offset 0,0; st8.spill [r2]=r30,16
+.mem.offset 8,0; st8.spill [r3]=r31,32
+    ;;
+    mov ar.fpsr=r11       /* M-unit */
+    st8 [r2]=r8,8         /* ar.ccv */
+    adds r24=PT(B6)-PT(F7),r3
+    ;;
+    stf.spill [r2]=f6,32
+    stf.spill [r3]=f7,32
+    ;;
+    stf.spill [r2]=f8,32
+    stf.spill [r3]=f9,32
+    ;;
+    stf.spill [r2]=f10
+    stf.spill [r3]=f11
+    adds r25=PT(B7)-PT(F11),r3
+    ;;
+    st8 [r24]=r18,16       /* b6 */
+    st8 [r25]=r19,16       /* b7 */
+    ;;
+    st8 [r24]=r9           /* ar.csd */
+    st8 [r25]=r10          /* ar.ssd */
+    ;;
+    srlz.d              // make sure we see the effect of cr.ivr
+    addl r14=@gprel(ia64_leave_nested),gp
+    ;;
+    mov rp=r14
+    br.call.sptk.many b6=kvm_ia64_handle_irq
+    ;;
+END(kvm_interrupt)
+    .global kvm_dispatch_vexirq
+    .org kvm_ia64_ivt+0x3400
+//////////////////////////////////////////////////////////////////////
+// 0x3400 Entry 13 (size 64 bundles) Reserved
+ENTRY(kvm_virtual_exirq)
+    mov r31=pr
+    mov r19=13
+    mov r30 =r0
+    ;;
+kvm_dispatch_vexirq:
+    cmp.eq p6,p0 = 1,r30
+    ;;
+(p6)add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
+    ;;
+(p6)ld8 r1 = [r29]
+    ;;
+    KVM_SAVE_MIN_WITH_COVER_R19
+    alloc r14=ar.pfs,0,0,1,0
+    mov out0=r13
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    KVM_SAVE_REST
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    mov rp=r14
+    br.call.sptk.many b6=kvm_vexirq
+END(kvm_virtual_exirq)
+    .org kvm_ia64_ivt+0x3800
+/////////////////////////////////////////////////////////////////////
+// 0x3800 Entry 14 (size 64 bundles) Reserved
+    KVM_FAULT(14)
+    // this code segment is from 2.6.16.13
+    .org kvm_ia64_ivt+0x3c00
+///////////////////////////////////////////////////////////////////////
+// 0x3c00 Entry 15 (size 64 bundles) Reserved
+    KVM_FAULT(15)
+    .org kvm_ia64_ivt+0x4000
+///////////////////////////////////////////////////////////////////////
+// 0x4000 Entry 16 (size 64 bundles) Reserved
+    KVM_FAULT(16)
+    .org kvm_ia64_ivt+0x4400
+//////////////////////////////////////////////////////////////////////
+// 0x4400 Entry 17 (size 64 bundles) Reserved
+    KVM_FAULT(17)
+    .org kvm_ia64_ivt+0x4800
+//////////////////////////////////////////////////////////////////////
+// 0x4800 Entry 18 (size 64 bundles) Reserved
+    KVM_FAULT(18)
+    .org kvm_ia64_ivt+0x4c00
+//////////////////////////////////////////////////////////////////////
+// 0x4c00 Entry 19 (size 64 bundles) Reserved
+    KVM_FAULT(19)
+    .org kvm_ia64_ivt+0x5000
+//////////////////////////////////////////////////////////////////////
+// 0x5000 Entry 20 (size 16 bundles) Page Not Present
+ENTRY(kvm_page_not_present)
+    KVM_REFLECT(20)
+END(kvm_page_not_present)
+    .org kvm_ia64_ivt+0x5100
+///////////////////////////////////////////////////////////////////////
+// 0x5100 Entry 21 (size 16 bundles) Key Permission vector
+ENTRY(kvm_key_permission)
+    KVM_REFLECT(21)
+END(kvm_key_permission)
+    .org kvm_ia64_ivt+0x5200
+//////////////////////////////////////////////////////////////////////
+// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
+ENTRY(kvm_iaccess_rights)
+    KVM_REFLECT(22)
+END(kvm_iaccess_rights)
+    .org kvm_ia64_ivt+0x5300
+//////////////////////////////////////////////////////////////////////
+// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
+ENTRY(kvm_daccess_rights)
+    KVM_REFLECT(23)
+END(kvm_daccess_rights)
+    .org kvm_ia64_ivt+0x5400
+/////////////////////////////////////////////////////////////////////
+// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
+ENTRY(kvm_general_exception)
+   KVM_REFLECT(24)
+   KVM_FAULT(24)
+END(kvm_general_exception)
+    .org kvm_ia64_ivt+0x5500
+//////////////////////////////////////////////////////////////////////
+// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
+ENTRY(kvm_disabled_fp_reg)
+    KVM_REFLECT(25)
+END(kvm_disabled_fp_reg)
+    .org kvm_ia64_ivt+0x5600
+////////////////////////////////////////////////////////////////////
+// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
+ENTRY(kvm_nat_consumption)
+    KVM_REFLECT(26)
+END(kvm_nat_consumption)
+    .org kvm_ia64_ivt+0x5700
+/////////////////////////////////////////////////////////////////////
+// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
+ENTRY(kvm_speculation_vector)
+    KVM_REFLECT(27)
+END(kvm_speculation_vector)
+    .org kvm_ia64_ivt+0x5800
+/////////////////////////////////////////////////////////////////////
+// 0x5800 Entry 28 (size 16 bundles) Reserved
+    KVM_FAULT(28)
+    .org kvm_ia64_ivt+0x5900
+///////////////////////////////////////////////////////////////////
+// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
+ENTRY(kvm_debug_vector)
+    KVM_FAULT(29)
+END(kvm_debug_vector)
+    .org kvm_ia64_ivt+0x5a00
+///////////////////////////////////////////////////////////////
+// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
+ENTRY(kvm_unaligned_access)
+    KVM_REFLECT(30)
+END(kvm_unaligned_access)
+    .org kvm_ia64_ivt+0x5b00
+//////////////////////////////////////////////////////////////////////
+// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
+ENTRY(kvm_unsupported_data_reference)
+    KVM_REFLECT(31)
+END(kvm_unsupported_data_reference)
+    .org kvm_ia64_ivt+0x5c00
+////////////////////////////////////////////////////////////////////
+// 0x5c00 Entry 32 (size 16 bundles) Floating Point FAULT (65)
+ENTRY(kvm_floating_point_fault)
+    KVM_REFLECT(32)
+END(kvm_floating_point_fault)
+    .org kvm_ia64_ivt+0x5d00
+/////////////////////////////////////////////////////////////////////
+// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
+ENTRY(kvm_floating_point_trap)
+    KVM_REFLECT(33)
+END(kvm_floating_point_trap)
+    .org kvm_ia64_ivt+0x5e00
+//////////////////////////////////////////////////////////////////////
+// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
+ENTRY(kvm_lower_privilege_trap)
+    KVM_REFLECT(34)
+END(kvm_lower_privilege_trap)
+    .org kvm_ia64_ivt+0x5f00
+//////////////////////////////////////////////////////////////////////
+// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
+ENTRY(kvm_taken_branch_trap)
+    KVM_REFLECT(35)
+END(kvm_taken_branch_trap)
+    .org kvm_ia64_ivt+0x6000
+////////////////////////////////////////////////////////////////////
+// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
+ENTRY(kvm_single_step_trap)
+    KVM_REFLECT(36)
+END(kvm_single_step_trap)
+    .global kvm_virtualization_fault_back
+    .org kvm_ia64_ivt+0x6100
+/////////////////////////////////////////////////////////////////////
+// 0x6100 Entry 37 (size 16 bundles) Virtualization Fault
+ENTRY(kvm_virtualization_fault)
+    mov r31=pr
+    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+    ;;
+    st8 [r16] = r1
+    adds r17 = VMM_VCPU_GP_OFFSET, r21
+    ;;
+    ld8 r1 = [r17]
+    cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
+    cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
+    cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
+    cmp.eq p9,p0=EVENT_RSM,r24
+    cmp.eq p10,p0=EVENT_SSM,r24
+    cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
+    cmp.eq p12,p0=EVENT_THASH,r24
+    (p6) br.dptk.many kvm_asm_mov_from_ar
+    (p7) br.dptk.many kvm_asm_mov_from_rr
+    (p8) br.dptk.many kvm_asm_mov_to_rr
+    (p9) br.dptk.many kvm_asm_rsm
+    (p10) br.dptk.many kvm_asm_ssm
+    (p11) br.dptk.many kvm_asm_mov_to_psr
+    (p12) br.dptk.many kvm_asm_thash
+    ;;
+kvm_virtualization_fault_back:
+    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+    ;;
+    ld8 r1 = [r16]
+    ;;
+    mov r19=37
+    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+    ;;
+    st8 [r16] = r24
+    st8 [r17] = r25
+    ;;
+    cmp.ne p6,p0=EVENT_RFI, r24
+    (p6) br.sptk kvm_dispatch_virtualization_fault
+    ;;
+    adds r18=VMM_VPD_BASE_OFFSET,r21
+    ;;
+    ld8 r18=[r18]
+    ;;
+    adds r18=VMM_VPD_VIFS_OFFSET,r18
+    ;;
+    ld8 r18=[r18]
+    ;;
+    tbit.z p6,p0=r18,63
+    (p6) br.sptk kvm_dispatch_virtualization_fault
+    ;;
+    //if vifs.v=1 desert current register frame
+    alloc r18=ar.pfs,0,0,0,0
+    br.sptk kvm_dispatch_virtualization_fault
+END(kvm_virtualization_fault)
+    .org kvm_ia64_ivt+0x6200
+//////////////////////////////////////////////////////////////
+// 0x6200 Entry 38 (size 16 bundles) Reserved
+    KVM_FAULT(38)
+    .org kvm_ia64_ivt+0x6300
+/////////////////////////////////////////////////////////////////
+// 0x6300 Entry 39 (size 16 bundles) Reserved
+    KVM_FAULT(39)
+    .org kvm_ia64_ivt+0x6400
+/////////////////////////////////////////////////////////////////
+// 0x6400 Entry 40 (size 16 bundles) Reserved
+    KVM_FAULT(40)
+    .org kvm_ia64_ivt+0x6500
+//////////////////////////////////////////////////////////////////
+// 0x6500 Entry 41 (size 16 bundles) Reserved
+    KVM_FAULT(41)
+    .org kvm_ia64_ivt+0x6600
+//////////////////////////////////////////////////////////////////
+// 0x6600 Entry 42 (size 16 bundles) Reserved
+    KVM_FAULT(42)
+    .org kvm_ia64_ivt+0x6700
+//////////////////////////////////////////////////////////////////
+// 0x6700 Entry 43 (size 16 bundles) Reserved
+    KVM_FAULT(43)
+    .org kvm_ia64_ivt+0x6800
+//////////////////////////////////////////////////////////////////
+// 0x6800 Entry 44 (size 16 bundles) Reserved
+    KVM_FAULT(44)
+    .org kvm_ia64_ivt+0x6900
+///////////////////////////////////////////////////////////////////
+// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception
+//(17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
+ENTRY(kvm_ia32_exception)
+    KVM_FAULT(45)
+END(kvm_ia32_exception)
+    .org kvm_ia64_ivt+0x6a00
+////////////////////////////////////////////////////////////////////
+// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
+ENTRY(kvm_ia32_intercept)
+    KVM_FAULT(47)
+END(kvm_ia32_intercept)
+    .org kvm_ia64_ivt+0x6c00
+/////////////////////////////////////////////////////////////////////
+// 0x6c00 Entry 48 (size 16 bundles) Reserved
+    KVM_FAULT(48)
+    .org kvm_ia64_ivt+0x6d00
+//////////////////////////////////////////////////////////////////////
+// 0x6d00 Entry 49 (size 16 bundles) Reserved
+    KVM_FAULT(49)
+    .org kvm_ia64_ivt+0x6e00
+//////////////////////////////////////////////////////////////////////
+// 0x6e00 Entry 50 (size 16 bundles) Reserved
+    KVM_FAULT(50)
+    .org kvm_ia64_ivt+0x6f00
+/////////////////////////////////////////////////////////////////////
+// 0x6f00 Entry 51 (size 16 bundles) Reserved
+    KVM_FAULT(52)
+    .org kvm_ia64_ivt+0x7100
+////////////////////////////////////////////////////////////////////
+// 0x7100 Entry 53 (size 16 bundles) Reserved
+    KVM_FAULT(53)
+    .org kvm_ia64_ivt+0x7200
+/////////////////////////////////////////////////////////////////////
+// 0x7200 Entry 54 (size 16 bundles) Reserved
+    KVM_FAULT(54)
+    .org kvm_ia64_ivt+0x7300
+////////////////////////////////////////////////////////////////////
+// 0x7300 Entry 55 (size 16 bundles) Reserved
+    KVM_FAULT(55)
+    .org kvm_ia64_ivt+0x7400
+////////////////////////////////////////////////////////////////////
+// 0x7400 Entry 56 (size 16 bundles) Reserved
+    KVM_FAULT(56)
+    .org kvm_ia64_ivt+0x7500
+/////////////////////////////////////////////////////////////////////
+// 0x7500 Entry 57 (size 16 bundles) Reserved
+    KVM_FAULT(57)
+    .org kvm_ia64_ivt+0x7600
+/////////////////////////////////////////////////////////////////////
+// 0x7600 Entry 58 (size 16 bundles) Reserved
+    KVM_FAULT(58)
+    .org kvm_ia64_ivt+0x7700
+////////////////////////////////////////////////////////////////////
+// 0x7700 Entry 59 (size 16 bundles) Reserved
+    KVM_FAULT(59)
+    .org kvm_ia64_ivt+0x7800
+////////////////////////////////////////////////////////////////////
+// 0x7800 Entry 60 (size 16 bundles) Reserved
+    KVM_FAULT(60)
+    .org kvm_ia64_ivt+0x7900
+/////////////////////////////////////////////////////////////////////
+// 0x7900 Entry 61 (size 16 bundles) Reserved
+    KVM_FAULT(61)
+    .org kvm_ia64_ivt+0x7a00
+/////////////////////////////////////////////////////////////////////
+// 0x7a00 Entry 62 (size 16 bundles) Reserved
+    KVM_FAULT(62)
+    .org kvm_ia64_ivt+0x7b00
+/////////////////////////////////////////////////////////////////////
+// 0x7b00 Entry 63 (size 16 bundles) Reserved
+    KVM_FAULT(63)
+    .org kvm_ia64_ivt+0x7c00
+////////////////////////////////////////////////////////////////////
+// 0x7c00 Entry 64 (size 16 bundles) Reserved
+    KVM_FAULT(64)
+    .org kvm_ia64_ivt+0x7d00
+/////////////////////////////////////////////////////////////////////
+// 0x7d00 Entry 65 (size 16 bundles) Reserved
+    KVM_FAULT(65)
+    .org kvm_ia64_ivt+0x7e00
+/////////////////////////////////////////////////////////////////////
+// 0x7e00 Entry 66 (size 16 bundles) Reserved
+    KVM_FAULT(66)
+    .org kvm_ia64_ivt+0x7f00
+////////////////////////////////////////////////////////////////////
+// 0x7f00 Entry 67 (size 16 bundles) Reserved
+    KVM_FAULT(67)
+    .org kvm_ia64_ivt+0x8000
+// There is no particular reason for this code to be here, other than that
+// there happens to be space here that would go unused otherwise.  If this
+// fault ever gets "unreserved", simply moved the following code to a more
+// suitable spot...
+ENTRY(kvm_dtlb_miss_dispatch)
+    mov r19 = 2
+    KVM_SAVE_MIN_WITH_COVER_R19
+    alloc r14=ar.pfs,0,0,3,0
+    mov out0=cr.ifa
+    mov out1=r15
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+    ;;
+    KVM_SAVE_REST
+    KVM_SAVE_EXTRA
+    mov rp=r14
+    ;;
+    adds out2=16,r12
+    br.call.sptk.many b6=kvm_page_fault
+END(kvm_dtlb_miss_dispatch)
+ENTRY(kvm_itlb_miss_dispatch)
+    KVM_SAVE_MIN_WITH_COVER_R19
+    alloc r14=ar.pfs,0,0,3,0
+    mov out0=cr.ifa
+    mov out1=r15
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    KVM_SAVE_REST
+    mov rp=r14
+    ;;
+    adds out2=16,r12
+    br.call.sptk.many b6=kvm_page_fault
+END(kvm_itlb_miss_dispatch)
+ENTRY(kvm_dispatch_reflection)
+    /*
+     * Input:
+     *  psr.ic: off
+     *  r19:    intr type (offset into ivt, see ia64_int.h)
+     *  r31:    contains saved predicates (pr)
+     */
+    KVM_SAVE_MIN_WITH_COVER_R19
+    alloc r14=ar.pfs,0,0,5,0
+    mov out0=cr.ifa
+    mov out1=cr.isr
+    mov out2=cr.iim
+    mov out3=r15
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    KVM_SAVE_REST
+    mov rp=r14
+    ;;
+    adds out4=16,r12
+    br.call.sptk.many b6=reflect_interruption
+END(kvm_dispatch_reflection)
+ENTRY(kvm_dispatch_virtualization_fault)
+    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+    ;;
+    st8 [r16] = r24
+    st8 [r17] = r25
+    ;;
+    KVM_SAVE_MIN_WITH_COVER_R19
+    ;;
+    alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
+    mov out0=r13        //vcpu
+    adds r3=8,r2                // set up second base pointer
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+    ;;
+    KVM_SAVE_REST
+    KVM_SAVE_EXTRA
+    mov rp=r14
+    ;;
+    adds out1=16,sp         //regs
+    br.call.sptk.many b6=kvm_emulate
+END(kvm_dispatch_virtualization_fault)
+ENTRY(kvm_dispatch_interrupt)
+    KVM_SAVE_MIN_WITH_COVER_R19 // uses r31; defines r2 and r3
+    ;;
+    alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
+    //mov out0=cr.ivr           // pass cr.ivr as first arg
+    adds r3=8,r2                // set up second base pointer for SAVE_REST
+    ;;
+    ssm psr.ic
+    ;;
+    srlz.i
+    ;;
+    //(p15) ssm psr.i
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    KVM_SAVE_REST
+    mov rp=r14
+    ;;
+    mov out0=r13                // pass pointer to pt_regs as second arg
+    br.call.sptk.many b6=kvm_ia64_handle_irq
+END(kvm_dispatch_interrupt)
+GLOBAL_ENTRY(ia64_leave_nested)
+        rsm psr.i
+        ;;
+        adds r21=PT(PR)+16,r12
+        ;;
+        lfetch [r21],PT(CR_IPSR)-PT(PR)
+        adds r2=PT(B6)+16,r12
+        adds r3=PT(R16)+16,r12
+        ;;
+        lfetch [r21]
+        ld8 r28=[r2],8          // load b6
+        adds r29=PT(R24)+16,r12
+        ld8.fill r16=[r3]
+        adds r3=PT(AR_CSD)-PT(R16),r3
+        adds r30=PT(AR_CCV)+16,r12
+        ;;
+        ld8.fill r24=[r29]
+        ld8 r15=[r30]           // load ar.ccv
+        ;;
+        ld8 r29=[r2],16         // load b7
+        ld8 r30=[r3],16         // load ar.csd
+        ;;
+        ld8 r31=[r2],16         // load ar.ssd
+        ld8.fill r8=[r3],16
+        ;;
+        ld8.fill r9=[r2],16
+        ld8.fill r10=[r3],PT(R17)-PT(R10)
+        ;;
+        ld8.fill r11=[r2],PT(R18)-PT(R11)
+        ld8.fill r17=[r3],16
+        ;;
+        ld8.fill r18=[r2],16
+        ld8.fill r19=[r3],16
+        ;;
+        ld8.fill r20=[r2],16
+        ld8.fill r21=[r3],16
+        mov ar.csd=r30
+        mov ar.ssd=r31
+        ;;
+        rsm psr.i | psr.ic
+        // initiate turning off of interrupt and interruption collection
+        invala                  // invalidate ALAT
+        ;;
+        srlz.i
+        ;;
+        ld8.fill r22=[r2],24
+        ld8.fill r23=[r3],24
+        mov b6=r28
+        ;;
+        ld8.fill r25=[r2],16
+        ld8.fill r26=[r3],16
+        mov b7=r29
+        ;;
+        ld8.fill r27=[r2],16
+        ld8.fill r28=[r3],16
+        ;;
+        ld8.fill r29=[r2],16
+        ld8.fill r30=[r3],24
+        ;;
+        ld8.fill r31=[r2],PT(F9)-PT(R31)
+        adds r3=PT(F10)-PT(F6),r3
+        ;;
+        ldf.fill f9=[r2],PT(F6)-PT(F9)
+        ldf.fill f10=[r3],PT(F8)-PT(F10)
+        ;;
+        ldf.fill f6=[r2],PT(F7)-PT(F6)
+        ;;
+        ldf.fill f7=[r2],PT(F11)-PT(F7)
+        ldf.fill f8=[r3],32
+        ;;
+        srlz.i                  // ensure interruption collection is off
+        mov ar.ccv=r15
+        ;;
+        bsw.0   // switch back to bank 0 (no stop bit required beforehand...)
+        ;;
+        ldf.fill f11=[r2]
+//      mov r18=r13
+//    mov r21=r13
+        adds r16=PT(CR_IPSR)+16,r12
+        adds r17=PT(CR_IIP)+16,r12
+        ;;
+        ld8 r29=[r16],16        // load cr.ipsr
+        ld8 r28=[r17],16        // load cr.iip
+        ;;
+        ld8 r30=[r16],16        // load cr.ifs
+        ld8 r25=[r17],16        // load ar.unat
+        ;;
+        ld8 r26=[r16],16        // load ar.pfs
+        ld8 r27=[r17],16        // load ar.rsc
+        cmp.eq p9,p0=r0,r0
+        // set p9 to indicate that we should restore cr.ifs
+        ;;
+        ld8 r24=[r16],16        // load ar.rnat (may be garbage)
+        ld8 r23=[r17],16// load ar.bspstore (may be garbage)
+        ;;
+        ld8 r31=[r16],16        // load predicates
+        ld8 r22=[r17],16        // load b0
+        ;;
+        ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
+        ld8.fill r1=[r17],16    // load r1
+        ;;
+        ld8.fill r12=[r16],16
+        ld8.fill r13=[r17],16
+        ;;
+        ld8 r20=[r16],16        // ar.fpsr
+        ld8.fill r15=[r17],16
+        ;;
+        ld8.fill r14=[r16],16
+        ld8.fill r2=[r17]
+        ;;
+        ld8.fill r3=[r16]
+        ;;
+        mov r16=ar.bsp          // get existing backing store pointer
+        ;;
+        mov b0=r22
+        mov ar.pfs=r26
+        mov cr.ifs=r30
+        mov cr.ipsr=r29
+        mov ar.fpsr=r20
+        mov cr.iip=r28
+        ;;
+        mov ar.rsc=r27
+        mov ar.unat=r25
+        mov pr=r31,-1
+        rfi
+END(ia64_leave_nested)
+GLOBAL_ENTRY(ia64_leave_hypervisor_prepare)
+    /*
+     * work.need_resched etc. mustn't get changed
+     *by this CPU before it returns to
+    ;;
+     * user- or fsys-mode, hence we disable interrupts early on:
+     */
+    adds r2 = PT(R4)+16,r12
+    adds r3 = PT(R5)+16,r12
+    adds r8 = PT(EML_UNAT)+16,r12
+    ;;
+    ld8 r8 = [r8]
+    ;;
+    mov ar.unat=r8
+    ;;
+    ld8.fill r4=[r2],16    //load r4
+    ld8.fill r5=[r3],16    //load r5
+    ;;
+    ld8.fill r6=[r2]    //load r6
+    ld8.fill r7=[r3]    //load r7
+    ;;
+END(ia64_leave_hypervisor_prepare)
+//fall through
+GLOBAL_ENTRY(ia64_leave_hypervisor)
+    rsm psr.i
+    ;;
+    br.call.sptk.many b0=leave_hypervisor_tail
+    ;;
+    adds r20=PT(PR)+16,r12
+    adds r8=PT(EML_UNAT)+16,r12
+    ;;
+    ld8 r8=[r8]
+    ;;
+    mov ar.unat=r8
+    ;;
+    lfetch [r20],PT(CR_IPSR)-PT(PR)
+    adds r2 = PT(B6)+16,r12
+    adds r3 = PT(B7)+16,r12
+    ;;
+    lfetch [r20]
+    ;;
+    ld8 r24=[r2],16        /* B6 */
+    ld8 r25=[r3],16        /* B7 */
+    ;;
+    ld8 r26=[r2],16        /* ar_csd */
+    ld8 r27=[r3],16        /* ar_ssd */
+    mov b6 = r24
+    ;;
+    ld8.fill r8=[r2],16
+    ld8.fill r9=[r3],16
+    mov b7 = r25
+    ;;
+    mov ar.csd = r26
+    mov ar.ssd = r27
+    ;;
+    ld8.fill r10=[r2],PT(R15)-PT(R10)
+    ld8.fill r11=[r3],PT(R14)-PT(R11)
+    ;;
+    ld8.fill r15=[r2],PT(R16)-PT(R15)
+    ld8.fill r14=[r3],PT(R17)-PT(R14)
+    ;;
+    ld8.fill r16=[r2],16
+    ld8.fill r17=[r3],16
+    ;;
+    ld8.fill r18=[r2],16
+    ld8.fill r19=[r3],16
+    ;;
+    ld8.fill r20=[r2],16
+    ld8.fill r21=[r3],16
+    ;;
+    ld8.fill r22=[r2],16
+    ld8.fill r23=[r3],16
+    ;;
+    ld8.fill r24=[r2],16
+    ld8.fill r25=[r3],16
+    ;;
+    ld8.fill r26=[r2],16
+    ld8.fill r27=[r3],16
+    ;;
+    ld8.fill r28=[r2],16
+    ld8.fill r29=[r3],16
+    ;;
+    ld8.fill r30=[r2],PT(F6)-PT(R30)
+    ld8.fill r31=[r3],PT(F7)-PT(R31)
+    ;;
+    rsm psr.i | psr.ic
+    // initiate turning off of interrupt and interruption collection
+    invala          // invalidate ALAT
+    ;;
+    srlz.i          // ensure interruption collection is off
+    ;;
+    bsw.0
+    ;;
+    adds r16 = PT(CR_IPSR)+16,r12
+    adds r17 = PT(CR_IIP)+16,r12
+    mov r21=r13         // get current
+    ;;
+    ld8 r31=[r16],16    // load cr.ipsr
+    ld8 r30=[r17],16    // load cr.iip
+    ;;
+    ld8 r29=[r16],16    // load cr.ifs
+    ld8 r28=[r17],16    // load ar.unat
+    ;;
+    ld8 r27=[r16],16    // load ar.pfs
+    ld8 r26=[r17],16    // load ar.rsc
+    ;;
+    ld8 r25=[r16],16    // load ar.rnat
+    ld8 r24=[r17],16    // load ar.bspstore
+    ;;
+    ld8 r23=[r16],16    // load predicates
+    ld8 r22=[r17],16    // load b0
+    ;;
+    ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
+    ld8.fill r1=[r17],16    //load r1
+    ;;
+    ld8.fill r12=[r16],16    //load r12
+    ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
+    ;;
+    ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
+    ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
+    ;;
+    ld8.fill r3=[r16]   //load r3
+    ld8 r18=[r17]       //load ar_ccv
+    ;;
+    mov ar.fpsr=r19
+    mov ar.ccv=r18
+    shr.u r18=r20,16
+    ;;
+kvm_rbs_switch:
+    mov r19=96
+kvm_dont_preserve_current_frame:
+/*
+    * To prevent leaking bits between the hypervisor and guest domain,
+    * we must clear the stacked registers in the "invalid" partition here.
+    * 5 registers/cycle on McKinley).
+    */
+#   define pRecurse     p6
+#   define pReturn      p7
+#   define Nregs        14
+    alloc loc0=ar.pfs,2,Nregs-2,2,0
+    shr.u loc1=r18,9            // RNaTslots <= floor(dirtySize / (64*8))
+    sub r19=r19,r18             // r19 = (physStackedSize + 8) - dirtySize
+    ;;
+    mov ar.rsc=r20              // load ar.rsc to be used for "loadrs"
+    shladd in0=loc1,3,r19
+    mov in1=0
+    ;;
+    TEXT_ALIGN(32)
+kvm_rse_clear_invalid:
+    alloc loc0=ar.pfs,2,Nregs-2,2,0
+    cmp.lt pRecurse,p0=Nregs*8,in0
+    // if more than Nregs regs left to clear, (re)curse
+    add out0=-Nregs*8,in0
+    add out1=1,in1              // increment recursion count
+    mov loc1=0
+    mov loc2=0
+    ;;
+    mov loc3=0
+    mov loc4=0
+    mov loc5=0
+    mov loc6=0
+    mov loc7=0
+(pRecurse) br.call.dptk.few b0=kvm_rse_clear_invalid
+    ;;
+    mov loc8=0
+    mov loc9=0
+    cmp.ne pReturn,p0=r0,in1
+    // if recursion count != 0, we need to do a br.ret
+    mov loc10=0
+    mov loc11=0
+(pReturn) br.ret.dptk.many b0
+#       undef pRecurse
+#       undef pReturn
+// loadrs has already been shifted
+    alloc r16=ar.pfs,0,0,0,0    // drop current register frame
+    ;;
+    loadrs
+    ;;
+    mov ar.bspstore=r24
+    ;;
+    mov ar.unat=r28
+    mov ar.rnat=r25
+    mov ar.rsc=r26
+    ;;
+    mov cr.ipsr=r31
+    mov cr.iip=r30
+    mov cr.ifs=r29
+    mov ar.pfs=r27
+    adds r18=VMM_VPD_BASE_OFFSET,r21
+    ;;
+    ld8 r18=[r18]   //vpd
+    adds r17=VMM_VCPU_ISR_OFFSET,r21
+    ;;
+    ld8 r17=[r17]
+    adds r19=VMM_VPD_VPSR_OFFSET,r18
+    ;;
+    ld8 r19=[r19]        //vpsr
+    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
+    ;;
+    ld8 r20=[r20]
+    ;;
+//vsa_sync_write_start
+    mov r25=r18
+    adds r16= VMM_VCPU_GP_OFFSET,r21
+    ;;
+    ld8 r16= [r16] // Put gp in r24
+    movl r24=@gprel(ia64_vmm_entry)  // calculate return address
+    ;;
+    add  r24=r24,r16
+    ;;
+    add r16=PAL_VPS_SYNC_WRITE,r20
+    ;;
+    mov b0=r16
+    br.cond.sptk b0         // call the service
+    ;;
+END(ia64_leave_hypervisor)
+// fall through
+GLOBAL_ENTRY(ia64_vmm_entry)
+/*
+ *  must be at bank 0
+ *  parameter:
+ *  r17:cr.isr
+ *  r18:vpd
+ *  r19:vpsr
+ *  r20:__vsa_base
+ *  r22:b0
+ *  r23:predicate
+ */
+    mov r24=r22
+    mov r25=r18
+    tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+    ;;
+    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
+    (p1) br.sptk.many ia64_vmm_entry_out
+    ;;
+    tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT         //p1=cr.isr.ir
+    ;;
+    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
+    (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
+    (p2) ld8 r26=[r25]
+    ;;
+ia64_vmm_entry_out:
+    mov pr=r23,-2
+    mov b0=r29
+    ;;
+    br.cond.sptk b0             // call pal service
+END(ia64_vmm_entry)
+/*
+ * extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2,
+ *                  u64 arg3, u64 arg4, u64 arg5,
+ *                  u64 arg6, u64 arg7);
+ *
+ * XXX: The currently defined services use only 4 args at the max. The
+ *  rest are not consumed.
+ */
+GLOBAL_ENTRY(ia64_call_vsa)
+    .regstk 4,4,0,0
+rpsave  =   loc0
+pfssave =   loc1
+psrsave =   loc2
+entry   =   loc3
+hostret =   r24
+    alloc   pfssave=ar.pfs,4,4,0,0
+    mov rpsave=rp
+    adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
+    ;;
+    ld8 entry=[entry]
+1:  mov hostret=ip
+    mov r25=in1         // copy arguments
+    mov r26=in2
+    mov r27=in3
+    mov psrsave=psr
+    ;;
+    tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
+    tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
+    ;;
+    add hostret=2f-1b,hostret   // calculate return address
+    add entry=entry,in0
+    ;;
+    rsm psr.i | psr.ic
+    ;;
+    srlz.i
+    mov b6=entry
+    br.cond.sptk b6         // call the service
+2:
+    // Architectural sequence for enabling interrupts if necessary
+(p7)    ssm psr.ic
+    ;;
+(p7)    srlz.i
+    ;;
+//(p6)    ssm psr.i
+    ;;
+    mov rp=rpsave
+    mov ar.pfs=pfssave
+    mov r8=r31
+    ;;
+    srlz.d
+    br.ret.sptk rp
+END(ia64_call_vsa)
+#define  INIT_BSPSTORE  ((4<<30)-(12<<20)-0x100)
+GLOBAL_ENTRY(vmm_reset_entry)
+    //set up ipsr, iip, vpd.vpsr, dcr
+    // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
+    // For DCR: all bits 0
+    adds r14=-VMM_PT_REGS_SIZE, r12
+    ;;
+    movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
+    movl r10=0x8000000000000000
+    adds r16=PT(CR_IIP), r14
+    adds r20=PT(R1), r14
+    ;;
+    rsm psr.ic | psr.i
+    ;;
+    srlz.i
+    ;;
+    bsw.0
+    ;;
+    mov r21 =r13
+    ;;
+    bsw.1
+    ;;
+    mov ar.rsc = 0
+    ;;
+    flushrs
+    ;;
+    mov ar.bspstore = 0
+    // clear BSPSTORE
+    ;;
+    mov cr.ipsr=r6
+    mov cr.ifs=r10
+    ld8 r4 = [r16] // Set init iip for first run.
+    ld8 r1 = [r20]
+    ;;
+    mov cr.iip=r4
+    ;;
+    adds r16=VMM_VPD_BASE_OFFSET,r13
+    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
+    ;;
+    ld8 r18=[r16]
+    ld8 r20=[r20]
+    ;;
+    adds r19=VMM_VPD_VPSR_OFFSET,r18
+    ;;
+    ld8 r19=[r19]
+    mov r17=r0
+    mov r22=r0
+    mov r23=r0
+    br.cond.sptk ia64_vmm_entry
+    br.ret.sptk  b0
+END(vmm_reset_entry)
diff --git a/arch/ia64/kvm/vti.h b/arch/ia64/kvm/vti.h
new file mode 100644
index 00000000000..f6c5617e16a
--- /dev/null
+++ b/arch/ia64/kvm/vti.h
@@ -0,0 +1,290 @@
+/*
+ * vti.h: prototype for generial vt related interface
+ *      Copyright (c) 2004, Intel Corporation.
+ *
+ *      Xuefei Xu (Anthony Xu) (anthony.xu@intel.com)
+ *      Fred Yang (fred.yang@intel.com)
+ *      Kun Tian (Kevin Tian) (kevin.tian@intel.com)
+ *
+ *      Copyright (c) 2007, Intel Corporation.
+ *      Zhang xiantao <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+#ifndef _KVM_VT_I_H
+#define _KVM_VT_I_H
+#ifndef __ASSEMBLY__
+#include <asm/page.h>
+#include <linux/kvm_host.h>
+/* define itr.i and itr.d  in ia64_itr function */
+#define ITR     0x01
+#define DTR     0x02
+#define IaDTR   0x03
+#define IA64_TR_VMM       6 /*itr6, dtr6 : maps vmm code, vmbuffer*/
+#define IA64_TR_VM_DATA   7 /*dtr7       : maps current vm data*/
+#define RR6 (6UL<<61)
+#define RR7 (7UL<<61)
+/* config_options in pal_vp_init_env */
+#define VP_INITIALIZE   1UL
+#define VP_FR_PMC       1UL<<1
+#define VP_OPCODE       1UL<<8
+#define VP_CAUSE        1UL<<9
+#define VP_FW_ACC       1UL<<63
+/* init vp env with initializing vm_buffer */
+#define VP_INIT_ENV_INITALIZE  (VP_INITIALIZE | VP_FR_PMC |\
+        VP_OPCODE | VP_CAUSE | VP_FW_ACC)
+/* init vp env without initializing vm_buffer */
+#define VP_INIT_ENV  VP_FR_PMC | VP_OPCODE | VP_CAUSE | VP_FW_ACC
+#define         PAL_VP_CREATE   265
+/* Stacked Virt. Initializes a new VPD for the operation of
+ * a new virtual processor in the virtual environment.
+ */
+#define         PAL_VP_ENV_INFO 266
+/*Stacked Virt. Returns the parameters needed to enter a virtual environment.*/
+#define         PAL_VP_EXIT_ENV 267
+/*Stacked Virt. Allows a logical processor to exit a virtual environment.*/
+#define         PAL_VP_INIT_ENV 268
+/*Stacked Virt. Allows a logical processor to enter a virtual environment.*/
+#define         PAL_VP_REGISTER 269
+/*Stacked Virt. Register a different host IVT for the virtual processor.*/
+#define         PAL_VP_RESUME   270
+/* Renamed from PAL_VP_RESUME */
+#define         PAL_VP_RESTORE  270
+/*Stacked Virt. Resumes virtual processor operation on the logical processor.*/
+#define         PAL_VP_SUSPEND  271
+/* Renamed from PAL_VP_SUSPEND */
+#define         PAL_VP_SAVE     271
+/* Stacked Virt. Suspends operation for the specified virtual processor on
+ * the logical processor.
+ */
+#define         PAL_VP_TERMINATE 272
+/* Stacked Virt. Terminates operation for the specified virtual processor.*/
+union vac {
+        unsigned long value;
+        struct {
+                int a_int:1;
+                int a_from_int_cr:1;
+                int a_to_int_cr:1;
+                int a_from_psr:1;
+                int a_from_cpuid:1;
+                int a_cover:1;
+                int a_bsw:1;
+                long reserved:57;
+        };
+};
+union vdc {
+        unsigned long value;
+        struct {
+                int d_vmsw:1;
+                int d_extint:1;
+                int d_ibr_dbr:1;
+                int d_pmc:1;
+                int d_to_pmd:1;
+                int d_itm:1;
+                long reserved:58;
+        };
+};
+struct vpd {
+        union vac   vac;
+        union vdc   vdc;
+        unsigned long  virt_env_vaddr;
+        unsigned long  reserved1[29];
+        unsigned long  vhpi;
+        unsigned long  reserved2[95];
+        unsigned long  vgr[16];
+        unsigned long  vbgr[16];
+        unsigned long  vnat;
+        unsigned long  vbnat;
+        unsigned long  vcpuid[5];
+        unsigned long  reserved3[11];
+        unsigned long  vpsr;
+        unsigned long  vpr;
+        unsigned long  reserved4[76];
+        union {
+                unsigned long  vcr[128];
+                struct {
+                        unsigned long dcr;
+                        unsigned long itm;
+                        unsigned long iva;
+                        unsigned long rsv1[5];
+                        unsigned long pta;
+                        unsigned long rsv2[7];
+                        unsigned long ipsr;
+                        unsigned long isr;
+                        unsigned long rsv3;
+                        unsigned long iip;
+                        unsigned long ifa;
+                        unsigned long itir;
+                        unsigned long iipa;
+                        unsigned long ifs;
+                        unsigned long iim;
+                        unsigned long iha;
+                        unsigned long rsv4[38];
+                        unsigned long lid;
+                        unsigned long ivr;
+                        unsigned long tpr;
+                        unsigned long eoi;
+                        unsigned long irr[4];
+                        unsigned long itv;
+                        unsigned long pmv;
+                        unsigned long cmcv;
+                        unsigned long rsv5[5];
+                        unsigned long lrr0;
+                        unsigned long lrr1;
+                        unsigned long rsv6[46];
+                };
+        };
+        unsigned long  reserved5[128];
+        unsigned long  reserved6[3456];
+        unsigned long  vmm_avail[128];
+        unsigned long  reserved7[4096];
+};
+#define PAL_PROC_VM_BIT         (1UL << 40)
+#define PAL_PROC_VMSW_BIT       (1UL << 54)
+static inline s64 ia64_pal_vp_env_info(u64 *buffer_size,
+                u64 *vp_env_info)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_ENV_INFO, 0, 0, 0);
+        *buffer_size = iprv.v0;
+        *vp_env_info = iprv.v1;
+        return iprv.status;
+}
+static inline s64 ia64_pal_vp_exit_env(u64 iva)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_EXIT_ENV, (u64)iva, 0, 0);
+        return iprv.status;
+}
+static inline s64 ia64_pal_vp_init_env(u64 config_options, u64 pbase_addr,
+                        u64 vbase_addr, u64 *vsa_base)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_INIT_ENV, config_options, pbase_addr,
+                        vbase_addr);
+        *vsa_base = iprv.v0;
+        return iprv.status;
+}
+static inline s64 ia64_pal_vp_restore(u64 *vpd, u64 pal_proc_vector)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_RESTORE, (u64)vpd, pal_proc_vector, 0);
+        return iprv.status;
+}
+static inline s64 ia64_pal_vp_save(u64 *vpd, u64 pal_proc_vector)
+{
+        struct ia64_pal_retval iprv;
+        PAL_CALL_STK(iprv, PAL_VP_SAVE, (u64)vpd, pal_proc_vector, 0);
+        return iprv.status;
+}
+#endif
+/*VPD field offset*/
+#define VPD_VAC_START_OFFSET            0
+#define VPD_VDC_START_OFFSET            8
+#define VPD_VHPI_START_OFFSET           256
+#define VPD_VGR_START_OFFSET            1024
+#define VPD_VBGR_START_OFFSET           1152
+#define VPD_VNAT_START_OFFSET           1280
+#define VPD_VBNAT_START_OFFSET          1288
+#define VPD_VCPUID_START_OFFSET         1296
+#define VPD_VPSR_START_OFFSET           1424
+#define VPD_VPR_START_OFFSET            1432
+#define VPD_VRSE_CFLE_START_OFFSET      1440
+#define VPD_VCR_START_OFFSET            2048
+#define VPD_VTPR_START_OFFSET           2576
+#define VPD_VRR_START_OFFSET            3072
+#define VPD_VMM_VAIL_START_OFFSET       31744
+/*Virtualization faults*/
+#define EVENT_MOV_TO_AR                  1
+#define EVENT_MOV_TO_AR_IMM              2
+#define EVENT_MOV_FROM_AR                3
+#define EVENT_MOV_TO_CR                  4
+#define EVENT_MOV_FROM_CR                5
+#define EVENT_MOV_TO_PSR                 6
+#define EVENT_MOV_FROM_PSR               7
+#define EVENT_ITC_D                      8
+#define EVENT_ITC_I                      9
+#define EVENT_MOV_TO_RR                  10
+#define EVENT_MOV_TO_DBR                 11
+#define EVENT_MOV_TO_IBR                 12
+#define EVENT_MOV_TO_PKR                 13
+#define EVENT_MOV_TO_PMC                 14
+#define EVENT_MOV_TO_PMD                 15
+#define EVENT_ITR_D                      16
+#define EVENT_ITR_I                      17
+#define EVENT_MOV_FROM_RR                18
+#define EVENT_MOV_FROM_DBR               19
+#define EVENT_MOV_FROM_IBR               20
+#define EVENT_MOV_FROM_PKR               21
+#define EVENT_MOV_FROM_PMC               22
+#define EVENT_MOV_FROM_CPUID             23
+#define EVENT_SSM                        24
+#define EVENT_RSM                        25
+#define EVENT_PTC_L                      26
+#define EVENT_PTC_G                      27
+#define EVENT_PTC_GA                     28
+#define EVENT_PTR_D                      29
+#define EVENT_PTR_I                      30
+#define EVENT_THASH                      31
+#define EVENT_TTAG                       32
+#define EVENT_TPA                        33
+#define EVENT_TAK                        34
+#define EVENT_PTC_E                      35
+#define EVENT_COVER                      36
+#define EVENT_RFI                        37
+#define EVENT_BSW_0                      38
+#define EVENT_BSW_1                      39
+#define EVENT_VMSW                       40
+/**PAL virtual services offsets */
+#define PAL_VPS_RESUME_NORMAL           0x0000
+#define PAL_VPS_RESUME_HANDLER          0x0400
+#define PAL_VPS_SYNC_READ               0x0800
+#define PAL_VPS_SYNC_WRITE              0x0c00
+#define PAL_VPS_SET_PENDING_INTERRUPT   0x1000
+#define PAL_VPS_THASH                   0x1400
+#define PAL_VPS_TTAG                    0x1800
+#define PAL_VPS_RESTORE                 0x1c00
+#define PAL_VPS_SAVE                    0x2000
+#endif/* _VT_I_H*/
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
new file mode 100644
index 00000000000..def4576d22b
--- /dev/null
+++ b/arch/ia64/kvm/vtlb.c
@@ -0,0 +1,636 @@
+/*
+ * vtlb.c: guest virtual tlb handling module.
+ * Copyright (c) 2004, Intel Corporation.
+ *  Yaozu Dong (Eddie Dong) <Eddie.dong@intel.com>
+ *  Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ *
+ * Copyright (c) 2007, Intel Corporation.
+ *  Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ *  Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include "vcpu.h"
+#include <linux/rwsem.h>
+#include <asm/tlb.h>
+/*
+ * Check to see if the address rid:va is translated by the TLB
+ */
+static int __is_tr_translated(struct thash_data *trp, u64 rid, u64 va)
+{
+        return ((trp->p) && (trp->rid == rid)
+                                && ((va-trp->vadr) < PSIZE(trp->ps)));
+}
+/*
+ * Only for GUEST TR format.
+ */
+static int __is_tr_overlap(struct thash_data *trp, u64 rid, u64 sva, u64 eva)
+{
+        u64 sa1, ea1;
+        if (!trp->p || trp->rid != rid)
+                return 0;
+        sa1 = trp->vadr;
+        ea1 = sa1 + PSIZE(trp->ps) - 1;
+        eva -= 1;
+        if ((sva > ea1) || (sa1 > eva))
+                return 0;
+        else
+                return 1;
+}
+void machine_tlb_purge(u64 va, u64 ps)
+{
+        ia64_ptcl(va, ps << 2);
+}
+void local_flush_tlb_all(void)
+{
+        int i, j;
+        unsigned long flags, count0, count1;
+        unsigned long stride0, stride1, addr;
+        addr    = current_vcpu->arch.ptce_base;
+        count0  = current_vcpu->arch.ptce_count[0];
+        count1  = current_vcpu->arch.ptce_count[1];
+        stride0 = current_vcpu->arch.ptce_stride[0];
+        stride1 = current_vcpu->arch.ptce_stride[1];
+        local_irq_save(flags);
+        for (i = 0; i < count0; ++i) {
+                for (j = 0; j < count1; ++j) {
+                        ia64_ptce(addr);
+                        addr += stride1;
+                }
+                addr += stride0;
+        }
+        local_irq_restore(flags);
+        ia64_srlz_i();          /* srlz.i implies srlz.d */
+}
+int vhpt_enabled(struct kvm_vcpu *vcpu, u64 vadr, enum vhpt_ref ref)
+{
+        union ia64_rr    vrr;
+        union ia64_pta   vpta;
+        struct  ia64_psr   vpsr;
+        vpsr = *(struct ia64_psr *)&VCPU(vcpu, vpsr);
+        vrr.val = vcpu_get_rr(vcpu, vadr);
+        vpta.val = vcpu_get_pta(vcpu);
+        if (vrr.ve & vpta.ve) {
+                switch (ref) {
+                case DATA_REF:
+                case NA_REF:
+                        return vpsr.dt;
+                case INST_REF:
+                        return vpsr.dt && vpsr.it && vpsr.ic;
+                case RSE_REF:
+                        return vpsr.dt && vpsr.rt;
+                }
+        }
+        return 0;
+}
+struct thash_data *vsa_thash(union ia64_pta vpta, u64 va, u64 vrr, u64 *tag)
+{
+        u64 index, pfn, rid, pfn_bits;
+        pfn_bits = vpta.size - 5 - 8;
+        pfn = REGION_OFFSET(va) >> _REGION_PAGE_SIZE(vrr);
+        rid = _REGION_ID(vrr);
+        index = ((rid & 0xff) << pfn_bits)|(pfn & ((1UL << pfn_bits) - 1));
+        *tag = ((rid >> 8) & 0xffff) | ((pfn >> pfn_bits) << 16);
+        return (struct thash_data *)((vpta.base << PTA_BASE_SHIFT) +
+                                (index << 5));
+}
+struct thash_data *__vtr_lookup(struct kvm_vcpu *vcpu, u64 va, int type)
+{
+        struct thash_data *trp;
+        int  i;
+        u64 rid;
+        rid = vcpu_get_rr(vcpu, va);
+        rid = rid & RR_RID_MASK;;
+        if (type == D_TLB) {
+                if (vcpu_quick_region_check(vcpu->arch.dtr_regions, va)) {
+                        for (trp = (struct thash_data *)&vcpu->arch.dtrs, i = 0;
+                                                i < NDTRS; i++, trp++) {
+                                if (__is_tr_translated(trp, rid, va))
+                                        return trp;
+                        }
+                }
+        } else {
+                if (vcpu_quick_region_check(vcpu->arch.itr_regions, va)) {
+                        for (trp = (struct thash_data *)&vcpu->arch.itrs, i = 0;
+                                        i < NITRS; i++, trp++) {
+                                if (__is_tr_translated(trp, rid, va))
+                                        return trp;
+                        }
+                }
+        }
+        return NULL;
+}
+static void vhpt_insert(u64 pte, u64 itir, u64 ifa, u64 gpte)
+{
+        union ia64_rr rr;
+        struct thash_data *head;
+        unsigned long ps, gpaddr;
+        ps = itir_ps(itir);
+        gpaddr = ((gpte & _PAGE_PPN_MASK) >> ps << ps) |
+                (ifa & ((1UL << ps) - 1));
+        rr.val = ia64_get_rr(ifa);
+        head = (struct thash_data *)ia64_thash(ifa);
+        head->etag = INVALID_TI_TAG;
+        ia64_mf();
+        head->page_flags = pte & ~PAGE_FLAGS_RV_MASK;
+        head->itir = rr.ps << 2;
+        head->etag = ia64_ttag(ifa);
+        head->gpaddr = gpaddr;
+}
+void mark_pages_dirty(struct kvm_vcpu *v, u64 pte, u64 ps)
+{
+        u64 i, dirty_pages = 1;
+        u64 base_gfn = (pte&_PAGE_PPN_MASK) >> PAGE_SHIFT;
+        spinlock_t *lock = __kvm_va(v->arch.dirty_log_lock_pa);
+        void *dirty_bitmap = (void *)v - (KVM_VCPU_OFS + v->vcpu_id * VCPU_SIZE)
+                                                + KVM_MEM_DIRTY_LOG_OFS;
+        dirty_pages <<= ps <= PAGE_SHIFT ? 0 : ps - PAGE_SHIFT;
+        vmm_spin_lock(lock);
+        for (i = 0; i < dirty_pages; i++) {
+                /* avoid RMW */
+                if (!test_bit(base_gfn + i, dirty_bitmap))
+                        set_bit(base_gfn + i , dirty_bitmap);
+        }
+        vmm_spin_unlock(lock);
+}
+void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type)
+{
+        u64 phy_pte, psr;
+        union ia64_rr mrr;
+        mrr.val = ia64_get_rr(va);
+        phy_pte = translate_phy_pte(&pte, itir, va);
+        if (itir_ps(itir) >= mrr.ps) {
+                vhpt_insert(phy_pte, itir, va, pte);
+        } else {
+                phy_pte  &= ~PAGE_FLAGS_RV_MASK;
+                psr = ia64_clear_ic();
+                ia64_itc(type, va, phy_pte, itir_ps(itir));
+                ia64_set_psr(psr);
+        }
+        if (!(pte&VTLB_PTE_IO))
+                mark_pages_dirty(v, pte, itir_ps(itir));
+}
+/*
+ *   vhpt lookup
+ */
+struct thash_data *vhpt_lookup(u64 va)
+{
+        struct thash_data *head;
+        u64 tag;
+        head = (struct thash_data *)ia64_thash(va);
+        tag = ia64_ttag(va);
+        if (head->etag == tag)
+                return head;
+        return NULL;
+}
+u64 guest_vhpt_lookup(u64 iha, u64 *pte)
+{
+        u64 ret;
+        struct thash_data *data;
+        data = __vtr_lookup(current_vcpu, iha, D_TLB);
+        if (data != NULL)
+                thash_vhpt_insert(current_vcpu, data->page_flags,
+                        data->itir, iha, D_TLB);
+        asm volatile ("rsm psr.ic|psr.i;;"
+                        "srlz.d;;"
+                        "ld8.s r9=[%1];;"
+                        "tnat.nz p6,p7=r9;;"
+                        "(p6) mov %0=1;"
+                        "(p6) mov r9=r0;"
+                        "(p7) extr.u r9=r9,0,53;;"
+                        "(p7) mov %0=r0;"
+                        "(p7) st8 [%2]=r9;;"
+                        "ssm psr.ic;;"
+                        "srlz.d;;"
+                        /* "ssm psr.i;;" Once interrupts in vmm open, need fix*/
+                        : "=r"(ret) : "r"(iha), "r"(pte):"memory");
+        return ret;
+}
+/*
+ *  purge software guest tlb
+ */
+static void vtlb_purge(struct kvm_vcpu *v, u64 va, u64 ps)
+{
+        struct thash_data *cur;
+        u64 start, curadr, size, psbits, tag, rr_ps, num;
+        union ia64_rr vrr;
+        struct thash_cb *hcb = &v->arch.vtlb;
+        vrr.val = vcpu_get_rr(v, va);
+        psbits = VMX(v, psbits[(va >> 61)]);
+        start = va & ~((1UL << ps) - 1);
+        while (psbits) {
+                curadr = start;
+                rr_ps = __ffs(psbits);
+                psbits &= ~(1UL << rr_ps);
+                num = 1UL << ((ps < rr_ps) ? 0 : (ps - rr_ps));
+                size = PSIZE(rr_ps);
+                vrr.ps = rr_ps;
+                while (num) {
+                        cur = vsa_thash(hcb->pta, curadr, vrr.val, &tag);
+                        if (cur->etag == tag && cur->ps == rr_ps)
+                                cur->etag = INVALID_TI_TAG;
+                        curadr += size;
+                        num--;
+                }
+        }
+}
+/*
+ *  purge VHPT and machine TLB
+ */
+static void vhpt_purge(struct kvm_vcpu *v, u64 va, u64 ps)
+{
+        struct thash_data *cur;
+        u64 start, size, tag, num;
+        union ia64_rr rr;
+        start = va & ~((1UL << ps) - 1);
+        rr.val = ia64_get_rr(va);
+        size = PSIZE(rr.ps);
+        num = 1UL << ((ps < rr.ps) ? 0 : (ps - rr.ps));
+        while (num) {
+                cur = (struct thash_data *)ia64_thash(start);
+                tag = ia64_ttag(start);
+                if (cur->etag == tag)
+                        cur->etag = INVALID_TI_TAG;
+                start += size;
+                num--;
+        }
+        machine_tlb_purge(va, ps);
+}
+/*
+ * Insert an entry into hash TLB or VHPT.
+ * NOTES:
+ *  1: When inserting VHPT to thash, "va" is a must covered
+ *  address by the inserted machine VHPT entry.
+ *  2: The format of entry is always in TLB.
+ *  3: The caller need to make sure the new entry will not overlap
+ *     with any existed entry.
+ */
+void vtlb_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va)
+{
+        struct thash_data *head;
+        union ia64_rr vrr;
+        u64 tag;
+        struct thash_cb *hcb = &v->arch.vtlb;
+        vrr.val = vcpu_get_rr(v, va);
+        vrr.ps = itir_ps(itir);
+        VMX(v, psbits[va >> 61]) |= (1UL << vrr.ps);
+        head = vsa_thash(hcb->pta, va, vrr.val, &tag);
+        head->page_flags = pte;
+        head->itir = itir;
+        head->etag = tag;
+}
+int vtr_find_overlap(struct kvm_vcpu *vcpu, u64 va, u64 ps, int type)
+{
+        struct thash_data  *trp;
+        int  i;
+        u64 end, rid;
+        rid = vcpu_get_rr(vcpu, va);
+        rid = rid & RR_RID_MASK;
+        end = va + PSIZE(ps);
+        if (type == D_TLB) {
+                if (vcpu_quick_region_check(vcpu->arch.dtr_regions, va)) {
+                        for (trp = (struct thash_data *)&vcpu->arch.dtrs, i = 0;
+                                        i < NDTRS; i++, trp++) {
+                                if (__is_tr_overlap(trp, rid, va, end))
+                                        return i;
+                        }
+                }
+        } else {
+                if (vcpu_quick_region_check(vcpu->arch.itr_regions, va)) {
+                        for (trp = (struct thash_data *)&vcpu->arch.itrs, i = 0;
+                                        i < NITRS; i++, trp++) {
+                                if (__is_tr_overlap(trp, rid, va, end))
+                                        return i;
+                        }
+                }
+        }
+        return -1;
+}
+/*
+ * Purge entries in VTLB and VHPT
+ */
+void thash_purge_entries(struct kvm_vcpu *v, u64 va, u64 ps)
+{
+        if (vcpu_quick_region_check(v->arch.tc_regions, va))
+                vtlb_purge(v, va, ps);
+        vhpt_purge(v, va, ps);
+}
+void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
+{
+        u64 old_va = va;
+        va = REGION_OFFSET(va);
+        if (vcpu_quick_region_check(v->arch.tc_regions, old_va))
+                vtlb_purge(v, va, ps);
+        vhpt_purge(v, va, ps);
+}
+u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
+{
+        u64 ps, ps_mask, paddr, maddr;
+        union pte_flags phy_pte;
+        ps = itir_ps(itir);
+        ps_mask = ~((1UL << ps) - 1);
+        phy_pte.val = *pte;
+        paddr = *pte;
+        paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
+        maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT);
+        if (maddr & GPFN_IO_MASK) {
+                *pte |= VTLB_PTE_IO;
+                return -1;
+        }
+        maddr = ((maddr & _PAGE_PPN_MASK) & PAGE_MASK) |
+                                        (paddr & ~PAGE_MASK);
+        phy_pte.ppn = maddr >> ARCH_PAGE_SHIFT;
+        return phy_pte.val;
+}
+/*
+ * Purge overlap TCs and then insert the new entry to emulate itc ops.
+ *    Notes: Only TC entry can purge and insert.
+ *    1 indicates this is MMIO
+ */
+int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
+                                                u64 ifa, int type)
+{
+        u64 ps;
+        u64 phy_pte;
+        union ia64_rr vrr, mrr;
+        int ret = 0;
+        ps = itir_ps(itir);
+        vrr.val = vcpu_get_rr(v, ifa);
+        mrr.val = ia64_get_rr(ifa);
+        phy_pte = translate_phy_pte(&pte, itir, ifa);
+        /* Ensure WB attribute if pte is related to a normal mem page,
+         * which is required by vga acceleration since qemu maps shared
+         * vram buffer with WB.
+         */
+        if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) {
+                pte &= ~_PAGE_MA_MASK;
+                phy_pte &= ~_PAGE_MA_MASK;
+        }
+        if (pte & VTLB_PTE_IO)
+                ret = 1;
+        vtlb_purge(v, ifa, ps);
+        vhpt_purge(v, ifa, ps);
+        if (ps == mrr.ps) {
+                if (!(pte&VTLB_PTE_IO)) {
+                        vhpt_insert(phy_pte, itir, ifa, pte);
+                } else {
+                        vtlb_insert(v, pte, itir, ifa);
+                        vcpu_quick_region_set(VMX(v, tc_regions), ifa);
+                }
+        } else if (ps > mrr.ps) {
+                vtlb_insert(v, pte, itir, ifa);
+                vcpu_quick_region_set(VMX(v, tc_regions), ifa);
+                if (!(pte&VTLB_PTE_IO))
+                        vhpt_insert(phy_pte, itir, ifa, pte);
+        } else {
+                u64 psr;
+                phy_pte  &= ~PAGE_FLAGS_RV_MASK;
+                psr = ia64_clear_ic();
+                ia64_itc(type, ifa, phy_pte, ps);
+                ia64_set_psr(psr);
+        }
+        if (!(pte&VTLB_PTE_IO))
+                mark_pages_dirty(v, pte, ps);
+        return ret;
+}
+/*
+ * Purge all TCs or VHPT entries including those in Hash table.
+ *
+ */
+void thash_purge_all(struct kvm_vcpu *v)
+{
+        int i;
+        struct thash_data *head;
+        struct thash_cb  *vtlb, *vhpt;
+        vtlb = &v->arch.vtlb;
+        vhpt = &v->arch.vhpt;
+        for (i = 0; i < 8; i++)
+                VMX(v, psbits[i]) = 0;
+        head = vtlb->hash;
+        for (i = 0; i < vtlb->num; i++) {
+                head->page_flags = 0;
+                head->etag = INVALID_TI_TAG;
+                head->itir = 0;
+                head->next = 0;
+                head++;
+        };
+        head = vhpt->hash;
+        for (i = 0; i < vhpt->num; i++) {
+                head->page_flags = 0;
+                head->etag = INVALID_TI_TAG;
+                head->itir = 0;
+                head->next = 0;
+                head++;
+        };
+        local_flush_tlb_all();
+}
+/*
+ * Lookup the hash table and its collision chain to find an entry
+ * covering this address rid:va or the entry.
+ *
+ * INPUT:
+ *  in: TLB format for both VHPT & TLB.
+ */
+struct thash_data *vtlb_lookup(struct kvm_vcpu *v, u64 va, int is_data)
+{
+        struct thash_data  *cch;
+        u64    psbits, ps, tag;
+        union ia64_rr vrr;
+        struct thash_cb *hcb = &v->arch.vtlb;
+        cch = __vtr_lookup(v, va, is_data);;
+        if (cch)
+                return cch;
+        if (vcpu_quick_region_check(v->arch.tc_regions, va) == 0)
+                return NULL;
+        psbits = VMX(v, psbits[(va >> 61)]);
+        vrr.val = vcpu_get_rr(v, va);
+        while (psbits) {
+                ps = __ffs(psbits);
+                psbits &= ~(1UL << ps);
+                vrr.ps = ps;
+                cch = vsa_thash(hcb->pta, va, vrr.val, &tag);
+                if (cch->etag == tag && cch->ps == ps)
+                        return cch;
+        }
+        return NULL;
+}
+/*
+ * Initialize internal control data before service.
+ */
+void thash_init(struct thash_cb *hcb, u64 sz)
+{
+        int i;
+        struct thash_data *head;
+        hcb->pta.val = (unsigned long)hcb->hash;
+        hcb->pta.vf = 1;
+        hcb->pta.ve = 1;
+        hcb->pta.size = sz;
+        head = hcb->hash;
+        for (i = 0; i < hcb->num; i++) {
+                head->page_flags = 0;
+                head->itir = 0;
+                head->etag = INVALID_TI_TAG;
+                head->next = 0;
+                head++;
+        }
+}
+u64 kvm_lookup_mpa(u64 gpfn)
+{
+        u64 *base = (u64 *) KVM_P2M_BASE;
+        return *(base + gpfn);
+}
+u64 kvm_gpa_to_mpa(u64 gpa)
+{
+        u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
+        return (pte >> PAGE_SHIFT << PAGE_SHIFT) | (gpa & ~PAGE_MASK);
+}
+/*
+ * Fetch guest bundle code.
+ * INPUT:
+ *  gip: guest ip
+ *  pbundle: used to return fetched bundle.
+ */
+int fetch_code(struct kvm_vcpu *vcpu, u64 gip, IA64_BUNDLE *pbundle)
+{
+        u64     gpip = 0;   /* guest physical IP*/
+        u64     *vpa;
+        struct thash_data    *tlb;
+        u64     maddr;
+        if (!(VCPU(vcpu, vpsr) & IA64_PSR_IT)) {
+                /* I-side physical mode */
+                gpip = gip;
+        } else {
+                tlb = vtlb_lookup(vcpu, gip, I_TLB);
+                if (tlb)
+                        gpip = (tlb->ppn >> (tlb->ps - 12) << tlb->ps) |
+                                (gip & (PSIZE(tlb->ps) - 1));
+        }
+        if (gpip) {
+                maddr = kvm_gpa_to_mpa(gpip);
+        } else {
+                tlb = vhpt_lookup(gip);
+                if (tlb == NULL) {
+                        ia64_ptcl(gip, ARCH_PAGE_SHIFT << 2);
+                        return IA64_FAULT;
+                }
+                maddr = (tlb->ppn >> (tlb->ps - 12) << tlb->ps)
+                                        | (gip & (PSIZE(tlb->ps) - 1));
+        }
+        vpa = (u64 *)__kvm_va(maddr);
+        pbundle->i64[0] = *vpa++;
+        pbundle->i64[1] = *vpa;
+        return IA64_NO_FAULT;
+}
+void kvm_init_vhpt(struct kvm_vcpu *v)
+{
+        v->arch.vhpt.num = VHPT_NUM_ENTRIES;
+        thash_init(&v->arch.vhpt, VHPT_SHIFT);
+        ia64_set_pta(v->arch.vhpt.pta.val);
+        /*Enable VHPT here?*/
+}
+void kvm_init_vtlb(struct kvm_vcpu *v)
+{
+        v->arch.vtlb.num = VTLB_NUM_ENTRIES;
+        thash_init(&v->arch.vtlb, VTLB_SHIFT);
+}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 20f45a8b87e..4e40c122bf2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -803,3 +803,4 @@ config PPC_CLOCK
 config PPC_LIB_RHEAP
        bool
+source "arch/powerpc/kvm/Kconfig"
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index a86d8d85321..807a2dce626 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -151,6 +151,9 @@ config BOOTX_TEXT
 config PPC_EARLY_DEBUG
        bool "Early debugging (dangerous)"
+        # PPC_EARLY_DEBUG on 440 leaves AS=1 mappings above the TLB high water
+        # mark, which doesn't work with current 440 KVM.
+        depends on !KVM
        help
          Say Y to enable some early debugging facilities that may be available
          for your processor/board combination. Those facilities are hacks
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index e2ec4a91cce..9dcdc036cdf 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -145,6 +145,7 @@ core-y				+= arch/powerpc/kernel/ \
                                   arch/powerpc/platforms/
 core-$(CONFIG_MATH_EMULATION)   += arch/powerpc/math-emu/
 core-$(CONFIG_XMON)             += arch/powerpc/xmon/
+core-$(CONFIG_KVM)              += arch/powerpc/kvm/
 drivers-$(CONFIG_OPROFILE)      += arch/powerpc/oprofile/
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index adf1d09d726..62134845af0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -23,6 +23,9 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/hrtimer.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
 #ifdef CONFIG_PPC64
 #include <linux/time.h>
 #include <linux/hardirq.h>
@@ -324,5 +327,30 @@ int main(void)
        DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
+#ifdef CONFIG_KVM
+        DEFINE(TLBE_BYTES, sizeof(struct tlbe));
+        DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
+        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
+        DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
+        DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
+        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
+        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+        DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+        DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+        DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+        DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
+        DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
+        DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
+        DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
+        DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
+        DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid));
+        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
+        DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+#endif
        return 0;
 }
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
new file mode 100644
index 00000000000..f5d7a5eab96
--- /dev/null
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -0,0 +1,224 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/mmu-44x.h>
+#include <asm/kvm_ppc.h>
+#include "44x_tlb.h"
+#define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
+#define PPC44x_TLB_SUPER_PERM_MASK (PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW)
+static unsigned int kvmppc_tlb_44x_pos;
+static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
+{
+        /* Mask off reserved bits. */
+        attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_ATTR_MASK;
+        if (!usermode) {
+                /* Guest is in supervisor mode, so we need to translate guest
+                 * supervisor permissions into user permissions. */
+                attrib &= ~PPC44x_TLB_USER_PERM_MASK;
+                attrib |= (attrib & PPC44x_TLB_SUPER_PERM_MASK) << 3;
+        }
+        /* Make sure host can always access this memory. */
+        attrib |= PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW;
+        return attrib;
+}
+/* Search the guest TLB for a matching entry. */
+int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
+                         unsigned int as)
+{
+        int i;
+        /* XXX Replace loop with fancy data structures. */
+        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+                struct tlbe *tlbe = &vcpu->arch.guest_tlb[i];
+                unsigned int tid;
+                if (eaddr < get_tlb_eaddr(tlbe))
+                        continue;
+                if (eaddr > get_tlb_end(tlbe))
+                        continue;
+                tid = get_tlb_tid(tlbe);
+                if (tid && (tid != pid))
+                        continue;
+                if (!get_tlb_v(tlbe))
+                        continue;
+                if (get_tlb_ts(tlbe) != as)
+                        continue;
+                return i;
+        }
+        return -1;
+}
+struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+        unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+        unsigned int index;
+        index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
+        if (index == -1)
+                return NULL;
+        return &vcpu->arch.guest_tlb[index];
+}
+struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+        unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+        unsigned int index;
+        index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
+        if (index == -1)
+                return NULL;
+        return &vcpu->arch.guest_tlb[index];
+}
+static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
+{
+        return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
+}
+/* Must be called with mmap_sem locked for writing. */
+static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
+                                      unsigned int index)
+{
+        struct tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
+        struct page *page = vcpu->arch.shadow_pages[index];
+        kunmap(vcpu->arch.shadow_pages[index]);
+        if (get_tlb_v(stlbe)) {
+                if (kvmppc_44x_tlbe_is_writable(stlbe))
+                        kvm_release_page_dirty(page);
+                else
+                        kvm_release_page_clean(page);
+        }
+}
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
+                    u32 flags)
+{
+        struct page *new_page;
+        struct tlbe *stlbe;
+        hpa_t hpaddr;
+        unsigned int victim;
+        /* Future optimization: don't overwrite the TLB entry containing the
+         * current PC (or stack?). */
+        victim = kvmppc_tlb_44x_pos++;
+        if (kvmppc_tlb_44x_pos > tlb_44x_hwater)
+                kvmppc_tlb_44x_pos = 0;
+        stlbe = &vcpu->arch.shadow_tlb[victim];
+        /* Get reference to new page. */
+        down_write(&current->mm->mmap_sem);
+        new_page = gfn_to_page(vcpu->kvm, gfn);
+        if (is_error_page(new_page)) {
+                printk(KERN_ERR "Couldn't get guest page!\n");
+                kvm_release_page_clean(new_page);
+                return;
+        }
+        hpaddr = page_to_phys(new_page);
+        /* Drop reference to old page. */
+        kvmppc_44x_shadow_release(vcpu, victim);
+        up_write(&current->mm->mmap_sem);
+        vcpu->arch.shadow_pages[victim] = new_page;
+        /* XXX Make sure (va, size) doesn't overlap any other
+         * entries. 440x6 user manual says the result would be
+         * "undefined." */
+        /* XXX what about AS? */
+        stlbe->tid = asid & 0xff;
+        /* Force TS=1 for all guest mappings. */
+        /* For now we hardcode 4KB mappings, but it will be important to
+         * use host large pages in the future. */
+        stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
+                       | PPC44x_TLB_4K;
+        stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
+        stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
+                                                    vcpu->arch.msr & MSR_PR);
+}
+void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, u64 eaddr, u64 asid)
+{
+        unsigned int pid = asid & 0xff;
+        int i;
+        /* XXX Replace loop with fancy data structures. */
+        down_write(&current->mm->mmap_sem);
+        for (i = 0; i <= tlb_44x_hwater; i++) {
+                struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+                unsigned int tid;
+                if (!get_tlb_v(stlbe))
+                        continue;
+                if (eaddr < get_tlb_eaddr(stlbe))
+                        continue;
+                if (eaddr > get_tlb_end(stlbe))
+                        continue;
+                tid = get_tlb_tid(stlbe);
+                if (tid && (tid != pid))
+                        continue;
+                kvmppc_44x_shadow_release(vcpu, i);
+                stlbe->word0 = 0;
+        }
+        up_write(&current->mm->mmap_sem);
+}
+/* Invalidate all mappings, so that when they fault back in they will get the
+ * proper permission bits. */
+void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+{
+        int i;
+        /* XXX Replace loop with fancy data structures. */
+        down_write(&current->mm->mmap_sem);
+        for (i = 0; i <= tlb_44x_hwater; i++) {
+                kvmppc_44x_shadow_release(vcpu, i);
+                vcpu->arch.shadow_tlb[i].word0 = 0;
+        }
+        up_write(&current->mm->mmap_sem);
+}
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
new file mode 100644
index 00000000000..2ccd46b6f6b
--- /dev/null
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -0,0 +1,91 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __KVM_POWERPC_TLB_H__
+#define __KVM_POWERPC_TLB_H__
+#include <linux/kvm_host.h>
+#include <asm/mmu-44x.h>
+extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                unsigned int pid, unsigned int as);
+extern struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+/* TLB helper functions */
+static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+{
+        return (tlbe->word0 >> 4) & 0xf;
+}
+static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+{
+        return tlbe->word0 & 0xfffffc00;
+}
+static inline gva_t get_tlb_bytes(const struct tlbe *tlbe)
+{
+        unsigned int pgsize = get_tlb_size(tlbe);
+        return 1 << 10 << (pgsize << 1);
+}
+static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+{
+        return get_tlb_eaddr(tlbe) + get_tlb_bytes(tlbe) - 1;
+}
+static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+{
+        u64 word1 = tlbe->word1;
+        return ((word1 & 0xf) << 32) | (word1 & 0xfffffc00);
+}
+static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+{
+        return tlbe->tid & 0xff;
+}
+static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+{
+        return (tlbe->word0 >> 8) & 0x1;
+}
+static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+{
+        return (tlbe->word0 >> 9) & 0x1;
+}
+static inline unsigned int get_mmucr_stid(const struct kvm_vcpu *vcpu)
+{
+        return vcpu->arch.mmucr & 0xff;
+}
+static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu)
+{
+        return (vcpu->arch.mmucr >> 16) & 0x1;
+}
+static inline gpa_t tlb_xlate(struct tlbe *tlbe, gva_t eaddr)
+{
+        unsigned int pgmask = get_tlb_bytes(tlbe) - 1;
+        return get_tlb_raddr(tlbe) | (eaddr & pgmask);
+}
+#endif /* __KVM_POWERPC_TLB_H__ */
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
new file mode 100644
index 00000000000..6b076010213
--- /dev/null
+++ b/arch/powerpc/kvm/Kconfig
@@ -0,0 +1,42 @@
+#
+# KVM configuration
+#
+menuconfig VIRTUALIZATION
+        bool "Virtualization"
+        ---help---
+          Say Y here to get to see options for using your Linux host to run
+          other operating systems inside virtual machines (guests).
+          This option alone does not add any kernel code.
+          If you say N, all options in this submenu will be skipped and
+          disabled.
+if VIRTUALIZATION
+config KVM
+        bool "Kernel-based Virtual Machine (KVM) support"
+        depends on 44x && EXPERIMENTAL
+        select PREEMPT_NOTIFIERS
+        select ANON_INODES
+        # We can only run on Book E hosts so far
+        select KVM_BOOKE_HOST
+        ---help---
+          Support hosting virtualized guest machines. You will also
+          need to select one or more of the processor modules below.
+          This module provides access to the hardware capabilities through
+          a character device node named /dev/kvm.
+          If unsure, say N.
+config KVM_BOOKE_HOST
+        bool "KVM host support for Book E PowerPC processors"
+        depends on KVM && 44x
+        ---help---
+          Provides host support for KVM on Book E PowerPC processors. Currently
+          this works on 440 processors only.
+source drivers/virtio/Kconfig
+endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
new file mode 100644
index 00000000000..d0d358d367e
--- /dev/null
+++ b/arch/powerpc/kvm/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o)
+kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o
+obj-$(CONFIG_KVM) += kvm.o
+AFLAGS_booke_interrupts.o := -I$(obj)
+kvm-booke-host-objs := booke_host.o booke_interrupts.o 44x_tlb.o
+obj-$(CONFIG_KVM_BOOKE_HOST) += kvm-booke-host.o
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
new file mode 100644
index 00000000000..6d9884a6884
--- /dev/null
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -0,0 +1,615 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <asm/cputable.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include "44x_tlb.h"
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+        { "exits",      VCPU_STAT(sum_exits) },
+        { "mmio",       VCPU_STAT(mmio_exits) },
+        { "dcr",        VCPU_STAT(dcr_exits) },
+        { "sig",        VCPU_STAT(signal_exits) },
+        { "light",      VCPU_STAT(light_exits) },
+        { "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
+        { "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
+        { "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
+        { "dtlb_v",     VCPU_STAT(dtlb_virt_miss_exits) },
+        { "sysc",       VCPU_STAT(syscall_exits) },
+        { "isi",        VCPU_STAT(isi_exits) },
+        { "dsi",        VCPU_STAT(dsi_exits) },
+        { "inst_emu",   VCPU_STAT(emulated_inst_exits) },
+        { "dec",        VCPU_STAT(dec_exits) },
+        { "ext_intr",   VCPU_STAT(ext_intr_exits) },
+        { NULL }
+};
+static const u32 interrupt_msr_mask[16] = {
+        [BOOKE_INTERRUPT_CRITICAL]      = MSR_ME,
+        [BOOKE_INTERRUPT_MACHINE_CHECK] = 0,
+        [BOOKE_INTERRUPT_DATA_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_INST_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_EXTERNAL]      = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_ALIGNMENT]     = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_PROGRAM]       = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_FP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_SYSCALL]       = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_AP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_DECREMENTER]   = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_FIT]           = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_WATCHDOG]      = MSR_ME,
+        [BOOKE_INTERRUPT_DTLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_ITLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
+        [BOOKE_INTERRUPT_DEBUG]         = MSR_ME,
+};
+const unsigned char exception_priority[] = {
+        [BOOKE_INTERRUPT_DATA_STORAGE] = 0,
+        [BOOKE_INTERRUPT_INST_STORAGE] = 1,
+        [BOOKE_INTERRUPT_ALIGNMENT] = 2,
+        [BOOKE_INTERRUPT_PROGRAM] = 3,
+        [BOOKE_INTERRUPT_FP_UNAVAIL] = 4,
+        [BOOKE_INTERRUPT_SYSCALL] = 5,
+        [BOOKE_INTERRUPT_AP_UNAVAIL] = 6,
+        [BOOKE_INTERRUPT_DTLB_MISS] = 7,
+        [BOOKE_INTERRUPT_ITLB_MISS] = 8,
+        [BOOKE_INTERRUPT_MACHINE_CHECK] = 9,
+        [BOOKE_INTERRUPT_DEBUG] = 10,
+        [BOOKE_INTERRUPT_CRITICAL] = 11,
+        [BOOKE_INTERRUPT_WATCHDOG] = 12,
+        [BOOKE_INTERRUPT_EXTERNAL] = 13,
+        [BOOKE_INTERRUPT_FIT] = 14,
+        [BOOKE_INTERRUPT_DECREMENTER] = 15,
+};
+const unsigned char priority_exception[] = {
+        BOOKE_INTERRUPT_DATA_STORAGE,
+        BOOKE_INTERRUPT_INST_STORAGE,
+        BOOKE_INTERRUPT_ALIGNMENT,
+        BOOKE_INTERRUPT_PROGRAM,
+        BOOKE_INTERRUPT_FP_UNAVAIL,
+        BOOKE_INTERRUPT_SYSCALL,
+        BOOKE_INTERRUPT_AP_UNAVAIL,
+        BOOKE_INTERRUPT_DTLB_MISS,
+        BOOKE_INTERRUPT_ITLB_MISS,
+        BOOKE_INTERRUPT_MACHINE_CHECK,
+        BOOKE_INTERRUPT_DEBUG,
+        BOOKE_INTERRUPT_CRITICAL,
+        BOOKE_INTERRUPT_WATCHDOG,
+        BOOKE_INTERRUPT_EXTERNAL,
+        BOOKE_INTERRUPT_FIT,
+        BOOKE_INTERRUPT_DECREMENTER,
+};
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+        struct tlbe *tlbe;
+        int i;
+        printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
+        printk("| %2s | %3s | %8s | %8s | %8s |\n",
+                        "nr", "tid", "word0", "word1", "word2");
+        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+                tlbe = &vcpu->arch.guest_tlb[i];
+                if (tlbe->word0 & PPC44x_TLB_VALID)
+                        printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
+                               i, tlbe->tid, tlbe->word0, tlbe->word1,
+                               tlbe->word2);
+        }
+        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+                tlbe = &vcpu->arch.shadow_tlb[i];
+                if (tlbe->word0 & PPC44x_TLB_VALID)
+                        printk(" S%2d | %02X | %08X | %08X | %08X |\n",
+                               i, tlbe->tid, tlbe->word0, tlbe->word1,
+                               tlbe->word2);
+        }
+}
+/* TODO: use vcpu_printf() */
+void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
+{
+        int i;
+        printk("pc:   %08x msr:  %08x\n", vcpu->arch.pc, vcpu->arch.msr);
+        printk("lr:   %08x ctr:  %08x\n", vcpu->arch.lr, vcpu->arch.ctr);
+        printk("srr0: %08x srr1: %08x\n", vcpu->arch.srr0, vcpu->arch.srr1);
+        printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
+        for (i = 0; i < 32; i += 4) {
+                printk("gpr%02d: %08x %08x %08x %08x\n", i,
+                       vcpu->arch.gpr[i],
+                       vcpu->arch.gpr[i+1],
+                       vcpu->arch.gpr[i+2],
+                       vcpu->arch.gpr[i+3]);
+        }
+}
+/* Check if we are ready to deliver the interrupt */
+static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+{
+        int r;
+        switch (interrupt) {
+        case BOOKE_INTERRUPT_CRITICAL:
+                r = vcpu->arch.msr & MSR_CE;
+                break;
+        case BOOKE_INTERRUPT_MACHINE_CHECK:
+                r = vcpu->arch.msr & MSR_ME;
+                break;
+        case BOOKE_INTERRUPT_EXTERNAL:
+                r = vcpu->arch.msr & MSR_EE;
+                break;
+        case BOOKE_INTERRUPT_DECREMENTER:
+                r = vcpu->arch.msr & MSR_EE;
+                break;
+        case BOOKE_INTERRUPT_FIT:
+                r = vcpu->arch.msr & MSR_EE;
+                break;
+        case BOOKE_INTERRUPT_WATCHDOG:
+                r = vcpu->arch.msr & MSR_CE;
+                break;
+        case BOOKE_INTERRUPT_DEBUG:
+                r = vcpu->arch.msr & MSR_DE;
+                break;
+        default:
+                r = 1;
+        }
+        return r;
+}
+static void kvmppc_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+{
+        switch (interrupt) {
+        case BOOKE_INTERRUPT_DECREMENTER:
+                vcpu->arch.tsr |= TSR_DIS;
+                break;
+        }
+        vcpu->arch.srr0 = vcpu->arch.pc;
+        vcpu->arch.srr1 = vcpu->arch.msr;
+        vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[interrupt];
+        kvmppc_set_msr(vcpu, vcpu->arch.msr & interrupt_msr_mask[interrupt]);
+}
+/* Check pending exceptions and deliver one, if possible. */
+void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu)
+{
+        unsigned long *pending = &vcpu->arch.pending_exceptions;
+        unsigned int exception;
+        unsigned int priority;
+        priority = find_first_bit(pending, BITS_PER_BYTE * sizeof(*pending));
+        while (priority <= BOOKE_MAX_INTERRUPT) {
+                exception = priority_exception[priority];
+                if (kvmppc_can_deliver_interrupt(vcpu, exception)) {
+                        kvmppc_clear_exception(vcpu, exception);
+                        kvmppc_deliver_interrupt(vcpu, exception);
+                        break;
+                }
+                priority = find_next_bit(pending,
+                                         BITS_PER_BYTE * sizeof(*pending),
+                                         priority + 1);
+        }
+}
+static int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+        enum emulation_result er;
+        int r;
+        er = kvmppc_emulate_instruction(run, vcpu);
+        switch (er) {
+        case EMULATE_DONE:
+                /* Future optimization: only reload non-volatiles if they were
+                 * actually modified. */
+                r = RESUME_GUEST_NV;
+                break;
+        case EMULATE_DO_MMIO:
+                run->exit_reason = KVM_EXIT_MMIO;
+                /* We must reload nonvolatiles because "update" load/store
+                 * instructions modify register state. */
+                /* Future optimization: only reload non-volatiles if they were
+                 * actually modified. */
+                r = RESUME_HOST_NV;
+                break;
+        case EMULATE_FAIL:
+                /* XXX Deliver Program interrupt to guest. */
+                printk(KERN_EMERG "%s: emulation failed (%08x)\n", __func__,
+                       vcpu->arch.last_inst);
+                r = RESUME_HOST;
+                break;
+        default:
+                BUG();
+        }
+        return r;
+}
+/**
+ * kvmppc_handle_exit
+ *
+ * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
+ */
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+        enum emulation_result er;
+        int r = RESUME_HOST;
+        local_irq_enable();
+        run->exit_reason = KVM_EXIT_UNKNOWN;
+        run->ready_for_interrupt_injection = 1;
+        switch (exit_nr) {
+        case BOOKE_INTERRUPT_MACHINE_CHECK:
+                printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
+                kvmppc_dump_vcpu(vcpu);
+                r = RESUME_HOST;
+                break;
+        case BOOKE_INTERRUPT_EXTERNAL:
+        case BOOKE_INTERRUPT_DECREMENTER:
+                /* Since we switched IVPR back to the host's value, the host
+                 * handled this interrupt the moment we enabled interrupts.
+                 * Now we just offer it a chance to reschedule the guest. */
+                /* XXX At this point the TLB still holds our shadow TLB, so if
+                 * we do reschedule the host will fault over it. Perhaps we
+                 * should politely restore the host's entries to minimize
+                 * misses before ceding control. */
+                if (need_resched())
+                        cond_resched();
+                if (exit_nr == BOOKE_INTERRUPT_DECREMENTER)
+                        vcpu->stat.dec_exits++;
+                else
+                        vcpu->stat.ext_intr_exits++;
+                r = RESUME_GUEST;
+                break;
+        case BOOKE_INTERRUPT_PROGRAM:
+                if (vcpu->arch.msr & MSR_PR) {
+                        /* Program traps generated by user-level software must be handled
+                         * by the guest kernel. */
+                        vcpu->arch.esr = vcpu->arch.fault_esr;
+                        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+                        r = RESUME_GUEST;
+                        break;
+                }
+                er = kvmppc_emulate_instruction(run, vcpu);
+                switch (er) {
+                case EMULATE_DONE:
+                        /* Future optimization: only reload non-volatiles if
+                         * they were actually modified by emulation. */
+                        vcpu->stat.emulated_inst_exits++;
+                        r = RESUME_GUEST_NV;
+                        break;
+                case EMULATE_DO_DCR:
+                        run->exit_reason = KVM_EXIT_DCR;
+                        r = RESUME_HOST;
+                        break;
+                case EMULATE_FAIL:
+                        /* XXX Deliver Program interrupt to guest. */
+                        printk(KERN_CRIT "%s: emulation at %x failed (%08x)\n",
+                               __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+                        /* For debugging, encode the failing instruction and
+                         * report it to userspace. */
+                        run->hw.hardware_exit_reason = ~0ULL << 32;
+                        run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+                        r = RESUME_HOST;
+                        break;
+                default:
+                        BUG();
+                }
+                break;
+        case BOOKE_INTERRUPT_DATA_STORAGE:
+                vcpu->arch.dear = vcpu->arch.fault_dear;
+                vcpu->arch.esr = vcpu->arch.fault_esr;
+                kvmppc_queue_exception(vcpu, exit_nr);
+                vcpu->stat.dsi_exits++;
+                r = RESUME_GUEST;
+                break;
+        case BOOKE_INTERRUPT_INST_STORAGE:
+                vcpu->arch.esr = vcpu->arch.fault_esr;
+                kvmppc_queue_exception(vcpu, exit_nr);
+                vcpu->stat.isi_exits++;
+                r = RESUME_GUEST;
+                break;
+        case BOOKE_INTERRUPT_SYSCALL:
+                kvmppc_queue_exception(vcpu, exit_nr);
+                vcpu->stat.syscall_exits++;
+                r = RESUME_GUEST;
+                break;
+        case BOOKE_INTERRUPT_DTLB_MISS: {
+                struct tlbe *gtlbe;
+                unsigned long eaddr = vcpu->arch.fault_dear;
+                gfn_t gfn;
+                /* Check the guest TLB. */
+                gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
+                if (!gtlbe) {
+                        /* The guest didn't have a mapping for it. */
+                        kvmppc_queue_exception(vcpu, exit_nr);
+                        vcpu->arch.dear = vcpu->arch.fault_dear;
+                        vcpu->arch.esr = vcpu->arch.fault_esr;
+                        vcpu->stat.dtlb_real_miss_exits++;
+                        r = RESUME_GUEST;
+                        break;
+                }
+                vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
+                gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
+                if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+                        /* The guest TLB had a mapping, but the shadow TLB
+                         * didn't, and it is RAM. This could be because:
+                         * a) the entry is mapping the host kernel, or
+                         * b) the guest used a large mapping which we're faking
+                         * Either way, we need to satisfy the fault without
+                         * invoking the guest. */
+                        kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
+                                       gtlbe->word2);
+                        vcpu->stat.dtlb_virt_miss_exits++;
+                        r = RESUME_GUEST;
+                } else {
+                        /* Guest has mapped and accessed a page which is not
+                         * actually RAM. */
+                        r = kvmppc_emulate_mmio(run, vcpu);
+                }
+                break;
+        }
+        case BOOKE_INTERRUPT_ITLB_MISS: {
+                struct tlbe *gtlbe;
+                unsigned long eaddr = vcpu->arch.pc;
+                gfn_t gfn;
+                r = RESUME_GUEST;
+                /* Check the guest TLB. */
+                gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
+                if (!gtlbe) {
+                        /* The guest didn't have a mapping for it. */
+                        kvmppc_queue_exception(vcpu, exit_nr);
+                        vcpu->stat.itlb_real_miss_exits++;
+                        break;
+                }
+                vcpu->stat.itlb_virt_miss_exits++;
+                gfn = tlb_xlate(gtlbe, eaddr) >> PAGE_SHIFT;
+                if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+                        /* The guest TLB had a mapping, but the shadow TLB
+                         * didn't. This could be because:
+                         * a) the entry is mapping the host kernel, or
+                         * b) the guest used a large mapping which we're faking
+                         * Either way, we need to satisfy the fault without
+                         * invoking the guest. */
+                        kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
+                                       gtlbe->word2);
+                } else {
+                        /* Guest mapped and leaped at non-RAM! */
+                        kvmppc_queue_exception(vcpu,
+                                               BOOKE_INTERRUPT_MACHINE_CHECK);
+                }
+                break;
+        }
+        default:
+                printk(KERN_EMERG "exit_nr %d\n", exit_nr);
+                BUG();
+        }
+        local_irq_disable();
+        kvmppc_check_and_deliver_interrupts(vcpu);
+        /* Do some exit accounting. */
+        vcpu->stat.sum_exits++;
+        if (!(r & RESUME_HOST)) {
+                /* To avoid clobbering exit_reason, only check for signals if
+                 * we aren't already exiting to userspace for some other
+                 * reason. */
+                if (signal_pending(current)) {
+                        run->exit_reason = KVM_EXIT_INTR;
+                        r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+                        vcpu->stat.signal_exits++;
+                } else {
+                        vcpu->stat.light_exits++;
+                }
+        } else {
+                switch (run->exit_reason) {
+                case KVM_EXIT_MMIO:
+                        vcpu->stat.mmio_exits++;
+                        break;
+                case KVM_EXIT_DCR:
+                        vcpu->stat.dcr_exits++;
+                        break;
+                case KVM_EXIT_INTR:
+                        vcpu->stat.signal_exits++;
+                        break;
+                }
+        }
+        return r;
+}
+/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+        struct tlbe *tlbe = &vcpu->arch.guest_tlb[0];
+        tlbe->tid = 0;
+        tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
+        tlbe->word1 = 0;
+        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
+        tlbe++;
+        tlbe->tid = 0;
+        tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
+        tlbe->word1 = 0xef600000;
+        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
+                      | PPC44x_TLB_I | PPC44x_TLB_G;
+        vcpu->arch.pc = 0;
+        vcpu->arch.msr = 0;
+        vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
+        /* Eye-catching number so we know if the guest takes an interrupt
+         * before it's programmed its own IVPR. */
+        vcpu->arch.ivpr = 0x55550000;
+        /* Since the guest can directly access the timebase, it must know the
+         * real timebase frequency. Accordingly, it must see the state of
+         * CCR1[TCS]. */
+        vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        int i;
+        regs->pc = vcpu->arch.pc;
+        regs->cr = vcpu->arch.cr;
+        regs->ctr = vcpu->arch.ctr;
+        regs->lr = vcpu->arch.lr;
+        regs->xer = vcpu->arch.xer;
+        regs->msr = vcpu->arch.msr;
+        regs->srr0 = vcpu->arch.srr0;
+        regs->srr1 = vcpu->arch.srr1;
+        regs->pid = vcpu->arch.pid;
+        regs->sprg0 = vcpu->arch.sprg0;
+        regs->sprg1 = vcpu->arch.sprg1;
+        regs->sprg2 = vcpu->arch.sprg2;
+        regs->sprg3 = vcpu->arch.sprg3;
+        regs->sprg5 = vcpu->arch.sprg4;
+        regs->sprg6 = vcpu->arch.sprg5;
+        regs->sprg7 = vcpu->arch.sprg6;
+        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+                regs->gpr[i] = vcpu->arch.gpr[i];
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        int i;
+        vcpu->arch.pc = regs->pc;
+        vcpu->arch.cr = regs->cr;
+        vcpu->arch.ctr = regs->ctr;
+        vcpu->arch.lr = regs->lr;
+        vcpu->arch.xer = regs->xer;
+        vcpu->arch.msr = regs->msr;
+        vcpu->arch.srr0 = regs->srr0;
+        vcpu->arch.srr1 = regs->srr1;
+        vcpu->arch.sprg0 = regs->sprg0;
+        vcpu->arch.sprg1 = regs->sprg1;
+        vcpu->arch.sprg2 = regs->sprg2;
+        vcpu->arch.sprg3 = regs->sprg3;
+        vcpu->arch.sprg5 = regs->sprg4;
+        vcpu->arch.sprg6 = regs->sprg5;
+        vcpu->arch.sprg7 = regs->sprg6;
+        for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
+                vcpu->arch.gpr[i] = regs->gpr[i];
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+        return -ENOTSUPP;
+}
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+        return -ENOTSUPP;
+}
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        return -ENOTSUPP;
+}
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        return -ENOTSUPP;
+}
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+        struct tlbe *gtlbe;
+        int index;
+        gva_t eaddr;
+        u8 pid;
+        u8 as;
+        eaddr = tr->linear_address;
+        pid = (tr->linear_address >> 32) & 0xff;
+        as = (tr->linear_address >> 40) & 0x1;
+        index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
+        if (index == -1) {
+                tr->valid = 0;
+                return 0;
+        }
+        gtlbe = &vcpu->arch.guest_tlb[index];
+        tr->physical_address = tlb_xlate(gtlbe, eaddr);
+        /* XXX what does "writeable" and "usermode" even mean? */
+        tr->valid = 1;
+        return 0;
+}
diff --git a/arch/powerpc/kvm/booke_host.c b/arch/powerpc/kvm/booke_host.c
new file mode 100644
index 00000000000..b480341bc31
--- /dev/null
+++ b/arch/powerpc/kvm/booke_host.c
@@ -0,0 +1,83 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/kvm_ppc.h>
+unsigned long kvmppc_booke_handlers;
+static int kvmppc_booke_init(void)
+{
+        unsigned long ivor[16];
+        unsigned long max_ivor = 0;
+        int i;
+        /* We install our own exception handlers by hijacking IVPR. IVPR must
+         * be 16-bit aligned, so we need a 64KB allocation. */
+        kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                 VCPU_SIZE_ORDER);
+        if (!kvmppc_booke_handlers)
+                return -ENOMEM;
+        /* XXX make sure our handlers are smaller than Linux's */
+        /* Copy our interrupt handlers to match host IVORs. That way we don't
+         * have to swap the IVORs on every guest/host transition. */
+        ivor[0] = mfspr(SPRN_IVOR0);
+        ivor[1] = mfspr(SPRN_IVOR1);
+        ivor[2] = mfspr(SPRN_IVOR2);
+        ivor[3] = mfspr(SPRN_IVOR3);
+        ivor[4] = mfspr(SPRN_IVOR4);
+        ivor[5] = mfspr(SPRN_IVOR5);
+        ivor[6] = mfspr(SPRN_IVOR6);
+        ivor[7] = mfspr(SPRN_IVOR7);
+        ivor[8] = mfspr(SPRN_IVOR8);
+        ivor[9] = mfspr(SPRN_IVOR9);
+        ivor[10] = mfspr(SPRN_IVOR10);
+        ivor[11] = mfspr(SPRN_IVOR11);
+        ivor[12] = mfspr(SPRN_IVOR12);
+        ivor[13] = mfspr(SPRN_IVOR13);
+        ivor[14] = mfspr(SPRN_IVOR14);
+        ivor[15] = mfspr(SPRN_IVOR15);
+        for (i = 0; i < 16; i++) {
+                if (ivor[i] > max_ivor)
+                        max_ivor = ivor[i];
+                memcpy((void *)kvmppc_booke_handlers + ivor[i],
+                       kvmppc_handlers_start + i * kvmppc_handler_len,
+                       kvmppc_handler_len);
+        }
+        flush_icache_range(kvmppc_booke_handlers,
+                           kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
+        return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+}
+static void __exit kvmppc_booke_exit(void)
+{
+        free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+        kvm_exit();
+}
+module_init(kvmppc_booke_init)
+module_exit(kvmppc_booke_exit)
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
new file mode 100644
index 00000000000..3b653b5309b
--- /dev/null
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -0,0 +1,436 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu-44x.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS)
+#define VCPU_GPR(n)     (VCPU_GPRS + (n * 4))
+/* The host stack layout: */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  4
+#define HOST_RUN        8
+/* r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option. */
+#define HOST_R2         12
+#define HOST_NV_GPRS    16
+#define HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * 4))
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + 4)
+#define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */
+#define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
+#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS))
+#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS))
+#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_INST_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_PROGRAM) | \
+                       (1<<BOOKE_INTERRUPT_DTLB_MISS))
+.macro KVM_HANDLER ivor_nr
+_GLOBAL(kvmppc_handler_\ivor_nr)
+        /* Get pointer to vcpu and record exit number. */
+        mtspr   SPRN_SPRG0, r4
+        mfspr   r4, SPRN_SPRG1
+        stw     r5, VCPU_GPR(r5)(r4)
+        stw     r6, VCPU_GPR(r6)(r4)
+        mfctr   r5
+        lis     r6, kvmppc_resume_host@h
+        stw     r5, VCPU_CTR(r4)
+        li      r5, \ivor_nr
+        ori     r6, r6, kvmppc_resume_host@l
+        mtctr   r6
+        bctr
+.endm
+_GLOBAL(kvmppc_handlers_start)
+KVM_HANDLER BOOKE_INTERRUPT_CRITICAL
+KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK
+KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE
+KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE
+KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL
+KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT
+KVM_HANDLER BOOKE_INTERRUPT_PROGRAM
+KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL
+KVM_HANDLER BOOKE_INTERRUPT_SYSCALL
+KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL
+KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER
+KVM_HANDLER BOOKE_INTERRUPT_FIT
+KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG
+KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS
+KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS
+KVM_HANDLER BOOKE_INTERRUPT_DEBUG
+_GLOBAL(kvmppc_handler_len)
+        .long kvmppc_handler_1 - kvmppc_handler_0
+/* Registers:
+ *  SPRG0: guest r4
+ *  r4: vcpu pointer
+ *  r5: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+        stw     r3, VCPU_GPR(r3)(r4)
+        mfcr    r3
+        stw     r3, VCPU_CR(r4)
+        stw     r7, VCPU_GPR(r7)(r4)
+        stw     r8, VCPU_GPR(r8)(r4)
+        stw     r9, VCPU_GPR(r9)(r4)
+        li      r6, 1
+        slw     r6, r6, r5
+        /* Save the faulting instruction and all GPRs for emulation. */
+        andi.   r7, r6, NEED_INST_MASK
+        beq     ..skip_inst_copy
+        mfspr   r9, SPRN_SRR0
+        mfmsr   r8
+        ori     r7, r8, MSR_DS
+        mtmsr   r7
+        isync
+        lwz     r9, 0(r9)
+        mtmsr   r8
+        isync
+        stw     r9, VCPU_LAST_INST(r4)
+        stw     r15, VCPU_GPR(r15)(r4)
+        stw     r16, VCPU_GPR(r16)(r4)
+        stw     r17, VCPU_GPR(r17)(r4)
+        stw     r18, VCPU_GPR(r18)(r4)
+        stw     r19, VCPU_GPR(r19)(r4)
+        stw     r20, VCPU_GPR(r20)(r4)
+        stw     r21, VCPU_GPR(r21)(r4)
+        stw     r22, VCPU_GPR(r22)(r4)
+        stw     r23, VCPU_GPR(r23)(r4)
+        stw     r24, VCPU_GPR(r24)(r4)
+        stw     r25, VCPU_GPR(r25)(r4)
+        stw     r26, VCPU_GPR(r26)(r4)
+        stw     r27, VCPU_GPR(r27)(r4)
+        stw     r28, VCPU_GPR(r28)(r4)
+        stw     r29, VCPU_GPR(r29)(r4)
+        stw     r30, VCPU_GPR(r30)(r4)
+        stw     r31, VCPU_GPR(r31)(r4)
+..skip_inst_copy:
+        /* Also grab DEAR and ESR before the host can clobber them. */
+        andi.   r7, r6, NEED_DEAR_MASK
+        beq     ..skip_dear
+        mfspr   r9, SPRN_DEAR
+        stw     r9, VCPU_FAULT_DEAR(r4)
+..skip_dear:
+        andi.   r7, r6, NEED_ESR_MASK
+        beq     ..skip_esr
+        mfspr   r9, SPRN_ESR
+        stw     r9, VCPU_FAULT_ESR(r4)
+..skip_esr:
+        /* Save remaining volatile guest register state to vcpu. */
+        stw     r0, VCPU_GPR(r0)(r4)
+        stw     r1, VCPU_GPR(r1)(r4)
+        stw     r2, VCPU_GPR(r2)(r4)
+        stw     r10, VCPU_GPR(r10)(r4)
+        stw     r11, VCPU_GPR(r11)(r4)
+        stw     r12, VCPU_GPR(r12)(r4)
+        stw     r13, VCPU_GPR(r13)(r4)
+        stw     r14, VCPU_GPR(r14)(r4) /* We need a NV GPR below. */
+        mflr    r3
+        stw     r3, VCPU_LR(r4)
+        mfxer   r3
+        stw     r3, VCPU_XER(r4)
+        mfspr   r3, SPRN_SPRG0
+        stw     r3, VCPU_GPR(r4)(r4)
+        mfspr   r3, SPRN_SRR0
+        stw     r3, VCPU_PC(r4)
+        /* Restore host stack pointer and PID before IVPR, since the host
+         * exception handlers use them. */
+        lwz     r1, VCPU_HOST_STACK(r4)
+        lwz     r3, VCPU_HOST_PID(r4)
+        mtspr   SPRN_PID, r3
+        /* Restore host IVPR before re-enabling interrupts. We cheat and know
+         * that Linux IVPR is always 0xc0000000. */
+        lis     r3, 0xc000
+        mtspr   SPRN_IVPR, r3
+        /* Switch to kernel stack and jump to handler. */
+        LOAD_REG_ADDR(r3, kvmppc_handle_exit)
+        mtctr   r3
+        lwz     r3, HOST_RUN(r1)
+        lwz     r2, HOST_R2(r1)
+        mr      r14, r4 /* Save vcpu pointer. */
+        bctrl   /* kvmppc_handle_exit() */
+        /* Restore vcpu pointer and the nonvolatiles we used. */
+        mr      r4, r14
+        lwz     r14, VCPU_GPR(r14)(r4)
+        /* Sometimes instruction emulation must restore complete GPR state. */
+        andi.   r5, r3, RESUME_FLAG_NV
+        beq     ..skip_nv_load
+        lwz     r15, VCPU_GPR(r15)(r4)
+        lwz     r16, VCPU_GPR(r16)(r4)
+        lwz     r17, VCPU_GPR(r17)(r4)
+        lwz     r18, VCPU_GPR(r18)(r4)
+        lwz     r19, VCPU_GPR(r19)(r4)
+        lwz     r20, VCPU_GPR(r20)(r4)
+        lwz     r21, VCPU_GPR(r21)(r4)
+        lwz     r22, VCPU_GPR(r22)(r4)
+        lwz     r23, VCPU_GPR(r23)(r4)
+        lwz     r24, VCPU_GPR(r24)(r4)
+        lwz     r25, VCPU_GPR(r25)(r4)
+        lwz     r26, VCPU_GPR(r26)(r4)
+        lwz     r27, VCPU_GPR(r27)(r4)
+        lwz     r28, VCPU_GPR(r28)(r4)
+        lwz     r29, VCPU_GPR(r29)(r4)
+        lwz     r30, VCPU_GPR(r30)(r4)
+        lwz     r31, VCPU_GPR(r31)(r4)
+..skip_nv_load:
+        /* Should we return to the guest? */
+        andi.   r5, r3, RESUME_FLAG_HOST
+        beq     lightweight_exit
+        srawi   r3, r3, 2 /* Shift -ERR back down. */
+heavyweight_exit:
+        /* Not returning to guest. */
+        /* We already saved guest volatile register state; now save the
+         * non-volatiles. */
+        stw     r15, VCPU_GPR(r15)(r4)
+        stw     r16, VCPU_GPR(r16)(r4)
+        stw     r17, VCPU_GPR(r17)(r4)
+        stw     r18, VCPU_GPR(r18)(r4)
+        stw     r19, VCPU_GPR(r19)(r4)
+        stw     r20, VCPU_GPR(r20)(r4)
+        stw     r21, VCPU_GPR(r21)(r4)
+        stw     r22, VCPU_GPR(r22)(r4)
+        stw     r23, VCPU_GPR(r23)(r4)
+        stw     r24, VCPU_GPR(r24)(r4)
+        stw     r25, VCPU_GPR(r25)(r4)
+        stw     r26, VCPU_GPR(r26)(r4)
+        stw     r27, VCPU_GPR(r27)(r4)
+        stw     r28, VCPU_GPR(r28)(r4)
+        stw     r29, VCPU_GPR(r29)(r4)
+        stw     r30, VCPU_GPR(r30)(r4)
+        stw     r31, VCPU_GPR(r31)(r4)
+        /* Load host non-volatile register state from host stack. */
+        lwz     r14, HOST_NV_GPR(r14)(r1)
+        lwz     r15, HOST_NV_GPR(r15)(r1)
+        lwz     r16, HOST_NV_GPR(r16)(r1)
+        lwz     r17, HOST_NV_GPR(r17)(r1)
+        lwz     r18, HOST_NV_GPR(r18)(r1)
+        lwz     r19, HOST_NV_GPR(r19)(r1)
+        lwz     r20, HOST_NV_GPR(r20)(r1)
+        lwz     r21, HOST_NV_GPR(r21)(r1)
+        lwz     r22, HOST_NV_GPR(r22)(r1)
+        lwz     r23, HOST_NV_GPR(r23)(r1)
+        lwz     r24, HOST_NV_GPR(r24)(r1)
+        lwz     r25, HOST_NV_GPR(r25)(r1)
+        lwz     r26, HOST_NV_GPR(r26)(r1)
+        lwz     r27, HOST_NV_GPR(r27)(r1)
+        lwz     r28, HOST_NV_GPR(r28)(r1)
+        lwz     r29, HOST_NV_GPR(r29)(r1)
+        lwz     r30, HOST_NV_GPR(r30)(r1)
+        lwz     r31, HOST_NV_GPR(r31)(r1)
+        /* Return to kvm_vcpu_run(). */
+        lwz     r4, HOST_STACK_LR(r1)
+        addi    r1, r1, HOST_STACK_SIZE
+        mtlr    r4
+        /* r3 still contains the return code from kvmppc_handle_exit(). */
+        blr
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+        stwu    r1, -HOST_STACK_SIZE(r1)
+        stw     r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */
+        /* Save host state to stack. */
+        stw     r3, HOST_RUN(r1)
+        mflr    r3
+        stw     r3, HOST_STACK_LR(r1)
+        /* Save host non-volatile register state to stack. */
+        stw     r14, HOST_NV_GPR(r14)(r1)
+        stw     r15, HOST_NV_GPR(r15)(r1)
+        stw     r16, HOST_NV_GPR(r16)(r1)
+        stw     r17, HOST_NV_GPR(r17)(r1)
+        stw     r18, HOST_NV_GPR(r18)(r1)
+        stw     r19, HOST_NV_GPR(r19)(r1)
+        stw     r20, HOST_NV_GPR(r20)(r1)
+        stw     r21, HOST_NV_GPR(r21)(r1)
+        stw     r22, HOST_NV_GPR(r22)(r1)
+        stw     r23, HOST_NV_GPR(r23)(r1)
+        stw     r24, HOST_NV_GPR(r24)(r1)
+        stw     r25, HOST_NV_GPR(r25)(r1)
+        stw     r26, HOST_NV_GPR(r26)(r1)
+        stw     r27, HOST_NV_GPR(r27)(r1)
+        stw     r28, HOST_NV_GPR(r28)(r1)
+        stw     r29, HOST_NV_GPR(r29)(r1)
+        stw     r30, HOST_NV_GPR(r30)(r1)
+        stw     r31, HOST_NV_GPR(r31)(r1)
+        /* Load guest non-volatiles. */
+        lwz     r14, VCPU_GPR(r14)(r4)
+        lwz     r15, VCPU_GPR(r15)(r4)
+        lwz     r16, VCPU_GPR(r16)(r4)
+        lwz     r17, VCPU_GPR(r17)(r4)
+        lwz     r18, VCPU_GPR(r18)(r4)
+        lwz     r19, VCPU_GPR(r19)(r4)
+        lwz     r20, VCPU_GPR(r20)(r4)
+        lwz     r21, VCPU_GPR(r21)(r4)
+        lwz     r22, VCPU_GPR(r22)(r4)
+        lwz     r23, VCPU_GPR(r23)(r4)
+        lwz     r24, VCPU_GPR(r24)(r4)
+        lwz     r25, VCPU_GPR(r25)(r4)
+        lwz     r26, VCPU_GPR(r26)(r4)
+        lwz     r27, VCPU_GPR(r27)(r4)
+        lwz     r28, VCPU_GPR(r28)(r4)
+        lwz     r29, VCPU_GPR(r29)(r4)
+        lwz     r30, VCPU_GPR(r30)(r4)
+        lwz     r31, VCPU_GPR(r31)(r4)
+lightweight_exit:
+        stw     r2, HOST_R2(r1)
+        mfspr   r3, SPRN_PID
+        stw     r3, VCPU_HOST_PID(r4)
+        lwz     r3, VCPU_PID(r4)
+        mtspr   SPRN_PID, r3
+        /* Prevent all TLB updates. */
+        mfmsr   r5
+        lis     r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
+        ori     r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
+        andc    r6, r5, r6
+        mtmsr   r6
+        /* Save the host's non-pinned TLB mappings, and load the guest mappings
+         * over them. Leave the host's "pinned" kernel mappings in place. */
+        /* XXX optimization: use generation count to avoid swapping unmodified
+         * entries. */
+        mfspr   r10, SPRN_MMUCR                 /* Save host MMUCR. */
+        lis     r8, tlb_44x_hwater@ha
+        lwz     r8, tlb_44x_hwater@l(r8)
+        addi    r3, r4, VCPU_HOST_TLB - 4
+        addi    r9, r4, VCPU_SHADOW_TLB - 4
+        li      r6, 0
+1:
+        /* Save host entry. */
+        tlbre   r7, r6, PPC44x_TLB_PAGEID
+        mfspr   r5, SPRN_MMUCR
+        stwu    r5, 4(r3)
+        stwu    r7, 4(r3)
+        tlbre   r7, r6, PPC44x_TLB_XLAT
+        stwu    r7, 4(r3)
+        tlbre   r7, r6, PPC44x_TLB_ATTRIB
+        stwu    r7, 4(r3)
+        /* Load guest entry. */
+        lwzu    r7, 4(r9)
+        mtspr   SPRN_MMUCR, r7
+        lwzu    r7, 4(r9)
+        tlbwe   r7, r6, PPC44x_TLB_PAGEID
+        lwzu    r7, 4(r9)
+        tlbwe   r7, r6, PPC44x_TLB_XLAT
+        lwzu    r7, 4(r9)
+        tlbwe   r7, r6, PPC44x_TLB_ATTRIB
+        /* Increment index. */
+        addi    r6, r6, 1
+        cmpw    r6, r8
+        blt     1b
+        mtspr   SPRN_MMUCR, r10                 /* Restore host MMUCR. */
+        iccci   0, 0 /* XXX hack */
+        /* Load some guest volatiles. */
+        lwz     r0, VCPU_GPR(r0)(r4)
+        lwz     r2, VCPU_GPR(r2)(r4)
+        lwz     r9, VCPU_GPR(r9)(r4)
+        lwz     r10, VCPU_GPR(r10)(r4)
+        lwz     r11, VCPU_GPR(r11)(r4)
+        lwz     r12, VCPU_GPR(r12)(r4)
+        lwz     r13, VCPU_GPR(r13)(r4)
+        lwz     r3, VCPU_LR(r4)
+        mtlr    r3
+        lwz     r3, VCPU_XER(r4)
+        mtxer   r3
+        /* Switch the IVPR. XXX If we take a TLB miss after this we're screwed,
+         * so how do we make sure vcpu won't fault? */
+        lis     r8, kvmppc_booke_handlers@ha
+        lwz     r8, kvmppc_booke_handlers@l(r8)
+        mtspr   SPRN_IVPR, r8
+        /* Save vcpu pointer for the exception handlers. */
+        mtspr   SPRN_SPRG1, r4
+        /* Can't switch the stack pointer until after IVPR is switched,
+         * because host interrupt handlers would get confused. */
+        lwz     r1, VCPU_GPR(r1)(r4)
+        /* XXX handle USPRG0 */
+        /* Host interrupt handlers may have clobbered these guest-readable
+         * SPRGs, so we need to reload them here with the guest's values. */
+        lwz     r3, VCPU_SPRG4(r4)
+        mtspr   SPRN_SPRG4, r3
+        lwz     r3, VCPU_SPRG5(r4)
+        mtspr   SPRN_SPRG5, r3
+        lwz     r3, VCPU_SPRG6(r4)
+        mtspr   SPRN_SPRG6, r3
+        lwz     r3, VCPU_SPRG7(r4)
+        mtspr   SPRN_SPRG7, r3
+        /* Finish loading guest volatiles and jump to guest. */
+        lwz     r3, VCPU_CTR(r4)
+        mtctr   r3
+        lwz     r3, VCPU_CR(r4)
+        mtcr    r3
+        lwz     r5, VCPU_GPR(r5)(r4)
+        lwz     r6, VCPU_GPR(r6)(r4)
+        lwz     r7, VCPU_GPR(r7)(r4)
+        lwz     r8, VCPU_GPR(r8)(r4)
+        lwz     r3, VCPU_PC(r4)
+        mtsrr0  r3
+        lwz     r3, VCPU_MSR(r4)
+        oris    r3, r3, KVMPPC_MSR_MASK@h
+        ori     r3, r3, KVMPPC_MSR_MASK@l
+        mtsrr1  r3
+        lwz     r3, VCPU_GPR(r3)(r4)
+        lwz     r4, VCPU_GPR(r4)(r4)
+        rfi
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
new file mode 100644
index 00000000000..a03fe0c8069
--- /dev/null
+++ b/arch/powerpc/kvm/emulate.c
@@ -0,0 +1,760 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <asm/dcr.h>
+#include <asm/dcr-regs.h>
+#include <asm/time.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_ppc.h>
+#include "44x_tlb.h"
+/* Instruction decoding */
+static inline unsigned int get_op(u32 inst)
+{
+        return inst >> 26;
+}
+static inline unsigned int get_xop(u32 inst)
+{
+        return (inst >> 1) & 0x3ff;
+}
+static inline unsigned int get_sprn(u32 inst)
+{
+        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+static inline unsigned int get_dcrn(u32 inst)
+{
+        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+static inline unsigned int get_rt(u32 inst)
+{
+        return (inst >> 21) & 0x1f;
+}
+static inline unsigned int get_rs(u32 inst)
+{
+        return (inst >> 21) & 0x1f;
+}
+static inline unsigned int get_ra(u32 inst)
+{
+        return (inst >> 16) & 0x1f;
+}
+static inline unsigned int get_rb(u32 inst)
+{
+        return (inst >> 11) & 0x1f;
+}
+static inline unsigned int get_rc(u32 inst)
+{
+        return inst & 0x1;
+}
+static inline unsigned int get_ws(u32 inst)
+{
+        return (inst >> 11) & 0x1f;
+}
+static inline unsigned int get_d(u32 inst)
+{
+        return inst & 0xffff;
+}
+static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+                             const struct tlbe *tlbe)
+{
+        gpa_t gpa;
+        if (!get_tlb_v(tlbe))
+                return 0;
+        /* Does it match current guest AS? */
+        /* XXX what about IS != DS? */
+        if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+                return 0;
+        gpa = get_tlb_raddr(tlbe);
+        if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+                /* Mapping is not for RAM. */
+                return 0;
+        return 1;
+}
+static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
+{
+        u64 eaddr;
+        u64 raddr;
+        u64 asid;
+        u32 flags;
+        struct tlbe *tlbe;
+        unsigned int ra;
+        unsigned int rs;
+        unsigned int ws;
+        unsigned int index;
+        ra = get_ra(inst);
+        rs = get_rs(inst);
+        ws = get_ws(inst);
+        index = vcpu->arch.gpr[ra];
+        if (index > PPC44x_TLB_SIZE) {
+                printk("%s: index %d\n", __func__, index);
+                kvmppc_dump_vcpu(vcpu);
+                return EMULATE_FAIL;
+        }
+        tlbe = &vcpu->arch.guest_tlb[index];
+        /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+        if (tlbe->word0 & PPC44x_TLB_VALID) {
+                eaddr = get_tlb_eaddr(tlbe);
+                asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+                kvmppc_mmu_invalidate(vcpu, eaddr, asid);
+        }
+        switch (ws) {
+        case PPC44x_TLB_PAGEID:
+                tlbe->tid = vcpu->arch.mmucr & 0xff;
+                tlbe->word0 = vcpu->arch.gpr[rs];
+                break;
+        case PPC44x_TLB_XLAT:
+                tlbe->word1 = vcpu->arch.gpr[rs];
+                break;
+        case PPC44x_TLB_ATTRIB:
+                tlbe->word2 = vcpu->arch.gpr[rs];
+                break;
+        default:
+                return EMULATE_FAIL;
+        }
+        if (tlbe_is_host_safe(vcpu, tlbe)) {
+                eaddr = get_tlb_eaddr(tlbe);
+                raddr = get_tlb_raddr(tlbe);
+                asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+                flags = tlbe->word2 & 0xffff;
+                /* Create a 4KB mapping on the host. If the guest wanted a
+                 * large page, only the first 4KB is mapped here and the rest
+                 * are mapped on the fly. */
+                kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
+        }
+        return EMULATE_DONE;
+}
+static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
+{
+        if (vcpu->arch.tcr & TCR_DIE) {
+                /* The decrementer ticks at the same rate as the timebase, so
+                 * that's how we convert the guest DEC value to the number of
+                 * host ticks. */
+                unsigned long nr_jiffies;
+                nr_jiffies = vcpu->arch.dec / tb_ticks_per_jiffy;
+                mod_timer(&vcpu->arch.dec_timer,
+                          get_jiffies_64() + nr_jiffies);
+        } else {
+                del_timer(&vcpu->arch.dec_timer);
+        }
+}
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.pc = vcpu->arch.srr0;
+        kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+}
+/* XXX to do:
+ * lhax
+ * lhaux
+ * lswx
+ * lswi
+ * stswx
+ * stswi
+ * lha
+ * lhau
+ * lmw
+ * stmw
+ *
+ * XXX is_bigendian should depend on MMU mapping or MSR[LE]
+ */
+int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+        u32 inst = vcpu->arch.last_inst;
+        u32 ea;
+        int ra;
+        int rb;
+        int rc;
+        int rs;
+        int rt;
+        int sprn;
+        int dcrn;
+        enum emulation_result emulated = EMULATE_DONE;
+        int advance = 1;
+        switch (get_op(inst)) {
+        case 3:                                                 /* trap */
+                printk("trap!\n");
+                kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+                advance = 0;
+                break;
+        case 19:
+                switch (get_xop(inst)) {
+                case 50:                                        /* rfi */
+                        kvmppc_emul_rfi(vcpu);
+                        advance = 0;
+                        break;
+                default:
+                        emulated = EMULATE_FAIL;
+                        break;
+                }
+                break;
+        case 31:
+                switch (get_xop(inst)) {
+                case 83:                                        /* mfmsr */
+                        rt = get_rt(inst);
+                        vcpu->arch.gpr[rt] = vcpu->arch.msr;
+                        break;
+                case 87:                                        /* lbzx */
+                        rt = get_rt(inst);
+                        emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+                        break;
+                case 131:                                       /* wrtee */
+                        rs = get_rs(inst);
+                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                         | (vcpu->arch.gpr[rs] & MSR_EE);
+                        break;
+                case 146:                                       /* mtmsr */
+                        rs = get_rs(inst);
+                        kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+                        break;
+                case 163:                                       /* wrteei */
+                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                         | (inst & MSR_EE);
+                        break;
+                case 215:                                       /* stbx */
+                        rs = get_rs(inst);
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       1, 1);
+                        break;
+                case 247:                                       /* stbux */
+                        rs = get_rs(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        ea = vcpu->arch.gpr[rb];
+                        if (ra)
+                                ea += vcpu->arch.gpr[ra];
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       1, 1);
+                        vcpu->arch.gpr[rs] = ea;
+                        break;
+                case 279:                                       /* lhzx */
+                        rt = get_rt(inst);
+                        emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+                        break;
+                case 311:                                       /* lhzux */
+                        rt = get_rt(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        ea = vcpu->arch.gpr[rb];
+                        if (ra)
+                                ea += vcpu->arch.gpr[ra];
+                        emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+                        vcpu->arch.gpr[ra] = ea;
+                        break;
+                case 323:                                       /* mfdcr */
+                        dcrn = get_dcrn(inst);
+                        rt = get_rt(inst);
+                        /* The guest may access CPR0 registers to determine the timebase
+                         * frequency, and it must know the real host frequency because it
+                         * can directly access the timebase registers.
+                         *
+                         * It would be possible to emulate those accesses in userspace,
+                         * but userspace can really only figure out the end frequency.
+                         * We could decompose that into the factors that compute it, but
+                         * that's tricky math, and it's easier to just report the real
+                         * CPR0 values.
+                         */
+                        switch (dcrn) {
+                        case DCRN_CPR0_CONFIG_ADDR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+                                break;
+                        case DCRN_CPR0_CONFIG_DATA:
+                                local_irq_disable();
+                                mtdcr(DCRN_CPR0_CONFIG_ADDR,
+                                      vcpu->arch.cpr0_cfgaddr);
+                                vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+                                local_irq_enable();
+                                break;
+                        default:
+                                run->dcr.dcrn = dcrn;
+                                run->dcr.data =  0;
+                                run->dcr.is_write = 0;
+                                vcpu->arch.io_gpr = rt;
+                                vcpu->arch.dcr_needed = 1;
+                                emulated = EMULATE_DO_DCR;
+                        }
+                        break;
+                case 339:                                       /* mfspr */
+                        sprn = get_sprn(inst);
+                        rt = get_rt(inst);
+                        switch (sprn) {
+                        case SPRN_SRR0:
+                                vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
+                        case SPRN_SRR1:
+                                vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
+                        case SPRN_MMUCR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+                        case SPRN_PID:
+                                vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+                        case SPRN_IVPR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+                        case SPRN_CCR0:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+                        case SPRN_CCR1:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+                        case SPRN_PVR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
+                        case SPRN_DEAR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+                        case SPRN_ESR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+                        case SPRN_DBCR0:
+                                vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+                        case SPRN_DBCR1:
+                                vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+                        /* Note: mftb and TBRL/TBWL are user-accessible, so
+                         * the guest can always access the real TB anyways.
+                         * In fact, we probably will never see these traps. */
+                        case SPRN_TBWL:
+                                vcpu->arch.gpr[rt] = mftbl(); break;
+                        case SPRN_TBWU:
+                                vcpu->arch.gpr[rt] = mftbu(); break;
+                        case SPRN_SPRG0:
+                                vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break;
+                        case SPRN_SPRG1:
+                                vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break;
+                        case SPRN_SPRG2:
+                                vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break;
+                        case SPRN_SPRG3:
+                                vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break;
+                        /* Note: SPRG4-7 are user-readable, so we don't get
+                         * a trap. */
+                        case SPRN_IVOR0:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
+                        case SPRN_IVOR1:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
+                        case SPRN_IVOR2:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
+                        case SPRN_IVOR3:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
+                        case SPRN_IVOR4:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
+                        case SPRN_IVOR5:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
+                        case SPRN_IVOR6:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
+                        case SPRN_IVOR7:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
+                        case SPRN_IVOR8:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
+                        case SPRN_IVOR9:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
+                        case SPRN_IVOR10:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
+                        case SPRN_IVOR11:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
+                        case SPRN_IVOR12:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
+                        case SPRN_IVOR13:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
+                        case SPRN_IVOR14:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
+                        case SPRN_IVOR15:
+                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
+                        default:
+                                printk("mfspr: unknown spr %x\n", sprn);
+                                vcpu->arch.gpr[rt] = 0;
+                                break;
+                        }
+                        break;
+                case 407:                                       /* sthx */
+                        rs = get_rs(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       2, 1);
+                        break;
+                case 439:                                       /* sthux */
+                        rs = get_rs(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        ea = vcpu->arch.gpr[rb];
+                        if (ra)
+                                ea += vcpu->arch.gpr[ra];
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       2, 1);
+                        vcpu->arch.gpr[ra] = ea;
+                        break;
+                case 451:                                       /* mtdcr */
+                        dcrn = get_dcrn(inst);
+                        rs = get_rs(inst);
+                        /* emulate some access in kernel */
+                        switch (dcrn) {
+                        case DCRN_CPR0_CONFIG_ADDR:
+                                vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+                                break;
+                        default:
+                                run->dcr.dcrn = dcrn;
+                                run->dcr.data = vcpu->arch.gpr[rs];
+                                run->dcr.is_write = 1;
+                                vcpu->arch.dcr_needed = 1;
+                                emulated = EMULATE_DO_DCR;
+                        }
+                        break;
+                case 467:                                       /* mtspr */
+                        sprn = get_sprn(inst);
+                        rs = get_rs(inst);
+                        switch (sprn) {
+                        case SPRN_SRR0:
+                                vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SRR1:
+                                vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_MMUCR:
+                                vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+                        case SPRN_PID:
+                                vcpu->arch.pid = vcpu->arch.gpr[rs]; break;
+                        case SPRN_CCR0:
+                                vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_CCR1:
+                                vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_DEAR:
+                                vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+                        case SPRN_ESR:
+                                vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+                        case SPRN_DBCR0:
+                                vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_DBCR1:
+                                vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+                        /* XXX We need to context-switch the timebase for
+                         * watchdog and FIT. */
+                        case SPRN_TBWL: break;
+                        case SPRN_TBWU: break;
+                        case SPRN_DEC:
+                                vcpu->arch.dec = vcpu->arch.gpr[rs];
+                                kvmppc_emulate_dec(vcpu);
+                                break;
+                        case SPRN_TSR:
+                                vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+                        case SPRN_TCR:
+                                vcpu->arch.tcr = vcpu->arch.gpr[rs];
+                                kvmppc_emulate_dec(vcpu);
+                                break;
+                        case SPRN_SPRG0:
+                                vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG1:
+                                vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG2:
+                                vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG3:
+                                vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
+                        /* Note: SPRG4-7 are user-readable. These values are
+                         * loaded into the real SPRGs when resuming the
+                         * guest. */
+                        case SPRN_SPRG4:
+                                vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG5:
+                                vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG6:
+                                vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_SPRG7:
+                                vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVPR:
+                                vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR0:
+                                vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR1:
+                                vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR2:
+                                vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR3:
+                                vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR4:
+                                vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR5:
+                                vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR6:
+                                vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR7:
+                                vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR8:
+                                vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR9:
+                                vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR10:
+                                vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR11:
+                                vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR12:
+                                vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR13:
+                                vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR14:
+                                vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
+                        case SPRN_IVOR15:
+                                vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
+                        default:
+                                printk("mtspr: unknown spr %x\n", sprn);
+                                emulated = EMULATE_FAIL;
+                                break;
+                        }
+                        break;
+                case 470:                                       /* dcbi */
+                        /* Do nothing. The guest is performing dcbi because
+                         * hardware DMA is not snooped by the dcache, but
+                         * emulated DMA either goes through the dcache as
+                         * normal writes, or the host kernel has handled dcache
+                         * coherence. */
+                        break;
+                case 534:                                       /* lwbrx */
+                        rt = get_rt(inst);
+                        emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0);
+                        break;
+                case 566:                                       /* tlbsync */
+                        break;
+                case 662:                                       /* stwbrx */
+                        rs = get_rs(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       4, 0);
+                        break;
+                case 978:                                       /* tlbwe */
+                        emulated = kvmppc_emul_tlbwe(vcpu, inst);
+                        break;
+                case 914:       {                               /* tlbsx */
+                        int index;
+                        unsigned int as = get_mmucr_sts(vcpu);
+                        unsigned int pid = get_mmucr_stid(vcpu);
+                        rt = get_rt(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        rc = get_rc(inst);
+                        ea = vcpu->arch.gpr[rb];
+                        if (ra)
+                                ea += vcpu->arch.gpr[ra];
+                        index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
+                        if (rc) {
+                                if (index < 0)
+                                        vcpu->arch.cr &= ~0x20000000;
+                                else
+                                        vcpu->arch.cr |= 0x20000000;
+                        }
+                        vcpu->arch.gpr[rt] = index;
+                        }
+                        break;
+                case 790:                                       /* lhbrx */
+                        rt = get_rt(inst);
+                        emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
+                        break;
+                case 918:                                       /* sthbrx */
+                        rs = get_rs(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        emulated = kvmppc_handle_store(run, vcpu,
+                                                       vcpu->arch.gpr[rs],
+                                                       2, 0);
+                        break;
+                case 966:                                       /* iccci */
+                        break;
+                default:
+                        printk("unknown: op %d xop %d\n", get_op(inst),
+                                get_xop(inst));
+                        emulated = EMULATE_FAIL;
+                        break;
+                }
+                break;
+        case 32:                                                /* lwz */
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+                break;
+        case 33:                                                /* lwzu */
+                ra = get_ra(inst);
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        case 34:                                                /* lbz */
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+                break;
+        case 35:                                                /* lbzu */
+                ra = get_ra(inst);
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        case 36:                                                /* stw */
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               4, 1);
+                break;
+        case 37:                                                /* stwu */
+                ra = get_ra(inst);
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               4, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        case 38:                                                /* stb */
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               1, 1);
+                break;
+        case 39:                                                /* stbu */
+                ra = get_ra(inst);
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               1, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        case 40:                                                /* lhz */
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+                break;
+        case 41:                                                /* lhzu */
+                ra = get_ra(inst);
+                rt = get_rt(inst);
+                emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        case 44:                                                /* sth */
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               2, 1);
+                break;
+        case 45:                                                /* sthu */
+                ra = get_ra(inst);
+                rs = get_rs(inst);
+                emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+                                               2, 1);
+                vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+                break;
+        default:
+                printk("unknown op %d\n", get_op(inst));
+                emulated = EMULATE_FAIL;
+                break;
+        }
+        if (advance)
+                vcpu->arch.pc += 4; /* Advance past emulated instruction. */
+        return emulated;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
new file mode 100644
index 00000000000..bad40bd2d3a
--- /dev/null
+++ b/arch/powerpc/kvm/powerpc.c
@@ -0,0 +1,436 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <asm/cputable.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+        return gfn;
+}
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+        /* XXX implement me */
+        return 0;
+}
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+        return 1;
+}
+int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+        enum emulation_result er;
+        int r;
+        er = kvmppc_emulate_instruction(run, vcpu);
+        switch (er) {
+        case EMULATE_DONE:
+                /* Future optimization: only reload non-volatiles if they were
+                 * actually modified. */
+                r = RESUME_GUEST_NV;
+                break;
+        case EMULATE_DO_MMIO:
+                run->exit_reason = KVM_EXIT_MMIO;
+                /* We must reload nonvolatiles because "update" load/store
+                 * instructions modify register state. */
+                /* Future optimization: only reload non-volatiles if they were
+                 * actually modified. */
+                r = RESUME_HOST_NV;
+                break;
+        case EMULATE_FAIL:
+                /* XXX Deliver Program interrupt to guest. */
+                printk(KERN_EMERG "%s: emulation failed (%08x)\n", __func__,
+                       vcpu->arch.last_inst);
+                r = RESUME_HOST;
+                break;
+        default:
+                BUG();
+        }
+        return r;
+}
+void kvm_arch_hardware_enable(void *garbage)
+{
+}
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+int kvm_arch_hardware_setup(void)
+{
+        return 0;
+}
+void kvm_arch_hardware_unsetup(void)
+{
+}
+void kvm_arch_check_processor_compat(void *rtn)
+{
+        int r;
+        if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+                r = 0;
+        else
+                r = -ENOTSUPP;
+        *(int *)rtn = r;
+}
+struct kvm *kvm_arch_create_vm(void)
+{
+        struct kvm *kvm;
+        kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+        if (!kvm)
+                return ERR_PTR(-ENOMEM);
+        return kvm;
+}
+static void kvmppc_free_vcpus(struct kvm *kvm)
+{
+        unsigned int i;
+        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                if (kvm->vcpus[i]) {
+                        kvm_arch_vcpu_free(kvm->vcpus[i]);
+                        kvm->vcpus[i] = NULL;
+                }
+        }
+}
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+        kvmppc_free_vcpus(kvm);
+        kvm_free_physmem(kvm);
+        kfree(kvm);
+}
+int kvm_dev_ioctl_check_extension(long ext)
+{
+        int r;
+        switch (ext) {
+        case KVM_CAP_USER_MEMORY:
+                r = 1;
+                break;
+        default:
+                r = 0;
+                break;
+        }
+        return r;
+}
+long kvm_arch_dev_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+        return -EINVAL;
+}
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc)
+{
+        return 0;
+}
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+        struct kvm_vcpu *vcpu;
+        int err;
+        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+        if (!vcpu) {
+                err = -ENOMEM;
+                goto out;
+        }
+        err = kvm_vcpu_init(vcpu, kvm, id);
+        if (err)
+                goto free_vcpu;
+        return vcpu;
+free_vcpu:
+        kmem_cache_free(kvm_vcpu_cache, vcpu);
+out:
+        return ERR_PTR(err);
+}
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+        kvm_vcpu_uninit(vcpu);
+        kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+        kvm_arch_vcpu_free(vcpu);
+}
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
+        return test_bit(priority, &vcpu->arch.pending_exceptions);
+}
+static void kvmppc_decrementer_func(unsigned long data)
+{
+        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+}
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+        setup_timer(&vcpu->arch.dec_timer, kvmppc_decrementer_func,
+                    (unsigned long)vcpu);
+        return 0;
+}
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+void decache_vcpus_on_cpu(int cpu)
+{
+}
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                    struct kvm_debug_guest *dbg)
+{
+        return -ENOTSUPP;
+}
+static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
+                                     struct kvm_run *run)
+{
+        u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+        *gpr = run->dcr.data;
+}
+static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *run)
+{
+        u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+        if (run->mmio.len > sizeof(*gpr)) {
+                printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
+                return;
+        }
+        if (vcpu->arch.mmio_is_bigendian) {
+                switch (run->mmio.len) {
+                case 4: *gpr = *(u32 *)run->mmio.data; break;
+                case 2: *gpr = *(u16 *)run->mmio.data; break;
+                case 1: *gpr = *(u8 *)run->mmio.data; break;
+                }
+        } else {
+                /* Convert BE data from userland back to LE. */
+                switch (run->mmio.len) {
+                case 4: *gpr = ld_le32((u32 *)run->mmio.data); break;
+                case 2: *gpr = ld_le16((u16 *)run->mmio.data); break;
+                case 1: *gpr = *(u8 *)run->mmio.data; break;
+                }
+        }
+}
+int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int rt, unsigned int bytes, int is_bigendian)
+{
+        if (bytes > sizeof(run->mmio.data)) {
+                printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
+                       run->mmio.len);
+        }
+        run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+        run->mmio.len = bytes;
+        run->mmio.is_write = 0;
+        vcpu->arch.io_gpr = rt;
+        vcpu->arch.mmio_is_bigendian = is_bigendian;
+        vcpu->mmio_needed = 1;
+        vcpu->mmio_is_write = 0;
+        return EMULATE_DO_MMIO;
+}
+int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                        u32 val, unsigned int bytes, int is_bigendian)
+{
+        void *data = run->mmio.data;
+        if (bytes > sizeof(run->mmio.data)) {
+                printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
+                       run->mmio.len);
+        }
+        run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+        run->mmio.len = bytes;
+        run->mmio.is_write = 1;
+        vcpu->mmio_needed = 1;
+        vcpu->mmio_is_write = 1;
+        /* Store the value at the lowest bytes in 'data'. */
+        if (is_bigendian) {
+                switch (bytes) {
+                case 4: *(u32 *)data = val; break;
+                case 2: *(u16 *)data = val; break;
+                case 1: *(u8  *)data = val; break;
+                }
+        } else {
+                /* Store LE value into 'data'. */
+                switch (bytes) {
+                case 4: st_le32(data, val); break;
+                case 2: st_le16(data, val); break;
+                case 1: *(u8 *)data = val; break;
+                }
+        }
+        return EMULATE_DO_MMIO;
+}
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+        int r;
+        sigset_t sigsaved;
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+        if (vcpu->mmio_needed) {
+                if (!vcpu->mmio_is_write)
+                        kvmppc_complete_mmio_load(vcpu, run);
+                vcpu->mmio_needed = 0;
+        } else if (vcpu->arch.dcr_needed) {
+                if (!vcpu->arch.dcr_is_write)
+                        kvmppc_complete_dcr_load(vcpu, run);
+                vcpu->arch.dcr_needed = 0;
+        }
+        kvmppc_check_and_deliver_interrupts(vcpu);
+        local_irq_disable();
+        kvm_guest_enter();
+        r = __kvmppc_vcpu_run(run, vcpu);
+        kvm_guest_exit();
+        local_irq_enable();
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+        return r;
+}
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
+{
+        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL;
+}
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL;
+}
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+        struct kvm_vcpu *vcpu = filp->private_data;
+        void __user *argp = (void __user *)arg;
+        long r;
+        switch (ioctl) {
+        case KVM_INTERRUPT: {
+                struct kvm_interrupt irq;
+                r = -EFAULT;
+                if (copy_from_user(&irq, argp, sizeof(irq)))
+                        goto out;
+                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+                break;
+        }
+        default:
+                r = -EINVAL;
+        }
+out:
+        return r;
+}
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+        return -ENOTSUPP;
+}
+long kvm_arch_vm_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
+{
+        long r;
+        switch (ioctl) {
+        default:
+                r = -EINVAL;
+        }
+        return r;
+}
+int kvm_arch_init(void *opaque)
+{
+        return 0;
+}
+void kvm_arch_exit(void)
+{
+}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f6a68e178fc..8f5f02160ff 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -62,6 +62,10 @@ config GENERIC_LOCKBREAK
        default y
        depends on SMP && PREEMPT
+config PGSTE
+        bool
+        default y if KVM
 mainmenu "Linux Kernel Configuration"
 config S390
@@ -69,6 +73,7 @@ config S390
        select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
+        select HAVE_KVM if 64BIT
 source "init/Kconfig"
@@ -515,6 +520,13 @@ config ZFCPDUMP
          Select this option if you want to build an zfcpdump enabled kernel.
          Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
+config S390_GUEST
+bool "s390 guest support (EXPERIMENTAL)"
+        depends on 64BIT && EXPERIMENTAL
+        select VIRTIO
+        select VIRTIO_RING
+        help
+          Select this option if you want to run the kernel under s390 linux
 endmenu
 source "net/Kconfig"
@@ -536,3 +548,5 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"
+source "arch/s390/kvm/Kconfig"
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index f708be367b0..792a4e7743c 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -87,7 +87,7 @@ LDFLAGS_vmlinux := -e start
 head-y          := arch/s390/kernel/head.o arch/s390/kernel/init_task.o
 core-y          += arch/s390/mm/ arch/s390/kernel/ arch/s390/crypto/ \
-                   arch/s390/appldata/ arch/s390/hypfs/
+                   arch/s390/appldata/ arch/s390/hypfs/ arch/s390/kvm/
 libs-y          += arch/s390/lib/
 drivers-y       += drivers/s390/
 drivers-$(CONFIG_MATHEMU) += arch/s390/math-emu/
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 540a67f979b..68ec4083bf7 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -144,6 +144,10 @@ static noinline __init void detect_machine_type(void)
        /* Running on a P/390 ? */
        if (cpuinfo->cpu_id.machine == 0x7490)
                machine_flags |= 4;
+        /* Running under KVM ? */
+        if (cpuinfo->cpu_id.version == 0xfe)
+                machine_flags |= 64;
 }
 #ifdef CONFIG_64BIT
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 7141147e6b6..a9d18aafa5f 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -316,7 +316,11 @@ static int __init early_parse_ipldelay(char *p)
 early_param("ipldelay", early_parse_ipldelay);
 #ifdef CONFIG_S390_SWITCH_AMODE
+#ifdef CONFIG_PGSTE
+unsigned int switch_amode = 1;
+#else
 unsigned int switch_amode = 0;
+#endif
 EXPORT_SYMBOL_GPL(switch_amode);
 static void set_amode_and_uaccess(unsigned long user_amode,
@@ -797,9 +801,13 @@ setup_arch(char **cmdline_p)
               "This machine has an IEEE fpu\n" :
               "This machine has no IEEE fpu\n");
 #else /* CONFIG_64BIT */
-        printk((MACHINE_IS_VM) ?
+        if (MACHINE_IS_VM)
-               "We are running under VM (64 bit mode)\n" :
+                printk("We are running under VM (64 bit mode)\n");
-               "We are running native (64 bit mode)\n");
+        else if (MACHINE_IS_KVM) {
+                printk("We are running under KVM (64 bit mode)\n");
+                add_preferred_console("ttyS", 1, NULL);
+        } else
+                printk("We are running native (64 bit mode)\n");
 #endif /* CONFIG_64BIT */
        /* Save unparsed command line copy for /proc/cmdline */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index c5f05b3fb2c..ca90ee3f930 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -110,6 +110,7 @@ void account_system_vtime(struct task_struct *tsk)
        S390_lowcore.steal_clock -= cputime << 12;
        account_system_time(tsk, 0, cputime);
 }
+EXPORT_SYMBOL_GPL(account_system_vtime);
 static inline void set_vtimer(__u64 expires)
 {
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
new file mode 100644
index 00000000000..1761b74d639
--- /dev/null
+++ b/arch/s390/kvm/Kconfig
@@ -0,0 +1,46 @@
+#
+# KVM configuration
+#
+config HAVE_KVM
+       bool
+menuconfig VIRTUALIZATION
+        bool "Virtualization"
+        default y
+        ---help---
+          Say Y here to get to see options for using your Linux host to run other
+          operating systems inside virtual machines (guests).
+          This option alone does not add any kernel code.
+          If you say N, all options in this submenu will be skipped and disabled.
+if VIRTUALIZATION
+config KVM
+        tristate "Kernel-based Virtual Machine (KVM) support"
+        depends on HAVE_KVM && EXPERIMENTAL
+        select PREEMPT_NOTIFIERS
+        select ANON_INODES
+        select S390_SWITCH_AMODE
+        select PREEMPT
+        ---help---
+          Support hosting paravirtualized guest machines using the SIE
+          virtualization capability on the mainframe. This should work
+          on any 64bit machine.
+          This module provides access to the hardware capabilities through
+          a character device node named /dev/kvm.
+          To compile this as a module, choose M here: the module
+          will be called kvm.
+          If unsure, say N.
+config KVM_TRACE
+       bool
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/virtio/Kconfig
+endif # VIRTUALIZATION
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
new file mode 100644
index 00000000000..e5221ec0b8e
--- /dev/null
+++ b/arch/s390/kvm/Makefile
@@ -0,0 +1,14 @@
+# Makefile for kernel virtual machines on s390
+#
+# Copyright IBM Corp. 2008
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License (version 2 only)
+# as published by the Free Software Foundation.
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o)
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/s390/kvm
+kvm-objs := $(common-objs) kvm-s390.o sie64a.o intercept.o interrupt.o priv.o sigp.o diag.o
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
new file mode 100644
index 00000000000..f639a152869
--- /dev/null
+++ b/arch/s390/kvm/diag.c
@@ -0,0 +1,67 @@
+/*
+ * diag.c - handling diagnose instructions
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include "kvm-s390.h"
+static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
+{
+        VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
+        vcpu->stat.diagnose_44++;
+        vcpu_put(vcpu);
+        schedule();
+        vcpu_load(vcpu);
+        return 0;
+}
+static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
+{
+        unsigned int reg = vcpu->arch.sie_block->ipa & 0xf;
+        unsigned long subcode = vcpu->arch.guest_gprs[reg] & 0xffff;
+        VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode);
+        switch (subcode) {
+        case 3:
+                vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
+                break;
+        case 4:
+                vcpu->run->s390_reset_flags = 0;
+                break;
+        default:
+                return -ENOTSUPP;
+        }
+        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+        vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
+        vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL;
+        vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT;
+        vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
+        VCPU_EVENT(vcpu, 3, "requesting userspace resets %lx",
+          vcpu->run->s390_reset_flags);
+        return -EREMOTE;
+}
+int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
+{
+        int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
+        switch (code) {
+        case 0x44:
+                return __diag_time_slice_end(vcpu);
+        case 0x308:
+                return __diag_ipl_functions(vcpu);
+        default:
+                return -ENOTSUPP;
+        }
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
new file mode 100644
index 00000000000..4e0633c413f
--- /dev/null
+++ b/arch/s390/kvm/gaccess.h
@@ -0,0 +1,274 @@
+/*
+ * gaccess.h -  access guest memory
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ */
+#ifndef __KVM_S390_GACCESS_H
+#define __KVM_S390_GACCESS_H
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+#include <asm/uaccess.h>
+static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu,
+                                               u64 guestaddr)
+{
+        u64 prefix  = vcpu->arch.sie_block->prefix;
+        u64 origin  = vcpu->kvm->arch.guest_origin;
+        u64 memsize = vcpu->kvm->arch.guest_memsize;
+        if (guestaddr < 2 * PAGE_SIZE)
+                guestaddr += prefix;
+        else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE))
+                guestaddr -= prefix;
+        if (guestaddr > memsize)
+                return (void __user __force *) ERR_PTR(-EFAULT);
+        guestaddr += origin;
+        return (void __user *) guestaddr;
+}
+static inline int get_guest_u64(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u64 *result)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 7);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return get_user(*result, (u64 __user *) uptr);
+}
+static inline int get_guest_u32(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u32 *result)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 3);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return get_user(*result, (u32 __user *) uptr);
+}
+static inline int get_guest_u16(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u16 *result)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 1);
+        if (IS_ERR(uptr))
+                return PTR_ERR(uptr);
+        return get_user(*result, (u16 __user *) uptr);
+}
+static inline int get_guest_u8(struct kvm_vcpu *vcpu, u64 guestaddr,
+                               u8 *result)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return get_user(*result, (u8 __user *) uptr);
+}
+static inline int put_guest_u64(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u64 value)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 7);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return put_user(value, (u64 __user *) uptr);
+}
+static inline int put_guest_u32(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u32 value)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 3);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return put_user(value, (u32 __user *) uptr);
+}
+static inline int put_guest_u16(struct kvm_vcpu *vcpu, u64 guestaddr,
+                                u16 value)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        BUG_ON(guestaddr & 1);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return put_user(value, (u16 __user *) uptr);
+}
+static inline int put_guest_u8(struct kvm_vcpu *vcpu, u64 guestaddr,
+                               u8 value)
+{
+        void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
+        if (IS_ERR((void __force *) uptr))
+                return PTR_ERR((void __force *) uptr);
+        return put_user(value, (u8 __user *) uptr);
+}
+static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu, u64 guestdest,
+                                       const void *from, unsigned long n)
+{
+        int rc;
+        unsigned long i;
+        const u8 *data = from;
+        for (i = 0; i < n; i++) {
+                rc = put_guest_u8(vcpu, guestdest++, *(data++));
+                if (rc < 0)
+                        return rc;
+        }
+        return 0;
+}
+static inline int copy_to_guest(struct kvm_vcpu *vcpu, u64 guestdest,
+                                const void *from, unsigned long n)
+{
+        u64 prefix  = vcpu->arch.sie_block->prefix;
+        u64 origin  = vcpu->kvm->arch.guest_origin;
+        u64 memsize = vcpu->kvm->arch.guest_memsize;
+        if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
+                goto slowpath;
+        if ((guestdest < prefix) && (guestdest + n > prefix))
+                goto slowpath;
+        if ((guestdest < prefix + 2 * PAGE_SIZE)
+            && (guestdest + n > prefix + 2 * PAGE_SIZE))
+                goto slowpath;
+        if (guestdest < 2 * PAGE_SIZE)
+                guestdest += prefix;
+        else if ((guestdest >= prefix) && (guestdest < prefix + 2 * PAGE_SIZE))
+                guestdest -= prefix;
+        if (guestdest + n > memsize)
+                return -EFAULT;
+        if (guestdest + n < guestdest)
+                return -EFAULT;
+        guestdest += origin;
+        return copy_to_user((void __user *) guestdest, from, n);
+slowpath:
+        return __copy_to_guest_slow(vcpu, guestdest, from, n);
+}
+static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to,
+                                         u64 guestsrc, unsigned long n)
+{
+        int rc;
+        unsigned long i;
+        u8 *data = to;
+        for (i = 0; i < n; i++) {
+                rc = get_guest_u8(vcpu, guestsrc++, data++);
+                if (rc < 0)
+                        return rc;
+        }
+        return 0;
+}
+static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
+                                  u64 guestsrc, unsigned long n)
+{
+        u64 prefix  = vcpu->arch.sie_block->prefix;
+        u64 origin  = vcpu->kvm->arch.guest_origin;
+        u64 memsize = vcpu->kvm->arch.guest_memsize;
+        if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
+                goto slowpath;
+        if ((guestsrc < prefix) && (guestsrc + n > prefix))
+                goto slowpath;
+        if ((guestsrc < prefix + 2 * PAGE_SIZE)
+            && (guestsrc + n > prefix + 2 * PAGE_SIZE))
+                goto slowpath;
+        if (guestsrc < 2 * PAGE_SIZE)
+                guestsrc += prefix;
+        else if ((guestsrc >= prefix) && (guestsrc < prefix + 2 * PAGE_SIZE))
+                guestsrc -= prefix;
+        if (guestsrc + n > memsize)
+                return -EFAULT;
+        if (guestsrc + n < guestsrc)
+                return -EFAULT;
+        guestsrc += origin;
+        return copy_from_user(to, (void __user *) guestsrc, n);
+slowpath:
+        return __copy_from_guest_slow(vcpu, to, guestsrc, n);
+}
+static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu, u64 guestdest,
+                                         const void *from, unsigned long n)
+{
+        u64 origin  = vcpu->kvm->arch.guest_origin;
+        u64 memsize = vcpu->kvm->arch.guest_memsize;
+        if (guestdest + n > memsize)
+                return -EFAULT;
+        if (guestdest + n < guestdest)
+                return -EFAULT;
+        guestdest += origin;
+        return copy_to_user((void __user *) guestdest, from, n);
+}
+static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
+                                           u64 guestsrc, unsigned long n)
+{
+        u64 origin  = vcpu->kvm->arch.guest_origin;
+        u64 memsize = vcpu->kvm->arch.guest_memsize;
+        if (guestsrc + n > memsize)
+                return -EFAULT;
+        if (guestsrc + n < guestsrc)
+                return -EFAULT;
+        guestsrc += origin;
+        return copy_from_user(to, (void __user *) guestsrc, n);
+}
+#endif
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
new file mode 100644
index 00000000000..349581a2610
--- /dev/null
+++ b/arch/s390/kvm/intercept.c
@@ -0,0 +1,216 @@
+/*
+ * intercept.c - in-kernel handling for sie intercepts
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <asm/kvm_host.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+static int handle_lctg(struct kvm_vcpu *vcpu)
+{
+        int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+        int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
+                        ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
+        u64 useraddr;
+        int reg, rc;
+        vcpu->stat.instruction_lctg++;
+        if ((vcpu->arch.sie_block->ipb & 0xff) != 0x2f)
+                return -ENOTSUPP;
+        useraddr = disp2;
+        if (base2)
+                useraddr += vcpu->arch.guest_gprs[base2];
+        reg = reg1;
+        VCPU_EVENT(vcpu, 5, "lctg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
+                   disp2);
+        do {
+                rc = get_guest_u64(vcpu, useraddr,
+                                   &vcpu->arch.sie_block->gcr[reg]);
+                if (rc == -EFAULT) {
+                        kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                        break;
+                }
+                useraddr += 8;
+                if (reg == reg3)
+                        break;
+                reg = (reg + 1) % 16;
+        } while (1);
+        return 0;
+}
+static int handle_lctl(struct kvm_vcpu *vcpu)
+{
+        int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+        int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 useraddr;
+        u32 val = 0;
+        int reg, rc;
+        vcpu->stat.instruction_lctl++;
+        useraddr = disp2;
+        if (base2)
+                useraddr += vcpu->arch.guest_gprs[base2];
+        VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
+                   disp2);
+        reg = reg1;
+        do {
+                rc = get_guest_u32(vcpu, useraddr, &val);
+                if (rc == -EFAULT) {
+                        kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                        break;
+                }
+                vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
+                vcpu->arch.sie_block->gcr[reg] |= val;
+                useraddr += 4;
+                if (reg == reg3)
+                        break;
+                reg = (reg + 1) % 16;
+        } while (1);
+        return 0;
+}
+static intercept_handler_t instruction_handlers[256] = {
+        [0x83] = kvm_s390_handle_diag,
+        [0xae] = kvm_s390_handle_sigp,
+        [0xb2] = kvm_s390_handle_priv,
+        [0xb7] = handle_lctl,
+        [0xeb] = handle_lctg,
+};
+static int handle_noop(struct kvm_vcpu *vcpu)
+{
+        switch (vcpu->arch.sie_block->icptcode) {
+        case 0x10:
+                vcpu->stat.exit_external_request++;
+                break;
+        case 0x14:
+                vcpu->stat.exit_external_interrupt++;
+                break;
+        default:
+                break; /* nothing */
+        }
+        return 0;
+}
+static int handle_stop(struct kvm_vcpu *vcpu)
+{
+        int rc;
+        vcpu->stat.exit_stop_request++;
+        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+        spin_lock_bh(&vcpu->arch.local_int.lock);
+        if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
+                vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
+                rc = __kvm_s390_vcpu_store_status(vcpu,
+                                                  KVM_S390_STORE_STATUS_NOADDR);
+                if (rc >= 0)
+                        rc = -ENOTSUPP;
+        }
+        if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
+                vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
+                VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
+                rc = -ENOTSUPP;
+        } else
+                rc = 0;
+        spin_unlock_bh(&vcpu->arch.local_int.lock);
+        return rc;
+}
+static int handle_validity(struct kvm_vcpu *vcpu)
+{
+        int viwhy = vcpu->arch.sie_block->ipb >> 16;
+        vcpu->stat.exit_validity++;
+        if (viwhy == 0x37) {
+                fault_in_pages_writeable((char __user *)
+                                         vcpu->kvm->arch.guest_origin +
+                                         vcpu->arch.sie_block->prefix,
+                                         PAGE_SIZE);
+                return 0;
+        }
+        VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
+                   viwhy);
+        return -ENOTSUPP;
+}
+static int handle_instruction(struct kvm_vcpu *vcpu)
+{
+        intercept_handler_t handler;
+        vcpu->stat.exit_instruction++;
+        handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
+        if (handler)
+                return handler(vcpu);
+        return -ENOTSUPP;
+}
+static int handle_prog(struct kvm_vcpu *vcpu)
+{
+        vcpu->stat.exit_program_interruption++;
+        return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
+}
+static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
+{
+        int rc, rc2;
+        vcpu->stat.exit_instr_and_program++;
+        rc = handle_instruction(vcpu);
+        rc2 = handle_prog(vcpu);
+        if (rc == -ENOTSUPP)
+                vcpu->arch.sie_block->icptcode = 0x04;
+        if (rc)
+                return rc;
+        return rc2;
+}
+static const intercept_handler_t intercept_funcs[0x48 >> 2] = {
+        [0x00 >> 2] = handle_noop,
+        [0x04 >> 2] = handle_instruction,
+        [0x08 >> 2] = handle_prog,
+        [0x0C >> 2] = handle_instruction_and_prog,
+        [0x10 >> 2] = handle_noop,
+        [0x14 >> 2] = handle_noop,
+        [0x1C >> 2] = kvm_s390_handle_wait,
+        [0x20 >> 2] = handle_validity,
+        [0x28 >> 2] = handle_stop,
+};
+int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
+{
+        intercept_handler_t func;
+        u8 code = vcpu->arch.sie_block->icptcode;
+        if (code & 3 || code > 0x48)
+                return -ENOTSUPP;
+        func = intercept_funcs[code >> 2];
+        if (func)
+                return func(vcpu);
+        return -ENOTSUPP;
+}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
new file mode 100644
index 00000000000..fcd1ed8015c
--- /dev/null
+++ b/arch/s390/kvm/interrupt.c
@@ -0,0 +1,592 @@
+/*
+ * interrupt.c - handling kvm guest interrupts
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ */
+#include <asm/lowcore.h>
+#include <asm/uaccess.h>
+#include <linux/kvm_host.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+static int psw_extint_disabled(struct kvm_vcpu *vcpu)
+{
+        return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
+}
+static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
+{
+        if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
+            (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
+            (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
+                return 0;
+        return 1;
+}
+static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
+                                      struct interrupt_info *inti)
+{
+        switch (inti->type) {
+        case KVM_S390_INT_EMERGENCY:
+                if (psw_extint_disabled(vcpu))
+                        return 0;
+                if (vcpu->arch.sie_block->gcr[0] & 0x4000ul)
+                        return 1;
+                return 0;
+        case KVM_S390_INT_SERVICE:
+                if (psw_extint_disabled(vcpu))
+                        return 0;
+                if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
+                        return 1;
+                return 0;
+        case KVM_S390_INT_VIRTIO:
+                if (psw_extint_disabled(vcpu))
+                        return 0;
+                if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
+                        return 1;
+                return 0;
+        case KVM_S390_PROGRAM_INT:
+        case KVM_S390_SIGP_STOP:
+        case KVM_S390_SIGP_SET_PREFIX:
+        case KVM_S390_RESTART:
+                return 1;
+        default:
+                BUG();
+        }
+        return 0;
+}
+static void __set_cpu_idle(struct kvm_vcpu *vcpu)
+{
+        BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
+        atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+        set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+}
+static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
+{
+        BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
+        atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+        clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+}
+static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
+{
+        atomic_clear_mask(CPUSTAT_ECALL_PEND |
+                CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
+                &vcpu->arch.sie_block->cpuflags);
+        vcpu->arch.sie_block->lctl = 0x0000;
+}
+static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
+{
+        atomic_set_mask(flag, &vcpu->arch.sie_block->cpuflags);
+}
+static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
+                                      struct interrupt_info *inti)
+{
+        switch (inti->type) {
+        case KVM_S390_INT_EMERGENCY:
+        case KVM_S390_INT_SERVICE:
+        case KVM_S390_INT_VIRTIO:
+                if (psw_extint_disabled(vcpu))
+                        __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+                else
+                        vcpu->arch.sie_block->lctl |= LCTL_CR0;
+                break;
+        case KVM_S390_SIGP_STOP:
+                __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+                break;
+        default:
+                BUG();
+        }
+}
+static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
+                                   struct interrupt_info *inti)
+{
+        const unsigned short table[] = { 2, 4, 4, 6 };
+        int rc, exception = 0;
+        switch (inti->type) {
+        case KVM_S390_INT_EMERGENCY:
+                VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
+                vcpu->stat.deliver_emergency_signal++;
+                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                         &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                        __LC_EXT_NEW_PSW, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                break;
+        case KVM_S390_INT_SERVICE:
+                VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
+                           inti->ext.ext_params);
+                vcpu->stat.deliver_service_signal++;
+                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                         &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                        __LC_EXT_NEW_PSW, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
+                if (rc == -EFAULT)
+                        exception = 1;
+                break;
+        case KVM_S390_INT_VIRTIO:
+                VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%lx",
+                           inti->ext.ext_params, inti->ext.ext_params2);
+                vcpu->stat.deliver_virtio_interrupt++;
+                rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, 0x0d00);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                         &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                        __LC_EXT_NEW_PSW, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = put_guest_u64(vcpu, __LC_PFAULT_INTPARM,
+                        inti->ext.ext_params2);
+                if (rc == -EFAULT)
+                        exception = 1;
+                break;
+        case KVM_S390_SIGP_STOP:
+                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
+                vcpu->stat.deliver_stop_signal++;
+                __set_intercept_indicator(vcpu, inti);
+                break;
+        case KVM_S390_SIGP_SET_PREFIX:
+                VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
+                           inti->prefix.address);
+                vcpu->stat.deliver_prefix_signal++;
+                vcpu->arch.sie_block->prefix = inti->prefix.address;
+                vcpu->arch.sie_block->ihcpu = 0xffff;
+                break;
+        case KVM_S390_RESTART:
+                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
+                vcpu->stat.deliver_restart_signal++;
+                rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
+                  restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                        offsetof(struct _lowcore, restart_psw), sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                break;
+        case KVM_S390_PROGRAM_INT:
+                VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
+                           inti->pgm.code,
+                           table[vcpu->arch.sie_block->ipa >> 14]);
+                vcpu->stat.deliver_program_int++;
+                rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = put_guest_u16(vcpu, __LC_PGM_ILC,
+                        table[vcpu->arch.sie_block->ipa >> 14]);
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
+                         &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                        __LC_PGM_NEW_PSW, sizeof(psw_t));
+                if (rc == -EFAULT)
+                        exception = 1;
+                break;
+        default:
+                BUG();
+        }
+        if (exception) {
+                VCPU_EVENT(vcpu, 1, "%s", "program exception while delivering"
+                           " interrupt");
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                if (inti->type == KVM_S390_PROGRAM_INT) {
+                        printk(KERN_WARNING "kvm: recursive program check\n");
+                        BUG();
+                }
+        }
+}
+static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
+{
+        int rc, exception = 0;
+        if (psw_extint_disabled(vcpu))
+                return 0;
+        if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))
+                return 0;
+        rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004);
+        if (rc == -EFAULT)
+                exception = 1;
+        rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+        if (rc == -EFAULT)
+                exception = 1;
+        rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                __LC_EXT_NEW_PSW, sizeof(psw_t));
+        if (rc == -EFAULT)
+                exception = 1;
+        if (exception) {
+                VCPU_EVENT(vcpu, 1, "%s", "program exception while delivering" \
+                           " ckc interrupt");
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                return 0;
+        }
+        return 1;
+}
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+{
+        struct local_interrupt *li = &vcpu->arch.local_int;
+        struct float_interrupt *fi = vcpu->arch.local_int.float_int;
+        struct interrupt_info  *inti;
+        int rc = 0;
+        if (atomic_read(&li->active)) {
+                spin_lock_bh(&li->lock);
+                list_for_each_entry(inti, &li->list, list)
+                        if (__interrupt_is_deliverable(vcpu, inti)) {
+                                rc = 1;
+                                break;
+                        }
+                spin_unlock_bh(&li->lock);
+        }
+        if ((!rc) && atomic_read(&fi->active)) {
+                spin_lock_bh(&fi->lock);
+                list_for_each_entry(inti, &fi->list, list)
+                        if (__interrupt_is_deliverable(vcpu, inti)) {
+                                rc = 1;
+                                break;
+                        }
+                spin_unlock_bh(&fi->lock);
+        }
+        if ((!rc) && (vcpu->arch.sie_block->ckc <
+                get_clock() + vcpu->arch.sie_block->epoch)) {
+                if ((!psw_extint_disabled(vcpu)) &&
+                        (vcpu->arch.sie_block->gcr[0] & 0x800ul))
+                        rc = 1;
+        }
+        return rc;
+}
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        return 0;
+}
+int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
+{
+        u64 now, sltime;
+        DECLARE_WAITQUEUE(wait, current);
+        vcpu->stat.exit_wait_state++;
+        if (kvm_cpu_has_interrupt(vcpu))
+                return 0;
+        if (psw_interrupts_disabled(vcpu)) {
+                VCPU_EVENT(vcpu, 3, "%s", "disabled wait");
+                __unset_cpu_idle(vcpu);
+                return -ENOTSUPP; /* disabled wait */
+        }
+        if (psw_extint_disabled(vcpu) ||
+            (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))) {
+                VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
+                goto no_timer;
+        }
+        now = get_clock() + vcpu->arch.sie_block->epoch;
+        if (vcpu->arch.sie_block->ckc < now) {
+                __unset_cpu_idle(vcpu);
+                return 0;
+        }
+        sltime = (vcpu->arch.sie_block->ckc - now) / (0xf4240000ul / HZ) + 1;
+        vcpu->arch.ckc_timer.expires = jiffies + sltime;
+        add_timer(&vcpu->arch.ckc_timer);
+        VCPU_EVENT(vcpu, 5, "enabled wait timer:%lx jiffies", sltime);
+no_timer:
+        spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+        spin_lock_bh(&vcpu->arch.local_int.lock);
+        __set_cpu_idle(vcpu);
+        vcpu->arch.local_int.timer_due = 0;
+        add_wait_queue(&vcpu->arch.local_int.wq, &wait);
+        while (list_empty(&vcpu->arch.local_int.list) &&
+                list_empty(&vcpu->arch.local_int.float_int->list) &&
+                (!vcpu->arch.local_int.timer_due) &&
+                !signal_pending(current)) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                spin_unlock_bh(&vcpu->arch.local_int.lock);
+                spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+                vcpu_put(vcpu);
+                schedule();
+                vcpu_load(vcpu);
+                spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+                spin_lock_bh(&vcpu->arch.local_int.lock);
+        }
+        __unset_cpu_idle(vcpu);
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&vcpu->wq, &wait);
+        spin_unlock_bh(&vcpu->arch.local_int.lock);
+        spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+        del_timer(&vcpu->arch.ckc_timer);
+        return 0;
+}
+void kvm_s390_idle_wakeup(unsigned long data)
+{
+        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+        spin_lock_bh(&vcpu->arch.local_int.lock);
+        vcpu->arch.local_int.timer_due = 1;
+        if (waitqueue_active(&vcpu->arch.local_int.wq))
+                wake_up_interruptible(&vcpu->arch.local_int.wq);
+        spin_unlock_bh(&vcpu->arch.local_int.lock);
+}
+void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
+{
+        struct local_interrupt *li = &vcpu->arch.local_int;
+        struct float_interrupt *fi = vcpu->arch.local_int.float_int;
+        struct interrupt_info  *n, *inti = NULL;
+        int deliver;
+        __reset_intercept_indicators(vcpu);
+        if (atomic_read(&li->active)) {
+                do {
+                        deliver = 0;
+                        spin_lock_bh(&li->lock);
+                        list_for_each_entry_safe(inti, n, &li->list, list) {
+                                if (__interrupt_is_deliverable(vcpu, inti)) {
+                                        list_del(&inti->list);
+                                        deliver = 1;
+                                        break;
+                                }
+                                __set_intercept_indicator(vcpu, inti);
+                        }
+                        if (list_empty(&li->list))
+                                atomic_set(&li->active, 0);
+                        spin_unlock_bh(&li->lock);
+                        if (deliver) {
+                                __do_deliver_interrupt(vcpu, inti);
+                                kfree(inti);
+                        }
+                } while (deliver);
+        }
+        if ((vcpu->arch.sie_block->ckc <
+                get_clock() + vcpu->arch.sie_block->epoch))
+                __try_deliver_ckc_interrupt(vcpu);
+        if (atomic_read(&fi->active)) {
+                do {
+                        deliver = 0;
+                        spin_lock_bh(&fi->lock);
+                        list_for_each_entry_safe(inti, n, &fi->list, list) {
+                                if (__interrupt_is_deliverable(vcpu, inti)) {
+                                        list_del(&inti->list);
+                                        deliver = 1;
+                                        break;
+                                }
+                                __set_intercept_indicator(vcpu, inti);
+                        }
+                        if (list_empty(&fi->list))
+                                atomic_set(&fi->active, 0);
+                        spin_unlock_bh(&fi->lock);
+                        if (deliver) {
+                                __do_deliver_interrupt(vcpu, inti);
+                                kfree(inti);
+                        }
+                } while (deliver);
+        }
+}
+int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
+{
+        struct local_interrupt *li = &vcpu->arch.local_int;
+        struct interrupt_info *inti;
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return -ENOMEM;
+        inti->type = KVM_S390_PROGRAM_INT;;
+        inti->pgm.code = code;
+        VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+        spin_lock_bh(&li->lock);
+        list_add(&inti->list, &li->list);
+        atomic_set(&li->active, 1);
+        BUG_ON(waitqueue_active(&li->wq));
+        spin_unlock_bh(&li->lock);
+        return 0;
+}
+int kvm_s390_inject_vm(struct kvm *kvm,
+                       struct kvm_s390_interrupt *s390int)
+{
+        struct local_interrupt *li;
+        struct float_interrupt *fi;
+        struct interrupt_info *inti;
+        int sigcpu;
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return -ENOMEM;
+        switch (s390int->type) {
+        case KVM_S390_INT_VIRTIO:
+                VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%lx",
+                         s390int->parm, s390int->parm64);
+                inti->type = s390int->type;
+                inti->ext.ext_params = s390int->parm;
+                inti->ext.ext_params2 = s390int->parm64;
+                break;
+        case KVM_S390_INT_SERVICE:
+                VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
+                inti->type = s390int->type;
+                inti->ext.ext_params = s390int->parm;
+                break;
+        case KVM_S390_PROGRAM_INT:
+        case KVM_S390_SIGP_STOP:
+        case KVM_S390_INT_EMERGENCY:
+        default:
+                kfree(inti);
+                return -EINVAL;
+        }
+        mutex_lock(&kvm->lock);
+        fi = &kvm->arch.float_int;
+        spin_lock_bh(&fi->lock);
+        list_add_tail(&inti->list, &fi->list);
+        atomic_set(&fi->active, 1);
+        sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+        if (sigcpu == KVM_MAX_VCPUS) {
+                do {
+                        sigcpu = fi->next_rr_cpu++;
+                        if (sigcpu == KVM_MAX_VCPUS)
+                                sigcpu = fi->next_rr_cpu = 0;
+                } while (fi->local_int[sigcpu] == NULL);
+        }
+        li = fi->local_int[sigcpu];
+        spin_lock_bh(&li->lock);
+        atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+        if (waitqueue_active(&li->wq))
+                wake_up_interruptible(&li->wq);
+        spin_unlock_bh(&li->lock);
+        spin_unlock_bh(&fi->lock);
+        mutex_unlock(&kvm->lock);
+        return 0;
+}
+int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
+                         struct kvm_s390_interrupt *s390int)
+{
+        struct local_interrupt *li;
+        struct interrupt_info *inti;
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return -ENOMEM;
+        switch (s390int->type) {
+        case KVM_S390_PROGRAM_INT:
+                if (s390int->parm & 0xffff0000) {
+                        kfree(inti);
+                        return -EINVAL;
+                }
+                inti->type = s390int->type;
+                inti->pgm.code = s390int->parm;
+                VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)",
+                           s390int->parm);
+                break;
+        case KVM_S390_SIGP_STOP:
+        case KVM_S390_RESTART:
+        case KVM_S390_SIGP_SET_PREFIX:
+        case KVM_S390_INT_EMERGENCY:
+                VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
+                inti->type = s390int->type;
+                break;
+        case KVM_S390_INT_VIRTIO:
+        case KVM_S390_INT_SERVICE:
+        default:
+                kfree(inti);
+                return -EINVAL;
+        }
+        mutex_lock(&vcpu->kvm->lock);
+        li = &vcpu->arch.local_int;
+        spin_lock_bh(&li->lock);
+        if (inti->type == KVM_S390_PROGRAM_INT)
+                list_add(&inti->list, &li->list);
+        else
+                list_add_tail(&inti->list, &li->list);
+        atomic_set(&li->active, 1);
+        if (inti->type == KVM_S390_SIGP_STOP)
+                li->action_bits |= ACTION_STOP_ON_STOP;
+        atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+        if (waitqueue_active(&li->wq))
+                wake_up_interruptible(&vcpu->arch.local_int.wq);
+        spin_unlock_bh(&li->lock);
+        mutex_unlock(&vcpu->kvm->lock);
+        return 0;
+}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
new file mode 100644
index 00000000000..98d1e73e01f
--- /dev/null
+++ b/arch/s390/kvm/kvm-s390.c
@@ -0,0 +1,685 @@
+/*
+ * s390host.c --  hosting zSeries kernel virtual machines
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ *               Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <asm/lowcore.h>
+#include <asm/pgtable.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+        { "userspace_handled", VCPU_STAT(exit_userspace) },
+        { "exit_validity", VCPU_STAT(exit_validity) },
+        { "exit_stop_request", VCPU_STAT(exit_stop_request) },
+        { "exit_external_request", VCPU_STAT(exit_external_request) },
+        { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
+        { "exit_instruction", VCPU_STAT(exit_instruction) },
+        { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
+        { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+        { "instruction_lctg", VCPU_STAT(instruction_lctg) },
+        { "instruction_lctl", VCPU_STAT(instruction_lctl) },
+        { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
+        { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
+        { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
+        { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
+        { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
+        { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
+        { "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
+        { "exit_wait_state", VCPU_STAT(exit_wait_state) },
+        { "instruction_stidp", VCPU_STAT(instruction_stidp) },
+        { "instruction_spx", VCPU_STAT(instruction_spx) },
+        { "instruction_stpx", VCPU_STAT(instruction_stpx) },
+        { "instruction_stap", VCPU_STAT(instruction_stap) },
+        { "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
+        { "instruction_stsch", VCPU_STAT(instruction_stsch) },
+        { "instruction_chsc", VCPU_STAT(instruction_chsc) },
+        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
+        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
+        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
+        { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
+        { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
+        { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
+        { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
+        { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
+        { "diagnose_44", VCPU_STAT(diagnose_44) },
+        { NULL }
+};
+/* Section: not file related */
+void kvm_arch_hardware_enable(void *garbage)
+{
+        /* every s390 is virtualization enabled ;-) */
+}
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+void decache_vcpus_on_cpu(int cpu)
+{
+}
+int kvm_arch_hardware_setup(void)
+{
+        return 0;
+}
+void kvm_arch_hardware_unsetup(void)
+{
+}
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+int kvm_arch_init(void *opaque)
+{
+        return 0;
+}
+void kvm_arch_exit(void)
+{
+}
+/* Section: device related */
+long kvm_arch_dev_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+        if (ioctl == KVM_S390_ENABLE_SIE)
+                return s390_enable_sie();
+        return -EINVAL;
+}
+int kvm_dev_ioctl_check_extension(long ext)
+{
+        return 0;
+}
+/* Section: vm related */
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                               struct kvm_dirty_log *log)
+{
+        return 0;
+}
+long kvm_arch_vm_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
+{
+        struct kvm *kvm = filp->private_data;
+        void __user *argp = (void __user *)arg;
+        int r;
+        switch (ioctl) {
+        case KVM_S390_INTERRUPT: {
+                struct kvm_s390_interrupt s390int;
+                r = -EFAULT;
+                if (copy_from_user(&s390int, argp, sizeof(s390int)))
+                        break;
+                r = kvm_s390_inject_vm(kvm, &s390int);
+                break;
+        }
+        default:
+                r = -EINVAL;
+        }
+        return r;
+}
+struct kvm *kvm_arch_create_vm(void)
+{
+        struct kvm *kvm;
+        int rc;
+        char debug_name[16];
+        rc = s390_enable_sie();
+        if (rc)
+                goto out_nokvm;
+        rc = -ENOMEM;
+        kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+        if (!kvm)
+                goto out_nokvm;
+        kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
+        if (!kvm->arch.sca)
+                goto out_nosca;
+        sprintf(debug_name, "kvm-%u", current->pid);
+        kvm->arch.dbf = debug_register(debug_name, 8, 2, 8 * sizeof(long));
+        if (!kvm->arch.dbf)
+                goto out_nodbf;
+        spin_lock_init(&kvm->arch.float_int.lock);
+        INIT_LIST_HEAD(&kvm->arch.float_int.list);
+        debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
+        VM_EVENT(kvm, 3, "%s", "vm created");
+        try_module_get(THIS_MODULE);
+        return kvm;
+out_nodbf:
+        free_page((unsigned long)(kvm->arch.sca));
+out_nosca:
+        kfree(kvm);
+out_nokvm:
+        return ERR_PTR(rc);
+}
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+        debug_unregister(kvm->arch.dbf);
+        free_page((unsigned long)(kvm->arch.sca));
+        kfree(kvm);
+        module_put(THIS_MODULE);
+}
+/* Section: vcpu related */
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+        return 0;
+}
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+        /* kvm common code refers to this, but does'nt call it */
+        BUG();
+}
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+        save_fp_regs(&vcpu->arch.host_fpregs);
+        save_access_regs(vcpu->arch.host_acrs);
+        vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
+        restore_fp_regs(&vcpu->arch.guest_fpregs);
+        restore_access_regs(vcpu->arch.guest_acrs);
+        if (signal_pending(current))
+                atomic_set_mask(CPUSTAT_STOP_INT,
+                        &vcpu->arch.sie_block->cpuflags);
+}
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+        save_fp_regs(&vcpu->arch.guest_fpregs);
+        save_access_regs(vcpu->arch.guest_acrs);
+        restore_fp_regs(&vcpu->arch.host_fpregs);
+        restore_access_regs(vcpu->arch.host_acrs);
+}
+static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
+{
+        /* this equals initial cpu reset in pop, but we don't switch to ESA */
+        vcpu->arch.sie_block->gpsw.mask = 0UL;
+        vcpu->arch.sie_block->gpsw.addr = 0UL;
+        vcpu->arch.sie_block->prefix    = 0UL;
+        vcpu->arch.sie_block->ihcpu     = 0xffff;
+        vcpu->arch.sie_block->cputm     = 0UL;
+        vcpu->arch.sie_block->ckc       = 0UL;
+        vcpu->arch.sie_block->todpr     = 0;
+        memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
+        vcpu->arch.sie_block->gcr[0]  = 0xE0UL;
+        vcpu->arch.sie_block->gcr[14] = 0xC2000000UL;
+        vcpu->arch.guest_fpregs.fpc = 0;
+        asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
+        vcpu->arch.sie_block->gbea = 1;
+}
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
+        vcpu->arch.sie_block->gmslm = 0xffffffffffUL;
+        vcpu->arch.sie_block->gmsor = 0x000000000000;
+        vcpu->arch.sie_block->ecb   = 2;
+        vcpu->arch.sie_block->eca   = 0xC1002001U;
+        setup_timer(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup,
+                 (unsigned long) vcpu);
+        get_cpu_id(&vcpu->arch.cpu_id);
+        vcpu->arch.cpu_id.version = 0xfe;
+        return 0;
+}
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+                                      unsigned int id)
+{
+        struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+        int rc = -ENOMEM;
+        if (!vcpu)
+                goto out_nomem;
+        vcpu->arch.sie_block = (struct sie_block *) get_zeroed_page(GFP_KERNEL);
+        if (!vcpu->arch.sie_block)
+                goto out_free_cpu;
+        vcpu->arch.sie_block->icpua = id;
+        BUG_ON(!kvm->arch.sca);
+        BUG_ON(kvm->arch.sca->cpu[id].sda);
+        kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+        vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
+        vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
+        spin_lock_init(&vcpu->arch.local_int.lock);
+        INIT_LIST_HEAD(&vcpu->arch.local_int.list);
+        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
+        spin_lock_bh(&kvm->arch.float_int.lock);
+        kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
+        init_waitqueue_head(&vcpu->arch.local_int.wq);
+        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
+        spin_unlock_bh(&kvm->arch.float_int.lock);
+        rc = kvm_vcpu_init(vcpu, kvm, id);
+        if (rc)
+                goto out_free_cpu;
+        VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
+                 vcpu->arch.sie_block);
+        try_module_get(THIS_MODULE);
+        return vcpu;
+out_free_cpu:
+        kfree(vcpu);
+out_nomem:
+        return ERR_PTR(rc);
+}
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+        VCPU_EVENT(vcpu, 3, "%s", "destroy cpu");
+        free_page((unsigned long)(vcpu->arch.sie_block));
+        kfree(vcpu);
+        module_put(THIS_MODULE);
+}
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+        /* kvm common code refers to this, but never calls it */
+        BUG();
+        return 0;
+}
+static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
+{
+        vcpu_load(vcpu);
+        kvm_s390_vcpu_initial_reset(vcpu);
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        vcpu_load(vcpu);
+        memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+        vcpu_load(vcpu);
+        memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+        vcpu_load(vcpu);
+        memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
+        memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+        vcpu_load(vcpu);
+        memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
+        memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        vcpu_load(vcpu);
+        memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
+        vcpu->arch.guest_fpregs.fpc = fpu->fpc;
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+        vcpu_load(vcpu);
+        memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
+        fpu->fpc = vcpu->arch.guest_fpregs.fpc;
+        vcpu_put(vcpu);
+        return 0;
+}
+static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
+{
+        int rc = 0;
+        vcpu_load(vcpu);
+        if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
+                rc = -EBUSY;
+        else
+                vcpu->arch.sie_block->gpsw = psw;
+        vcpu_put(vcpu);
+        return rc;
+}
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+        return -EINVAL; /* not implemented yet */
+}
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                    struct kvm_debug_guest *dbg)
+{
+        return -EINVAL; /* not implemented yet */
+}
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL; /* not implemented yet */
+}
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        return -EINVAL; /* not implemented yet */
+}
+static void __vcpu_run(struct kvm_vcpu *vcpu)
+{
+        memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16);
+        if (need_resched())
+                schedule();
+        vcpu->arch.sie_block->icptcode = 0;
+        local_irq_disable();
+        kvm_guest_enter();
+        local_irq_enable();
+        VCPU_EVENT(vcpu, 6, "entering sie flags %x",
+                   atomic_read(&vcpu->arch.sie_block->cpuflags));
+        sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs);
+        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
+                   vcpu->arch.sie_block->icptcode);
+        local_irq_disable();
+        kvm_guest_exit();
+        local_irq_enable();
+        memcpy(&vcpu->arch.guest_gprs[14], &vcpu->arch.sie_block->gg14, 16);
+}
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        int rc;
+        sigset_t sigsaved;
+        vcpu_load(vcpu);
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+        BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL);
+        switch (kvm_run->exit_reason) {
+        case KVM_EXIT_S390_SIEIC:
+                vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
+                vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
+                break;
+        case KVM_EXIT_UNKNOWN:
+        case KVM_EXIT_S390_RESET:
+                break;
+        default:
+                BUG();
+        }
+        might_sleep();
+        do {
+                kvm_s390_deliver_pending_interrupts(vcpu);
+                __vcpu_run(vcpu);
+                rc = kvm_handle_sie_intercept(vcpu);
+        } while (!signal_pending(current) && !rc);
+        if (signal_pending(current) && !rc)
+                rc = -EINTR;
+        if (rc == -ENOTSUPP) {
+                /* intercept cannot be handled in-kernel, prepare kvm-run */
+                kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
+                kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
+                kvm_run->s390_sieic.mask     = vcpu->arch.sie_block->gpsw.mask;
+                kvm_run->s390_sieic.addr     = vcpu->arch.sie_block->gpsw.addr;
+                kvm_run->s390_sieic.ipa      = vcpu->arch.sie_block->ipa;
+                kvm_run->s390_sieic.ipb      = vcpu->arch.sie_block->ipb;
+                rc = 0;
+        }
+        if (rc == -EREMOTE) {
+                /* intercept was handled, but userspace support is needed
+                 * kvm_run has been prepared by the handler */
+                rc = 0;
+        }
+        if (vcpu->sigset_active)
+                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+        vcpu_put(vcpu);
+        vcpu->stat.exit_userspace++;
+        return rc;
+}
+static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
+                       unsigned long n, int prefix)
+{
+        if (prefix)
+                return copy_to_guest(vcpu, guestdest, from, n);
+        else
+                return copy_to_guest_absolute(vcpu, guestdest, from, n);
+}
+/*
+ * store status at address
+ * we use have two special cases:
+ * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
+ * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
+ */
+int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+        const unsigned char archmode = 1;
+        int prefix;
+        if (addr == KVM_S390_STORE_STATUS_NOADDR) {
+                if (copy_to_guest_absolute(vcpu, 163ul, &archmode, 1))
+                        return -EFAULT;
+                addr = SAVE_AREA_BASE;
+                prefix = 0;
+        } else if (addr == KVM_S390_STORE_STATUS_PREFIXED) {
+                if (copy_to_guest(vcpu, 163ul, &archmode, 1))
+                        return -EFAULT;
+                addr = SAVE_AREA_BASE;
+                prefix = 1;
+        } else
+                prefix = 0;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, fp_regs),
+                        vcpu->arch.guest_fpregs.fprs, 128, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, gp_regs),
+                        vcpu->arch.guest_gprs, 128, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, psw),
+                        &vcpu->arch.sie_block->gpsw, 16, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, pref_reg),
+                        &vcpu->arch.sie_block->prefix, 4, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu,
+                        addr + offsetof(struct save_area_s390x, fp_ctrl_reg),
+                        &vcpu->arch.guest_fpregs.fpc, 4, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, tod_reg),
+                        &vcpu->arch.sie_block->todpr, 4, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, timer),
+                        &vcpu->arch.sie_block->cputm, 8, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, clk_cmp),
+                        &vcpu->arch.sie_block->ckc, 8, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu, addr + offsetof(struct save_area_s390x, acc_regs),
+                        &vcpu->arch.guest_acrs, 64, prefix))
+                return -EFAULT;
+        if (__guestcopy(vcpu,
+                        addr + offsetof(struct save_area_s390x, ctrl_regs),
+                        &vcpu->arch.sie_block->gcr, 128, prefix))
+                return -EFAULT;
+        return 0;
+}
+static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+        int rc;
+        vcpu_load(vcpu);
+        rc = __kvm_s390_vcpu_store_status(vcpu, addr);
+        vcpu_put(vcpu);
+        return rc;
+}
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+        struct kvm_vcpu *vcpu = filp->private_data;
+        void __user *argp = (void __user *)arg;
+        switch (ioctl) {
+        case KVM_S390_INTERRUPT: {
+                struct kvm_s390_interrupt s390int;
+                if (copy_from_user(&s390int, argp, sizeof(s390int)))
+                        return -EFAULT;
+                return kvm_s390_inject_vcpu(vcpu, &s390int);
+        }
+        case KVM_S390_STORE_STATUS:
+                return kvm_s390_vcpu_store_status(vcpu, arg);
+        case KVM_S390_SET_INITIAL_PSW: {
+                psw_t psw;
+                if (copy_from_user(&psw, argp, sizeof(psw)))
+                        return -EFAULT;
+                return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+        }
+        case KVM_S390_INITIAL_RESET:
+                return kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+        default:
+                ;
+        }
+        return -EINVAL;
+}
+/* Section: memory related */
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                                struct kvm_userspace_memory_region *mem,
+                                struct kvm_memory_slot old,
+                                int user_alloc)
+{
+        /* A few sanity checks. We can have exactly one memory slot which has
+           to start at guest virtual zero and which has to be located at a
+           page boundary in userland and which has to end at a page boundary.
+           The memory in userland is ok to be fragmented into various different
+           vmas. It is okay to mmap() and munmap() stuff in this slot after
+           doing this call at any time */
+        if (mem->slot)
+                return -EINVAL;
+        if (mem->guest_phys_addr)
+                return -EINVAL;
+        if (mem->userspace_addr & (PAGE_SIZE - 1))
+                return -EINVAL;
+        if (mem->memory_size & (PAGE_SIZE - 1))
+                return -EINVAL;
+        kvm->arch.guest_origin = mem->userspace_addr;
+        kvm->arch.guest_memsize = mem->memory_size;
+        /* FIXME: we do want to interrupt running CPUs and update their memory
+           configuration now to avoid race conditions. But hey, changing the
+           memory layout while virtual CPUs are running is usually bad
+           programming practice. */
+        return 0;
+}
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+        return gfn;
+}
+static int __init kvm_s390_init(void)
+{
+        return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+}
+static void __exit kvm_s390_exit(void)
+{
+        kvm_exit();
+}
+module_init(kvm_s390_init);
+module_exit(kvm_s390_exit);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
new file mode 100644
index 00000000000..3893cf12eac
--- /dev/null
+++ b/arch/s390/kvm/kvm-s390.h
@@ -0,0 +1,64 @@
+/*
+ * kvm_s390.h -  definition for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#ifndef ARCH_S390_KVM_S390_H
+#define ARCH_S390_KVM_S390_H
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
+int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
+#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
+do { \
+        debug_sprintf_event(d_kvm->arch.dbf, d_loglevel, d_string "\n", \
+          d_args); \
+} while (0)
+#define VCPU_EVENT(d_vcpu, d_loglevel, d_string, d_args...)\
+do { \
+        debug_sprintf_event(d_vcpu->kvm->arch.dbf, d_loglevel, \
+          "%02d[%016lx-%016lx]: " d_string "\n", d_vcpu->vcpu_id, \
+          d_vcpu->arch.sie_block->gpsw.mask, d_vcpu->arch.sie_block->gpsw.addr,\
+          d_args); \
+} while (0)
+static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
+{
+        return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT;
+}
+int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
+void kvm_s390_idle_wakeup(unsigned long data);
+void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
+int kvm_s390_inject_vm(struct kvm *kvm,
+                struct kvm_s390_interrupt *s390int);
+int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
+                struct kvm_s390_interrupt *s390int);
+int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+/* implemented in priv.c */
+int kvm_s390_handle_priv(struct kvm_vcpu *vcpu);
+/* implemented in sigp.c */
+int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
+/* implemented in kvm-s390.c */
+int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
+                                 unsigned long addr);
+/* implemented in diag.c */
+int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
+#endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
new file mode 100644
index 00000000000..1465946325c
--- /dev/null
+++ b/arch/s390/kvm/priv.c
@@ -0,0 +1,323 @@
+/*
+ * priv.c - handling privileged instructions
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/errno.h>
+#include <asm/current.h>
+#include <asm/debug.h>
+#include <asm/ebcdic.h>
+#include <asm/sysinfo.h>
+#include "gaccess.h"
+#include "kvm-s390.h"
+static int handle_set_prefix(struct kvm_vcpu *vcpu)
+{
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 operand2;
+        u32 address = 0;
+        u8 tmp;
+        vcpu->stat.instruction_spx++;
+        operand2 = disp2;
+        if (base2)
+                operand2 += vcpu->arch.guest_gprs[base2];
+        /* must be word boundary */
+        if (operand2 & 3) {
+                kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+                goto out;
+        }
+        /* get the value */
+        if (get_guest_u32(vcpu, operand2, &address)) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out;
+        }
+        address = address & 0x7fffe000u;
+        /* make sure that the new value is valid memory */
+        if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
+           (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out;
+        }
+        vcpu->arch.sie_block->prefix = address;
+        vcpu->arch.sie_block->ihcpu = 0xffff;
+        VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+out:
+        return 0;
+}
+static int handle_store_prefix(struct kvm_vcpu *vcpu)
+{
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 operand2;
+        u32 address;
+        vcpu->stat.instruction_stpx++;
+        operand2 = disp2;
+        if (base2)
+                operand2 += vcpu->arch.guest_gprs[base2];
+        /* must be word boundary */
+        if (operand2 & 3) {
+                kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+                goto out;
+        }
+        address = vcpu->arch.sie_block->prefix;
+        address = address & 0x7fffe000u;
+        /* get the value */
+        if (put_guest_u32(vcpu, operand2, address)) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out;
+        }
+        VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+out:
+        return 0;
+}
+static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
+{
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 useraddr;
+        int rc;
+        vcpu->stat.instruction_stap++;
+        useraddr = disp2;
+        if (base2)
+                useraddr += vcpu->arch.guest_gprs[base2];
+        if (useraddr & 1) {
+                kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+                goto out;
+        }
+        rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id);
+        if (rc == -EFAULT) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out;
+        }
+        VCPU_EVENT(vcpu, 5, "storing cpu address to %lx", useraddr);
+out:
+        return 0;
+}
+static int handle_skey(struct kvm_vcpu *vcpu)
+{
+        vcpu->stat.instruction_storage_key++;
+        vcpu->arch.sie_block->gpsw.addr -= 4;
+        VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+        return 0;
+}
+static int handle_stsch(struct kvm_vcpu *vcpu)
+{
+        vcpu->stat.instruction_stsch++;
+        VCPU_EVENT(vcpu, 4, "%s", "store subchannel - CC3");
+        /* condition code 3 */
+        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+        return 0;
+}
+static int handle_chsc(struct kvm_vcpu *vcpu)
+{
+        vcpu->stat.instruction_chsc++;
+        VCPU_EVENT(vcpu, 4, "%s", "channel subsystem call - CC3");
+        /* condition code 3 */
+        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+        return 0;
+}
+static unsigned int kvm_stfl(void)
+{
+        asm volatile(
+                "       .insn   s,0xb2b10000,0(0)\n" /* stfl */
+                "0:\n"
+                EX_TABLE(0b, 0b));
+        return S390_lowcore.stfl_fac_list;
+}
+static int handle_stfl(struct kvm_vcpu *vcpu)
+{
+        unsigned int facility_list = kvm_stfl();
+        int rc;
+        vcpu->stat.instruction_stfl++;
+        facility_list &= ~(1UL<<24); /* no stfle */
+        rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
+                           &facility_list, sizeof(facility_list));
+        if (rc == -EFAULT)
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+        else
+                VCPU_EVENT(vcpu, 5, "store facility list value %x",
+                           facility_list);
+        return 0;
+}
+static int handle_stidp(struct kvm_vcpu *vcpu)
+{
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 operand2;
+        int rc;
+        vcpu->stat.instruction_stidp++;
+        operand2 = disp2;
+        if (base2)
+                operand2 += vcpu->arch.guest_gprs[base2];
+        if (operand2 & 7) {
+                kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+                goto out;
+        }
+        rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data);
+        if (rc == -EFAULT) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out;
+        }
+        VCPU_EVENT(vcpu, 5, "%s", "store cpu id");
+out:
+        return 0;
+}
+static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
+{
+        struct float_interrupt *fi = &vcpu->kvm->arch.float_int;
+        int cpus = 0;
+        int n;
+        spin_lock_bh(&fi->lock);
+        for (n = 0; n < KVM_MAX_VCPUS; n++)
+                if (fi->local_int[n])
+                        cpus++;
+        spin_unlock_bh(&fi->lock);
+        /* deal with other level 3 hypervisors */
+        if (stsi(mem, 3, 2, 2) == -ENOSYS)
+                mem->count = 0;
+        if (mem->count < 8)
+                mem->count++;
+        for (n = mem->count - 1; n > 0 ; n--)
+                memcpy(&mem->vm[n], &mem->vm[n - 1], sizeof(mem->vm[0]));
+        mem->vm[0].cpus_total = cpus;
+        mem->vm[0].cpus_configured = cpus;
+        mem->vm[0].cpus_standby = 0;
+        mem->vm[0].cpus_reserved = 0;
+        mem->vm[0].caf = 1000;
+        memcpy(mem->vm[0].name, "KVMguest", 8);
+        ASCEBC(mem->vm[0].name, 8);
+        memcpy(mem->vm[0].cpi, "KVM/Linux       ", 16);
+        ASCEBC(mem->vm[0].cpi, 16);
+}
+static int handle_stsi(struct kvm_vcpu *vcpu)
+{
+        int fc = (vcpu->arch.guest_gprs[0] & 0xf0000000) >> 28;
+        int sel1 = vcpu->arch.guest_gprs[0] & 0xff;
+        int sel2 = vcpu->arch.guest_gprs[1] & 0xffff;
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u64 operand2;
+        unsigned long mem;
+        vcpu->stat.instruction_stsi++;
+        VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
+        operand2 = disp2;
+        if (base2)
+                operand2 += vcpu->arch.guest_gprs[base2];
+        if (operand2 & 0xfff && fc > 0)
+                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+        switch (fc) {
+        case 0:
+                vcpu->arch.guest_gprs[0] = 3 << 28;
+                vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+                return 0;
+        case 1: /* same handling for 1 and 2 */
+        case 2:
+                mem = get_zeroed_page(GFP_KERNEL);
+                if (!mem)
+                        goto out_fail;
+                if (stsi((void *) mem, fc, sel1, sel2) == -ENOSYS)
+                        goto out_mem;
+                break;
+        case 3:
+                if (sel1 != 2 || sel2 != 2)
+                        goto out_fail;
+                mem = get_zeroed_page(GFP_KERNEL);
+                if (!mem)
+                        goto out_fail;
+                handle_stsi_3_2_2(vcpu, (void *) mem);
+                break;
+        default:
+                goto out_fail;
+        }
+        if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
+                kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+                goto out_mem;
+        }
+        free_page(mem);
+        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        vcpu->arch.guest_gprs[0] = 0;
+        return 0;
+out_mem:
+        free_page(mem);
+out_fail:
+        /* condition code 3 */
+        vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+        return 0;
+}
+static intercept_handler_t priv_handlers[256] = {
+        [0x02] = handle_stidp,
+        [0x10] = handle_set_prefix,
+        [0x11] = handle_store_prefix,
+        [0x12] = handle_store_cpu_address,
+        [0x29] = handle_skey,
+        [0x2a] = handle_skey,
+        [0x2b] = handle_skey,
+        [0x34] = handle_stsch,
+        [0x5f] = handle_chsc,
+        [0x7d] = handle_stsi,
+        [0xb1] = handle_stfl,
+};
+int kvm_s390_handle_priv(struct kvm_vcpu *vcpu)
+{
+        intercept_handler_t handler;
+        handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
+        if (handler)
+                return handler(vcpu);
+        return -ENOTSUPP;
+}
diff --git a/arch/s390/kvm/sie64a.S b/arch/s390/kvm/sie64a.S
new file mode 100644
index 00000000000..934fd6a885f
--- /dev/null
+++ b/arch/s390/kvm/sie64a.S
@@ -0,0 +1,47 @@
+/*
+ * sie64a.S - low level sie call
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+#include <linux/errno.h>
+#include <asm/asm-offsets.h>
+SP_R5 = 5 * 8   # offset into stackframe
+SP_R6 = 6 * 8
+/*
+ * sie64a calling convention:
+ * %r2 pointer to sie control block
+ * %r3 guest register save area
+ */
+        .globl  sie64a
+sie64a:
+        lgr     %r5,%r3
+        stmg    %r5,%r14,SP_R5(%r15)    # save register on entry
+        lgr     %r14,%r2                # pointer to sie control block
+        lmg     %r0,%r13,0(%r3)         # load guest gprs 0-13
+sie_inst:
+        sie     0(%r14)
+        lg      %r14,SP_R5(%r15)
+        stmg    %r0,%r13,0(%r14)        # save guest gprs 0-13
+        lghi    %r2,0
+        lmg     %r6,%r14,SP_R6(%r15)
+        br      %r14
+sie_err:
+        lg      %r14,SP_R5(%r15)
+        stmg    %r0,%r13,0(%r14)        # save guest gprs 0-13
+        lghi    %r2,-EFAULT
+        lmg     %r6,%r14,SP_R6(%r15)
+        br      %r14
+        .section __ex_table,"a"
+        .quad   sie_inst,sie_err
+        .previous
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
new file mode 100644
index 00000000000..0a236acfb5f
--- /dev/null
+++ b/arch/s390/kvm/sigp.c
@@ -0,0 +1,288 @@
+/*
+ * sigp.c - handlinge interprocessor communication
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include "gaccess.h"
+#include "kvm-s390.h"
+/* sigp order codes */
+#define SIGP_SENSE             0x01
+#define SIGP_EXTERNAL_CALL     0x02
+#define SIGP_EMERGENCY         0x03
+#define SIGP_START             0x04
+#define SIGP_STOP              0x05
+#define SIGP_RESTART           0x06
+#define SIGP_STOP_STORE_STATUS 0x09
+#define SIGP_INITIAL_CPU_RESET 0x0b
+#define SIGP_CPU_RESET         0x0c
+#define SIGP_SET_PREFIX        0x0d
+#define SIGP_STORE_STATUS_ADDR 0x0e
+#define SIGP_SET_ARCH          0x12
+/* cpu status bits */
+#define SIGP_STAT_EQUIPMENT_CHECK   0x80000000UL
+#define SIGP_STAT_INCORRECT_STATE   0x00000200UL
+#define SIGP_STAT_INVALID_PARAMETER 0x00000100UL
+#define SIGP_STAT_EXT_CALL_PENDING  0x00000080UL
+#define SIGP_STAT_STOPPED           0x00000040UL
+#define SIGP_STAT_OPERATOR_INTERV   0x00000020UL
+#define SIGP_STAT_CHECK_STOP        0x00000010UL
+#define SIGP_STAT_INOPERATIVE       0x00000004UL
+#define SIGP_STAT_INVALID_ORDER     0x00000002UL
+#define SIGP_STAT_RECEIVER_CHECK    0x00000001UL
+static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg)
+{
+        struct float_interrupt *fi = &vcpu->kvm->arch.float_int;
+        int rc;
+        if (cpu_addr >= KVM_MAX_VCPUS)
+                return 3; /* not operational */
+        spin_lock_bh(&fi->lock);
+        if (fi->local_int[cpu_addr] == NULL)
+                rc = 3; /* not operational */
+        else if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
+                 & CPUSTAT_RUNNING) {
+                *reg &= 0xffffffff00000000UL;
+                rc = 1; /* status stored */
+        } else {
+                *reg &= 0xffffffff00000000UL;
+                *reg |= SIGP_STAT_STOPPED;
+                rc = 1; /* status stored */
+        }
+        spin_unlock_bh(&fi->lock);
+        VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
+        return rc;
+}
+static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
+{
+        struct float_interrupt *fi = &vcpu->kvm->arch.float_int;
+        struct local_interrupt *li;
+        struct interrupt_info *inti;
+        int rc;
+        if (cpu_addr >= KVM_MAX_VCPUS)
+                return 3; /* not operational */
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return -ENOMEM;
+        inti->type = KVM_S390_INT_EMERGENCY;
+        spin_lock_bh(&fi->lock);
+        li = fi->local_int[cpu_addr];
+        if (li == NULL) {
+                rc = 3; /* not operational */
+                kfree(inti);
+                goto unlock;
+        }
+        spin_lock_bh(&li->lock);
+        list_add_tail(&inti->list, &li->list);
+        atomic_set(&li->active, 1);
+        atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+        if (waitqueue_active(&li->wq))
+                wake_up_interruptible(&li->wq);
+        spin_unlock_bh(&li->lock);
+        rc = 0; /* order accepted */
+unlock:
+        spin_unlock_bh(&fi->lock);
+        VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
+        return rc;
+}
+static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
+{
+        struct float_interrupt *fi = &vcpu->kvm->arch.float_int;
+        struct local_interrupt *li;
+        struct interrupt_info *inti;
+        int rc;
+        if (cpu_addr >= KVM_MAX_VCPUS)
+                return 3; /* not operational */
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return -ENOMEM;
+        inti->type = KVM_S390_SIGP_STOP;
+        spin_lock_bh(&fi->lock);
+        li = fi->local_int[cpu_addr];
+        if (li == NULL) {
+                rc = 3; /* not operational */
+                kfree(inti);
+                goto unlock;
+        }
+        spin_lock_bh(&li->lock);
+        list_add_tail(&inti->list, &li->list);
+        atomic_set(&li->active, 1);
+        atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
+        if (store)
+                li->action_bits |= ACTION_STORE_ON_STOP;
+        li->action_bits |= ACTION_STOP_ON_STOP;
+        if (waitqueue_active(&li->wq))
+                wake_up_interruptible(&li->wq);
+        spin_unlock_bh(&li->lock);
+        rc = 0; /* order accepted */
+unlock:
+        spin_unlock_bh(&fi->lock);
+        VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
+        return rc;
+}
+static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
+{
+        int rc;
+        switch (parameter & 0xff) {
+        case 0:
+                printk(KERN_WARNING "kvm: request to switch to ESA/390 mode"
+                                                        " not supported");
+                rc = 3; /* not operational */
+                break;
+        case 1:
+        case 2:
+                rc = 0; /* order accepted */
+                break;
+        default:
+                rc = -ENOTSUPP;
+        }
+        return rc;
+}
+static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
+                             u64 *reg)
+{
+        struct float_interrupt *fi = &vcpu->kvm->arch.float_int;
+        struct local_interrupt *li;
+        struct interrupt_info *inti;
+        int rc;
+        u8 tmp;
+        /* make sure that the new value is valid memory */
+        address = address & 0x7fffe000u;
+        if ((copy_from_guest(vcpu, &tmp,
+                (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) ||
+           (copy_from_guest(vcpu, &tmp, (u64) (address +
+                        vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) {
+                *reg |= SIGP_STAT_INVALID_PARAMETER;
+                return 1; /* invalid parameter */
+        }
+        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+        if (!inti)
+                return 2; /* busy */
+        spin_lock_bh(&fi->lock);
+        li = fi->local_int[cpu_addr];
+        if ((cpu_addr >= KVM_MAX_VCPUS) || (li == NULL)) {
+                rc = 1; /* incorrect state */
+                *reg &= SIGP_STAT_INCORRECT_STATE;
+                kfree(inti);
+                goto out_fi;
+        }
+        spin_lock_bh(&li->lock);
+        /* cpu must be in stopped state */
+        if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+                rc = 1; /* incorrect state */
+                *reg &= SIGP_STAT_INCORRECT_STATE;
+                kfree(inti);
+                goto out_li;
+        }
+        inti->type = KVM_S390_SIGP_SET_PREFIX;
+        inti->prefix.address = address;
+        list_add_tail(&inti->list, &li->list);
+        atomic_set(&li->active, 1);
+        if (waitqueue_active(&li->wq))
+                wake_up_interruptible(&li->wq);
+        rc = 0; /* order accepted */
+        VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
+out_li:
+        spin_unlock_bh(&li->lock);
+out_fi:
+        spin_unlock_bh(&fi->lock);
+        return rc;
+}
+int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
+{
+        int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+        int r3 = vcpu->arch.sie_block->ipa & 0x000f;
+        int base2 = vcpu->arch.sie_block->ipb >> 28;
+        int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+        u32 parameter;
+        u16 cpu_addr = vcpu->arch.guest_gprs[r3];
+        u8 order_code;
+        int rc;
+        order_code = disp2;
+        if (base2)
+                order_code += vcpu->arch.guest_gprs[base2];
+        if (r1 % 2)
+                parameter = vcpu->arch.guest_gprs[r1];
+        else
+                parameter = vcpu->arch.guest_gprs[r1 + 1];
+        switch (order_code) {
+        case SIGP_SENSE:
+                vcpu->stat.instruction_sigp_sense++;
+                rc = __sigp_sense(vcpu, cpu_addr,
+                                  &vcpu->arch.guest_gprs[r1]);
+                break;
+        case SIGP_EMERGENCY:
+                vcpu->stat.instruction_sigp_emergency++;
+                rc = __sigp_emergency(vcpu, cpu_addr);
+                break;
+        case SIGP_STOP:
+                vcpu->stat.instruction_sigp_stop++;
+                rc = __sigp_stop(vcpu, cpu_addr, 0);
+                break;
+        case SIGP_STOP_STORE_STATUS:
+                vcpu->stat.instruction_sigp_stop++;
+                rc = __sigp_stop(vcpu, cpu_addr, 1);
+                break;
+        case SIGP_SET_ARCH:
+                vcpu->stat.instruction_sigp_arch++;
+                rc = __sigp_set_arch(vcpu, parameter);
+                break;
+        case SIGP_SET_PREFIX:
+                vcpu->stat.instruction_sigp_prefix++;
+                rc = __sigp_set_prefix(vcpu, cpu_addr, parameter,
+                                       &vcpu->arch.guest_gprs[r1]);
+                break;
+        case SIGP_RESTART:
+                vcpu->stat.instruction_sigp_restart++;
+                /* user space must know about restart */
+        default:
+                return -ENOTSUPP;
+        }
+        if (rc < 0)
+                return rc;
+        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        vcpu->arch.sie_block->gpsw.mask |= (rc & 3ul) << 44;
+        return 0;
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index fd072013f88..5c1aea97cd1 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -30,11 +30,27 @@
 #define TABLES_PER_PAGE 4
 #define FRAG_MASK       15UL
 #define SECOND_HALVES   10UL
+void clear_table_pgstes(unsigned long *table)
+{
+        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
+        memset(table + 256, 0, PAGE_SIZE/4);
+        clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
+        memset(table + 768, 0, PAGE_SIZE/4);
+}
 #else
 #define ALLOC_ORDER     2
 #define TABLES_PER_PAGE 2
 #define FRAG_MASK       3UL
 #define SECOND_HALVES   2UL
+void clear_table_pgstes(unsigned long *table)
+{
+        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
+        memset(table + 256, 0, PAGE_SIZE/2);
+}
 #endif
 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
@@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
        unsigned long *table;
        unsigned long bits;
-        bits = mm->context.noexec ? 3UL : 1UL;
+        bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
        spin_lock(&mm->page_table_lock);
        page = NULL;
        if (!list_empty(&mm->context.pgtable_list)) {
@@ -170,7 +186,10 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                pgtable_page_ctor(page);
                page->flags &= ~FRAG_MASK;
                table = (unsigned long *) page_to_phys(page);
-                clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
+                if (mm->context.pgstes)
+                        clear_table_pgstes(table);
+                else
+                        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
                spin_lock(&mm->page_table_lock);
                list_add(&page->lru, &mm->context.pgtable_list);
        }
@@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        struct page *page;
        unsigned long bits;
-        bits = mm->context.noexec ? 3UL : 1UL;
+        bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
        bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        spin_lock(&mm->page_table_lock);
@@ -228,3 +247,43 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
        mm->context.noexec = 0;
        update_mm(mm, tsk);
 }
+/*
+ * switch on pgstes for its userspace process (for kvm)
+ */
+int s390_enable_sie(void)
+{
+        struct task_struct *tsk = current;
+        struct mm_struct *mm;
+        int rc;
+        task_lock(tsk);
+        rc = 0;
+        if (tsk->mm->context.pgstes)
+                goto unlock;
+        rc = -EINVAL;
+        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
+            tsk->mm != tsk->active_mm || tsk->mm->ioctx_list)
+                goto unlock;
+        tsk->mm->context.pgstes = 1;    /* dirty little tricks .. */
+        mm = dup_mm(tsk);
+        tsk->mm->context.pgstes = 0;
+        rc = -ENOMEM;
+        if (!mm)
+                goto unlock;
+        mmput(tsk->mm);
+        tsk->mm = tsk->active_mm = mm;
+        preempt_disable();
+        update_mm(mm, tsk);
+        cpu_set(smp_processor_id(), mm->cpu_vm_mask);
+        preempt_enable();
+        rc = 0;
+unlock:
+        task_unlock(tsk);
+        return rc;
+}
+EXPORT_SYMBOL_GPL(s390_enable_sie);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fadf794483..e5790fe9e33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -373,6 +373,25 @@ config VMI
          at the moment), by linking the kernel to a GPL-ed ROM module
          provided by the hypervisor.
+config KVM_CLOCK
+        bool "KVM paravirtualized clock"
+        select PARAVIRT
+        depends on !(X86_VISWS || X86_VOYAGER)
+        help
+          Turning on this option will allow you to run a paravirtualized clock
+          when running over the KVM hypervisor. Instead of relying on a PIT
+          (or probably other) emulation by the underlying device model, the host
+          provides the guest with timing infrastructure such as time of day, and
+          system time
+config KVM_GUEST
+        bool "KVM Guest support"
+        select PARAVIRT
+        depends on !(X86_VISWS || X86_VOYAGER)
+        help
+         This option enables various optimizations for running under the KVM
+         hypervisor.
 source "arch/x86/lguest/Kconfig"
 config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90e092d0af0..fa19c381954 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)     += test_nx.o
 obj-$(CONFIG_VMI)               += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST)         += kvm.o
+obj-$(CONFIG_KVM_CLOCK)         += kvmclock.o
 obj-$(CONFIG_PARAVIRT)          += paravirt.o paravirt_patch_$(BITS).o
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2251d0ae957..26855381790 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
 #include <asm/hpet.h>
 #include <linux/kdebug.h>
 #include <asm/smp.h>
+#include <asm/reboot.h>
 #include <mach_ipi.h>
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
-void machine_crash_shutdown(struct pt_regs *regs)
+void native_machine_crash_shutdown(struct pt_regs *regs)
 {
        /* This function is only called after the system
         * has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 00000000000..8b7a3cf37d2
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ *   Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#define MMU_QUEUE_SIZE 1024
+struct kvm_para_state {
+        u8 mmu_queue[MMU_QUEUE_SIZE];
+        int mmu_queue_len;
+        enum paravirt_lazy_mode mode;
+};
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static struct kvm_para_state *kvm_para_state(void)
+{
+        return &per_cpu(para_state, raw_smp_processor_id());
+}
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+static void kvm_mmu_op(void *buffer, unsigned len)
+{
+        int r;
+        unsigned long a1, a2;
+        do {
+                a1 = __pa(buffer);
+                a2 = 0;   /* on i386 __pa() always returns <4G */
+                r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
+                buffer += r;
+                len -= r;
+        } while (len);
+}
+static void mmu_queue_flush(struct kvm_para_state *state)
+{
+        if (state->mmu_queue_len) {
+                kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
+                state->mmu_queue_len = 0;
+        }
+}
+static void kvm_deferred_mmu_op(void *buffer, int len)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        if (state->mode != PARAVIRT_LAZY_MMU) {
+                kvm_mmu_op(buffer, len);
+                return;
+        }
+        if (state->mmu_queue_len + len > sizeof state->mmu_queue)
+                mmu_queue_flush(state);
+        memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
+        state->mmu_queue_len += len;
+}
+static void kvm_mmu_write(void *dest, u64 val)
+{
+        __u64 pte_phys;
+        struct kvm_mmu_op_write_pte wpte;
+#ifdef CONFIG_HIGHPTE
+        struct page *page;
+        unsigned long dst = (unsigned long) dest;
+        page = kmap_atomic_to_page(dest);
+        pte_phys = page_to_pfn(page);
+        pte_phys <<= PAGE_SHIFT;
+        pte_phys += (dst & ~(PAGE_MASK));
+#else
+        pte_phys = (unsigned long)__pa(dest);
+#endif
+        wpte.header.op = KVM_MMU_OP_WRITE_PTE;
+        wpte.pte_val = val;
+        wpte.pte_phys = pte_phys;
+        kvm_deferred_mmu_op(&wpte, sizeof wpte);
+}
+/*
+ * We only need to hook operations that are MMU writes.  We hook these so that
+ * we can use lazy MMU mode to batch these operations.  We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                           pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+        kvm_mmu_write(pmdp, pmd_val(pmd));
+}
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+                                pte_t *ptep, pte_t pte)
+{
+        kvm_mmu_write(ptep, pte_val(pte));
+}
+static void kvm_pte_clear(struct mm_struct *mm,
+                          unsigned long addr, pte_t *ptep)
+{
+        kvm_mmu_write(ptep, 0);
+}
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+        kvm_mmu_write(pmdp, 0);
+}
+#endif
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+        kvm_mmu_write(pudp, pud_val(pud));
+}
+#if PAGETABLE_LEVELS == 4
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+        kvm_mmu_write(pgdp, pgd_val(pgd));
+}
+#endif
+#endif /* PAGETABLE_LEVELS >= 3 */
+static void kvm_flush_tlb(void)
+{
+        struct kvm_mmu_op_flush_tlb ftlb = {
+                .header.op = KVM_MMU_OP_FLUSH_TLB,
+        };
+        kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+}
+static void kvm_release_pt(u32 pfn)
+{
+        struct kvm_mmu_op_release_pt rpt = {
+                .header.op = KVM_MMU_OP_RELEASE_PT,
+                .pt_phys = (u64)pfn << PAGE_SHIFT,
+        };
+        kvm_mmu_op(&rpt, sizeof rpt);
+}
+static void kvm_enter_lazy_mmu(void)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        paravirt_enter_lazy_mmu();
+        state->mode = paravirt_get_lazy_mode();
+}
+static void kvm_leave_lazy_mmu(void)
+{
+        struct kvm_para_state *state = kvm_para_state();
+        mmu_queue_flush(state);
+        paravirt_leave_lazy(paravirt_get_lazy_mode());
+        state->mode = paravirt_get_lazy_mode();
+}
+static void paravirt_ops_setup(void)
+{
+        pv_info.name = "KVM";
+        pv_info.paravirt_enabled = 1;
+        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+                pv_cpu_ops.io_delay = kvm_io_delay;
+        if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
+                pv_mmu_ops.set_pte = kvm_set_pte;
+                pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+                pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+                pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+                pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+                pv_mmu_ops.pte_clear = kvm_pte_clear;
+                pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+                pv_mmu_ops.set_pud = kvm_set_pud;
+#if PAGETABLE_LEVELS == 4
+                pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+#endif
+                pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+                pv_mmu_ops.release_pte = kvm_release_pt;
+                pv_mmu_ops.release_pmd = kvm_release_pt;
+                pv_mmu_ops.release_pud = kvm_release_pt;
+                pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+                pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+        }
+}
+void __init kvm_guest_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        paravirt_ops_setup();
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 00000000000..ddee04043ae
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+    Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <asm/reboot.h>
+#define KVM_SCALE 22
+static int kvmclock = 1;
+static int parse_no_kvmclock(char *arg)
+{
+        kvmclock = 0;
+        return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+/* The hypervisor will put information about time periodically here */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+        int cpu = smp_processor_id();
+        u64 delta = native_read_tsc() - last_tsc;
+        return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+        u32 wc_sec, wc_nsec;
+        u64 delta;
+        struct timespec ts;
+        int version, nsec;
+        int low, high;
+        low = (int)__pa(&wall_clock);
+        high = ((u64)__pa(&wall_clock) >> 32);
+        delta = kvm_clock_read();
+        native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+        do {
+                version = wall_clock.wc_version;
+                rmb();
+                wc_sec = wall_clock.wc_sec;
+                wc_nsec = wall_clock.wc_nsec;
+                rmb();
+        } while ((wall_clock.wc_version != version) || (version & 1));
+        delta = kvm_clock_read() - delta;
+        delta += wc_nsec;
+        nsec = do_div(delta, NSEC_PER_SEC);
+        set_normalized_timespec(&ts, wc_sec + delta, nsec);
+        /*
+         * Of all mechanisms of time adjustment I've tested, this one
+         * was the champion!
+         */
+        return ts.tv_sec + 1;
+}
+int kvm_set_wallclock(unsigned long now)
+{
+        return 0;
+}
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+        u64 last_tsc, now;
+        int cpu;
+        preempt_disable();
+        cpu = smp_processor_id();
+        last_tsc = get_clock(cpu, tsc_timestamp);
+        now = get_clock(cpu, system_time);
+        now += kvm_get_delta(last_tsc);
+        preempt_enable();
+        return now;
+}
+static struct clocksource kvm_clock = {
+        .name = "kvm-clock",
+        .read = kvm_clock_read,
+        .rating = 400,
+        .mask = CLOCKSOURCE_MASK(64),
+        .mult = 1 << KVM_SCALE,
+        .shift = KVM_SCALE,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+static int kvm_register_clock(void)
+{
+        int cpu = smp_processor_id();
+        int low, high;
+        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+        return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+static void kvm_setup_secondary_clock(void)
+{
+        /*
+         * Now that the first cpu already had this clocksource initialized,
+         * we shouldn't fail.
+         */
+        WARN_ON(kvm_register_clock());
+        /* ok, done with our trickery, call native */
+        setup_secondary_APIC_clock();
+}
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writting anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+        native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+        native_machine_crash_shutdown(regs);
+}
+#endif
+static void kvm_shutdown(void)
+{
+        native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+        native_machine_shutdown();
+}
+void __init kvmclock_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+                if (kvm_register_clock())
+                        return;
+                pv_time_ops.get_wallclock = kvm_get_wallclock;
+                pv_time_ops.set_wallclock = kvm_set_wallclock;
+                pv_time_ops.sched_clock = kvm_clock_read;
+                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+                machine_ops.shutdown  = kvm_shutdown;
+#ifdef CONFIG_KEXEC
+                machine_ops.crash_shutdown  = kvm_crash_shutdown;
+#endif
+                clocksource_register(&kvm_clock);
+        }
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1791a751a77..a4a838306b2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
        }
 }
-static void native_machine_shutdown(void)
+void native_machine_shutdown(void)
 {
        /* Stop the cpus and apics */
 #ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
        .shutdown = native_machine_shutdown,
        .emergency_restart = native_machine_emergency_restart,
        .restart = native_machine_restart,
-        .halt = native_machine_halt
+        .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC
+        .crash_shutdown = native_machine_crash_shutdown,
+#endif
 };
 void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
        machine_ops.halt();
 }
+#ifdef CONFIG_KEXEC
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+        machine_ops.crash_shutdown(regs);
+}
+#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 44cc9b93393..2283422af79 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
 #include <linux/pfn.h>
 #include <linux/pci.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 #include <video/edid.h>
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
        max_low_pfn = setup_memory();
+#ifdef CONFIG_KVM_CLOCK
+        kvmclock_init();
+#endif
 #ifdef CONFIG_VMI
        /*
         * Must be after max_low_pfn is determined, and before kernel
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
         */
        vmi_init();
 #endif
+        kvm_guest_init();
        /*
         * NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 60e64c8eee9..a94fb959a87 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
        io_delay_init();
+#ifdef CONFIG_KVM_CLOCK
+        kvmclock_init();
+#endif
 #ifdef CONFIG_SMP
        /* setup to use the early static init tables during kernel startup */
        x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
        init_apic_mappings();
        ioapic_init_mappings();
+        kvm_guest_init();
        /*
         * We trust e820 completely. No explicit ROM probing in memory.
         */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e793c0..8d45fabc5f3 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,7 @@ if VIRTUALIZATION
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
-        depends on HAVE_KVM && EXPERIMENTAL
+        depends on HAVE_KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        ---help---
@@ -50,6 +50,17 @@ config KVM_AMD
          Provides support for KVM on AMD processors equipped with the AMD-V
          (SVM) extensions.
+config KVM_TRACE
+        bool "KVM trace support"
+        depends on KVM && MARKERS && SYSFS
+        select RELAY
+        select DEBUG_FS
+        default n
+        ---help---
+          This option allows reading a trace of kvm-related events through
+          relayfs.  Note the ABI is not considered stable and will be
+          modified in future updates.
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index ffdd0b31078..c97d35c218d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,14 @@
 #
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+ifeq ($(CONFIG_KVM_TRACE),y)
+common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+endif
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+        i8254.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 00000000000..361e3161127
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,611 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ *   Sheng Yang <sheng.yang@intel.com>
+ *   Based on QEMU and Xen.
+ */
+#include <linux/kvm_host.h>
+#include "irq.h"
+#include "i8254.h"
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+/* Compute with 96 bit intermediate result: (a*b)/c */
+static u64 muldiv64(u64 a, u32 b, u32 c)
+{
+        union {
+                u64 ll;
+                struct {
+                        u32 low, high;
+                } l;
+        } u, res;
+        u64 rl, rh;
+        u.ll = a;
+        rl = (u64)u.l.low * (u64)b;
+        rh = (u64)u.l.high * (u64)b;
+        rh += (rl >> 32);
+        res.l.high = div64_64(rh, c);
+        res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+        return res.ll;
+}
+static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+{
+        struct kvm_kpit_channel_state *c =
+                &kvm->arch.vpit->pit_state.channels[channel];
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        switch (c->mode) {
+        default:
+        case 0:
+        case 4:
+                /* XXX: just disable/enable counting */
+                break;
+        case 1:
+        case 2:
+        case 3:
+        case 5:
+                /* Restart counting on rising edge. */
+                if (c->gate < val)
+                        c->count_load_time = ktime_get();
+                break;
+        }
+        c->gate = val;
+}
+int pit_get_gate(struct kvm *kvm, int channel)
+{
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        return kvm->arch.vpit->pit_state.channels[channel].gate;
+}
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+        struct kvm_kpit_channel_state *c =
+                &kvm->arch.vpit->pit_state.channels[channel];
+        s64 d, t;
+        int counter;
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+        d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+        switch (c->mode) {
+        case 0:
+        case 1:
+        case 4:
+        case 5:
+                counter = (c->count - d) & 0xffff;
+                break;
+        case 3:
+                /* XXX: may be incorrect for odd counts */
+                counter = c->count - (mod_64((2 * d), c->count));
+                break;
+        default:
+                counter = c->count - mod_64(d, c->count);
+                break;
+        }
+        return counter;
+}
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+        struct kvm_kpit_channel_state *c =
+                &kvm->arch.vpit->pit_state.channels[channel];
+        s64 d, t;
+        int out;
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+        d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+        switch (c->mode) {
+        default:
+        case 0:
+                out = (d >= c->count);
+                break;
+        case 1:
+                out = (d < c->count);
+                break;
+        case 2:
+                out = ((mod_64(d, c->count) == 0) && (d != 0));
+                break;
+        case 3:
+                out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+                break;
+        case 4:
+        case 5:
+                out = (d == c->count);
+                break;
+        }
+        return out;
+}
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+        struct kvm_kpit_channel_state *c =
+                &kvm->arch.vpit->pit_state.channels[channel];
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        if (!c->count_latched) {
+                c->latched_count = pit_get_count(kvm, channel);
+                c->count_latched = c->rw_mode;
+        }
+}
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+        struct kvm_kpit_channel_state *c =
+                &kvm->arch.vpit->pit_state.channels[channel];
+        WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+        if (!c->status_latched) {
+                /* TODO: Return NULL COUNT (bit 6). */
+                c->status = ((pit_get_out(kvm, channel) << 7) |
+                                (c->rw_mode << 4) |
+                                (c->mode << 1) |
+                                c->bcd);
+                c->status_latched = 1;
+        }
+}
+int __pit_timer_fn(struct kvm_kpit_state *ps)
+{
+        struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
+        struct kvm_kpit_timer *pt = &ps->pit_timer;
+        atomic_inc(&pt->pending);
+        smp_mb__after_atomic_inc();
+        /* FIXME: handle case where the guest is in guest mode */
+        if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+                vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                wake_up_interruptible(&vcpu0->wq);
+        }
+        pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
+        pt->scheduled = ktime_to_ns(pt->timer.expires);
+        return (pt->period == 0 ? 0 : 1);
+}
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+        if (pit && vcpu->vcpu_id == 0)
+                return atomic_read(&pit->pit_state.pit_timer.pending);
+        return 0;
+}
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+        struct kvm_kpit_state *ps;
+        int restart_timer = 0;
+        ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
+        restart_timer = __pit_timer_fn(ps);
+        if (restart_timer)
+                return HRTIMER_RESTART;
+        else
+                return HRTIMER_NORESTART;
+}
+static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+{
+        pr_debug("pit: execute del timer!\n");
+        hrtimer_cancel(&pt->timer);
+}
+static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+{
+        s64 interval;
+        interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+        pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+        /* TODO The new value only affected after the retriggered */
+        hrtimer_cancel(&pt->timer);
+        pt->period = (is_period == 0) ? 0 : interval;
+        pt->timer.function = pit_timer_fn;
+        atomic_set(&pt->pending, 0);
+        hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+                      HRTIMER_MODE_ABS);
+}
+static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+        struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+        WARN_ON(!mutex_is_locked(&ps->lock));
+        pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+        /*
+         * Though spec said the state of 8254 is undefined after power-up,
+         * seems some tricky OS like Windows XP depends on IRQ0 interrupt
+         * when booting up.
+         * So here setting initialize rate for it, and not a specific number
+         */
+        if (val == 0)
+                val = 0x10000;
+        ps->channels[channel].count_load_time = ktime_get();
+        ps->channels[channel].count = val;
+        if (channel != 0)
+                return;
+        /* Two types of timer
+         * mode 1 is one shot, mode 2 is period, otherwise del timer */
+        switch (ps->channels[0].mode) {
+        case 1:
+                create_pit_timer(&ps->pit_timer, val, 0);
+                break;
+        case 2:
+                create_pit_timer(&ps->pit_timer, val, 1);
+                break;
+        default:
+                destroy_pit_timer(&ps->pit_timer);
+        }
+}
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+        mutex_lock(&kvm->arch.vpit->pit_state.lock);
+        pit_load_count(kvm, channel, val);
+        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+}
+static void pit_ioport_write(struct kvm_io_device *this,
+                             gpa_t addr, int len, const void *data)
+{
+        struct kvm_pit *pit = (struct kvm_pit *)this->private;
+        struct kvm_kpit_state *pit_state = &pit->pit_state;
+        struct kvm *kvm = pit->kvm;
+        int channel, access;
+        struct kvm_kpit_channel_state *s;
+        u32 val = *(u32 *) data;
+        val  &= 0xff;
+        addr &= KVM_PIT_CHANNEL_MASK;
+        mutex_lock(&pit_state->lock);
+        if (val != 0)
+                pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
+                          (unsigned int)addr, len, val);
+        if (addr == 3) {
+                channel = val >> 6;
+                if (channel == 3) {
+                        /* Read-Back Command. */
+                        for (channel = 0; channel < 3; channel++) {
+                                s = &pit_state->channels[channel];
+                                if (val & (2 << channel)) {
+                                        if (!(val & 0x20))
+                                                pit_latch_count(kvm, channel);
+                                        if (!(val & 0x10))
+                                                pit_latch_status(kvm, channel);
+                                }
+                        }
+                } else {
+                        /* Select Counter <channel>. */
+                        s = &pit_state->channels[channel];
+                        access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+                        if (access == 0) {
+                                pit_latch_count(kvm, channel);
+                        } else {
+                                s->rw_mode = access;
+                                s->read_state = access;
+                                s->write_state = access;
+                                s->mode = (val >> 1) & 7;
+                                if (s->mode > 5)
+                                        s->mode -= 4;
+                                s->bcd = val & 1;
+                        }
+                }
+        } else {
+                /* Write Count. */
+                s = &pit_state->channels[addr];
+                switch (s->write_state) {
+                default:
+                case RW_STATE_LSB:
+                        pit_load_count(kvm, addr, val);
+                        break;
+                case RW_STATE_MSB:
+                        pit_load_count(kvm, addr, val << 8);
+                        break;
+                case RW_STATE_WORD0:
+                        s->write_latch = val;
+                        s->write_state = RW_STATE_WORD1;
+                        break;
+                case RW_STATE_WORD1:
+                        pit_load_count(kvm, addr, s->write_latch | (val << 8));
+                        s->write_state = RW_STATE_WORD0;
+                        break;
+                }
+        }
+        mutex_unlock(&pit_state->lock);
+}
+static void pit_ioport_read(struct kvm_io_device *this,
+                            gpa_t addr, int len, void *data)
+{
+        struct kvm_pit *pit = (struct kvm_pit *)this->private;
+        struct kvm_kpit_state *pit_state = &pit->pit_state;
+        struct kvm *kvm = pit->kvm;
+        int ret, count;
+        struct kvm_kpit_channel_state *s;
+        addr &= KVM_PIT_CHANNEL_MASK;
+        s = &pit_state->channels[addr];
+        mutex_lock(&pit_state->lock);
+        if (s->status_latched) {
+                s->status_latched = 0;
+                ret = s->status;
+        } else if (s->count_latched) {
+                switch (s->count_latched) {
+                default:
+                case RW_STATE_LSB:
+                        ret = s->latched_count & 0xff;
+                        s->count_latched = 0;
+                        break;
+                case RW_STATE_MSB:
+                        ret = s->latched_count >> 8;
+                        s->count_latched = 0;
+                        break;
+                case RW_STATE_WORD0:
+                        ret = s->latched_count & 0xff;
+                        s->count_latched = RW_STATE_MSB;
+                        break;
+                }
+        } else {
+                switch (s->read_state) {
+                default:
+                case RW_STATE_LSB:
+                        count = pit_get_count(kvm, addr);
+                        ret = count & 0xff;
+                        break;
+                case RW_STATE_MSB:
+                        count = pit_get_count(kvm, addr);
+                        ret = (count >> 8) & 0xff;
+                        break;
+                case RW_STATE_WORD0:
+                        count = pit_get_count(kvm, addr);
+                        ret = count & 0xff;
+                        s->read_state = RW_STATE_WORD1;
+                        break;
+                case RW_STATE_WORD1:
+                        count = pit_get_count(kvm, addr);
+                        ret = (count >> 8) & 0xff;
+                        s->read_state = RW_STATE_WORD0;
+                        break;
+                }
+        }
+        if (len > sizeof(ret))
+                len = sizeof(ret);
+        memcpy(data, (char *)&ret, len);
+        mutex_unlock(&pit_state->lock);
+}
+static int pit_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+        return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+                (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+static void speaker_ioport_write(struct kvm_io_device *this,
+                                 gpa_t addr, int len, const void *data)
+{
+        struct kvm_pit *pit = (struct kvm_pit *)this->private;
+        struct kvm_kpit_state *pit_state = &pit->pit_state;
+        struct kvm *kvm = pit->kvm;
+        u32 val = *(u32 *) data;
+        mutex_lock(&pit_state->lock);
+        pit_state->speaker_data_on = (val >> 1) & 1;
+        pit_set_gate(kvm, 2, val & 1);
+        mutex_unlock(&pit_state->lock);
+}
+static void speaker_ioport_read(struct kvm_io_device *this,
+                                gpa_t addr, int len, void *data)
+{
+        struct kvm_pit *pit = (struct kvm_pit *)this->private;
+        struct kvm_kpit_state *pit_state = &pit->pit_state;
+        struct kvm *kvm = pit->kvm;
+        unsigned int refresh_clock;
+        int ret;
+        /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+        refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+        mutex_lock(&pit_state->lock);
+        ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
+                (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+        if (len > sizeof(ret))
+                len = sizeof(ret);
+        memcpy(data, (char *)&ret, len);
+        mutex_unlock(&pit_state->lock);
+}
+static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+        return (addr == KVM_SPEAKER_BASE_ADDRESS);
+}
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+        int i;
+        struct kvm_kpit_channel_state *c;
+        mutex_lock(&pit->pit_state.lock);
+        for (i = 0; i < 3; i++) {
+                c = &pit->pit_state.channels[i];
+                c->mode = 0xff;
+                c->gate = (i != 2);
+                pit_load_count(pit->kvm, i, 0);
+        }
+        mutex_unlock(&pit->pit_state.lock);
+        atomic_set(&pit->pit_state.pit_timer.pending, 0);
+        pit->pit_state.inject_pending = 1;
+}
+struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+{
+        struct kvm_pit *pit;
+        struct kvm_kpit_state *pit_state;
+        pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+        if (!pit)
+                return NULL;
+        mutex_init(&pit->pit_state.lock);
+        mutex_lock(&pit->pit_state.lock);
+        /* Initialize PIO device */
+        pit->dev.read = pit_ioport_read;
+        pit->dev.write = pit_ioport_write;
+        pit->dev.in_range = pit_in_range;
+        pit->dev.private = pit;
+        kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+        pit->speaker_dev.read = speaker_ioport_read;
+        pit->speaker_dev.write = speaker_ioport_write;
+        pit->speaker_dev.in_range = speaker_in_range;
+        pit->speaker_dev.private = pit;
+        kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+        kvm->arch.vpit = pit;
+        pit->kvm = kvm;
+        pit_state = &pit->pit_state;
+        pit_state->pit = pit;
+        hrtimer_init(&pit_state->pit_timer.timer,
+                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        mutex_unlock(&pit->pit_state.lock);
+        kvm_pit_reset(pit);
+        return pit;
+}
+void kvm_free_pit(struct kvm *kvm)
+{
+        struct hrtimer *timer;
+        if (kvm->arch.vpit) {
+                mutex_lock(&kvm->arch.vpit->pit_state.lock);
+                timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+                hrtimer_cancel(timer);
+                mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+                kfree(kvm->arch.vpit);
+        }
+}
+void __inject_pit_timer_intr(struct kvm *kvm)
+{
+        mutex_lock(&kvm->lock);
+        kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
+        kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
+        kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
+        kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+        mutex_unlock(&kvm->lock);
+}
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+{
+        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+        struct kvm *kvm = vcpu->kvm;
+        struct kvm_kpit_state *ps;
+        if (vcpu && pit) {
+                ps = &pit->pit_state;
+                /* Try to inject pending interrupts when:
+                 * 1. Pending exists
+                 * 2. Last interrupt was accepted or waited for too long time*/
+                if (atomic_read(&ps->pit_timer.pending) &&
+                    (ps->inject_pending ||
+                    (jiffies - ps->last_injected_time
+                                >= KVM_MAX_PIT_INTR_INTERVAL))) {
+                        ps->inject_pending = 0;
+                        __inject_pit_timer_intr(kvm);
+                        ps->last_injected_time = jiffies;
+                }
+        }
+}
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+        struct kvm_arch *arch = &vcpu->kvm->arch;
+        struct kvm_kpit_state *ps;
+        if (vcpu && arch->vpit) {
+                ps = &arch->vpit->pit_state;
+                if (atomic_read(&ps->pit_timer.pending) &&
+                (((arch->vpic->pics[0].imr & 1) == 0 &&
+                  arch->vpic->pics[0].irq_base == vec) ||
+                  (arch->vioapic->redirtbl[0].fields.vector == vec &&
+                  arch->vioapic->redirtbl[0].fields.mask != 1))) {
+                        ps->inject_pending = 1;
+                        atomic_dec(&ps->pit_timer.pending);
+                        ps->channels[0].count_load_time = ktime_get();
+                }
+        }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 00000000000..db25c2a6c8c
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,63 @@
+#ifndef __I8254_H
+#define __I8254_H
+#include "iodev.h"
+struct kvm_kpit_timer {
+        struct hrtimer timer;
+        int irq;
+        s64 period; /* unit: ns */
+        s64 scheduled;
+        ktime_t last_update;
+        atomic_t pending;
+};
+struct kvm_kpit_channel_state {
+        u32 count; /* can be 65536 */
+        u16 latched_count;
+        u8 count_latched;
+        u8 status_latched;
+        u8 status;
+        u8 read_state;
+        u8 write_state;
+        u8 write_latch;
+        u8 rw_mode;
+        u8 mode;
+        u8 bcd; /* not supported */
+        u8 gate; /* timer start */
+        ktime_t count_load_time;
+};
+struct kvm_kpit_state {
+        struct kvm_kpit_channel_state channels[3];
+        struct kvm_kpit_timer pit_timer;
+        u32    speaker_data_on;
+        struct mutex lock;
+        struct kvm_pit *pit;
+        bool inject_pending; /* if inject pending interrupts */
+        unsigned long last_injected_time;
+};
+struct kvm_pit {
+        unsigned long base_addresss;
+        struct kvm_io_device dev;
+        struct kvm_io_device speaker_dev;
+        struct kvm *kvm;
+        struct kvm_kpit_state pit_state;
+};
+#define KVM_PIT_BASE_ADDRESS        0x40
+#define KVM_SPEAKER_BASE_ADDRESS    0x61
+#define KVM_PIT_MEM_LENGTH          4
+#define KVM_PIT_FREQ                1193181
+#define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
+#define KVM_PIT_CHANNEL_MASK        0x3
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_free_pit(struct kvm *kvm);
+void kvm_pit_reset(struct kvm_pit *pit);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e5714759e97..ce1f583459b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -23,6 +23,22 @@
 #include <linux/kvm_host.h>
 #include "irq.h"
+#include "i8254.h"
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        int ret;
+        ret = pit_has_pending_timer(vcpu);
+        ret |= apic_has_pending_timer(vcpu);
+        return ret;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 /*
 * check if there is pending interrupt without
@@ -66,6 +82,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
        kvm_inject_apic_timer_irqs(vcpu);
+        kvm_inject_pit_timer_irqs(vcpu);
        /* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
@@ -73,6 +90,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
        kvm_apic_timer_intr_post(vcpu, vec);
+        kvm_pit_timer_intr_post(vcpu, vec);
        /* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index fa5ed5d59b5..1802134b836 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index ecdfe97e463..65ef0fc2c03 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -39,6 +39,8 @@ struct vcpu_svm {
        unsigned long host_db_regs[NUM_DB_REGS];
        unsigned long host_dr6;
        unsigned long host_dr7;
+        u32 *msrpm;
 };
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 68a6b151193..57ac4e4c556 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                } else
                        apic_clear_vector(vector, apic->regs + APIC_TMR);
-                if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
                        kvm_vcpu_kick(vcpu);
-                else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+                else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
-                        vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+                        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                        if (waitqueue_active(&vcpu->wq))
                                wake_up_interruptible(&vcpu->wq);
                }
@@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_INIT:
                if (level) {
-                        if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+                        if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
                                printk(KERN_DEBUG
                                       "INIT on a runnable vcpu %d\n",
                                       vcpu->vcpu_id);
-                        vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+                        vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                        kvm_vcpu_kick(vcpu);
                } else {
                        printk(KERN_DEBUG
@@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_STARTUP:
                printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
                       vcpu->vcpu_id, vector);
-                if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        vcpu->arch.sipi_vector = vector;
-                        vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+                        vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
                        if (waitqueue_active(&vcpu->wq))
                                wake_up_interruptible(&vcpu->wq);
                }
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
        apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
                           PRIx64 ", "
                           "timer initial count 0x%x, period %lldns, "
-                           "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+                           "expire @ 0x%016" PRIx64 ".\n", __func__,
                           APIC_BUS_CYCLE_NS, ktime_to_ns(now),
                           apic_get_reg(apic, APIC_TMICT),
                           apic->timer.period,
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
        /* too common printing */
        if (offset != APIC_EOI)
                apic_debug("%s: offset 0x%x with length 0x%x, and value is "
-                           "0x%x\n", __FUNCTION__, offset, len, val);
+                           "0x%x\n", __func__, offset, len, val);
        offset &= 0xff0;
@@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
@@ -869,7 +870,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        struct kvm_lapic *apic;
        int i;
-        apic_debug("%s\n", __FUNCTION__);
+        apic_debug("%s\n", __func__);
        ASSERT(vcpu);
        apic = vcpu->arch.apic;
@@ -907,7 +908,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        apic_update_ppr(apic);
        apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
-                   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+                   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
                   vcpu, kvm_apic_id(apic),
                   vcpu->arch.apic_base, apic->base_address);
 }
@@ -940,7 +941,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        atomic_inc(&apic->timer.pending);
        if (waitqueue_active(q)) {
-                apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+                apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                wake_up_interruptible(q);
        }
        if (apic_lvtt_period(apic)) {
@@ -952,6 +953,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        return result;
 }
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+        struct kvm_lapic *lapic = vcpu->arch.apic;
+        if (lapic)
+                return atomic_read(&lapic->timer.pending);
+        return 0;
+}
 static int __inject_apic_timer_irq(struct kvm_lapic *apic)
 {
        int vector;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e55af12e11b..2ad6f548167 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,11 +27,22 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
 #undef MMU_DEBUG
 #undef AUDIT
@@ -101,8 +112,6 @@ static int dbg = 1;
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 #define PT64_LEVEL_BITS 9
@@ -159,6 +168,13 @@ static int dbg = 1;
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+struct kvm_pv_mmu_op_buffer {
+        void *ptr;
+        unsigned len;
+        unsigned processed;
+        char buf[512] __aligned(sizeof(long));
+};
 struct kvm_rmap_desc {
        u64 *shadow_ptes[RMAP_EXT];
        struct kvm_rmap_desc *more;
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte)
 static int is_shadow_present_pte(u64 pte)
 {
-        pte &= ~PT_SHADOW_IO_MARK;
        return pte != shadow_trap_nonpresent_pte
                && pte != shadow_notrap_nonpresent_pte;
 }
+static int is_large_pte(u64 pte)
+{
+        return pte & PT_PAGE_SIZE_MASK;
+}
 static int is_writeble_pte(unsigned long pte)
 {
        return pte & PT_WRITABLE_MASK;
@@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte)
        return pte & PT_DIRTY_MASK;
 }
-static int is_io_pte(unsigned long pte)
+static int is_rmap_pte(u64 pte)
 {
-        return pte & PT_SHADOW_IO_MARK;
+        return is_shadow_present_pte(pte);
 }
-static int is_rmap_pte(u64 pte)
+static pfn_t spte_to_pfn(u64 pte)
 {
-        return is_shadow_present_pte(pte);
+        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
 static gfn_t pse36_gfn_delta(u32 gpte)
@@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 }
 /*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+        unsigned long idx;
+        idx = (gfn / KVM_PAGES_PER_HPAGE) -
+              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+        return &slot->lpage_info[idx].write_count;
+}
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+        int *write_count;
+        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        *write_count += 1;
+        WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+        int *write_count;
+        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        *write_count -= 1;
+        WARN_ON(*write_count < 0);
+}
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+        int *largepage_idx;
+        if (slot) {
+                largepage_idx = slot_largepage_idx(gfn, slot);
+                return *largepage_idx;
+        }
+        return 1;
+}
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+        struct vm_area_struct *vma;
+        unsigned long addr;
+        addr = gfn_to_hva(kvm, gfn);
+        if (kvm_is_error_hva(addr))
+                return 0;
+        vma = find_vma(current->mm, addr);
+        if (vma && is_vm_hugetlb_page(vma))
+                return 1;
+        return 0;
+}
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+        struct kvm_memory_slot *slot;
+        if (has_wrprotected_page(vcpu->kvm, large_gfn))
+                return 0;
+        if (!host_largepage_backed(vcpu->kvm, large_gfn))
+                return 0;
+        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+        if (slot && slot->dirty_bitmap)
+                return 0;
+        return 1;
+}
+/*
 * Take gfn and return the reverse mapping to it.
 * Note: gfn must be unaliased before this function get called
 */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 {
        struct kvm_memory_slot *slot;
+        unsigned long idx;
        slot = gfn_to_memslot(kvm, gfn);
-        return &slot->rmap[gfn - slot->base_gfn];
+        if (!lpage)
+                return &slot->rmap[gfn - slot->base_gfn];
+        idx = (gfn / KVM_PAGES_PER_HPAGE) -
+              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+        return &slot->lpage_info[idx].rmap_pde;
 }
 /*
@@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
 */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 {
        struct kvm_mmu_page *sp;
        struct kvm_rmap_desc *desc;
@@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
        gfn = unalias_gfn(vcpu->kvm, gfn);
        sp = page_header(__pa(spte));
        sp->gfns[spte - sp->spt] = gfn;
-        rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+        rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
        if (!*rmapp) {
                rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
                *rmapp = (unsigned long)spte;
@@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        struct kvm_rmap_desc *desc;
        struct kvm_rmap_desc *prev_desc;
        struct kvm_mmu_page *sp;
-        struct page *page;
+        pfn_t pfn;
        unsigned long *rmapp;
        int i;
        if (!is_rmap_pte(*spte))
                return;
        sp = page_header(__pa(spte));
-        page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+        pfn = spte_to_pfn(*spte);
-        mark_page_accessed(page);
+        if (*spte & PT_ACCESSED_MASK)
+                kvm_set_pfn_accessed(pfn);
        if (is_writeble_pte(*spte))
-                kvm_release_page_dirty(page);
+                kvm_release_pfn_dirty(pfn);
        else
-                kvm_release_page_clean(page);
+                kvm_release_pfn_clean(pfn);
-        rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+        rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
        if (!*rmapp) {
                printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
                BUG();
@@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
        int write_protected = 0;
        gfn = unalias_gfn(kvm, gfn);
-        rmapp = gfn_to_rmap(kvm, gfn);
+        rmapp = gfn_to_rmap(kvm, gfn, 0);
        spte = rmap_next(kvm, rmapp, NULL);
        while (spte) {
@@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                }
                spte = rmap_next(kvm, rmapp, spte);
        }
+        if (write_protected) {
+                pfn_t pfn;
+                spte = rmap_next(kvm, rmapp, NULL);
+                pfn = spte_to_pfn(*spte);
+                kvm_set_pfn_dirty(pfn);
+        }
+        /* check for huge page mappings */
+        rmapp = gfn_to_rmap(kvm, gfn, 1);
+        spte = rmap_next(kvm, rmapp, NULL);
+        while (spte) {
+                BUG_ON(!spte);
+                BUG_ON(!(*spte & PT_PRESENT_MASK));
+                BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+                pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+                if (is_writeble_pte(*spte)) {
+                        rmap_remove(kvm, spte);
+                        --kvm->stat.lpages;
+                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+                        write_protected = 1;
+                }
+                spte = rmap_next(kvm, rmapp, spte);
+        }
        if (write_protected)
                kvm_flush_remote_tlbs(kvm);
+        account_shadowed(kvm, gfn);
 }
 #ifdef MMU_DEBUG
@@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt)
        u64 *end;
        for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-                if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+                if (*pos != shadow_trap_nonpresent_pte) {
-                        printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+                        printk(KERN_ERR "%s: %p %llx\n", __func__,
                               pos, *pos);
                        return 0;
                }
@@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-        return gfn;
+        return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
        struct kvm_mmu_page *sp;
        struct hlist_node *node;
-        pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-        index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+        index = kvm_page_table_hashfn(gfn);
        bucket = &kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry(sp, node, bucket, hash_link)
-                if (sp->gfn == gfn && !sp->role.metaphysical) {
+                if (sp->gfn == gfn && !sp->role.metaphysical
+                    && !sp->role.invalid) {
                        pgprintk("%s: found role %x\n",
-                                 __FUNCTION__, sp->role.word);
+                                 __func__, sp->role.word);
                        return sp;
                }
        return NULL;
@@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-        pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+        pgprintk("%s: looking gfn %lx role %x\n", __func__,
                 gfn, role.word);
-        index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry(sp, node, bucket, hash_link)
                if (sp->gfn == gfn && sp->role.word == role.word) {
                        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-                        pgprintk("%s: found\n", __FUNCTION__);
+                        pgprintk("%s: found\n", __func__);
                        return sp;
                }
        ++vcpu->kvm->stat.mmu_cache_miss;
        sp = kvm_mmu_alloc_page(vcpu, parent_pte);
        if (!sp)
                return sp;
-        pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+        pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
        sp->gfn = gfn;
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
-        vcpu->arch.mmu.prefetch_page(vcpu, sp);
        if (!metaphysical)
                rmap_write_protect(vcpu->kvm, gfn);
+        vcpu->arch.mmu.prefetch_page(vcpu, sp);
        return sp;
 }
@@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
                ent = pt[i];
+                if (is_shadow_present_pte(ent)) {
+                        if (!is_large_pte(ent)) {
+                                ent &= PT64_BASE_ADDR_MASK;
+                                mmu_page_remove_parent_pte(page_header(ent),
+                                                           &pt[i]);
+                        } else {
+                                --kvm->stat.lpages;
+                                rmap_remove(kvm, &pt[i]);
+                        }
+                }
                pt[i] = shadow_trap_nonpresent_pte;
-                if (!is_shadow_present_pte(ent))
-                        continue;
-                ent &= PT64_BASE_ADDR_MASK;
-                mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
        }
        kvm_flush_remote_tlbs(kvm);
 }
@@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
        }
        kvm_mmu_page_unlink_children(kvm, sp);
        if (!sp->root_count) {
+                if (!sp->role.metaphysical)
+                        unaccount_shadowed(kvm, sp->gfn);
                hlist_del(&sp->hash_link);
                kvm_mmu_free_page(kvm, sp);
-        } else
+        } else {
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
+                sp->role.invalid = 1;
+                kvm_reload_remote_mmus(kvm);
+        }
        kvm_mmu_reset_last_pte_updated(kvm);
 }
@@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
        struct hlist_node *node, *n;
        int r;
-        pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
        r = 0;
-        index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+        index = kvm_page_table_hashfn(gfn);
        bucket = &kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
                if (sp->gfn == gfn && !sp->role.metaphysical) {
-                        pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                        pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
                                 sp->role.word);
                        kvm_mmu_zap_page(kvm, sp);
                        r = 1;
@@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
        struct kvm_mmu_page *sp;
        while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-                pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+                pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
                kvm_mmu_zap_page(kvm, sp);
        }
 }
@@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                         int *ptwrite, gfn_t gfn, struct page *page)
+                         int *ptwrite, int largepage, gfn_t gfn,
+                         pfn_t pfn, bool speculative)
 {
        u64 spte;
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*shadow_pte);
-        hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
        pgprintk("%s: spte %llx access %x write_fault %d"
                 " user_fault %d gfn %lx\n",
-                 __FUNCTION__, *shadow_pte, pt_access,
+                 __func__, *shadow_pte, pt_access,
                 write_fault, user_fault, gfn);
        if (is_rmap_pte(*shadow_pte)) {
-                if (host_pfn != page_to_pfn(page)) {
+                /*
+                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+                 * the parent of the now unreachable PTE.
+                 */
+                if (largepage && !is_large_pte(*shadow_pte)) {
+                        struct kvm_mmu_page *child;
+                        u64 pte = *shadow_pte;
+                        child = page_header(pte & PT64_BASE_ADDR_MASK);
+                        mmu_page_remove_parent_pte(child, shadow_pte);
+                } else if (pfn != spte_to_pfn(*shadow_pte)) {
                        pgprintk("hfn old %lx new %lx\n",
-                                 host_pfn, page_to_pfn(page));
+                                 spte_to_pfn(*shadow_pte), pfn);
                        rmap_remove(vcpu->kvm, shadow_pte);
+                } else {
+                        if (largepage)
+                                was_rmapped = is_large_pte(*shadow_pte);
+                        else
+                                was_rmapped = 1;
                }
-                else
-                        was_rmapped = 1;
        }
        /*
@@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
         * demand paging).
         */
        spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+        if (!speculative)
+                pte_access |= PT_ACCESSED_MASK;
        if (!dirty)
                pte_access &= ~ACC_WRITE_MASK;
        if (!(pte_access & ACC_EXEC_MASK))
@@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
        spte |= PT_PRESENT_MASK;
        if (pte_access & ACC_USER_MASK)
                spte |= PT_USER_MASK;
+        if (largepage)
+                spte |= PT_PAGE_SIZE_MASK;
-        if (is_error_page(page)) {
+        spte |= (u64)pfn << PAGE_SHIFT;
-                set_shadow_pte(shadow_pte,
-                               shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
-                kvm_release_page_clean(page);
-                return;
-        }
-        spte |= page_to_phys(page);
        if ((pte_access & ACC_WRITE_MASK)
            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
@@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
                shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-                if (shadow) {
+                if (shadow ||
+                   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
-                                 __FUNCTION__, gfn);
+                                 __func__, gfn);
                        pte_access &= ~ACC_WRITE_MASK;
                        if (is_writeble_pte(spte)) {
                                spte &= ~PT_WRITABLE_MASK;
@@ -964,18 +1119,25 @@ unshadowed:
        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);
-        pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+        pgprintk("%s: setting spte %llx\n", __func__, spte);
+        pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
+                 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+                 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
        set_shadow_pte(shadow_pte, spte);
+        if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+            && (spte & PT_PRESENT_MASK))
+                ++vcpu->kvm->stat.lpages;
        page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
        if (!was_rmapped) {
-                rmap_add(vcpu, shadow_pte, gfn);
+                rmap_add(vcpu, shadow_pte, gfn, largepage);
                if (!is_rmap_pte(*shadow_pte))
-                        kvm_release_page_clean(page);
+                        kvm_release_pfn_clean(pfn);
        } else {
                if (was_writeble)
-                        kvm_release_page_dirty(page);
+                        kvm_release_pfn_dirty(pfn);
                else
-                        kvm_release_page_clean(page);
+                        kvm_release_pfn_clean(pfn);
        }
        if (!ptwrite || !*ptwrite)
                vcpu->arch.last_pte_updated = shadow_pte;
@@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-                           gfn_t gfn, struct page *page)
+                           int largepage, gfn_t gfn, pfn_t pfn,
+                           int level)
 {
-        int level = PT32E_ROOT_LEVEL;
        hpa_t table_addr = vcpu->arch.mmu.root_hpa;
        int pt_write = 0;
@@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
                if (level == 1) {
                        mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-                                     0, write, 1, &pt_write, gfn, page);
+                                     0, write, 1, &pt_write, 0, gfn, pfn, false);
-                        return pt_write || is_io_pte(table[index]);
+                        return pt_write;
+                }
+                if (largepage && level == 2) {
+                        mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+                                     0, write, 1, &pt_write, 1, gfn, pfn, false);
+                        return pt_write;
                }
                if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
                                                     1, ACC_ALL, &table[index]);
                        if (!new_table) {
                                pgprintk("nonpaging_map: ENOMEM\n");
-                                kvm_release_page_clean(page);
+                                kvm_release_pfn_clean(pfn);
                                return -ENOMEM;
                        }
@@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
        int r;
+        int largepage = 0;
-        struct page *page;
+        pfn_t pfn;
-        down_read(&vcpu->kvm->slots_lock);
        down_read(&current->mm->mmap_sem);
-        page = gfn_to_page(vcpu->kvm, gfn);
+        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+                largepage = 1;
+        }
+        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        up_read(&current->mm->mmap_sem);
+        /* mmio */
+        if (is_error_pfn(pfn)) {
+                kvm_release_pfn_clean(pfn);
+                return 1;
+        }
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
-        r = __nonpaging_map(vcpu, v, write, gfn, page);
+        r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
+                         PT32E_ROOT_LEVEL);
        spin_unlock(&vcpu->kvm->mmu_lock);
-        up_read(&vcpu->kvm->slots_lock);
        return r;
 }
@@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                sp = page_header(root);
                --sp->root_count;
+                if (!sp->root_count && sp->role.invalid)
+                        kvm_mmu_zap_page(vcpu->kvm, sp);
                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
@@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                        root &= PT64_BASE_ADDR_MASK;
                        sp = page_header(root);
                        --sp->root_count;
+                        if (!sp->root_count && sp->role.invalid)
+                                kvm_mmu_zap_page(vcpu->kvm, sp);
                }
                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
@@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
        int i;
        gfn_t root_gfn;
        struct kvm_mmu_page *sp;
+        int metaphysical = 0;
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                hpa_t root = vcpu->arch.mmu.root_hpa;
                ASSERT(!VALID_PAGE(root));
+                if (tdp_enabled)
+                        metaphysical = 1;
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                                      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
+                                      PT64_ROOT_LEVEL, metaphysical,
+                                      ACC_ALL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
                vcpu->arch.mmu.root_hpa = root;
                return;
        }
 #endif
+        metaphysical = !is_paging(vcpu);
+        if (tdp_enabled)
+                metaphysical = 1;
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                } else if (vcpu->arch.mmu.root_level == 0)
                        root_gfn = 0;
                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-                                      PT32_ROOT_LEVEL, !is_paging(vcpu),
+                                      PT32_ROOT_LEVEL, metaphysical,
                                      ACC_ALL, NULL);
                root = __pa(sp->spt);
                ++sp->root_count;
@@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        gfn_t gfn;
        int r;
-        pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                             error_code & PFERR_WRITE_MASK, gfn);
 }
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+                                u32 error_code)
+{
+        pfn_t pfn;
+        int r;
+        int largepage = 0;
+        gfn_t gfn = gpa >> PAGE_SHIFT;
+        ASSERT(vcpu);
+        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        r = mmu_topup_memory_caches(vcpu);
+        if (r)
+                return r;
+        down_read(&current->mm->mmap_sem);
+        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+                largepage = 1;
+        }
+        pfn = gfn_to_pfn(vcpu->kvm, gfn);
+        up_read(&current->mm->mmap_sem);
+        if (is_error_pfn(pfn)) {
+                kvm_release_pfn_clean(pfn);
+                return 1;
+        }
+        spin_lock(&vcpu->kvm->mmu_lock);
+        kvm_mmu_free_some_pages(vcpu);
+        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+                         largepage, gfn, pfn, TDP_ROOT_LEVEL);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+        return r;
+}
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
        mmu_free_roots(vcpu);
@@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
-        pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
+        pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
        mmu_free_roots(vcpu);
 }
@@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
        return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+        struct kvm_mmu *context = &vcpu->arch.mmu;
+        context->new_cr3 = nonpaging_new_cr3;
+        context->page_fault = tdp_page_fault;
+        context->free = nonpaging_free;
+        context->prefetch_page = nonpaging_prefetch_page;
+        context->shadow_root_level = TDP_ROOT_LEVEL;
+        context->root_hpa = INVALID_PAGE;
+        if (!is_paging(vcpu)) {
+                context->gva_to_gpa = nonpaging_gva_to_gpa;
+                context->root_level = 0;
+        } else if (is_long_mode(vcpu)) {
+                context->gva_to_gpa = paging64_gva_to_gpa;
+                context->root_level = PT64_ROOT_LEVEL;
+        } else if (is_pae(vcpu)) {
+                context->gva_to_gpa = paging64_gva_to_gpa;
+                context->root_level = PT32E_ROOT_LEVEL;
+        } else {
+                context->gva_to_gpa = paging32_gva_to_gpa;
+                context->root_level = PT32_ROOT_LEVEL;
+        }
+        return 0;
+}
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
                return paging32_init_context(vcpu);
 }
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.update_pte.pfn = bad_pfn;
+        if (tdp_enabled)
+                return init_kvm_tdp_mmu(vcpu);
+        else
+                return init_kvm_softmmu(vcpu);
+}
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
@@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
        pte = *spte;
        if (is_shadow_present_pte(pte)) {
-                if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+                if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+                    is_large_pte(pte))
                        rmap_remove(vcpu->kvm, spte);
                else {
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
                }
        }
        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+        if (is_large_pte(pte))
+                --vcpu->kvm->stat.lpages;
 }
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp,
                                  u64 *spte,
-                                  const void *new, int bytes,
+                                  const void *new)
-                                  int offset_in_pte)
 {
-        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+        if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+            && !vcpu->arch.update_pte.largepage) {
                ++vcpu->kvm->stat.mmu_pde_zapped;
                return;
        }
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (sp->role.glevels == PT32_ROOT_LEVEL)
-                paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+                paging32_update_pte(vcpu, sp, spte, new);
        else
-                paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+                paging64_update_pte(vcpu, sp, spte, new);
 }
 static bool need_remote_flush(u64 old, u64 new)
@@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        gfn_t gfn;
        int r;
        u64 gpte = 0;
-        struct page *page;
+        pfn_t pfn;
+        vcpu->arch.update_pte.largepage = 0;
        if (bytes != 4 && bytes != 8)
                return;
@@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
        down_read(&current->mm->mmap_sem);
-        page = gfn_to_page(vcpu->kvm, gfn);
+        if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+                vcpu->arch.update_pte.largepage = 1;
+        }
+        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        up_read(&current->mm->mmap_sem);
+        if (is_error_pfn(pfn)) {
+                kvm_release_pfn_clean(pfn);
+                return;
+        }
        vcpu->arch.update_pte.gfn = gfn;
-        vcpu->arch.update_pte.page = page;
+        vcpu->arch.update_pte.pfn = pfn;
 }
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        struct hlist_node *node, *n;
        struct hlist_head *bucket;
        unsigned index;
-        u64 entry;
+        u64 entry, gentry;
        u64 *spte;
        unsigned offset = offset_in_page(gpa);
        unsigned pte_size;
@@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int level;
        int flooded = 0;
        int npte;
+        int r;
-        pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
        mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
@@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                vcpu->arch.last_pt_write_count = 1;
                vcpu->arch.last_pte_updated = NULL;
        }
-        index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
                if (sp->gfn != gfn || sp->role.metaphysical)
@@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                                continue;
                }
                spte = &sp->spt[page_offset / sizeof(*spte)];
+                if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+                        gentry = 0;
+                        r = kvm_read_guest_atomic(vcpu->kvm,
+                                                  gpa & ~(u64)(pte_size - 1),
+                                                  &gentry, pte_size);
+                        new = (const void *)&gentry;
+                        if (r < 0)
+                                new = NULL;
+                }
                while (npte--) {
                        entry = *spte;
                        mmu_pte_write_zap_pte(vcpu, sp, spte);
-                        mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
+                        if (new)
-                                              page_offset & (pte_size - 1));
+                                mmu_pte_write_new_pte(vcpu, sp, spte, new);
                        mmu_pte_write_flush_tlb(vcpu, entry, *spte);
                        ++spte;
                }
        }
        kvm_mmu_audit(vcpu, "post pte write");
        spin_unlock(&vcpu->kvm->mmu_lock);
-        if (vcpu->arch.update_pte.page) {
+        if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
-                kvm_release_page_clean(vcpu->arch.update_pte.page);
+                kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
-                vcpu->arch.update_pte.page = NULL;
+                vcpu->arch.update_pte.pfn = bad_pfn;
        }
 }
@@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
        gpa_t gpa;
        int r;
-        down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-        up_read(&vcpu->kvm->slots_lock);
        spin_lock(&vcpu->kvm->mmu_lock);
        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -1577,6 +1859,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_enable_tdp(void)
+{
+        tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu_page *sp;
@@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm)
        kvm_flush_remote_tlbs(kvm);
 }
-void kvm_mmu_module_exit(void)
+void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+{
+        struct kvm_mmu_page *page;
+        page = container_of(kvm->arch.active_mmu_pages.prev,
+                            struct kvm_mmu_page, link);
+        kvm_mmu_zap_page(kvm, page);
+}
+static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+{
+        struct kvm *kvm;
+        struct kvm *kvm_freed = NULL;
+        int cache_count = 0;
+        spin_lock(&kvm_lock);
+        list_for_each_entry(kvm, &vm_list, vm_list) {
+                int npages;
+                spin_lock(&kvm->mmu_lock);
+                npages = kvm->arch.n_alloc_mmu_pages -
+                         kvm->arch.n_free_mmu_pages;
+                cache_count += npages;
+                if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+                        kvm_mmu_remove_one_alloc_mmu_page(kvm);
+                        cache_count--;
+                        kvm_freed = kvm;
+                }
+                nr_to_scan--;
+                spin_unlock(&kvm->mmu_lock);
+        }
+        if (kvm_freed)
+                list_move_tail(&kvm_freed->vm_list, &vm_list);
+        spin_unlock(&kvm_lock);
+        return cache_count;
+}
+static struct shrinker mmu_shrinker = {
+        .shrink = mmu_shrink,
+        .seeks = DEFAULT_SEEKS * 10,
+};
+void mmu_destroy_caches(void)
 {
        if (pte_chain_cache)
                kmem_cache_destroy(pte_chain_cache);
@@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void)
                kmem_cache_destroy(mmu_page_header_cache);
 }
+void kvm_mmu_module_exit(void)
+{
+        mmu_destroy_caches();
+        unregister_shrinker(&mmu_shrinker);
+}
 int kvm_mmu_module_init(void)
 {
        pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void)
        if (!mmu_page_header_cache)
                goto nomem;
+        register_shrinker(&mmu_shrinker);
        return 0;
 nomem:
-        kvm_mmu_module_exit();
+        mmu_destroy_caches();
        return -ENOMEM;
 }
@@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
        return nr_mmu_pages;
 }
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+                                unsigned len)
+{
+        if (len > buffer->len)
+                return NULL;
+        return buffer->ptr;
+}
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+                                unsigned len)
+{
+        void *ret;
+        ret = pv_mmu_peek_buffer(buffer, len);
+        if (!ret)
+                return ret;
+        buffer->ptr += len;
+        buffer->len -= len;
+        buffer->processed += len;
+        return ret;
+}
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+                             gpa_t addr, gpa_t value)
+{
+        int bytes = 8;
+        int r;
+        if (!is_long_mode(vcpu) && !is_pae(vcpu))
+                bytes = 4;
+        r = mmu_topup_memory_caches(vcpu);
+        if (r)
+                return r;
+        if (!emulator_write_phys(vcpu, addr, &value, bytes))
+                return -EFAULT;
+        return 1;
+}
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+        kvm_x86_ops->tlb_flush(vcpu);
+        return 1;
+}
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+        spin_lock(&vcpu->kvm->mmu_lock);
+        mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+        return 1;
+}
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+                             struct kvm_pv_mmu_op_buffer *buffer)
+{
+        struct kvm_mmu_op_header *header;
+        header = pv_mmu_peek_buffer(buffer, sizeof *header);
+        if (!header)
+                return 0;
+        switch (header->op) {
+        case KVM_MMU_OP_WRITE_PTE: {
+                struct kvm_mmu_op_write_pte *wpte;
+                wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+                if (!wpte)
+                        return 0;
+                return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+                                        wpte->pte_val);
+        }
+        case KVM_MMU_OP_FLUSH_TLB: {
+                struct kvm_mmu_op_flush_tlb *ftlb;
+                ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+                if (!ftlb)
+                        return 0;
+                return kvm_pv_mmu_flush_tlb(vcpu);
+        }
+        case KVM_MMU_OP_RELEASE_PT: {
+                struct kvm_mmu_op_release_pt *rpt;
+                rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+                if (!rpt)
+                        return 0;
+                return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+        }
+        default: return 0;
+        }
+}
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+                  gpa_t addr, unsigned long *ret)
+{
+        int r;
+        struct kvm_pv_mmu_op_buffer buffer;
+        buffer.ptr = buffer.buf;
+        buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
+        buffer.processed = 0;
+        r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+        if (r)
+                goto out;
+        while (buffer.len) {
+                r = kvm_pv_mmu_op_one(vcpu, &buffer);
+                if (r < 0)
+                        goto out;
+                if (r == 0)
+                        break;
+        }
+        r = 1;
+out:
+        *ret = buffer.processed;
+        return r;
+}
 #ifdef AUDIT
 static const char *audit_msg;
@@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
                        audit_mappings_page(vcpu, ent, va, level - 1);
                } else {
                        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-                        struct page *page = gpa_to_page(vcpu, gpa);
+                        hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
-                        hpa_t hpa = page_to_phys(page);
                        if (is_shadow_present_pte(ent)
                            && (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
                                 && !is_error_hpa(hpa))
                                printk(KERN_ERR "audit: (%s) notrap shadow,"
                                       " valid guest gva %lx\n", audit_msg, va);
-                        kvm_release_page_clean(page);
+                        kvm_release_pfn_clean(pfn);
                }
        }
@@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
        if (n_rmap != n_actual)
                printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-                       __FUNCTION__, audit_msg, n_rmap, n_actual);
+                       __func__, audit_msg, n_rmap, n_actual);
 }
 static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
                if (*rmapp)
                        printk(KERN_ERR "%s: (%s) shadow page has writable"
                               " mappings: gfn %lx role %x\n",
-                               __FUNCTION__, audit_msg, sp->gfn,
+                               __func__, audit_msg, sp->gfn,
                               sp->role.word);
        }
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fce19ec7a2..e64e9f56a65 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,12 @@
 #include <linux/kvm_host.h>
+#ifdef CONFIG_X86_64
+#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
+#else
+#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
+#endif
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
        if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ecc0856268c..156fe10288a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
        unsigned index, pt_access, pte_access;
        gpa_t pte_gpa;
-        pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+        pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
        walker->level = vcpu->arch.mmu.root_level;
        pte = vcpu->arch.cr3;
@@ -155,7 +155,7 @@ walk:
                pte_gpa += index * sizeof(pt_element_t);
                walker->table_gfn[walker->level - 1] = table_gfn;
                walker->pte_gpa[walker->level - 1] = pte_gpa;
-                pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                pgprintk("%s: table_gfn[%d] %lx\n", __func__,
                         walker->level - 1, table_gfn);
                kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
@@ -222,7 +222,7 @@ walk:
        walker->pt_access = pt_access;
        walker->pte_access = pte_access;
        pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-                 __FUNCTION__, (u64)pte, pt_access, pte_access);
+                 __func__, (u64)pte, pt_access, pte_access);
        return 1;
 not_present:
@@ -243,31 +243,30 @@ err:
 }
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-                              u64 *spte, const void *pte, int bytes,
+                              u64 *spte, const void *pte)
-                              int offset_in_pte)
 {
        pt_element_t gpte;
        unsigned pte_access;
-        struct page *npage;
+        pfn_t pfn;
+        int largepage = vcpu->arch.update_pte.largepage;
        gpte = *(const pt_element_t *)pte;
        if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-                if (!offset_in_pte && !is_present_pte(gpte))
+                if (!is_present_pte(gpte))
                        set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
                return;
        }
-        if (bytes < sizeof(pt_element_t))
+        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-                return;
-        pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
        pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
        if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
                return;
-        npage = vcpu->arch.update_pte.page;
+        pfn = vcpu->arch.update_pte.pfn;
-        if (!npage)
+        if (is_error_pfn(pfn))
                return;
-        get_page(npage);
+        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+                     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+                     pfn, true);
 }
 /*
@@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 */
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *walker,
-                         int user_fault, int write_fault, int *ptwrite,
+                         int user_fault, int write_fault, int largepage,
-                         struct page *page)
+                         int *ptwrite, pfn_t pfn)
 {
        hpa_t shadow_addr;
        int level;
@@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                shadow_ent = ((u64 *)__va(shadow_addr)) + index;
                if (level == PT_PAGE_TABLE_LEVEL)
                        break;
-                if (is_shadow_present_pte(*shadow_ent)) {
+                if (largepage && level == PT_DIRECTORY_LEVEL)
+                        break;
+                if (is_shadow_present_pte(*shadow_ent)
+                    && !is_large_pte(*shadow_ent)) {
                        shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
                        continue;
                }
+                if (is_large_pte(*shadow_ent))
+                        rmap_remove(vcpu->kvm, shadow_ent);
                if (level - 1 == PT_PAGE_TABLE_LEVEL
                    && walker->level == PT_DIRECTORY_LEVEL) {
                        metaphysical = 1;
@@ -329,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                                                  walker->pte_gpa[level - 2],
                                                  &curr_pte, sizeof(curr_pte));
                        if (r || curr_pte != walker->ptes[level - 2]) {
-                                kvm_release_page_clean(page);
+                                kvm_release_pfn_clean(pfn);
                                return NULL;
                        }
                }
@@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
                     user_fault, write_fault,
                     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-                     ptwrite, walker->gfn, page);
+                     ptwrite, largepage, walker->gfn, pfn, false);
        return shadow_ent;
 }
@@ -371,16 +378,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        u64 *shadow_pte;
        int write_pt = 0;
        int r;
-        struct page *page;
+        pfn_t pfn;
+        int largepage = 0;
-        pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
        kvm_mmu_audit(vcpu, "pre page fault");
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
-        down_read(&vcpu->kvm->slots_lock);
        /*
         * Look up the shadow pte for the faulting address.
         */
@@ -391,40 +398,45 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
         * The page is not mapped by the guest.  Let the guest handle it.
         */
        if (!r) {
-                pgprintk("%s: guest page fault\n", __FUNCTION__);
+                pgprintk("%s: guest page fault\n", __func__);
                inject_page_fault(vcpu, addr, walker.error_code);
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-                up_read(&vcpu->kvm->slots_lock);
                return 0;
        }
        down_read(&current->mm->mmap_sem);
-        page = gfn_to_page(vcpu->kvm, walker.gfn);
+        if (walker.level == PT_DIRECTORY_LEVEL) {
+                gfn_t large_gfn;
+                large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+                if (is_largepage_backed(vcpu, large_gfn)) {
+                        walker.gfn = large_gfn;
+                        largepage = 1;
+                }
+        }
+        pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
        up_read(&current->mm->mmap_sem);
+        /* mmio */
+        if (is_error_pfn(pfn)) {
+                pgprintk("gfn %x is mmio\n", walker.gfn);
+                kvm_release_pfn_clean(pfn);
+                return 1;
+        }
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
        shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                                  &write_pt, page);
+                                  largepage, &write_pt, pfn);
-        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
                 shadow_pte, *shadow_pte, write_pt);
        if (!write_pt)
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-        /*
-         * mmio: emulate if accessible, otherwise its a guest fault.
-         */
-        if (shadow_pte && is_io_pte(*shadow_pte)) {
-                spin_unlock(&vcpu->kvm->mmu_lock);
-                up_read(&vcpu->kvm->slots_lock);
-                return 1;
-        }
        ++vcpu->stat.pf_fixed;
        kvm_mmu_audit(vcpu, "post page fault (fixed)");
        spin_unlock(&vcpu->kvm->mmu_lock);
-        up_read(&vcpu->kvm->slots_lock);
        return write_pt;
 }
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c87338..00000000000
--- a/arch/x86/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-struct segment_descriptor {
-        u16 limit_low;
-        u16 base_low;
-        u8  base_mid;
-        u8  type : 4;
-        u8  system : 1;
-        u8  dpl : 2;
-        u8  present : 1;
-        u8  limit_high : 4;
-        u8  avl : 1;
-        u8  long_mode : 1;
-        u8  default_op : 1;
-        u8  granularity : 1;
-        u8  base_high;
-} __attribute__((packed));
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
-        struct segment_descriptor s;
-        u32 base_higher;
-        u32 pad_zero;
-};
-#endif
-#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a582f1090e..89e0be2c10d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,18 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled = false;
+#endif
+static int npt = 1;
+module_param(npt, int, S_IRUGO);
 static void kvm_reput_irq(struct vcpu_svm *svm);
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct vcpu_svm, vcpu);
 }
-unsigned long iopm_base;
+static unsigned long iopm_base;
-unsigned long msrpm_base;
 struct kvm_ldttss_desc {
        u16 limit0;
@@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-        if (!(efer & EFER_LMA))
+        if (!npt_enabled && !(efer & EFER_LMA))
                efer &= ~EFER_LME;
        to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
@@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        if (!svm->next_rip) {
-                printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+                printk(KERN_DEBUG "%s: NOP\n", __func__);
                return;
        }
        if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
                printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-                       __FUNCTION__,
+                       __func__,
                       svm->vmcb->save.rip,
                       svm->next_rip);
@@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage)
        struct svm_cpu_data *svm_data;
        uint64_t efer;
-#ifdef CONFIG_X86_64
-        struct desc_ptr gdt_descr;
-#else
        struct desc_ptr gdt_descr;
-#endif
        struct desc_struct *gdt;
        int me = raw_smp_processor_id();
@@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage)
        svm_data->asid_generation = 1;
        svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
        svm_data->next_asid = svm_data->max_asid + 1;
-        svm_features = cpuid_edx(SVM_CPUID_FUNC);
        asm volatile ("sgdt %0" : "=m"(gdt_descr));
        gdt = (struct desc_struct *)gdt_descr.address;
@@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
        BUG();
 }
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+#ifdef CONFIG_X86_64
+        set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
+        set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
+        set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
+        set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
+        set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
+        set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
+#endif
+        set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+}
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+        u32 *msrpm = svm->msrpm;
+        svm->vmcb->control.lbr_ctl = 1;
+        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+        u32 *msrpm = svm->msrpm;
+        svm->vmcb->control.lbr_ctl = 0;
+        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
 static __init int svm_hardware_setup(void)
 {
        int cpu;
        struct page *iopm_pages;
-        struct page *msrpm_pages;
+        void *iopm_va;
-        void *iopm_va, *msrpm_va;
        int r;
        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
@@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void)
        clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+        if (boot_cpu_has(X86_FEATURE_NX))
+                kvm_enable_efer_bits(EFER_NX);
-        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+        for_each_online_cpu(cpu) {
+                r = svm_cpu_init(cpu);
+                if (r)
+                        goto err;
+        }
-        r = -ENOMEM;
+        svm_features = cpuid_edx(SVM_CPUID_FUNC);
-        if (!msrpm_pages)
-                goto err_1;
-        msrpm_va = page_address(msrpm_pages);
+        if (!svm_has(SVM_FEATURE_NPT))
-        memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+                npt_enabled = false;
-        msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
-#ifdef CONFIG_X86_64
+        if (npt_enabled && !npt) {
-        set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
+                printk(KERN_INFO "kvm: Nested Paging disabled\n");
-        set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
+                npt_enabled = false;
-        set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
+        }
-        set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
-        set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
-        set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
-        set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
-        set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
-        set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
-        set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
-        for_each_online_cpu(cpu) {
+        if (npt_enabled) {
-                r = svm_cpu_init(cpu);
+                printk(KERN_INFO "kvm: Nested Paging enabled\n");
-                if (r)
+                kvm_enable_tdp();
-                        goto err_2;
        }
        return 0;
-err_2:
+err:
-        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-        msrpm_base = 0;
-err_1:
        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
        iopm_base = 0;
        return r;
@@ -421,9 +458,8 @@ err_1:
 static __exit void svm_hardware_unsetup(void)
 {
-        __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
-        iopm_base = msrpm_base = 0;
+        iopm_base = 0;
 }
 static void init_seg(struct vmcb_seg *seg)
@@ -443,15 +479,14 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
        seg->base = 0;
 }
-static void init_vmcb(struct vmcb *vmcb)
+static void init_vmcb(struct vcpu_svm *svm)
 {
-        struct vmcb_control_area *control = &vmcb->control;
+        struct vmcb_control_area *control = &svm->vmcb->control;
-        struct vmcb_save_area *save = &vmcb->save;
+        struct vmcb_save_area *save = &svm->vmcb->save;
        control->intercept_cr_read =    INTERCEPT_CR0_MASK |
                                        INTERCEPT_CR3_MASK |
-                                        INTERCEPT_CR4_MASK |
+                                        INTERCEPT_CR4_MASK;
-                                        INTERCEPT_CR8_MASK;
        control->intercept_cr_write =   INTERCEPT_CR0_MASK |
                                        INTERCEPT_CR3_MASK |
@@ -471,23 +506,13 @@ static void init_vmcb(struct vmcb *vmcb)
                                        INTERCEPT_DR7_MASK;
        control->intercept_exceptions = (1 << PF_VECTOR) |
-                                        (1 << UD_VECTOR);
+                                        (1 << UD_VECTOR) |
+                                        (1 << MC_VECTOR);
        control->intercept =    (1ULL << INTERCEPT_INTR) |
                                (1ULL << INTERCEPT_NMI) |
                                (1ULL << INTERCEPT_SMI) |
-                /*
-                 * selective cr0 intercept bug?
-                 *      0:   0f 22 d8                mov    %eax,%cr3
-                 *      3:   0f 20 c0                mov    %cr0,%eax
-                 *      6:   0d 00 00 00 80          or     $0x80000000,%eax
-                 *      b:   0f 22 c0                mov    %eax,%cr0
-                 * set cr3 ->interception
-                 * get cr0 ->interception
-                 * set cr0 -> no interception
-                 */
-                /*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
                                (1ULL << INTERCEPT_CPUID) |
                                (1ULL << INTERCEPT_INVD) |
                                (1ULL << INTERCEPT_HLT) |
@@ -508,7 +533,7 @@ static void init_vmcb(struct vmcb *vmcb)
                                (1ULL << INTERCEPT_MWAIT);
        control->iopm_base_pa = iopm_base;
-        control->msrpm_base_pa = msrpm_base;
+        control->msrpm_base_pa = __pa(svm->msrpm);
        control->tsc_offset = 0;
        control->int_ctl = V_INTR_MASKING_MASK;
@@ -550,13 +575,30 @@ static void init_vmcb(struct vmcb *vmcb)
        save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
+        if (npt_enabled) {
+                /* Setup VMCB for Nested Paging */
+                control->nested_ctl = 1;
+                control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+                control->intercept_exceptions &= ~(1 << PF_VECTOR);
+                control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
+                                                INTERCEPT_CR3_MASK);
+                control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
+                                                 INTERCEPT_CR3_MASK);
+                save->g_pat = 0x0007040600070406ULL;
+                /* enable caching because the QEMU Bios doesn't enable it */
+                save->cr0 = X86_CR0_ET;
+                save->cr3 = 0;
+                save->cr4 = 0;
+        }
+        force_new_asid(&svm->vcpu);
 }
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-        init_vmcb(svm->vmcb);
+        init_vmcb(svm);
        if (vcpu->vcpu_id != 0) {
                svm->vmcb->save.rip = 0;
@@ -571,6 +613,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
        struct vcpu_svm *svm;
        struct page *page;
+        struct page *msrpm_pages;
        int err;
        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -589,12 +632,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
                goto uninit;
        }
+        err = -ENOMEM;
+        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+        if (!msrpm_pages)
+                goto uninit;
+        svm->msrpm = page_address(msrpm_pages);
+        svm_vcpu_init_msrpm(svm->msrpm);
        svm->vmcb = page_address(page);
        clear_page(svm->vmcb);
        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
        svm->asid_generation = 0;
        memset(svm->db_regs, 0, sizeof(svm->db_regs));
-        init_vmcb(svm->vmcb);
+        init_vmcb(svm);
        fx_init(&svm->vcpu);
        svm->vcpu.fpu_active = 1;
@@ -617,6 +667,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -731,6 +782,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
        var->unusable = !var->present;
 }
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+        return save->cpl;
+}
 static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -784,6 +842,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                }
        }
 #endif
+        if (npt_enabled)
+                goto set;
        if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
                svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
                vcpu->fpu_active = 1;
@@ -791,18 +852,29 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vcpu->arch.cr0 = cr0;
        cr0 |= X86_CR0_PG | X86_CR0_WP;
-        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        if (!vcpu->fpu_active) {
                svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
                cr0 |= X86_CR0_TS;
        }
+set:
+        /*
+         * re-enable caching here because the QEMU bios
+         * does not do it - this results in some delay at
+         * reboot
+         */
+        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        svm->vmcb->save.cr0 = cr0;
 }
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       vcpu->arch.cr4 = cr4;
+        unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
-       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+        vcpu->arch.cr4 = cr4;
+        if (!npt_enabled)
+                cr4 |= X86_CR4_PAE;
+        cr4 |= host_cr4_mce;
+        to_svm(vcpu)->vmcb->save.cr4 = cr4;
 }
 static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -833,13 +905,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 }
-/* FIXME:
-        svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
-        svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-*/
 static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 {
        return -EOPNOTSUPP;
@@ -920,7 +985,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
        }
        default:
                printk(KERN_DEBUG "%s: unexpected dr %u\n",
-                       __FUNCTION__, dr);
+                       __func__, dr);
                *exception = UD_VECTOR;
                return;
        }
@@ -962,6 +1027,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
+static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+        /*
+         * On an #MC intercept the MCE handler is not called automatically in
+         * the host. So do it by hand here.
+         */
+        asm volatile (
+                "int $0x12\n");
+        /* not sure if we ever come back to this point */
+        return 1;
+}
 static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
        /*
@@ -969,7 +1047,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
         * so reinitialize it.
         */
        clear_page(svm->vmcb);
-        init_vmcb(svm->vmcb);
+        init_vmcb(svm);
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
@@ -1033,9 +1111,18 @@ static int invalid_op_interception(struct vcpu_svm *svm,
 static int task_switch_interception(struct vcpu_svm *svm,
                                    struct kvm_run *kvm_run)
 {
-        pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
+        u16 tss_selector;
-        kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-        return 0;
+        tss_selector = (u16)svm->vmcb->control.exit_info_1;
+        if (svm->vmcb->control.exit_info_2 &
+            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+                return kvm_task_switch(&svm->vcpu, tss_selector,
+                                       TASK_SWITCH_IRET);
+        if (svm->vmcb->control.exit_info_2 &
+            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+                return kvm_task_switch(&svm->vcpu, tss_selector,
+                                       TASK_SWITCH_JMP);
+        return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
 }
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1049,7 +1136,7 @@ static int emulate_on_interception(struct vcpu_svm *svm,
                                   struct kvm_run *kvm_run)
 {
        if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
-                pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+                pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
        return 1;
 }
@@ -1179,8 +1266,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
                svm->vmcb->save.sysenter_esp = data;
                break;
        case MSR_IA32_DEBUGCTLMSR:
-                pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+                if (!svm_has(SVM_FEATURE_LBRV)) {
-                                __FUNCTION__, data);
+                        pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+                                        __func__, data);
+                        break;
+                }
+                if (data & DEBUGCTL_RESERVED_BITS)
+                        return 1;
+                svm->vmcb->save.dbgctl = data;
+                if (data & (1ULL<<0))
+                        svm_enable_lbrv(svm);
+                else
+                        svm_disable_lbrv(svm);
                break;
        case MSR_K7_EVNTSEL0:
        case MSR_K7_EVNTSEL1:
@@ -1265,6 +1363,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
+        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
        [SVM_EXIT_INTR]                         = nop_on_interception,
        [SVM_EXIT_NMI]                          = nop_on_interception,
        [SVM_EXIT_SMI]                          = nop_on_interception,
@@ -1290,14 +1389,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
        [SVM_EXIT_MONITOR]                      = invalid_op_interception,
        [SVM_EXIT_MWAIT]                        = invalid_op_interception,
+        [SVM_EXIT_NPF]                          = pf_interception,
 };
 static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 exit_code = svm->vmcb->control.exit_code;
+        if (npt_enabled) {
+                int mmu_reload = 0;
+                if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
+                        svm_set_cr0(vcpu, svm->vmcb->save.cr0);
+                        mmu_reload = 1;
+                }
+                vcpu->arch.cr0 = svm->vmcb->save.cr0;
+                vcpu->arch.cr3 = svm->vmcb->save.cr3;
+                if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+                        if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+                                kvm_inject_gp(vcpu, 0);
+                                return 1;
+                        }
+                }
+                if (mmu_reload) {
+                        kvm_mmu_reset_context(vcpu);
+                        kvm_mmu_load(vcpu);
+                }
+        }
        kvm_reput_irq(svm);
        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
@@ -1308,10 +1427,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        }
        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
-            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+            exit_code != SVM_EXIT_NPF)
                printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
                       "exit_code 0x%x\n",
-                       __FUNCTION__, svm->vmcb->control.exit_int_info,
+                       __func__, svm->vmcb->control.exit_int_info,
                       exit_code);
        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1364,6 +1484,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
        svm_inject_irq(svm, irq);
 }
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_svm *svm = to_svm(vcpu);
+        struct vmcb *vmcb = svm->vmcb;
+        int max_irr, tpr;
+        if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
+                return;
+        vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+        max_irr = kvm_lapic_find_highest_irr(vcpu);
+        if (max_irr == -1)
+                return;
+        tpr = kvm_lapic_get_cr8(vcpu) << 4;
+        if (tpr >= (max_irr & 0xf0))
+                vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+}
 static void svm_intr_assist(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1376,14 +1517,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
                              SVM_EVTINJ_VEC_MASK;
                vmcb->control.exit_int_info = 0;
                svm_inject_irq(svm, intr_vector);
-                return;
+                goto out;
        }
        if (vmcb->control.int_ctl & V_IRQ_MASK)
-                return;
+                goto out;
        if (!kvm_cpu_has_interrupt(vcpu))
-                return;
+                goto out;
        if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
@@ -1391,12 +1532,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
                /* unable to deliver irq, set pending irq */
                vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
                svm_inject_irq(svm, 0x0);
-                return;
+                goto out;
        }
        /* Okay, we can deliver the interrupt: grab it and update PIC state. */
        intr_vector = kvm_cpu_get_interrupt(vcpu);
        svm_inject_irq(svm, intr_vector);
        kvm_timer_intr_post(vcpu, intr_vector);
+out:
+        update_cr8_intercept(vcpu);
 }
 static void kvm_reput_irq(struct vcpu_svm *svm)
@@ -1482,6 +1625,29 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_svm *svm = to_svm(vcpu);
+        if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+                kvm_lapic_set_tpr(vcpu, cr8);
+        }
+}
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_svm *svm = to_svm(vcpu);
+        u64 cr8;
+        if (!irqchip_in_kernel(vcpu->kvm))
+                return;
+        cr8 = kvm_get_cr8(vcpu);
+        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1491,6 +1657,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        pre_svm_run(svm);
+        sync_lapic_to_cr8(vcpu);
        save_host_msrs(vcpu);
        fs_selector = read_fs();
        gs_selector = read_gs();
@@ -1499,6 +1667,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        svm->host_dr6 = read_dr6();
        svm->host_dr7 = read_dr7();
        svm->vmcb->save.cr2 = vcpu->arch.cr2;
+        /* required for live migration with NPT */
+        if (npt_enabled)
+                svm->vmcb->save.cr3 = vcpu->arch.cr3;
        if (svm->vmcb->save.dr7 & 0xff) {
                write_dr7(0);
@@ -1635,6 +1806,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        stgi();
+        sync_cr8_to_lapic(vcpu);
        svm->next_rip = 0;
 }
@@ -1642,6 +1815,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+        if (npt_enabled) {
+                svm->vmcb->control.nested_cr3 = root;
+                force_new_asid(vcpu);
+                return;
+        }
        svm->vmcb->save.cr3 = root;
        force_new_asid(vcpu);
@@ -1709,6 +1888,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .get_segment_base = svm_get_segment_base,
        .get_segment = svm_get_segment,
        .set_segment = svm_set_segment,
+        .get_cpl = svm_get_cpl,
        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
        .set_cr0 = svm_set_cr0,
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
index 5fd50491b55..1b8afa78e86 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
 #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
 #define SVM_EXIT_READ_CR0       0x000
 #define SVM_EXIT_READ_CR3       0x003
 #define SVM_EXIT_READ_CR4       0x004
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 00000000000..622aa10f692
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,59 @@
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+struct tss_segment_32 {
+        u32 prev_task_link;
+        u32 esp0;
+        u32 ss0;
+        u32 esp1;
+        u32 ss1;
+        u32 esp2;
+        u32 ss2;
+        u32 cr3;
+        u32 eip;
+        u32 eflags;
+        u32 eax;
+        u32 ecx;
+        u32 edx;
+        u32 ebx;
+        u32 esp;
+        u32 ebp;
+        u32 esi;
+        u32 edi;
+        u32 es;
+        u32 cs;
+        u32 ss;
+        u32 ds;
+        u32 fs;
+        u32 gs;
+        u32 ldt_selector;
+        u16 t;
+        u16 io_map;
+};
+struct tss_segment_16 {
+        u16 prev_task_link;
+        u16 sp0;
+        u16 ss0;
+        u16 sp1;
+        u16 ss1;
+        u16 sp2;
+        u16 ss2;
+        u16 ip;
+        u16 flag;
+        u16 ax;
+        u16 cx;
+        u16 dx;
+        u16 bx;
+        u16 sp;
+        u16 bp;
+        u16 si;
+        u16 di;
+        u16 es;
+        u16 cs;
+        u16 ss;
+        u16 ds;
+        u16 ldt;
+};
+#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e1462880d1..8e5d6645b90 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -17,7 +17,6 @@
 #include "irq.h"
 #include "vmx.h"
-#include "segment_descriptor.h"
 #include "mmu.h"
 #include <linux/kvm_host.h>
@@ -37,6 +36,12 @@ MODULE_LICENSE("GPL");
 static int bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, 0);
+static int enable_vpid = 1;
+module_param(enable_vpid, bool, 0);
+static int flexpriority_enabled = 1;
+module_param(flexpriority_enabled, bool, 0);
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -71,6 +76,7 @@ struct vcpu_vmx {
                        unsigned rip;
                } irq;
        } rmode;
+        int vpid;
 };
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -85,6 +91,10 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
+static struct page *vmx_msr_bitmap;
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
 static struct vmcs_config {
        int size;
@@ -176,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info)
                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
+static inline int cpu_has_vmx_msr_bitmap(void)
+{
+        return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+}
 static inline int cpu_has_vmx_tpr_shadow(void)
 {
        return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
@@ -194,8 +209,9 @@ static inline int cpu_has_secondary_exec_ctrls(void)
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-        return (vmcs_config.cpu_based_2nd_exec_ctrl &
+        return flexpriority_enabled
-                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+                && (vmcs_config.cpu_based_2nd_exec_ctrl &
+                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 }
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -204,6 +220,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
                (irqchip_in_kernel(kvm)));
 }
+static inline int cpu_has_vmx_vpid(void)
+{
+        return (vmcs_config.cpu_based_2nd_exec_ctrl &
+                SECONDARY_EXEC_ENABLE_VPID);
+}
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -214,6 +236,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
        return -1;
 }
+static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+{
+    struct {
+        u64 vpid : 16;
+        u64 rsvd : 48;
+        u64 gva;
+    } operand = { vpid, 0, gva };
+    asm volatile (ASM_VMX_INVVPID
+                  /* CF==1 or ZF==1 --> rc = -1 */
+                  "; ja 1f ; ud2 ; 1:"
+                  : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -257,6 +293,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
        vmx->launched = 0;
 }
+static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+{
+        if (vmx->vpid == 0)
+                return;
+        __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
 static unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -353,7 +397,7 @@ static void reload_tss(void)
         * VT restores TR but not its size.  Useless.
         */
        struct descriptor_table gdt;
-        struct segment_descriptor *descs;
+        struct desc_struct *descs;
        get_gdt(&gdt);
        descs = (void *)gdt.base;
@@ -485,11 +529,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 phys_addr = __pa(vmx->vmcs);
-        u64 tsc_this, delta;
+        u64 tsc_this, delta, new_offset;
        if (vcpu->cpu != cpu) {
                vcpu_clear(vmx);
                kvm_migrate_apic_timer(vcpu);
+                vpid_sync_vcpu_all(vmx);
        }
        if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -524,8 +569,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * Make sure the time stamp counter is monotonous.
                 */
                rdtscll(tsc_this);
-                delta = vcpu->arch.host_tsc - tsc_this;
+                if (tsc_this < vcpu->arch.host_tsc) {
-                vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+                        delta = vcpu->arch.host_tsc - tsc_this;
+                        new_offset = vmcs_read64(TSC_OFFSET) + delta;
+                        vmcs_write64(TSC_OFFSET, new_offset);
+                }
        }
 }
@@ -596,7 +644,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                     nr | INTR_TYPE_EXCEPTION
-                     | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+                     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
                     | INTR_INFO_VALID_MASK);
        if (has_error_code)
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -959,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_MOV_DR_EXITING |
              CPU_BASED_USE_TSC_OFFSETING;
        opt = CPU_BASED_TPR_SHADOW |
+              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
                                &_cpu_based_exec_control) < 0)
@@ -971,7 +1020,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
                min = 0;
                opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-                        SECONDARY_EXEC_WBINVD_EXITING;
+                        SECONDARY_EXEC_WBINVD_EXITING |
+                        SECONDARY_EXEC_ENABLE_VPID;
                if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
                        return -EIO;
@@ -1080,6 +1130,10 @@ static __init int hardware_setup(void)
 {
        if (setup_vmcs_config(&vmcs_config) < 0)
                return -EIO;
+        if (boot_cpu_has(X86_FEATURE_NX))
+                kvm_enable_efer_bits(EFER_NX);
        return alloc_kvm_area();
 }
@@ -1214,7 +1268,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
                printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-                       __FUNCTION__);
+                       __func__);
                vmcs_write32(GUEST_TR_AR_BYTES,
                             (guest_tr_ar & ~AR_TYPE_MASK)
                             | AR_TYPE_BUSY_64_TSS);
@@ -1239,6 +1293,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 #endif
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+        vpid_sync_vcpu_all(to_vmx(vcpu));
+}
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
@@ -1275,6 +1334,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+        vmx_flush_tlb(vcpu);
        vmcs_writel(GUEST_CR3, cr3);
        if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
@@ -1288,14 +1348,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        vcpu->arch.cr4 = cr4;
 }
-#ifdef CONFIG_X86_64
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
        vcpu->arch.shadow_efer = efer;
+        if (!msr)
+                return;
        if (efer & EFER_LMA) {
                vmcs_write32(VM_ENTRY_CONTROLS,
                                     vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1312,8 +1372,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
        setup_msrs(vmx);
 }
-#endif
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1344,6 +1402,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
        var->unusable = (ar >> 16) & 1;
 }
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+        struct kvm_segment kvm_seg;
+        if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+                return 0;
+        if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+                return 3;
+        vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
+        return kvm_seg.selector & 3;
+}
 static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
        u32 ar;
@@ -1433,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm)
        int ret = 0;
        int r;
-        down_read(&kvm->slots_lock);
        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
        if (r < 0)
                goto out;
@@ -1456,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm)
        ret = 1;
 out:
-        up_read(&kvm->slots_lock);
        return ret;
 }
@@ -1494,6 +1564,46 @@ out:
        return r;
 }
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+        int vpid;
+        vmx->vpid = 0;
+        if (!enable_vpid || !cpu_has_vmx_vpid())
+                return;
+        spin_lock(&vmx_vpid_lock);
+        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+        if (vpid < VMX_NR_VPIDS) {
+                vmx->vpid = vpid;
+                __set_bit(vpid, vmx_vpid_bitmap);
+        }
+        spin_unlock(&vmx_vpid_lock);
+}
+void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+{
+        void *va;
+        if (!cpu_has_vmx_msr_bitmap())
+                return;
+        /*
+         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+         * have the write-low and read-high bitmap offsets the wrong way round.
+         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+         */
+        va = kmap(msr_bitmap);
+        if (msr <= 0x1fff) {
+                __clear_bit(msr, va + 0x000); /* read-low */
+                __clear_bit(msr, va + 0x800); /* write-low */
+        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+                msr &= 0x1fff;
+                __clear_bit(msr, va + 0x400); /* read-high */
+                __clear_bit(msr, va + 0xc00); /* write-high */
+        }
+        kunmap(msr_bitmap);
+}
 /*
 * Sets up the vmcs for emulated real mode.
 */
@@ -1511,6 +1621,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
        vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+        if (cpu_has_vmx_msr_bitmap())
+                vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
        /* Control */
@@ -1532,6 +1645,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
                        exec_control &=
                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+                if (vmx->vpid == 0)
+                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
@@ -1613,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        u64 msr;
        int ret;
+        down_read(&vcpu->kvm->slots_lock);
        if (!init_rmode_tss(vmx->vcpu.kvm)) {
                ret = -ENOMEM;
                goto out;
@@ -1621,7 +1737,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx->vcpu.arch.rmode.active = 0;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-        set_cr8(&vmx->vcpu, 0);
+        kvm_set_cr8(&vmx->vcpu, 0);
        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
        if (vmx->vcpu.vcpu_id == 0)
                msr |= MSR_IA32_APICBASE_BSP;
@@ -1704,18 +1820,22 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                vmcs_write64(APIC_ACCESS_ADDR,
                             page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+        if (vmx->vpid != 0)
+                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
        vmx->vcpu.arch.cr0 = 0x60000010;
        vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
        vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
        vmx_set_efer(&vmx->vcpu, 0);
-#endif
        vmx_fpu_activate(&vmx->vcpu);
        update_exception_bitmap(&vmx->vcpu);
-        return 0;
+        vpid_sync_vcpu_all(vmx);
+        ret = 0;
 out:
+        up_read(&vcpu->kvm->slots_lock);
        return ret;
 }
@@ -1723,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
        if (vcpu->arch.rmode.active) {
                vmx->rmode.irq.pending = true;
                vmx->rmode.irq.vector = irq;
@@ -1844,7 +1966,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
                                                !is_page_fault(intr_info))
                printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-                       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+                       "intr info 0x%x\n", __func__, vect_info, intr_info);
        if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
                int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
@@ -1869,10 +1991,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        error_code = 0;
        rip = vmcs_readl(GUEST_RIP);
-        if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
+                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
+                            (u32)((u64)cr2 >> 32), handler);
                return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
@@ -1901,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
                                     struct kvm_run *kvm_run)
 {
        ++vcpu->stat.irq_exits;
+        KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
        return 1;
 }
@@ -1958,25 +2083,27 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        reg = (exit_qualification >> 8) & 15;
        switch ((exit_qualification >> 4) & 3) {
        case 0: /* mov to cr */
+                KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
+                            (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
                switch (cr) {
                case 0:
                        vcpu_load_rsp_rip(vcpu);
-                        set_cr0(vcpu, vcpu->arch.regs[reg]);
+                        kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
                        vcpu_load_rsp_rip(vcpu);
-                        set_cr3(vcpu, vcpu->arch.regs[reg]);
+                        kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 4:
                        vcpu_load_rsp_rip(vcpu);
-                        set_cr4(vcpu, vcpu->arch.regs[reg]);
+                        kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
                        vcpu_load_rsp_rip(vcpu);
-                        set_cr8(vcpu, vcpu->arch.regs[reg]);
+                        kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        if (irqchip_in_kernel(vcpu->kvm))
                                return 1;
@@ -1990,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                vcpu->arch.cr0 &= ~X86_CR0_TS;
                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
                vmx_fpu_activate(vcpu);
+                KVMTRACE_0D(CLTS, vcpu, handler);
                skip_emulated_instruction(vcpu);
                return 1;
        case 1: /*mov from cr*/
@@ -1998,18 +2126,24 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        vcpu_load_rsp_rip(vcpu);
                        vcpu->arch.regs[reg] = vcpu->arch.cr3;
                        vcpu_put_rsp_rip(vcpu);
+                        KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
+                                    (u32)vcpu->arch.regs[reg],
+                                    (u32)((u64)vcpu->arch.regs[reg] >> 32),
+                                    handler);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
                        vcpu_load_rsp_rip(vcpu);
-                        vcpu->arch.regs[reg] = get_cr8(vcpu);
+                        vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
                        vcpu_put_rsp_rip(vcpu);
+                        KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
+                                    (u32)vcpu->arch.regs[reg], handler);
                        skip_emulated_instruction(vcpu);
                        return 1;
                }
                break;
        case 3: /* lmsw */
-                lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+                kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -2049,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        val = 0;
                }
                vcpu->arch.regs[reg] = val;
+                KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
        } else {
                /* mov to dr */
        }
@@ -2073,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                return 1;
        }
+        KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
+                    handler);
        /* FIXME: handling of bits 32:63 of rax, rdx */
        vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
        vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
@@ -2086,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+        KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
+                    handler);
        if (vmx_set_msr(vcpu, ecx, data) != 0) {
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -2110,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+        KVMTRACE_0D(PEND_INTR, vcpu, handler);
        /*
         * If the user space waits to inject interrupts, exit as soon as
         * possible
@@ -2152,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
        offset = exit_qualification & 0xffful;
+        KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
        er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
        if (er !=  EMULATE_DONE) {
@@ -2163,6 +2309,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
+static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+        unsigned long exit_qualification;
+        u16 tss_selector;
+        int reason;
+        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+        reason = (u32)exit_qualification >> 30;
+        tss_selector = exit_qualification;
+        return kvm_task_switch(vcpu, tss_selector, reason);
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2185,6 +2345,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
+        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -2200,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vectoring_info = vmx->idt_vectoring_info;
+        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
+                    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
        if (unlikely(vmx->fail)) {
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2210,7 +2374,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                                exit_reason != EXIT_REASON_EXCEPTION_NMI)
                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-                       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+                       "exit reason is 0x%x\n", __func__, exit_reason);
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -2221,10 +2385,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        return 0;
 }
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
 static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 {
        int max_irr, tpr;
@@ -2285,11 +2445,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
                        return;
                }
+                KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-                if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+                if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
                        vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
                                vmcs_read32(IDT_VECTORING_ERROR_CODE));
                if (unlikely(has_ext_irq))
@@ -2470,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        /* We need to handle NMIs before interrupts are enabled */
-        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
+                KVMTRACE_0D(NMI, vcpu, handler);
                asm("int $2");
+        }
 }
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
@@ -2489,6 +2653,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        spin_lock(&vmx_vpid_lock);
+        if (vmx->vpid != 0)
+                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+        spin_unlock(&vmx_vpid_lock);
        vmx_free_vmcs(vcpu);
        kfree(vmx->host_msrs);
        kfree(vmx->guest_msrs);
@@ -2505,6 +2673,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        if (!vmx)
                return ERR_PTR(-ENOMEM);
+        allocate_vpid(vmx);
        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
        if (err)
                goto free_vcpu;
@@ -2591,14 +2761,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_segment_base = vmx_get_segment_base,
        .get_segment = vmx_get_segment,
        .set_segment = vmx_set_segment,
+        .get_cpl = vmx_get_cpl,
        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
        .set_cr3 = vmx_set_cr3,
        .set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
        .set_efer = vmx_set_efer,
-#endif
        .get_idt = vmx_get_idt,
        .set_idt = vmx_set_idt,
        .get_gdt = vmx_get_gdt,
@@ -2626,7 +2795,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 static int __init vmx_init(void)
 {
-        void *iova;
+        void *va;
        int r;
        vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2639,28 +2808,48 @@ static int __init vmx_init(void)
                goto out;
        }
+        vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+        if (!vmx_msr_bitmap) {
+                r = -ENOMEM;
+                goto out1;
+        }
        /*
         * Allow direct access to the PC debug port (it is often used for I/O
         * delays, but the vmexits simply slow things down).
         */
-        iova = kmap(vmx_io_bitmap_a);
+        va = kmap(vmx_io_bitmap_a);
-        memset(iova, 0xff, PAGE_SIZE);
+        memset(va, 0xff, PAGE_SIZE);
-        clear_bit(0x80, iova);
+        clear_bit(0x80, va);
        kunmap(vmx_io_bitmap_a);
-        iova = kmap(vmx_io_bitmap_b);
+        va = kmap(vmx_io_bitmap_b);
-        memset(iova, 0xff, PAGE_SIZE);
+        memset(va, 0xff, PAGE_SIZE);
        kunmap(vmx_io_bitmap_b);
+        va = kmap(vmx_msr_bitmap);
+        memset(va, 0xff, PAGE_SIZE);
+        kunmap(vmx_msr_bitmap);
+        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
        if (r)
-                goto out1;
+                goto out2;
+        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
+        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
+        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
+        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
+        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
        if (bypass_guest_pf)
                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
        return 0;
+out2:
+        __free_page(vmx_msr_bitmap);
 out1:
        __free_page(vmx_io_bitmap_b);
 out:
@@ -2670,6 +2859,7 @@ out:
 static void __exit vmx_exit(void)
 {
+        __free_page(vmx_msr_bitmap);
        __free_page(vmx_io_bitmap_b);
        __free_page(vmx_io_bitmap_a);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index d52ae8d7303..5dff4606b98 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -49,6 +49,7 @@
 * Definitions of Secondary Processor-Based VM-Execution Controls.
 */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING           0x00000040
@@ -65,6 +66,7 @@
 /* VMCS Encodings */
 enum vmcs_field {
+        VIRTUAL_PROCESSOR_ID            = 0x00000000,
        GUEST_ES_SELECTOR               = 0x00000800,
        GUEST_CS_SELECTOR               = 0x00000802,
        GUEST_SS_SELECTOR               = 0x00000804,
@@ -231,12 +233,12 @@ enum vmcs_field {
 */
 #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
 #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
+#define INTR_INFO_DELIVER_CODE_MASK     0x800           /* 11 */
 #define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
 #define VECTORING_INFO_VECTOR_MASK              INTR_INFO_VECTOR_MASK
 #define VECTORING_INFO_TYPE_MASK                INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK       INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK        INTR_INFO_DELIVER_CODE_MASK
 #define VECTORING_INFO_VALID_MASK               INTR_INFO_VALID_MASK
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
@@ -321,4 +323,8 @@ enum vmcs_field {
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT        9
+#define VMX_NR_VPIDS                            (1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT          1
+#define VMX_VPID_EXTENT_ALL_CONTEXT             2
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01552bd1f..0ce556372a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,10 +15,12 @@
 */
 #include <linux/kvm_host.h>
-#include "segment_descriptor.h"
 #include "irq.h"
 #include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include <linux/clocksource.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
@@ -28,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/msr.h>
+#include <asm/desc.h>
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                               \
@@ -41,7 +44,15 @@
                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -63,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "irq_window", VCPU_STAT(irq_window_exits) },
        { "halt_exits", VCPU_STAT(halt_exits) },
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
        { "irq_exits", VCPU_STAT(irq_exits) },
        { "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -78,6 +90,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "mmu_recycled", VM_STAT(mmu_recycled) },
        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+        { "largepages", VM_STAT(lpages) },
        { NULL }
 };
@@ -85,7 +98,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 unsigned long segment_base(u16 selector)
 {
        struct descriptor_table gdt;
-        struct segment_descriptor *d;
+        struct desc_struct *d;
        unsigned long table_base;
        unsigned long v;
@@ -101,13 +114,12 @@ unsigned long segment_base(u16 selector)
                asm("sldt %0" : "=g"(ldt_selector));
                table_base = segment_base(ldt_selector);
        }
-        d = (struct segment_descriptor *)(table_base + (selector & ~7));
+        d = (struct desc_struct *)(table_base + (selector & ~7));
-        v = d->base_low | ((unsigned long)d->base_mid << 16) |
+        v = d->base0 | ((unsigned long)d->base1 << 16) |
-                ((unsigned long)d->base_high << 24);
+                ((unsigned long)d->base2 << 24);
 #ifdef CONFIG_X86_64
-        if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-                v |= ((unsigned long) \
+                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-                      ((struct segment_descriptor_64 *)d)->base_higher) << 32;
 #endif
        return v;
 }
@@ -145,11 +157,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
                           u32 error_code)
 {
        ++vcpu->stat.pf_guest;
-        if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+        if (vcpu->arch.exception.pending) {
-                printk(KERN_DEBUG "kvm: inject_page_fault:"
+                if (vcpu->arch.exception.nr == PF_VECTOR) {
-                       " double fault 0x%lx\n", addr);
+                        printk(KERN_DEBUG "kvm: inject_page_fault:"
-                vcpu->arch.exception.nr = DF_VECTOR;
+                                        " double fault 0x%lx\n", addr);
-                vcpu->arch.exception.error_code = 0;
+                        vcpu->arch.exception.nr = DF_VECTOR;
+                        vcpu->arch.exception.error_code = 0;
+                } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+                        /* triple fault -> shutdown */
+                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                }
                return;
        }
        vcpu->arch.cr2 = addr;
@@ -184,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        int ret;
        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-        down_read(&vcpu->kvm->slots_lock);
        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
                                  offset * sizeof(u64), sizeof(pdpte));
        if (ret < 0) {
@@ -201,10 +217,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-        up_read(&vcpu->kvm->slots_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(load_pdptrs);
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
@@ -215,18 +231,16 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
        if (is_long_mode(vcpu) || !is_pae(vcpu))
                return false;
-        down_read(&vcpu->kvm->slots_lock);
        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
        if (r < 0)
                goto out;
        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-        up_read(&vcpu->kvm->slots_lock);
        return changed;
 }
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        if (cr0 & CR0_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
@@ -284,15 +298,18 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        kvm_mmu_reset_context(vcpu);
        return;
 }
-EXPORT_SYMBOL_GPL(set_cr0);
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-        set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+        KVMTRACE_1D(LMSW, vcpu,
+                    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
+                    handler);
 }
-EXPORT_SYMBOL_GPL(lmsw);
+EXPORT_SYMBOL_GPL(kvm_lmsw);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        if (cr4 & CR4_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
@@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        vcpu->arch.cr4 = cr4;
        kvm_mmu_reset_context(vcpu);
 }
-EXPORT_SYMBOL_GPL(set_cr4);
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
                kvm_mmu_flush_tlb(vcpu);
@@ -359,7 +376,6 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                 */
        }
-        down_read(&vcpu->kvm->slots_lock);
        /*
         * Does the new cr3 value map to physical memory? (Note, we
         * catch an invalid cr3 even in real-mode, because it would
@@ -375,11 +391,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                vcpu->arch.cr3 = cr3;
                vcpu->arch.mmu.new_cr3(vcpu);
        }
-        up_read(&vcpu->kvm->slots_lock);
 }
-EXPORT_SYMBOL_GPL(set_cr3);
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        if (cr8 & CR8_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
@@ -391,16 +406,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
        else
                vcpu->arch.cr8 = cr8;
 }
-EXPORT_SYMBOL_GPL(set_cr8);
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
        if (irqchip_in_kernel(vcpu->kvm))
                return kvm_lapic_get_cr8(vcpu);
        else
                return vcpu->arch.cr8;
 }
-EXPORT_SYMBOL_GPL(get_cr8);
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
 /*
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -415,7 +430,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-        MSR_IA32_TIME_STAMP_COUNTER,
+        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+        MSR_IA32_PERF_STATUS,
 };
 static unsigned num_msrs_to_save;
@@ -424,11 +440,9 @@ static u32 emulated_msrs[] = {
        MSR_IA32_MISC_ENABLE,
 };
-#ifdef CONFIG_X86_64
 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-        if (efer & EFER_RESERVED_BITS) {
+        if (efer & efer_reserved_bits) {
                printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
                       efer);
                kvm_inject_gp(vcpu, 0);
@@ -450,7 +464,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
        vcpu->arch.shadow_efer = efer;
 }
-#endif
+void kvm_enable_efer_bits(u64 mask)
+{
+       efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 /*
 * Writes msr value into into the appropriate "register".
@@ -470,26 +489,86 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
        return kvm_set_msr(vcpu, index, *data);
 }
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+        static int version;
+        struct kvm_wall_clock wc;
+        struct timespec wc_ts;
+        if (!wall_clock)
+                return;
+        version++;
+        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+        wc_ts = current_kernel_time();
+        wc.wc_sec = wc_ts.tv_sec;
+        wc.wc_nsec = wc_ts.tv_nsec;
+        wc.wc_version = version;
+        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+        version++;
+        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+        struct timespec ts;
+        unsigned long flags;
+        struct kvm_vcpu_arch *vcpu = &v->arch;
+        void *shared_kaddr;
+        if ((!vcpu->time_page))
+                return;
+        /* Keep irq disabled to prevent changes to the clock */
+        local_irq_save(flags);
+        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+                          &vcpu->hv_clock.tsc_timestamp);
+        ktime_get_ts(&ts);
+        local_irq_restore(flags);
+        /* With all the info we got, fill in the values */
+        vcpu->hv_clock.system_time = ts.tv_nsec +
+                                     (NSEC_PER_SEC * (u64)ts.tv_sec);
+        /*
+         * The interface expects us to write an even number signaling that the
+         * update is finished. Since the guest won't see the intermediate
+         * state, we just write "2" at the end
+         */
+        vcpu->hv_clock.version = 2;
+        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+                sizeof(vcpu->hv_clock));
+        kunmap_atomic(shared_kaddr, KM_USER0);
+        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
-#ifdef CONFIG_X86_64
        case MSR_EFER:
                set_efer(vcpu, data);
                break;
-#endif
        case MSR_IA32_MC0_STATUS:
                pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                       __FUNCTION__, data);
+                       __func__, data);
                break;
        case MSR_IA32_MCG_STATUS:
                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                        __FUNCTION__, data);
+                        __func__, data);
                break;
        case MSR_IA32_MCG_CTL:
                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-                        __FUNCTION__, data);
+                        __func__, data);
                break;
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
@@ -501,6 +580,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
+        case MSR_KVM_WALL_CLOCK:
+                vcpu->kvm->arch.wall_clock = data;
+                kvm_write_wall_clock(vcpu->kvm, data);
+                break;
+        case MSR_KVM_SYSTEM_TIME: {
+                if (vcpu->arch.time_page) {
+                        kvm_release_page_dirty(vcpu->arch.time_page);
+                        vcpu->arch.time_page = NULL;
+                }
+                vcpu->arch.time = data;
+                /* we verify if the enable bit is set... */
+                if (!(data & 1))
+                        break;
+                /* ...but clean it before doing the actual write */
+                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+                vcpu->arch.hv_clock.tsc_to_system_mul =
+                                        clocksource_khz2mult(tsc_khz, 22);
+                vcpu->arch.hv_clock.tsc_shift = 22;
+                down_read(&current->mm->mmap_sem);
+                vcpu->arch.time_page =
+                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+                up_read(&current->mm->mmap_sem);
+                if (is_error_page(vcpu->arch.time_page)) {
+                        kvm_release_page_clean(vcpu->arch.time_page);
+                        vcpu->arch.time_page = NULL;
+                }
+                kvm_write_guest_time(vcpu);
+                break;
+        }
        default:
                pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
                return 1;
@@ -540,7 +655,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MC0_MISC+12:
        case MSR_IA32_MC0_MISC+16:
        case MSR_IA32_UCODE_REV:
-        case MSR_IA32_PERF_STATUS:
        case MSR_IA32_EBL_CR_POWERON:
                /* MTRR registers */
        case 0xfe:
@@ -556,11 +670,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
-#ifdef CONFIG_X86_64
+        case MSR_IA32_PERF_STATUS:
+                /* TSC increment by tick */
+                data = 1000ULL;
+                /* CPU multiplier */
+                data |= (((uint64_t)4ULL) << 40);
+                break;
        case MSR_EFER:
                data = vcpu->arch.shadow_efer;
                break;
-#endif
+        case MSR_KVM_WALL_CLOCK:
+                data = vcpu->kvm->arch.wall_clock;
+                break;
+        case MSR_KVM_SYSTEM_TIME:
+                data = vcpu->arch.time;
+                break;
        default:
                pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                return 1;
@@ -584,9 +708,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
        vcpu_load(vcpu);
+        down_read(&vcpu->kvm->slots_lock);
        for (i = 0; i < msrs->nmsrs; ++i)
                if (do_msr(vcpu, entries[i].index, &entries[i].data))
                        break;
+        up_read(&vcpu->kvm->slots_lock);
        vcpu_put(vcpu);
@@ -688,11 +814,24 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
+        case KVM_CAP_CLOCKSOURCE:
+        case KVM_CAP_PIT:
+        case KVM_CAP_NOP_IO_DELAY:
+        case KVM_CAP_MP_STATE:
                r = 1;
                break;
        case KVM_CAP_VAPIC:
                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                break;
+        case KVM_CAP_NR_VCPUS:
+                r = KVM_MAX_VCPUS;
+                break;
+        case KVM_CAP_NR_MEMSLOTS:
+                r = KVM_MEMORY_SLOTS;
+                break;
+        case KVM_CAP_PV_MMU:
+                r = !tdp_enabled;
+                break;
        default:
                r = 0;
                break;
@@ -763,6 +902,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        kvm_x86_ops->vcpu_load(vcpu, cpu);
+        kvm_write_guest_time(vcpu);
 }
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -958,32 +1098,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        }
        /* function 4 and 0xb have additional index. */
        case 4: {
-                int index, cache_type;
+                int i, cache_type;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until cache_type is zero */
-                for (index = 1; *nent < maxnent; ++index) {
+                for (i = 1; *nent < maxnent; ++i) {
-                        cache_type = entry[index - 1].eax & 0x1f;
+                        cache_type = entry[i - 1].eax & 0x1f;
                        if (!cache_type)
                                break;
-                        do_cpuid_1_ent(&entry[index], function, index);
+                        do_cpuid_1_ent(&entry[i], function, i);
-                        entry[index].flags |=
+                        entry[i].flags |=
                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                        ++*nent;
                }
                break;
        }
        case 0xb: {
-                int index, level_type;
+                int i, level_type;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until level_type is zero */
-                for (index = 1; *nent < maxnent; ++index) {
+                for (i = 1; *nent < maxnent; ++i) {
-                        level_type = entry[index - 1].ecx & 0xff;
+                        level_type = entry[i - 1].ecx & 0xff;
                        if (!level_type)
                                break;
-                        do_cpuid_1_ent(&entry[index], function, index);
+                        do_cpuid_1_ent(&entry[i], function, i);
-                        entry[index].flags |=
+                        entry[i].flags |=
                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                        ++*nent;
                }
@@ -1365,6 +1505,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
        return r;
 }
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+        int r = 0;
+        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+        return r;
+}
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+        int r = 0;
+        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+        kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+        return r;
+}
 /*
 * Get (and clear) the dirty memory log for a memory slot.
 */
@@ -1457,6 +1614,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
                } else
                        goto out;
                break;
+        case KVM_CREATE_PIT:
+                r = -ENOMEM;
+                kvm->arch.vpit = kvm_create_pit(kvm);
+                if (kvm->arch.vpit)
+                        r = 0;
+                break;
        case KVM_IRQ_LINE: {
                struct kvm_irq_level irq_event;
@@ -1512,6 +1675,37 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+        case KVM_GET_PIT: {
+                struct kvm_pit_state ps;
+                r = -EFAULT;
+                if (copy_from_user(&ps, argp, sizeof ps))
+                        goto out;
+                r = -ENXIO;
+                if (!kvm->arch.vpit)
+                        goto out;
+                r = kvm_vm_ioctl_get_pit(kvm, &ps);
+                if (r)
+                        goto out;
+                r = -EFAULT;
+                if (copy_to_user(argp, &ps, sizeof ps))
+                        goto out;
+                r = 0;
+                break;
+        }
+        case KVM_SET_PIT: {
+                struct kvm_pit_state ps;
+                r = -EFAULT;
+                if (copy_from_user(&ps, argp, sizeof ps))
+                        goto out;
+                r = -ENXIO;
+                if (!kvm->arch.vpit)
+                        goto out;
+                r = kvm_vm_ioctl_set_pit(kvm, &ps);
+                if (r)
+                        goto out;
+                r = 0;
+                break;
+        }
        default:
                ;
        }
@@ -1570,7 +1764,6 @@ int emulator_read_std(unsigned long addr,
        void *data = val;
        int r = X86EMUL_CONTINUE;
-        down_read(&vcpu->kvm->slots_lock);
        while (bytes) {
                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
                unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1785,6 @@ int emulator_read_std(unsigned long addr,
                addr += tocopy;
        }
 out:
-        up_read(&vcpu->kvm->slots_lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1803,7 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
-        down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-        up_read(&vcpu->kvm->slots_lock);
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1646,19 +1836,15 @@ mmio:
        return X86EMUL_UNHANDLEABLE;
 }
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                               const void *val, int bytes)
+                          const void *val, int bytes)
 {
        int ret;
-        down_read(&vcpu->kvm->slots_lock);
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-        if (ret < 0) {
+        if (ret < 0)
-                up_read(&vcpu->kvm->slots_lock);
                return 0;
-        }
        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-        up_read(&vcpu->kvm->slots_lock);
        return 1;
 }
@@ -1670,9 +1856,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
        struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
-        down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-        up_read(&vcpu->kvm->slots_lock);
        if (gpa == UNMAPPED_GVA) {
                kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1933,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                char *kaddr;
                u64 val;
-                down_read(&vcpu->kvm->slots_lock);
                gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
                if (gpa == UNMAPPED_GVA ||
@@ -1769,9 +1952,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
                kunmap_atomic(kaddr, KM_USER0);
                kvm_release_page_dirty(page);
-        emul_write:
-                up_read(&vcpu->kvm->slots_lock);
        }
+emul_write:
 #endif
        return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -1802,7 +1984,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
                *dest = kvm_x86_ops->get_dr(vcpu, dr);
                return X86EMUL_CONTINUE;
        default:
-                pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+                pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
                return X86EMUL_UNHANDLEABLE;
        }
 }
@@ -1840,7 +2022,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-struct x86_emulate_ops emulate_ops = {
+static struct x86_emulate_ops emulate_ops = {
        .read_std            = emulator_read_std,
        .read_emulated       = emulator_read_emulated,
        .write_emulated      = emulator_write_emulated,
@@ -2091,6 +2273,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.guest_page_offset = 0;
        vcpu->arch.pio.rep = 0;
+        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+                            handler);
+        else
+                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+                            handler);
        kvm_x86_ops->cache_regs(vcpu);
        memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
        kvm_x86_ops->decache_regs(vcpu);
@@ -2129,6 +2318,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.guest_page_offset = offset_in_page(address);
        vcpu->arch.pio.rep = rep;
+        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+                            handler);
+        else
+                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+                            handler);
        if (!count) {
                kvm_x86_ops->skip_emulated_instruction(vcpu);
                return 1;
@@ -2163,10 +2359,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                kvm_x86_ops->skip_emulated_instruction(vcpu);
        for (i = 0; i < nr_pages; ++i) {
-                down_read(&vcpu->kvm->slots_lock);
                page = gva_to_page(vcpu, address + i * PAGE_SIZE);
                vcpu->arch.pio.guest_pages[i] = page;
-                up_read(&vcpu->kvm->slots_lock);
                if (!page) {
                        kvm_inject_gp(vcpu, 0);
                        free_pio_guest_pages(vcpu);
@@ -2238,10 +2432,13 @@ void kvm_arch_exit(void)
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.halt_exits;
+        KVMTRACE_0D(HLT, vcpu, handler);
        if (irqchip_in_kernel(vcpu->kvm)) {
-                vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+                up_read(&vcpu->kvm->slots_lock);
                kvm_vcpu_block(vcpu);
-                if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+                down_read(&vcpu->kvm->slots_lock);
+                if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
                        return -EINTR;
                return 1;
        } else {
@@ -2251,9 +2448,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+                           unsigned long a1)
+{
+        if (is_long_mode(vcpu))
+                return a0;
+        else
+                return a0 | ((gpa_t)a1 << 32);
+}
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
+        int r = 1;
        kvm_x86_ops->cache_regs(vcpu);
@@ -2263,6 +2470,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        a2 = vcpu->arch.regs[VCPU_REGS_RDX];
        a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+        KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
        if (!is_long_mode(vcpu)) {
                nr &= 0xFFFFFFFF;
                a0 &= 0xFFFFFFFF;
@@ -2275,13 +2484,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
+        case KVM_HC_MMU_OP:
+                r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+                break;
        default:
                ret = -KVM_ENOSYS;
                break;
        }
        vcpu->arch.regs[VCPU_REGS_RAX] = ret;
        kvm_x86_ops->decache_regs(vcpu);
-        return 0;
+        ++vcpu->stat.hypercalls;
+        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -2329,7 +2542,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
                   unsigned long *rflags)
 {
-        lmsw(vcpu, msw);
+        kvm_lmsw(vcpu, msw);
        *rflags = kvm_x86_ops->get_rflags(vcpu);
 }
@@ -2346,9 +2559,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
        case 4:
                return vcpu->arch.cr4;
        case 8:
-                return get_cr8(vcpu);
+                return kvm_get_cr8(vcpu);
        default:
-                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
                return 0;
        }
 }
@@ -2358,23 +2571,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
 {
        switch (cr) {
        case 0:
-                set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
                *rflags = kvm_x86_ops->get_rflags(vcpu);
                break;
        case 2:
                vcpu->arch.cr2 = val;
                break;
        case 3:
-                set_cr3(vcpu, val);
+                kvm_set_cr3(vcpu, val);
                break;
        case 4:
-                set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+                kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
                break;
        case 8:
-                set_cr8(vcpu, val & 0xfUL);
+                kvm_set_cr8(vcpu, val & 0xfUL);
                break;
        default:
-                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
        }
 }
@@ -2447,6 +2660,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
        }
        kvm_x86_ops->decache_regs(vcpu);
        kvm_x86_ops->skip_emulated_instruction(vcpu);
+        KVMTRACE_5D(CPUID, vcpu, function,
+                    (u32)vcpu->arch.regs[VCPU_REGS_RAX],
+                    (u32)vcpu->arch.regs[VCPU_REGS_RBX],
+                    (u32)vcpu->arch.regs[VCPU_REGS_RCX],
+                    (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -2469,7 +2687,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
                              struct kvm_run *kvm_run)
 {
        kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-        kvm_run->cr8 = get_cr8(vcpu);
+        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
        if (irqchip_in_kernel(vcpu->kvm))
                kvm_run->ready_for_interrupt_injection = 1;
@@ -2509,16 +2727,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        int r;
-        if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
                pr_debug("vcpu %d received sipi with vector # %x\n",
                       vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
                r = kvm_x86_ops->vcpu_reset(vcpu);
                if (r)
                        return r;
-                vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        }
+        down_read(&vcpu->kvm->slots_lock);
        vapic_enter(vcpu);
 preempted:
@@ -2526,6 +2745,10 @@ preempted:
                kvm_x86_ops->guest_debug_pre(vcpu);
 again:
+        if (vcpu->requests)
+                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+                        kvm_mmu_unload(vcpu);
        r = kvm_mmu_reload(vcpu);
        if (unlikely(r))
                goto out;
@@ -2539,6 +2762,11 @@ again:
                        r = 0;
                        goto out;
                }
+                if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+                        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+                        r = 0;
+                        goto out;
+                }
        }
        kvm_inject_pending_timer_irqs(vcpu);
@@ -2557,6 +2785,14 @@ again:
                goto out;
        }
+        if (vcpu->requests)
+                if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
+                        local_irq_enable();
+                        preempt_enable();
+                        r = 1;
+                        goto out;
+                }
        if (signal_pending(current)) {
                local_irq_enable();
                preempt_enable();
@@ -2566,6 +2802,13 @@ again:
                goto out;
        }
+        vcpu->guest_mode = 1;
+        /*
+         * Make sure that guest_mode assignment won't happen after
+         * testing the pending IRQ vector bitmap.
+         */
+        smp_wmb();
        if (vcpu->arch.exception.pending)
                __queue_exception(vcpu);
        else if (irqchip_in_kernel(vcpu->kvm))
@@ -2575,13 +2818,15 @@ again:
        kvm_lapic_sync_to_vapic(vcpu);
-        vcpu->guest_mode = 1;
+        up_read(&vcpu->kvm->slots_lock);
        kvm_guest_enter();
        if (vcpu->requests)
                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
                        kvm_x86_ops->tlb_flush(vcpu);
+        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
        kvm_x86_ops->run(vcpu, kvm_run);
        vcpu->guest_mode = 0;
@@ -2601,6 +2846,8 @@ again:
        preempt_enable();
+        down_read(&vcpu->kvm->slots_lock);
        /*
         * Profile KVM exit RIPs:
         */
@@ -2628,14 +2875,18 @@ again:
        }
 out:
+        up_read(&vcpu->kvm->slots_lock);
        if (r > 0) {
                kvm_resched(vcpu);
+                down_read(&vcpu->kvm->slots_lock);
                goto preempted;
        }
        post_kvm_run_save(vcpu, kvm_run);
+        down_read(&vcpu->kvm->slots_lock);
        vapic_exit(vcpu);
+        up_read(&vcpu->kvm->slots_lock);
        return r;
 }
@@ -2647,7 +2898,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        vcpu_load(vcpu);
-        if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
                kvm_vcpu_block(vcpu);
                vcpu_put(vcpu);
                return -EAGAIN;
@@ -2658,7 +2909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        /* re-sync apic's tpr */
        if (!irqchip_in_kernel(vcpu->kvm))
-                set_cr8(vcpu, kvm_run->cr8);
+                kvm_set_cr8(vcpu, kvm_run->cr8);
        if (vcpu->arch.pio.cur_count) {
                r = complete_pio(vcpu);
@@ -2670,9 +2921,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
                vcpu->mmio_read_completed = 1;
                vcpu->mmio_needed = 0;
+                down_read(&vcpu->kvm->slots_lock);
                r = emulate_instruction(vcpu, kvm_run,
                                        vcpu->arch.mmio_fault_cr2, 0,
                                        EMULTYPE_NO_DECODE);
+                up_read(&vcpu->kvm->slots_lock);
                if (r == EMULATE_DO_MMIO) {
                        /*
                         * Read-modify-write.  Back to userspace.
@@ -2773,7 +3027,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 static void get_segment(struct kvm_vcpu *vcpu,
                        struct kvm_segment *var, int seg)
 {
-        return kvm_x86_ops->get_segment(vcpu, var, seg);
+        kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2816,7 +3070,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        sregs->cr2 = vcpu->arch.cr2;
        sregs->cr3 = vcpu->arch.cr3;
        sregs->cr4 = vcpu->arch.cr4;
-        sregs->cr8 = get_cr8(vcpu);
+        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.shadow_efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
@@ -2836,12 +3090,438 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        vcpu_load(vcpu);
+        mp_state->mp_state = vcpu->arch.mp_state;
+        vcpu_put(vcpu);
+        return 0;
+}
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+        vcpu_load(vcpu);
+        vcpu->arch.mp_state = mp_state->mp_state;
+        vcpu_put(vcpu);
+        return 0;
+}
 static void set_segment(struct kvm_vcpu *vcpu,
                        struct kvm_segment *var, int seg)
 {
-        return kvm_x86_ops->set_segment(vcpu, var, seg);
+        kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
+                                   struct kvm_segment *kvm_desct)
+{
+        kvm_desct->base = seg_desc->base0;
+        kvm_desct->base |= seg_desc->base1 << 16;
+        kvm_desct->base |= seg_desc->base2 << 24;
+        kvm_desct->limit = seg_desc->limit0;
+        kvm_desct->limit |= seg_desc->limit << 16;
+        kvm_desct->selector = selector;
+        kvm_desct->type = seg_desc->type;
+        kvm_desct->present = seg_desc->p;
+        kvm_desct->dpl = seg_desc->dpl;
+        kvm_desct->db = seg_desc->d;
+        kvm_desct->s = seg_desc->s;
+        kvm_desct->l = seg_desc->l;
+        kvm_desct->g = seg_desc->g;
+        kvm_desct->avl = seg_desc->avl;
+        if (!selector)
+                kvm_desct->unusable = 1;
+        else
+                kvm_desct->unusable = 0;
+        kvm_desct->padding = 0;
+}
+static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
+                                           u16 selector,
+                                           struct descriptor_table *dtable)
+{
+        if (selector & 1 << 2) {
+                struct kvm_segment kvm_seg;
+                get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+                if (kvm_seg.unusable)
+                        dtable->limit = 0;
+                else
+                        dtable->limit = kvm_seg.limit;
+                dtable->base = kvm_seg.base;
+        }
+        else
+                kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+                                         struct desc_struct *seg_desc)
+{
+        struct descriptor_table dtable;
+        u16 index = selector >> 3;
+        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        if (dtable.limit < index * 8 + 7) {
+                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+                return 1;
+        }
+        return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+                                         struct desc_struct *seg_desc)
+{
+        struct descriptor_table dtable;
+        u16 index = selector >> 3;
+        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        if (dtable.limit < index * 8 + 7)
+                return 1;
+        return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+                             struct desc_struct *seg_desc)
+{
+        u32 base_addr;
+        base_addr = seg_desc->base0;
+        base_addr |= (seg_desc->base1 << 16);
+        base_addr |= (seg_desc->base2 << 24);
+        return base_addr;
+}
+static int load_tss_segment32(struct kvm_vcpu *vcpu,
+                              struct desc_struct *seg_desc,
+                              struct tss_segment_32 *tss)
+{
+        u32 base_addr;
+        base_addr = get_tss_base_addr(vcpu, seg_desc);
+        return kvm_read_guest(vcpu->kvm, base_addr, tss,
+                              sizeof(struct tss_segment_32));
+}
+static int save_tss_segment32(struct kvm_vcpu *vcpu,
+                              struct desc_struct *seg_desc,
+                              struct tss_segment_32 *tss)
+{
+        u32 base_addr;
+        base_addr = get_tss_base_addr(vcpu, seg_desc);
+        return kvm_write_guest(vcpu->kvm, base_addr, tss,
+                               sizeof(struct tss_segment_32));
+}
+static int load_tss_segment16(struct kvm_vcpu *vcpu,
+                              struct desc_struct *seg_desc,
+                              struct tss_segment_16 *tss)
+{
+        u32 base_addr;
+        base_addr = get_tss_base_addr(vcpu, seg_desc);
+        return kvm_read_guest(vcpu->kvm, base_addr, tss,
+                              sizeof(struct tss_segment_16));
+}
+static int save_tss_segment16(struct kvm_vcpu *vcpu,
+                              struct desc_struct *seg_desc,
+                              struct tss_segment_16 *tss)
+{
+        u32 base_addr;
+        base_addr = get_tss_base_addr(vcpu, seg_desc);
+        return kvm_write_guest(vcpu->kvm, base_addr, tss,
+                               sizeof(struct tss_segment_16));
+}
+static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+        struct kvm_segment kvm_seg;
+        get_segment(vcpu, &kvm_seg, seg);
+        return kvm_seg.selector;
+}
+static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
+                                                u16 selector,
+                                                struct kvm_segment *kvm_seg)
+{
+        struct desc_struct seg_desc;
+        if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
+                return 1;
+        seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
+        return 0;
+}
+static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+                                   int type_bits, int seg)
+{
+        struct kvm_segment kvm_seg;
+        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
+                return 1;
+        kvm_seg.type |= type_bits;
+        if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
+            seg != VCPU_SREG_LDTR)
+                if (!kvm_seg.s)
+                        kvm_seg.unusable = 1;
+        set_segment(vcpu, &kvm_seg, seg);
+        return 0;
+}
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+                                struct tss_segment_32 *tss)
+{
+        tss->cr3 = vcpu->arch.cr3;
+        tss->eip = vcpu->arch.rip;
+        tss->eflags = kvm_x86_ops->get_rflags(vcpu);
+        tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
+        tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+        tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
+        tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
+        tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
+        tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
+        tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
+        tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
+        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+        tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+        tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+        tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+        tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+                                  struct tss_segment_32 *tss)
+{
+        kvm_set_cr3(vcpu, tss->cr3);
+        vcpu->arch.rip = tss->eip;
+        kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
+        vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
+        vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
+        vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
+        vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
+        vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
+        vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
+        vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
+        vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+        if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+                return 1;
+        return 0;
+}
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+                                struct tss_segment_16 *tss)
+{
+        tss->ip = vcpu->arch.rip;
+        tss->flag = kvm_x86_ops->get_rflags(vcpu);
+        tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
+        tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
+        tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
+        tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
+        tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
+        tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
+        tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
+        tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+        tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+        tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+                                 struct tss_segment_16 *tss)
+{
+        vcpu->arch.rip = tss->ip;
+        kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
+        vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
+        vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
+        vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
+        vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
+        vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
+        vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
+        vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
+        vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+        if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+                return 1;
+        if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+                return 1;
+        return 0;
+}
+int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
+                       struct desc_struct *cseg_desc,
+                       struct desc_struct *nseg_desc)
+{
+        struct tss_segment_16 tss_segment_16;
+        int ret = 0;
+        if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
+                goto out;
+        save_state_to_tss16(vcpu, &tss_segment_16);
+        save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
+        if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
+                goto out;
+        if (load_state_from_tss16(vcpu, &tss_segment_16))
+                goto out;
+        ret = 1;
+out:
+        return ret;
+}
+int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
+                       struct desc_struct *cseg_desc,
+                       struct desc_struct *nseg_desc)
+{
+        struct tss_segment_32 tss_segment_32;
+        int ret = 0;
+        if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
+                goto out;
+        save_state_to_tss32(vcpu, &tss_segment_32);
+        save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
+        if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
+                goto out;
+        if (load_state_from_tss32(vcpu, &tss_segment_32))
+                goto out;
+        ret = 1;
+out:
+        return ret;
 }
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+{
+        struct kvm_segment tr_seg;
+        struct desc_struct cseg_desc;
+        struct desc_struct nseg_desc;
+        int ret = 0;
+        get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+        if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+                goto out;
+        if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
+                goto out;
+        if (reason != TASK_SWITCH_IRET) {
+                int cpl;
+                cpl = kvm_x86_ops->get_cpl(vcpu);
+                if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
+                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+                        return 1;
+                }
+        }
+        if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+                kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+                return 1;
+        }
+        if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+                cseg_desc.type &= ~(1 << 8); //clear the B flag
+                save_guest_segment_descriptor(vcpu, tr_seg.selector,
+                                              &cseg_desc);
+        }
+        if (reason == TASK_SWITCH_IRET) {
+                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+                kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+        }
+        kvm_x86_ops->skip_emulated_instruction(vcpu);
+        kvm_x86_ops->cache_regs(vcpu);
+        if (nseg_desc.type & 8)
+                ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
+                                         &nseg_desc);
+        else
+                ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
+                                         &nseg_desc);
+        if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+                kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+        }
+        if (reason != TASK_SWITCH_IRET) {
+                nseg_desc.type |= (1 << 8);
+                save_guest_segment_descriptor(vcpu, tss_selector,
+                                              &nseg_desc);
+        }
+        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+        seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+        tr_seg.type = 11;
+        set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+        kvm_x86_ops->decache_regs(vcpu);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
@@ -2862,12 +3542,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
-        set_cr8(vcpu, sregs->cr8);
+        kvm_set_cr8(vcpu, sregs->cr8);
        mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
        kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
        kvm_set_apic_base(vcpu, sregs->apic_base);
        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
@@ -3141,9 +3819,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
        if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
-                vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
-                vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!page) {
@@ -3175,7 +3853,9 @@ fail:
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
        kvm_free_lapic(vcpu);
+        down_read(&vcpu->kvm->slots_lock);
        kvm_mmu_destroy(vcpu);
+        up_read(&vcpu->kvm->slots_lock);
        free_page((unsigned long)vcpu->arch.pio_data);
 }
@@ -3219,10 +3899,13 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
        kvm_free_physmem(kvm);
+        if (kvm->arch.apic_access_page)
+                put_page(kvm->arch.apic_access_page);
        kfree(kvm);
 }
@@ -3278,8 +3961,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-        return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
+        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-               || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
 }
 static void vcpu_kick_intr(void *info)
@@ -3293,11 +3976,17 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
        int ipi_pcpu = vcpu->cpu;
+        int cpu = get_cpu();
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
                ++vcpu->stat.halt_wakeup;
        }
-        if (vcpu->guest_mode)
+        /*
+         * We may be called synchronously with irqs disabled in guest mode,
+         * So need not to call smp_call_function_single() in that case.
+         */
+        if (vcpu->guest_mode && vcpu->cpu != cpu)
                smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+        put_cpu();
 }
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 79586003397..2ca08386f99 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -65,6 +65,14 @@
 #define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
 #define String      (1<<10)     /* String instruction (rep capable) */
 #define Stack       (1<<11)     /* Stack instruction (push/pop) */
+#define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
+#define GroupMask   0xff        /* Group number stored in bits 0:7 */
+enum {
+        Group1_80, Group1_81, Group1_82, Group1_83,
+        Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+};
 static u16 opcode_table[256] = {
        /* 0x00 - 0x07 */
@@ -123,14 +131,14 @@ static u16 opcode_table[256] = {
        ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
        ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
        /* 0x80 - 0x87 */
-        ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+        Group | Group1_80, Group | Group1_81,
-        ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+        Group | Group1_82, Group | Group1_83,
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        /* 0x88 - 0x8F */
        ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
        ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-        0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+        0, ModRM | DstReg, 0, Group | Group1A,
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
@@ -164,16 +172,15 @@ static u16 opcode_table[256] = {
        0, 0, 0, 0,
        /* 0xF0 - 0xF7 */
        0, 0, 0, 0,
-        ImplicitOps, ImplicitOps,
+        ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
-        ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        /* 0xF8 - 0xFF */
        ImplicitOps, 0, ImplicitOps, ImplicitOps,
-        0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+        0, 0, Group | Group4, Group | Group5,
 };
 static u16 twobyte_table[256] = {
        /* 0x00 - 0x0F */
-        0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+        0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
        ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
        /* 0x10 - 0x1F */
        0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -229,6 +236,56 @@ static u16 twobyte_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
+static u16 group_table[] = {
+        [Group1_80*8] =
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        [Group1_81*8] =
+        DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+        DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+        DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+        DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+        [Group1_82*8] =
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+        [Group1_83*8] =
+        DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+        DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+        DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+        DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+        [Group1A*8] =
+        DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+        [Group3_Byte*8] =
+        ByteOp | SrcImm | DstMem | ModRM, 0,
+        ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+        0, 0, 0, 0,
+        [Group3*8] =
+        DstMem | SrcImm | ModRM | SrcImm, 0,
+        DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+        0, 0, 0, 0,
+        [Group4*8] =
+        ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+        0, 0, 0, 0, 0, 0,
+        [Group5*8] =
+        DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+        SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+        [Group7*8] =
+        0, 0, ModRM | SrcMem, ModRM | SrcMem,
+        SrcNone | ModRM | DstMem | Mov, 0,
+        SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
+};
+static u16 group2_table[] = {
+        [Group7*8] =
+        SrcNone | ModRM, 0, 0, 0,
+        SrcNone | ModRM | DstMem | Mov, 0,
+        SrcMem16 | ModRM | Mov, 0,
+};
 /* EFLAGS bit definitions. */
 #define EFLG_OF (1<<11)
 #define EFLG_DF (1<<10)
@@ -317,7 +374,7 @@ static u16 twobyte_table[256] = {
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
-                unsigned long _tmp;                                          \
+                unsigned long __tmp;                                         \
                switch ((_dst).bytes) {                                      \
                case 1:                                                      \
                        __asm__ __volatile__ (                               \
@@ -325,7 +382,7 @@ static u16 twobyte_table[256] = {
                                _op"b %"_bx"3,%1; "                          \
                                _POST_EFLAGS("0", "4", "2")                  \
                                : "=m" (_eflags), "=m" ((_dst).val),         \
-                                  "=&r" (_tmp)                               \
+                                  "=&r" (__tmp)                              \
                                : _by ((_src).val), "i" (EFLAGS_MASK));      \
                        break;                                               \
                default:                                                     \
@@ -426,29 +483,40 @@ static u16 twobyte_table[256] = {
        (_type)_x;                                                      \
 })
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+        return (1UL << (c->ad_bytes << 3)) - 1;
+}
 /* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)                                               \
+static inline unsigned long
-        ((c->ad_bytes == sizeof(unsigned long)) ?                       \
+address_mask(struct decode_cache *c, unsigned long reg)
-                (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+{
-#define register_address(base, reg)                                     \
+        if (c->ad_bytes == sizeof(unsigned long))
-        ((base) + address_mask(reg))
+                return reg;
-#define register_address_increment(reg, inc)                            \
+        else
-        do {                                                            \
+                return reg & ad_mask(c);
-                /* signed type ensures sign extension to long */        \
+}
-                int _inc = (inc);                                       \
-                if (c->ad_bytes == sizeof(unsigned long))               \
-                        (reg) += _inc;                                  \
-                else                                                    \
-                        (reg) = ((reg) &                                \
-                                 ~((1UL << (c->ad_bytes << 3)) - 1)) |  \
-                                (((reg) + _inc) &                       \
-                                 ((1UL << (c->ad_bytes << 3)) - 1));    \
-        } while (0)
-#define JMP_REL(rel)                                                    \
+static inline unsigned long
-        do {                                                            \
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
-                register_address_increment(c->eip, rel);                \
+{
-        } while (0)
+        return base + address_mask(c, reg);
+}
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+        if (c->ad_bytes == sizeof(unsigned long))
+                *reg += inc;
+        else
+                *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+        register_address_increment(c, &c->eip, rel);
+}
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
                              struct x86_emulate_ops *ops,
@@ -763,7 +831,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        struct decode_cache *c = &ctxt->decode;
        int rc = 0;
        int mode = ctxt->mode;
-        int def_op_bytes, def_ad_bytes;
+        int def_op_bytes, def_ad_bytes, group;
        /* Shadow copy of register state. Committed on successful emulation. */
@@ -864,12 +932,24 @@ done_prefixes:
                        c->b = insn_fetch(u8, 1, c->eip);
                        c->d = twobyte_table[c->b];
                }
+        }
-                /* Unrecognised? */
+        if (c->d & Group) {
-                if (c->d == 0) {
+                group = c->d & GroupMask;
-                        DPRINTF("Cannot emulate %02x\n", c->b);
+                c->modrm = insn_fetch(u8, 1, c->eip);
-                        return -1;
+                --c->eip;
-                }
+                group = (group << 3) + ((c->modrm >> 3) & 7);
+                if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+                        c->d = group2_table[group];
+                else
+                        c->d = group_table[group];
+        }
+        /* Unrecognised? */
+        if (c->d == 0) {
+                DPRINTF("Cannot emulate %02x\n", c->b);
+                return -1;
        }
        if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
@@ -924,6 +1004,7 @@ done_prefixes:
                 */
                if ((c->d & ModRM) && c->modrm_mod == 3) {
                        c->src.type = OP_REG;
+                        c->src.val = c->modrm_val;
                        break;
                }
                c->src.type = OP_MEM;
@@ -967,6 +1048,7 @@ done_prefixes:
        case DstMem:
                if ((c->d & ModRM) && c->modrm_mod == 3) {
                        c->dst.type = OP_REG;
+                        c->dst.val = c->dst.orig_val = c->modrm_val;
                        break;
                }
                c->dst.type = OP_MEM;
@@ -984,8 +1066,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
        c->dst.type  = OP_MEM;
        c->dst.bytes = c->op_bytes;
        c->dst.val = c->src.val;
-        register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+        register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-        c->dst.ptr = (void *) register_address(ctxt->ss_base,
+        c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
                                               c->regs[VCPU_REGS_RSP]);
 }
@@ -995,13 +1077,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
        struct decode_cache *c = &ctxt->decode;
        int rc;
-        rc = ops->read_std(register_address(ctxt->ss_base,
+        rc = ops->read_std(register_address(c, ctxt->ss_base,
                                            c->regs[VCPU_REGS_RSP]),
                           &c->dst.val, c->dst.bytes, ctxt->vcpu);
        if (rc != 0)
                return rc;
-        register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+        register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
        return 0;
 }
@@ -1043,26 +1125,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
        switch (c->modrm_reg) {
        case 0 ... 1:   /* test */
-                /*
-                 * Special case in Grp3: test has an immediate
-                 * source operand.
-                 */
-                c->src.type = OP_IMM;
-                c->src.ptr = (unsigned long *)c->eip;
-                c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-                if (c->src.bytes == 8)
-                        c->src.bytes = 4;
-                switch (c->src.bytes) {
-                case 1:
-                        c->src.val = insn_fetch(s8, 1, c->eip);
-                        break;
-                case 2:
-                        c->src.val = insn_fetch(s16, 2, c->eip);
-                        break;
-                case 4:
-                        c->src.val = insn_fetch(s32, 4, c->eip);
-                        break;
-                }
                emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
                break;
        case 2: /* not */
@@ -1076,7 +1138,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
                rc = X86EMUL_UNHANDLEABLE;
                break;
        }
-done:
        return rc;
 }
@@ -1084,7 +1145,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
                               struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
-        int rc;
        switch (c->modrm_reg) {
        case 0: /* inc */
@@ -1094,36 +1154,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
        case 4: /* jmp abs */
-                if (c->b == 0xff)
+                c->eip = c->src.val;
-                        c->eip = c->dst.val;
-                else {
-                        DPRINTF("Cannot emulate %02x\n", c->b);
-                        return X86EMUL_UNHANDLEABLE;
-                }
                break;
        case 6: /* push */
+                emulate_push(ctxt);
-                /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-                if (ctxt->mode == X86EMUL_MODE_PROT64) {
-                        c->dst.bytes = 8;
-                        rc = ops->read_std((unsigned long)c->dst.ptr,
-                                           &c->dst.val, 8, ctxt->vcpu);
-                        if (rc != 0)
-                                return rc;
-                }
-                register_address_increment(c->regs[VCPU_REGS_RSP],
-                                           -c->dst.bytes);
-                rc = ops->write_emulated(register_address(ctxt->ss_base,
-                                    c->regs[VCPU_REGS_RSP]), &c->dst.val,
-                                    c->dst.bytes, ctxt->vcpu);
-                if (rc != 0)
-                        return rc;
-                c->dst.type = OP_NONE;
                break;
-        default:
-                DPRINTF("Cannot emulate %02x\n", c->b);
-                return X86EMUL_UNHANDLEABLE;
        }
        return 0;
 }
@@ -1361,19 +1396,19 @@ special_insn:
                c->dst.type  = OP_MEM;
                c->dst.bytes = c->op_bytes;
                c->dst.val = c->src.val;
-                register_address_increment(c->regs[VCPU_REGS_RSP],
+                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
                                           -c->op_bytes);
                c->dst.ptr = (void *) register_address(
-                        ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+                        c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
                break;
        case 0x58 ... 0x5f: /* pop reg */
        pop_instruction:
-                if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                if ((rc = ops->read_std(register_address(c, ctxt->ss_base,
                        c->regs[VCPU_REGS_RSP]), c->dst.ptr,
                        c->op_bytes, ctxt->vcpu)) != 0)
                        goto done;
-                register_address_increment(c->regs[VCPU_REGS_RSP],
+                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
                                           c->op_bytes);
                c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
@@ -1393,9 +1428,9 @@ special_insn:
                                1,
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
-                                address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                                address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
                                (ctxt->eflags & EFLG_DF),
-                                register_address(ctxt->es_base,
+                                register_address(c, ctxt->es_base,
                                                 c->regs[VCPU_REGS_RDI]),
                                c->rep_prefix,
                                c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1409,9 +1444,9 @@ special_insn:
                                0,
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
-                                address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                                address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
                                (ctxt->eflags & EFLG_DF),
-                                register_address(c->override_base ?
+                                register_address(c, c->override_base ?
                                                        *c->override_base :
                                                        ctxt->ds_base,
                                                 c->regs[VCPU_REGS_RSI]),
@@ -1425,7 +1460,7 @@ special_insn:
                int rel = insn_fetch(s8, 1, c->eip);
                if (test_cc(c->b, ctxt->eflags))
-                        JMP_REL(rel);
+                        jmp_rel(c, rel);
                break;
        }
        case 0x80 ... 0x83:     /* Grp1 */
@@ -1477,7 +1512,7 @@ special_insn:
        case 0x88 ... 0x8b:     /* mov */
                goto mov;
        case 0x8d: /* lea r16/r32, m */
-                c->dst.val = c->modrm_val;
+                c->dst.val = c->modrm_ea;
                break;
        case 0x8f:              /* pop (sole member of Grp1a) */
                rc = emulate_grp1a(ctxt, ops);
@@ -1501,27 +1536,27 @@ special_insn:
        case 0xa4 ... 0xa5:     /* movs */
                c->dst.type = OP_MEM;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-                c->dst.ptr = (unsigned long *)register_address(
+                c->dst.ptr = (unsigned long *)register_address(c,
                                                   ctxt->es_base,
                                                   c->regs[VCPU_REGS_RDI]);
-                if ((rc = ops->read_emulated(register_address(
+                if ((rc = ops->read_emulated(register_address(c,
                      c->override_base ? *c->override_base :
                                        ctxt->ds_base,
                                        c->regs[VCPU_REGS_RSI]),
                                        &c->dst.val,
                                        c->dst.bytes, ctxt->vcpu)) != 0)
                        goto done;
-                register_address_increment(c->regs[VCPU_REGS_RSI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RSI],
                                       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
-                register_address_increment(c->regs[VCPU_REGS_RDI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RDI],
                                       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
        case 0xa6 ... 0xa7:     /* cmps */
                c->src.type = OP_NONE; /* Disable writeback. */
                c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-                c->src.ptr = (unsigned long *)register_address(
+                c->src.ptr = (unsigned long *)register_address(c,
                                c->override_base ? *c->override_base :
                                                   ctxt->ds_base,
                                                   c->regs[VCPU_REGS_RSI]);
@@ -1533,7 +1568,7 @@ special_insn:
                c->dst.type = OP_NONE; /* Disable writeback. */
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-                c->dst.ptr = (unsigned long *)register_address(
+                c->dst.ptr = (unsigned long *)register_address(c,
                                                   ctxt->es_base,
                                                   c->regs[VCPU_REGS_RDI]);
                if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
@@ -1546,10 +1581,10 @@ special_insn:
                emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-                register_address_increment(c->regs[VCPU_REGS_RSI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RSI],
                                       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
                                                                  : c->src.bytes);
-                register_address_increment(c->regs[VCPU_REGS_RDI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RDI],
                                       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                                  : c->dst.bytes);
@@ -1557,11 +1592,11 @@ special_insn:
        case 0xaa ... 0xab:     /* stos */
                c->dst.type = OP_MEM;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-                c->dst.ptr = (unsigned long *)register_address(
+                c->dst.ptr = (unsigned long *)register_address(c,
                                                   ctxt->es_base,
                                                   c->regs[VCPU_REGS_RDI]);
                c->dst.val = c->regs[VCPU_REGS_RAX];
-                register_address_increment(c->regs[VCPU_REGS_RDI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RDI],
                                       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
@@ -1569,7 +1604,7 @@ special_insn:
                c->dst.type = OP_REG;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-                if ((rc = ops->read_emulated(register_address(
+                if ((rc = ops->read_emulated(register_address(c,
                                c->override_base ? *c->override_base :
                                                   ctxt->ds_base,
                                                 c->regs[VCPU_REGS_RSI]),
@@ -1577,7 +1612,7 @@ special_insn:
                                                 c->dst.bytes,
                                                 ctxt->vcpu)) != 0)
                        goto done;
-                register_address_increment(c->regs[VCPU_REGS_RSI],
+                register_address_increment(c, &c->regs[VCPU_REGS_RSI],
                                       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
@@ -1616,14 +1651,14 @@ special_insn:
                        goto cannot_emulate;
                }
                c->src.val = (unsigned long) c->eip;
-                JMP_REL(rel);
+                jmp_rel(c, rel);
                c->op_bytes = c->ad_bytes;
                emulate_push(ctxt);
                break;
        }
        case 0xe9: /* jmp rel */
        case 0xeb: /* jmp rel short */
-                JMP_REL(c->src.val);
+                jmp_rel(c, c->src.val);
                c->dst.type = OP_NONE; /* Disable writeback. */
                break;
        case 0xf4:              /* hlt */
@@ -1690,6 +1725,8 @@ twobyte_insn:
                                goto done;
                        kvm_emulate_hypercall(ctxt->vcpu);
+                        /* Disable writeback. */
+                        c->dst.type = OP_NONE;
                        break;
                case 2: /* lgdt */
                        rc = read_descriptor(ctxt, ops, c->src.ptr,
@@ -1697,6 +1734,8 @@ twobyte_insn:
                        if (rc)
                                goto done;
                        realmode_lgdt(ctxt->vcpu, size, address);
+                        /* Disable writeback. */
+                        c->dst.type = OP_NONE;
                        break;
                case 3: /* lidt/vmmcall */
                        if (c->modrm_mod == 3 && c->modrm_rm == 1) {
@@ -1712,27 +1751,25 @@ twobyte_insn:
                                        goto done;
                                realmode_lidt(ctxt->vcpu, size, address);
                        }
+                        /* Disable writeback. */
+                        c->dst.type = OP_NONE;
                        break;
                case 4: /* smsw */
-                        if (c->modrm_mod != 3)
+                        c->dst.bytes = 2;
-                                goto cannot_emulate;
+                        c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
-                        *(u16 *)&c->regs[c->modrm_rm]
-                                = realmode_get_cr(ctxt->vcpu, 0);
                        break;
                case 6: /* lmsw */
-                        if (c->modrm_mod != 3)
+                        realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
-                                goto cannot_emulate;
+                                      &ctxt->eflags);
-                        realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
-                                                  &ctxt->eflags);
                        break;
                case 7: /* invlpg*/
                        emulate_invlpg(ctxt->vcpu, memop);
+                        /* Disable writeback. */
+                        c->dst.type = OP_NONE;
                        break;
                default:
                        goto cannot_emulate;
                }
-                /* Disable writeback. */
-                c->dst.type = OP_NONE;
                break;
        case 0x06:
                emulate_clts(ctxt->vcpu);
@@ -1823,7 +1860,7 @@ twobyte_insn:
                        goto cannot_emulate;
                }
                if (test_cc(c->b, ctxt->eflags))
-                        JMP_REL(rel);
+                        jmp_rel(c, rel);
                c->dst.type = OP_NONE;
                break;
        }
diff --git a/drivers/s390/Makefile b/drivers/s390/Makefile
index 5a888704a8d..4f4e7cf105d 100644
--- a/drivers/s390/Makefile
+++ b/drivers/s390/Makefile
@@ -5,7 +5,7 @@
 CFLAGS_sysinfo.o += -Iinclude/math-emu -Iarch/s390/math-emu -w
 obj-y += s390mach.o sysinfo.o s390_rdev.o
-obj-y += cio/ block/ char/ crypto/ net/ scsi/
+obj-y += cio/ block/ char/ crypto/ net/ scsi/ kvm/
 drivers-y += drivers/s390/built-in.o
diff --git a/drivers/s390/kvm/Makefile b/drivers/s390/kvm/Makefile
new file mode 100644
index 00000000000..4a5ec39f9ca
--- /dev/null
+++ b/drivers/s390/kvm/Makefile
@@ -0,0 +1,9 @@
+# Makefile for kvm guest drivers on s390
+#
+# Copyright IBM Corp. 2008
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License (version 2 only)
+# as published by the Free Software Foundation.
+obj-$(CONFIG_VIRTIO) += kvm_virtio.o
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
new file mode 100644
index 00000000000..bbef3764fbf
--- /dev/null
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -0,0 +1,338 @@
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/interrupt.h>
+#include <linux/virtio_ring.h>
+#include <asm/io.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#include <asm/setup.h>
+#include <asm/s390_ext.h>
+#define VIRTIO_SUBCODE_64 0x0D00
+/*
+ * The pointer to our (page) of device descriptions.
+ */
+static void *kvm_devices;
+/*
+ * Unique numbering for kvm devices.
+ */
+static unsigned int dev_index;
+struct kvm_device {
+        struct virtio_device vdev;
+        struct kvm_device_desc *desc;
+};
+#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
+/*
+ * memory layout:
+ * - kvm_device_descriptor
+ *        struct kvm_device_desc
+ * - configuration
+ *        struct kvm_vqconfig
+ * - feature bits
+ * - config space
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+        return (struct kvm_vqconfig *)(desc + 1);
+}
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+        return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+        return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+        return sizeof(*desc)
+                + desc->num_vq * sizeof(struct kvm_vqconfig)
+                + desc->feature_len * 2
+                + desc->config_len;
+}
+/*
+ * This tests (and acknowleges) a feature bit.
+ */
+static bool kvm_feature(struct virtio_device *vdev, unsigned fbit)
+{
+        struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+        u8 *features;
+        if (fbit / 8 > desc->feature_len)
+                return false;
+        features = kvm_vq_features(desc);
+        if (!(features[fbit / 8] & (1 << (fbit % 8))))
+                return false;
+        /*
+         * We set the matching bit in the other half of the bitmap to tell the
+         * Host we want to use this feature.
+         */
+        features[desc->feature_len + fbit / 8] |= (1 << (fbit % 8));
+        return true;
+}
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+                   void *buf, unsigned len)
+{
+        struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+        BUG_ON(offset + len > desc->config_len);
+        memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+                   const void *buf, unsigned len)
+{
+        struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+        BUG_ON(offset + len > desc->config_len);
+        memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+        return to_kvmdev(vdev)->desc->status;
+}
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+        BUG_ON(!status);
+        to_kvmdev(vdev)->desc->status = status;
+        kvm_hypercall1(KVM_S390_VIRTIO_SET_STATUS,
+                       (unsigned long) to_kvmdev(vdev)->desc);
+}
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+        kvm_hypercall1(KVM_S390_VIRTIO_RESET,
+                       (unsigned long) to_kvmdev(vdev)->desc);
+}
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall.  We hand the address  of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+        struct kvm_vqconfig *config = vq->priv;
+        kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address);
+}
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+                                    unsigned index,
+                                    void (*callback)(struct virtqueue *vq))
+{
+        struct kvm_device *kdev = to_kvmdev(vdev);
+        struct kvm_vqconfig *config;
+        struct virtqueue *vq;
+        int err;
+        if (index >= kdev->desc->num_vq)
+                return ERR_PTR(-ENOENT);
+        config = kvm_vq_config(kdev->desc)+index;
+        if (add_shared_memory(config->address,
+                                vring_size(config->num, PAGE_SIZE))) {
+                err = -ENOMEM;
+                goto out;
+        }
+        vq = vring_new_virtqueue(config->num, vdev, (void *) config->address,
+                                 kvm_notify, callback);
+        if (!vq) {
+                err = -ENOMEM;
+                goto unmap;
+        }
+        /*
+         * register a callback token
+         * The host will sent this via the external interrupt parameter
+         */
+        config->token = (u64) vq;
+        vq->priv = config;
+        return vq;
+unmap:
+        remove_shared_memory(config->address, vring_size(config->num,
+                             PAGE_SIZE));
+out:
+        return ERR_PTR(err);
+}
+static void kvm_del_vq(struct virtqueue *vq)
+{
+        struct kvm_vqconfig *config = vq->priv;
+        vring_del_virtqueue(vq);
+        remove_shared_memory(config->address,
+                             vring_size(config->num, PAGE_SIZE));
+}
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_configspace_ops = {
+        .feature = kvm_feature,
+        .get = kvm_get,
+        .set = kvm_set,
+        .get_status = kvm_get_status,
+        .set_status = kvm_set_status,
+        .reset = kvm_reset,
+        .find_vq = kvm_find_vq,
+        .del_vq = kvm_del_vq,
+};
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_s390/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device kvm_root = {
+        .parent = NULL,
+        .bus_id = "kvm_s390",
+};
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d)
+{
+        struct kvm_device *kdev;
+        kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+        if (!kdev) {
+                printk(KERN_EMERG "Cannot allocate kvm dev %u\n",
+                       dev_index++);
+                return;
+        }
+        kdev->vdev.dev.parent = &kvm_root;
+        kdev->vdev.index = dev_index++;
+        kdev->vdev.id.device = d->type;
+        kdev->vdev.config = &kvm_vq_configspace_ops;
+        kdev->desc = d;
+        if (register_virtio_device(&kdev->vdev) != 0) {
+                printk(KERN_ERR "Failed to register kvm device %u\n",
+                       kdev->vdev.index);
+                kfree(kdev);
+        }
+}
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+        unsigned int i;
+        struct kvm_device_desc *d;
+        for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+                d = kvm_devices + i;
+                if (d->type == 0)
+                        break;
+                add_kvm_device(d);
+        }
+}
+/*
+ * we emulate the request_irq behaviour on top of s390 extints
+ */
+static void kvm_extint_handler(u16 code)
+{
+        void *data = (void *) *(long *) __LC_PFAULT_INTPARM;
+        u16 subcode = S390_lowcore.cpu_addr;
+        if ((subcode & 0xff00) != VIRTIO_SUBCODE_64)
+                return;
+        vring_interrupt(0, data);
+}
+/*
+ * Init function for virtio
+ * devices are in a single page above top of "normal" mem
+ */
+static int __init kvm_devices_init(void)
+{
+        int rc;
+        if (!MACHINE_IS_KVM)
+                return -ENODEV;
+        rc = device_register(&kvm_root);
+        if (rc) {
+                printk(KERN_ERR "Could not register kvm_s390 root device");
+                return rc;
+        }
+        if (add_shared_memory((max_pfn) << PAGE_SHIFT, PAGE_SIZE)) {
+                device_unregister(&kvm_root);
+                return -ENOMEM;
+        }
+        kvm_devices  = (void *) (max_pfn << PAGE_SHIFT);
+        ctl_set_bit(0, 9);
+        register_external_interrupt(0x2603, kvm_extint_handler);
+        scan_devices();
+        return 0;
+}
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/include/asm-ia64/gcc_intrin.h b/include/asm-ia64/gcc_intrin.h
index de2ed2cbdd8..2fe292c275f 100644
--- a/include/asm-ia64/gcc_intrin.h
+++ b/include/asm-ia64/gcc_intrin.h
@@ -21,6 +21,10 @@
 #define ia64_invala_fr(regnum)  asm volatile ("invala.e f%0" :: "i"(regnum))
+#define ia64_flushrs() asm volatile ("flushrs;;":::"memory")
+#define ia64_loadrs() asm volatile ("loadrs;;":::"memory")
 extern void ia64_bad_param_for_setreg (void);
 extern void ia64_bad_param_for_getreg (void);
@@ -517,6 +521,14 @@ do {										\
 #define ia64_ptrd(addr, size)                                           \
        asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
+#define ia64_ttag(addr)                                                 \
+({                                                                        \
+        __u64 ia64_intri_res;                                              \
+        asm volatile ("ttag %0=%1" : "=r"(ia64_intri_res) : "r" (addr));   \
+        ia64_intri_res;                                                  \
+})
 /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
 #define ia64_lfhint_none   0
diff --git a/include/asm-ia64/kvm.h b/include/asm-ia64/kvm.h
index 030d29b4b26..eb2d3559d08 100644
--- a/include/asm-ia64/kvm.h
+++ b/include/asm-ia64/kvm.h
@@ -1,6 +1,205 @@
-#ifndef __LINUX_KVM_IA64_H
+#ifndef __ASM_IA64_KVM_H
-#define __LINUX_KVM_IA64_H
+#define __ASM_IA64_KVM_H
-/* ia64 does not support KVM */
+/*
+ * asm-ia64/kvm.h: kvm structure definitions  for ia64
+ *
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <asm/types.h>
+#include <asm/fpu.h>
+#include <linux/ioctl.h>
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+        __u64 base_address;
+        __u32 ioregsel;
+        __u32 id;
+        __u32 irr;
+        __u32 pad;
+        union {
+                __u64 bits;
+                struct {
+                        __u8 vector;
+                        __u8 delivery_mode:3;
+                        __u8 dest_mode:1;
+                        __u8 delivery_status:1;
+                        __u8 polarity:1;
+                        __u8 remote_irr:1;
+                        __u8 trig_mode:1;
+                        __u8 mask:1;
+                        __u8 reserve:7;
+                        __u8 reserved[4];
+                        __u8 dest_id;
+                } fields;
+        } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
+#define KVM_CONTEXT_SIZE        8*1024
+union context {
+        /* 8K size */
+        char    dummy[KVM_CONTEXT_SIZE];
+        struct {
+                unsigned long       psr;
+                unsigned long       pr;
+                unsigned long       caller_unat;
+                unsigned long       pad;
+                unsigned long       gr[32];
+                unsigned long       ar[128];
+                unsigned long       br[8];
+                unsigned long       cr[128];
+                unsigned long       rr[8];
+                unsigned long       ibr[8];
+                unsigned long       dbr[8];
+                unsigned long       pkr[8];
+                struct ia64_fpreg   fr[128];
+        };
+};
+struct thash_data {
+        union {
+                struct {
+                        unsigned long p    :  1; /* 0 */
+                        unsigned long rv1  :  1; /* 1 */
+                        unsigned long ma   :  3; /* 2-4 */
+                        unsigned long a    :  1; /* 5 */
+                        unsigned long d    :  1; /* 6 */
+                        unsigned long pl   :  2; /* 7-8 */
+                        unsigned long ar   :  3; /* 9-11 */
+                        unsigned long ppn  : 38; /* 12-49 */
+                        unsigned long rv2  :  2; /* 50-51 */
+                        unsigned long ed   :  1; /* 52 */
+                        unsigned long ig1  : 11; /* 53-63 */
+                };
+                struct {
+                        unsigned long __rv1 : 53;     /* 0-52 */
+                        unsigned long contiguous : 1; /*53 */
+                        unsigned long tc : 1;         /* 54 TR or TC */
+                        unsigned long cl : 1;
+                        /* 55 I side or D side cache line */
+                        unsigned long len  :  4;      /* 56-59 */
+                        unsigned long io  : 1;  /* 60 entry is for io or not */
+                        unsigned long nomap : 1;
+                        /* 61 entry cann't be inserted into machine TLB.*/
+                        unsigned long checked : 1;
+                        /* 62 for VTLB/VHPT sanity check */
+                        unsigned long invalid : 1;
+                        /* 63 invalid entry */
+                };
+                unsigned long page_flags;
+        };                  /* same for VHPT and TLB */
+        union {
+                struct {
+                        unsigned long rv3  :  2;
+                        unsigned long ps   :  6;
+                        unsigned long key  : 24;
+                        unsigned long rv4  : 32;
+                };
+                unsigned long itir;
+        };
+        union {
+                struct {
+                        unsigned long ig2  :  12;
+                        unsigned long vpn  :  49;
+                        unsigned long vrn  :   3;
+                };
+                unsigned long ifa;
+                unsigned long vadr;
+                struct {
+                        unsigned long tag  :  63;
+                        unsigned long ti   :  1;
+                };
+                unsigned long etag;
+        };
+        union {
+                struct thash_data *next;
+                unsigned long rid;
+                unsigned long gpaddr;
+        };
+};
+#define NITRS   8
+#define NDTRS   8
+struct saved_vpd {
+        unsigned long  vhpi;
+        unsigned long  vgr[16];
+        unsigned long  vbgr[16];
+        unsigned long  vnat;
+        unsigned long  vbnat;
+        unsigned long  vcpuid[5];
+        unsigned long  vpsr;
+        unsigned long  vpr;
+        unsigned long  vcr[128];
+};
+struct kvm_regs {
+        char *saved_guest;
+        char *saved_stack;
+        struct saved_vpd vpd;
+        /*Arch-regs*/
+        int mp_state;
+        unsigned long vmm_rr;
+        /* TR and TC.  */
+        struct thash_data itrs[NITRS];
+        struct thash_data dtrs[NDTRS];
+        /* Bit is set if there is a tr/tc for the region.  */
+        unsigned char itr_regions;
+        unsigned char dtr_regions;
+        unsigned char tc_regions;
+        char irq_check;
+        unsigned long saved_itc;
+        unsigned long itc_check;
+        unsigned long timer_check;
+        unsigned long timer_pending;
+        unsigned long last_itc;
+        unsigned long vrr[8];
+        unsigned long ibr[8];
+        unsigned long dbr[8];
+        unsigned long insvc[4];         /* Interrupt in service.  */
+        unsigned long xtp;
+        unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
+        unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
+        unsigned long metaphysical_saved_rr0; /* from kvm_arch          */
+        unsigned long metaphysical_saved_rr4; /* from kvm_arch          */
+        unsigned long fp_psr;       /*used for lazy float register */
+        unsigned long saved_gp;
+        /*for phycial  emulation */
+};
+struct kvm_sregs {
+};
+struct kvm_fpu {
+};
 #endif
diff --git a/include/asm-ia64/kvm_host.h b/include/asm-ia64/kvm_host.h
new file mode 100644
index 00000000000..c082c208c1f
--- /dev/null
+++ b/include/asm-ia64/kvm_host.h
@@ -0,0 +1,524 @@
+/*
+ * kvm_host.h: used for kvm module, and hold ia64-specific sections.
+ *
+ * Copyright (C) 2007, Intel Corporation.
+ *
+ * Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_KVM_HOST_H
+#define __ASM_KVM_HOST_H
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+#include <asm/pal.h>
+#include <asm/sal.h>
+#define KVM_MAX_VCPUS 4
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+/* define exit reasons from vmm to kvm*/
+#define EXIT_REASON_VM_PANIC            0
+#define EXIT_REASON_MMIO_INSTRUCTION    1
+#define EXIT_REASON_PAL_CALL            2
+#define EXIT_REASON_SAL_CALL            3
+#define EXIT_REASON_SWITCH_RR6          4
+#define EXIT_REASON_VM_DESTROY          5
+#define EXIT_REASON_EXTERNAL_INTERRUPT  6
+#define EXIT_REASON_IPI                 7
+#define EXIT_REASON_PTC_G               8
+/*Define vmm address space and vm data space.*/
+#define KVM_VMM_SIZE (16UL<<20)
+#define KVM_VMM_SHIFT 24
+#define KVM_VMM_BASE 0xD000000000000000UL
+#define VMM_SIZE (8UL<<20)
+/*
+ * Define vm_buffer, used by PAL Services, base address.
+ * Note: vmbuffer is in the VMM-BLOCK, the size must be < 8M
+ */
+#define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
+#define KVM_VM_BUFFER_SIZE (8UL<<20)
+/*Define Virtual machine data layout.*/
+#define KVM_VM_DATA_SHIFT  24
+#define KVM_VM_DATA_SIZE (1UL << KVM_VM_DATA_SHIFT)
+#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VMM_SIZE)
+#define KVM_P2M_BASE    KVM_VM_DATA_BASE
+#define KVM_P2M_OFS     0
+#define KVM_P2M_SIZE    (8UL << 20)
+#define KVM_VHPT_BASE   (KVM_P2M_BASE + KVM_P2M_SIZE)
+#define KVM_VHPT_OFS    KVM_P2M_SIZE
+#define KVM_VHPT_BLOCK_SIZE   (2UL << 20)
+#define VHPT_SHIFT      18
+#define VHPT_SIZE       (1UL << VHPT_SHIFT)
+#define VHPT_NUM_ENTRIES (1<<(VHPT_SHIFT-5))
+#define KVM_VTLB_BASE   (KVM_VHPT_BASE+KVM_VHPT_BLOCK_SIZE)
+#define KVM_VTLB_OFS    (KVM_VHPT_OFS+KVM_VHPT_BLOCK_SIZE)
+#define KVM_VTLB_BLOCK_SIZE   (1UL<<20)
+#define VTLB_SHIFT      17
+#define VTLB_SIZE       (1UL<<VTLB_SHIFT)
+#define VTLB_NUM_ENTRIES (1<<(VTLB_SHIFT-5))
+#define KVM_VPD_BASE   (KVM_VTLB_BASE+KVM_VTLB_BLOCK_SIZE)
+#define KVM_VPD_OFS    (KVM_VTLB_OFS+KVM_VTLB_BLOCK_SIZE)
+#define KVM_VPD_BLOCK_SIZE   (2UL<<20)
+#define VPD_SHIFT       16
+#define VPD_SIZE        (1UL<<VPD_SHIFT)
+#define KVM_VCPU_BASE   (KVM_VPD_BASE+KVM_VPD_BLOCK_SIZE)
+#define KVM_VCPU_OFS    (KVM_VPD_OFS+KVM_VPD_BLOCK_SIZE)
+#define KVM_VCPU_BLOCK_SIZE   (2UL<<20)
+#define VCPU_SHIFT 18
+#define VCPU_SIZE (1UL<<VCPU_SHIFT)
+#define MAX_VCPU_NUM KVM_VCPU_BLOCK_SIZE/VCPU_SIZE
+#define KVM_VM_BASE     (KVM_VCPU_BASE+KVM_VCPU_BLOCK_SIZE)
+#define KVM_VM_OFS      (KVM_VCPU_OFS+KVM_VCPU_BLOCK_SIZE)
+#define KVM_VM_BLOCK_SIZE     (1UL<<19)
+#define KVM_MEM_DIRTY_LOG_BASE (KVM_VM_BASE+KVM_VM_BLOCK_SIZE)
+#define KVM_MEM_DIRTY_LOG_OFS  (KVM_VM_OFS+KVM_VM_BLOCK_SIZE)
+#define KVM_MEM_DIRTY_LOG_SIZE (1UL<<19)
+/* Get vpd, vhpt, tlb, vcpu, base*/
+#define VPD_ADDR(n) (KVM_VPD_BASE+n*VPD_SIZE)
+#define VHPT_ADDR(n) (KVM_VHPT_BASE+n*VHPT_SIZE)
+#define VTLB_ADDR(n) (KVM_VTLB_BASE+n*VTLB_SIZE)
+#define VCPU_ADDR(n) (KVM_VCPU_BASE+n*VCPU_SIZE)
+/*IO section definitions*/
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+#define STATE_IOREQ_NONE        0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+/*Guest Physical address layout.*/
+#define GPFN_MEM        (0UL << 60) /* Guest pfn is normal mem */
+#define GPFN_FRAME_BUFFER   (1UL << 60) /* VGA framebuffer */
+#define GPFN_LOW_MMIO       (2UL << 60) /* Low MMIO range */
+#define GPFN_PIB        (3UL << 60) /* PIB base */
+#define GPFN_IOSAPIC        (4UL << 60) /* IOSAPIC base */
+#define GPFN_LEGACY_IO      (5UL << 60) /* Legacy I/O base */
+#define GPFN_GFW        (6UL << 60) /* Guest Firmware */
+#define GPFN_HIGH_MMIO      (7UL << 60) /* High MMIO range */
+#define GPFN_IO_MASK        (7UL << 60) /* Guest pfn is I/O type */
+#define GPFN_INV_MASK       (1UL << 63) /* Guest pfn is invalid */
+#define INVALID_MFN       (~0UL)
+#define MEM_G   (1UL << 30)
+#define MEM_M   (1UL << 20)
+#define MMIO_START       (3 * MEM_G)
+#define MMIO_SIZE        (512 * MEM_M)
+#define VGA_IO_START     0xA0000UL
+#define VGA_IO_SIZE      0x20000
+#define LEGACY_IO_START  (MMIO_START + MMIO_SIZE)
+#define LEGACY_IO_SIZE   (64 * MEM_M)
+#define IO_SAPIC_START   0xfec00000UL
+#define IO_SAPIC_SIZE    0x100000
+#define PIB_START 0xfee00000UL
+#define PIB_SIZE 0x200000
+#define GFW_START        (4 * MEM_G - 16 * MEM_M)
+#define GFW_SIZE         (16 * MEM_M)
+/*Deliver mode, defined for ioapic.c*/
+#define dest_Fixed IOSAPIC_FIXED
+#define dest_LowestPrio IOSAPIC_LOWEST_PRIORITY
+#define NMI_VECTOR                      2
+#define ExtINT_VECTOR                   0
+#define NULL_VECTOR                     (-1)
+#define IA64_SPURIOUS_INT_VECTOR        0x0f
+#define VCPU_LID(v) (((u64)(v)->vcpu_id) << 24)
+/*
+ *Delivery mode
+ */
+#define SAPIC_DELIV_SHIFT      8
+#define SAPIC_FIXED            0x0
+#define SAPIC_LOWEST_PRIORITY  0x1
+#define SAPIC_PMI              0x2
+#define SAPIC_NMI              0x4
+#define SAPIC_INIT             0x5
+#define SAPIC_EXTINT           0x7
+/*
+ * vcpu->requests bit members for arch
+ */
+#define KVM_REQ_PTC_G           32
+#define KVM_REQ_RESUME          33
+#define KVM_PAGES_PER_HPAGE     1
+struct kvm;
+struct kvm_vcpu;
+struct kvm_guest_debug{
+};
+struct kvm_mmio_req {
+        uint64_t addr;          /*  physical address            */
+        uint64_t size;          /*  size in bytes               */
+        uint64_t data;          /*  data (or paddr of data)     */
+        uint8_t state:4;
+        uint8_t dir:1;          /*  1=read, 0=write             */
+};
+/*Pal data struct */
+struct kvm_pal_call{
+        /*In area*/
+        uint64_t gr28;
+        uint64_t gr29;
+        uint64_t gr30;
+        uint64_t gr31;
+        /*Out area*/
+        struct ia64_pal_retval ret;
+};
+/* Sal data structure */
+struct kvm_sal_call{
+        /*In area*/
+        uint64_t in0;
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t in5;
+        uint64_t in6;
+        uint64_t in7;
+        struct sal_ret_values ret;
+};
+/*Guest change rr6*/
+struct kvm_switch_rr6 {
+        uint64_t old_rr;
+        uint64_t new_rr;
+};
+union ia64_ipi_a{
+        unsigned long val;
+        struct {
+                unsigned long rv  : 3;
+                unsigned long ir  : 1;
+                unsigned long eid : 8;
+                unsigned long id  : 8;
+                unsigned long ib_base : 44;
+        };
+};
+union ia64_ipi_d {
+        unsigned long val;
+        struct {
+                unsigned long vector : 8;
+                unsigned long dm  : 3;
+                unsigned long ig  : 53;
+        };
+};
+/*ipi check exit data*/
+struct kvm_ipi_data{
+        union ia64_ipi_a addr;
+        union ia64_ipi_d data;
+};
+/*global purge data*/
+struct kvm_ptc_g {
+        unsigned long vaddr;
+        unsigned long rr;
+        unsigned long ps;
+        struct kvm_vcpu *vcpu;
+};
+/*Exit control data */
+struct exit_ctl_data{
+        uint32_t exit_reason;
+        uint32_t vm_status;
+        union {
+                struct kvm_mmio_req     ioreq;
+                struct kvm_pal_call     pal_data;
+                struct kvm_sal_call     sal_data;
+                struct kvm_switch_rr6   rr_data;
+                struct kvm_ipi_data     ipi_data;
+                struct kvm_ptc_g        ptc_g_data;
+        } u;
+};
+union pte_flags {
+        unsigned long val;
+        struct {
+                unsigned long p    :  1; /*0      */
+                unsigned long      :  1; /* 1     */
+                unsigned long ma   :  3; /* 2-4   */
+                unsigned long a    :  1; /* 5     */
+                unsigned long d    :  1; /* 6     */
+                unsigned long pl   :  2; /* 7-8   */
+                unsigned long ar   :  3; /* 9-11  */
+                unsigned long ppn  : 38; /* 12-49 */
+                unsigned long      :  2; /* 50-51 */
+                unsigned long ed   :  1; /* 52    */
+        };
+};
+union ia64_pta {
+        unsigned long val;
+        struct {
+                unsigned long ve : 1;
+                unsigned long reserved0 : 1;
+                unsigned long size : 6;
+                unsigned long vf : 1;
+                unsigned long reserved1 : 6;
+                unsigned long base : 49;
+        };
+};
+struct thash_cb {
+        /* THASH base information */
+        struct thash_data       *hash; /* hash table pointer */
+        union ia64_pta          pta;
+        int           num;
+};
+struct kvm_vcpu_stat {
+};
+struct kvm_vcpu_arch {
+        int launched;
+        int last_exit;
+        int last_run_cpu;
+        int vmm_tr_slot;
+        int vm_tr_slot;
+#define KVM_MP_STATE_RUNNABLE          0
+#define KVM_MP_STATE_UNINITIALIZED     1
+#define KVM_MP_STATE_INIT_RECEIVED     2
+#define KVM_MP_STATE_HALTED            3
+        int mp_state;
+#define MAX_PTC_G_NUM                   3
+        int ptc_g_count;
+        struct kvm_ptc_g ptc_g_data[MAX_PTC_G_NUM];
+        /*halt timer to wake up sleepy vcpus*/
+        struct hrtimer hlt_timer;
+        long ht_active;
+        struct kvm_lapic *apic;    /* kernel irqchip context */
+        struct vpd *vpd;
+        /* Exit data for vmm_transition*/
+        struct exit_ctl_data exit_data;
+        cpumask_t cache_coherent_map;
+        unsigned long vmm_rr;
+        unsigned long host_rr6;
+        unsigned long psbits[8];
+        unsigned long cr_iipa;
+        unsigned long cr_isr;
+        unsigned long vsa_base;
+        unsigned long dirty_log_lock_pa;
+        unsigned long __gp;
+        /* TR and TC.  */
+        struct thash_data itrs[NITRS];
+        struct thash_data dtrs[NDTRS];
+        /* Bit is set if there is a tr/tc for the region.  */
+        unsigned char itr_regions;
+        unsigned char dtr_regions;
+        unsigned char tc_regions;
+        /* purge all */
+        unsigned long ptce_base;
+        unsigned long ptce_count[2];
+        unsigned long ptce_stride[2];
+        /* itc/itm */
+        unsigned long last_itc;
+        long itc_offset;
+        unsigned long itc_check;
+        unsigned long timer_check;
+        unsigned long timer_pending;
+        unsigned long vrr[8];
+        unsigned long ibr[8];
+        unsigned long dbr[8];
+        unsigned long insvc[4];         /* Interrupt in service.  */
+        unsigned long xtp;
+        unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */
+        unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */
+        unsigned long metaphysical_saved_rr0; /* from kvm_arch          */
+        unsigned long metaphysical_saved_rr4; /* from kvm_arch          */
+        unsigned long fp_psr;       /*used for lazy float register */
+        unsigned long saved_gp;
+        /*for phycial  emulation */
+        int mode_flags;
+        struct thash_cb vtlb;
+        struct thash_cb vhpt;
+        char irq_check;
+        char irq_new_pending;
+        unsigned long opcode;
+        unsigned long cause;
+        union context host;
+        union context guest;
+};
+struct kvm_vm_stat {
+        u64 remote_tlb_flush;
+};
+struct kvm_sal_data {
+        unsigned long boot_ip;
+        unsigned long boot_gp;
+};
+struct kvm_arch {
+        unsigned long   vm_base;
+        unsigned long   metaphysical_rr0;
+        unsigned long   metaphysical_rr4;
+        unsigned long   vmm_init_rr;
+        unsigned long   vhpt_base;
+        unsigned long   vtlb_base;
+        unsigned long   vpd_base;
+        spinlock_t dirty_log_lock;
+        struct kvm_ioapic *vioapic;
+        struct kvm_vm_stat stat;
+        struct kvm_sal_data rdv_sal_data;
+};
+union cpuid3_t {
+        u64 value;
+        struct {
+                u64 number : 8;
+                u64 revision : 8;
+                u64 model : 8;
+                u64 family : 8;
+                u64 archrev : 8;
+                u64 rv : 24;
+        };
+};
+struct kvm_pt_regs {
+        /* The following registers are saved by SAVE_MIN: */
+        unsigned long b6;  /* scratch */
+        unsigned long b7;  /* scratch */
+        unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
+        unsigned long ar_ssd; /* reserved for future use (scratch) */
+        unsigned long r8;  /* scratch (return value register 0) */
+        unsigned long r9;  /* scratch (return value register 1) */
+        unsigned long r10; /* scratch (return value register 2) */
+        unsigned long r11; /* scratch (return value register 3) */
+        unsigned long cr_ipsr; /* interrupted task's psr */
+        unsigned long cr_iip;  /* interrupted task's instruction pointer */
+        unsigned long cr_ifs;  /* interrupted task's function state */
+        unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
+        unsigned long ar_pfs;  /* prev function state  */
+        unsigned long ar_rsc;  /* RSE configuration */
+        /* The following two are valid only if cr_ipsr.cpl > 0: */
+        unsigned long ar_rnat;  /* RSE NaT */
+        unsigned long ar_bspstore; /* RSE bspstore */
+        unsigned long pr;  /* 64 predicate registers (1 bit each) */
+        unsigned long b0;  /* return pointer (bp) */
+        unsigned long loadrs;  /* size of dirty partition << 16 */
+        unsigned long r1;  /* the gp pointer */
+        unsigned long r12; /* interrupted task's memory stack pointer */
+        unsigned long r13; /* thread pointer */
+        unsigned long ar_fpsr;  /* floating point status (preserved) */
+        unsigned long r15;  /* scratch */
+        /* The remaining registers are NOT saved for system calls.  */
+        unsigned long r14;  /* scratch */
+        unsigned long r2;  /* scratch */
+        unsigned long r3;  /* scratch */
+        unsigned long r16;  /* scratch */
+        unsigned long r17;  /* scratch */
+        unsigned long r18;  /* scratch */
+        unsigned long r19;  /* scratch */
+        unsigned long r20;  /* scratch */
+        unsigned long r21;  /* scratch */
+        unsigned long r22;  /* scratch */
+        unsigned long r23;  /* scratch */
+        unsigned long r24;  /* scratch */
+        unsigned long r25;  /* scratch */
+        unsigned long r26;  /* scratch */
+        unsigned long r27;  /* scratch */
+        unsigned long r28;  /* scratch */
+        unsigned long r29;  /* scratch */
+        unsigned long r30;  /* scratch */
+        unsigned long r31;  /* scratch */
+        unsigned long ar_ccv;  /* compare/exchange value (scratch) */
+        /*
+         * Floating point registers that the kernel considers scratch:
+         */
+        struct ia64_fpreg f6;  /* scratch */
+        struct ia64_fpreg f7;  /* scratch */
+        struct ia64_fpreg f8;  /* scratch */
+        struct ia64_fpreg f9;  /* scratch */
+        struct ia64_fpreg f10;  /* scratch */
+        struct ia64_fpreg f11;  /* scratch */
+        unsigned long r4;  /* preserved */
+        unsigned long r5;  /* preserved */
+        unsigned long r6;  /* preserved */
+        unsigned long r7;  /* preserved */
+        unsigned long eml_unat;    /* used for emulating instruction */
+        unsigned long pad0;     /* alignment pad */
+};
+static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
+{
+        return (struct kvm_pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1;
+}
+typedef int kvm_vmm_entry(void);
+typedef void kvm_tramp_entry(union context *host, union context *guest);
+struct kvm_vmm_info{
+        struct module   *module;
+        kvm_vmm_entry   *vmm_entry;
+        kvm_tramp_entry *tramp_entry;
+        unsigned long   vmm_ivt;
+};
+int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+void kvm_sal_emul(struct kvm_vcpu *vcpu);
+#endif
diff --git a/include/asm-ia64/kvm_para.h b/include/asm-ia64/kvm_para.h
new file mode 100644
index 00000000000..9f9796bb344
--- /dev/null
+++ b/include/asm-ia64/kvm_para.h
@@ -0,0 +1,29 @@
+#ifndef __IA64_KVM_PARA_H
+#define __IA64_KVM_PARA_H
+/*
+ * asm-ia64/kvm_para.h
+ *
+ * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+static inline unsigned int kvm_arch_para_features(void)
+{
+        return 0;
+}
+#endif
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 741f7ecb986..6aff126fc07 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -119,6 +119,69 @@ struct ia64_psr {
        __u64 reserved4 : 19;
 };
+union ia64_isr {
+        __u64  val;
+        struct {
+                __u64 code : 16;
+                __u64 vector : 8;
+                __u64 reserved1 : 8;
+                __u64 x : 1;
+                __u64 w : 1;
+                __u64 r : 1;
+                __u64 na : 1;
+                __u64 sp : 1;
+                __u64 rs : 1;
+                __u64 ir : 1;
+                __u64 ni : 1;
+                __u64 so : 1;
+                __u64 ei : 2;
+                __u64 ed : 1;
+                __u64 reserved2 : 20;
+        };
+};
+union ia64_lid {
+        __u64 val;
+        struct {
+                __u64  rv  : 16;
+                __u64  eid : 8;
+                __u64  id  : 8;
+                __u64  ig  : 32;
+        };
+};
+union ia64_tpr {
+        __u64 val;
+        struct {
+                __u64 ig0 : 4;
+                __u64 mic : 4;
+                __u64 rsv : 8;
+                __u64 mmi : 1;
+                __u64 ig1 : 47;
+        };
+};
+union ia64_itir {
+        __u64 val;
+        struct {
+                __u64 rv3  :  2; /* 0-1 */
+                __u64 ps   :  6; /* 2-7 */
+                __u64 key  : 24; /* 8-31 */
+                __u64 rv4  : 32; /* 32-63 */
+        };
+};
+union  ia64_rr {
+        __u64 val;
+        struct {
+                __u64  ve       :  1;  /* enable hw walker */
+                __u64  reserved0:  1;  /* reserved */
+                __u64  ps       :  6;  /* log page size */
+                __u64  rid      : 24;  /* region id */
+                __u64  reserved1: 32;  /* reserved */
+        };
+};
 /*
 * CPU type, hardware bug flags, and per-CPU state.  Frequently used
 * state comes earlier:
diff --git a/include/asm-powerpc/kvm.h b/include/asm-powerpc/kvm.h
index d1b530fbf8d..f993e4198d5 100644
--- a/include/asm-powerpc/kvm.h
+++ b/include/asm-powerpc/kvm.h
@@ -1,6 +1,55 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
 #ifndef __LINUX_KVM_POWERPC_H
 #define __LINUX_KVM_POWERPC_H
-/* powerpc does not support KVM */
+#include <asm/types.h>
+struct kvm_regs {
+        __u64 pc;
+        __u64 cr;
+        __u64 ctr;
+        __u64 lr;
+        __u64 xer;
+        __u64 msr;
+        __u64 srr0;
+        __u64 srr1;
+        __u64 pid;
+        __u64 sprg0;
+        __u64 sprg1;
+        __u64 sprg2;
+        __u64 sprg3;
+        __u64 sprg4;
+        __u64 sprg5;
+        __u64 sprg6;
+        __u64 sprg7;
+        __u64 gpr[32];
+};
+struct kvm_sregs {
+};
+struct kvm_fpu {
+        __u64 fpr[32];
+};
-#endif
+#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/include/asm-powerpc/kvm_asm.h b/include/asm-powerpc/kvm_asm.h
new file mode 100644
index 00000000000..2197764796d
--- /dev/null
+++ b/include/asm-powerpc/kvm_asm.h
@@ -0,0 +1,55 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __POWERPC_KVM_ASM_H__
+#define __POWERPC_KVM_ASM_H__
+/* IVPR must be 64KiB-aligned. */
+#define VCPU_SIZE_ORDER 4
+#define VCPU_SIZE_LOG   (VCPU_SIZE_ORDER + 12)
+#define VCPU_TLB_PGSZ   PPC44x_TLB_64K
+#define VCPU_SIZE_BYTES (1<<VCPU_SIZE_LOG)
+#define BOOKE_INTERRUPT_CRITICAL 0
+#define BOOKE_INTERRUPT_MACHINE_CHECK 1
+#define BOOKE_INTERRUPT_DATA_STORAGE 2
+#define BOOKE_INTERRUPT_INST_STORAGE 3
+#define BOOKE_INTERRUPT_EXTERNAL 4
+#define BOOKE_INTERRUPT_ALIGNMENT 5
+#define BOOKE_INTERRUPT_PROGRAM 6
+#define BOOKE_INTERRUPT_FP_UNAVAIL 7
+#define BOOKE_INTERRUPT_SYSCALL 8
+#define BOOKE_INTERRUPT_AP_UNAVAIL 9
+#define BOOKE_INTERRUPT_DECREMENTER 10
+#define BOOKE_INTERRUPT_FIT 11
+#define BOOKE_INTERRUPT_WATCHDOG 12
+#define BOOKE_INTERRUPT_DTLB_MISS 13
+#define BOOKE_INTERRUPT_ITLB_MISS 14
+#define BOOKE_INTERRUPT_DEBUG 15
+#define BOOKE_MAX_INTERRUPT 15
+#define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
+#define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
+#define RESUME_GUEST            0
+#define RESUME_GUEST_NV         RESUME_FLAG_NV
+#define RESUME_HOST             RESUME_FLAG_HOST
+#define RESUME_HOST_NV          (RESUME_FLAG_HOST|RESUME_FLAG_NV)
+#endif /* __POWERPC_KVM_ASM_H__ */
diff --git a/include/asm-powerpc/kvm_host.h b/include/asm-powerpc/kvm_host.h
new file mode 100644
index 00000000000..04ffbb8e0a3
--- /dev/null
+++ b/include/asm-powerpc/kvm_host.h
@@ -0,0 +1,152 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __POWERPC_KVM_HOST_H__
+#define __POWERPC_KVM_HOST_H__
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/kvm_types.h>
+#include <asm/kvm_asm.h>
+#define KVM_MAX_VCPUS 1
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+/* We don't currently support large pages. */
+#define KVM_PAGES_PER_HPAGE (1<<31)
+struct kvm;
+struct kvm_run;
+struct kvm_vcpu;
+struct kvm_vm_stat {
+        u32 remote_tlb_flush;
+};
+struct kvm_vcpu_stat {
+        u32 sum_exits;
+        u32 mmio_exits;
+        u32 dcr_exits;
+        u32 signal_exits;
+        u32 light_exits;
+        /* Account for special types of light exits: */
+        u32 itlb_real_miss_exits;
+        u32 itlb_virt_miss_exits;
+        u32 dtlb_real_miss_exits;
+        u32 dtlb_virt_miss_exits;
+        u32 syscall_exits;
+        u32 isi_exits;
+        u32 dsi_exits;
+        u32 emulated_inst_exits;
+        u32 dec_exits;
+        u32 ext_intr_exits;
+};
+struct tlbe {
+        u32 tid; /* Only the low 8 bits are used. */
+        u32 word0;
+        u32 word1;
+        u32 word2;
+};
+struct kvm_arch {
+};
+struct kvm_vcpu_arch {
+        /* Unmodified copy of the guest's TLB. */
+        struct tlbe guest_tlb[PPC44x_TLB_SIZE];
+        /* TLB that's actually used when the guest is running. */
+        struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
+        /* Pages which are referenced in the shadow TLB. */
+        struct page *shadow_pages[PPC44x_TLB_SIZE];
+        /* Copy of the host's TLB. */
+        struct tlbe host_tlb[PPC44x_TLB_SIZE];
+        u32 host_stack;
+        u32 host_pid;
+        u64 fpr[32];
+        u32 gpr[32];
+        u32 pc;
+        u32 cr;
+        u32 ctr;
+        u32 lr;
+        u32 xer;
+        u32 msr;
+        u32 mmucr;
+        u32 sprg0;
+        u32 sprg1;
+        u32 sprg2;
+        u32 sprg3;
+        u32 sprg4;
+        u32 sprg5;
+        u32 sprg6;
+        u32 sprg7;
+        u32 srr0;
+        u32 srr1;
+        u32 csrr0;
+        u32 csrr1;
+        u32 dsrr0;
+        u32 dsrr1;
+        u32 dear;
+        u32 esr;
+        u32 dec;
+        u32 decar;
+        u32 tbl;
+        u32 tbu;
+        u32 tcr;
+        u32 tsr;
+        u32 ivor[16];
+        u32 ivpr;
+        u32 pir;
+        u32 pid;
+        u32 pvr;
+        u32 ccr0;
+        u32 ccr1;
+        u32 dbcr0;
+        u32 dbcr1;
+        u32 last_inst;
+        u32 fault_dear;
+        u32 fault_esr;
+        gpa_t paddr_accessed;
+        u8 io_gpr; /* GPR used as IO source/target */
+        u8 mmio_is_bigendian;
+        u8 dcr_needed;
+        u8 dcr_is_write;
+        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
+        struct timer_list dec_timer;
+        unsigned long pending_exceptions;
+};
+struct kvm_guest_debug {
+        int enabled;
+        unsigned long bp[4];
+        int singlestep;
+};
+#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/include/asm-powerpc/kvm_para.h b/include/asm-powerpc/kvm_para.h
new file mode 100644
index 00000000000..2d48f6a63d0
--- /dev/null
+++ b/include/asm-powerpc/kvm_para.h
@@ -0,0 +1,37 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __POWERPC_KVM_PARA_H__
+#define __POWERPC_KVM_PARA_H__
+#ifdef __KERNEL__
+static inline int kvm_para_available(void)
+{
+        return 0;
+}
+static inline unsigned int kvm_arch_para_features(void)
+{
+        return 0;
+}
+#endif /* __KERNEL__ */
+#endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/include/asm-powerpc/kvm_ppc.h b/include/asm-powerpc/kvm_ppc.h
new file mode 100644
index 00000000000..7ac820308a7
--- /dev/null
+++ b/include/asm-powerpc/kvm_ppc.h
@@ -0,0 +1,88 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __POWERPC_KVM_PPC_H__
+#define __POWERPC_KVM_PPC_H__
+/* This file exists just so we can dereference kvm_vcpu, avoiding nested header
+ * dependencies. */
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+struct kvm_tlb {
+        struct tlbe guest_tlb[PPC44x_TLB_SIZE];
+        struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
+};
+enum emulation_result {
+        EMULATE_DONE,         /* no further processing */
+        EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
+        EMULATE_DO_DCR,       /* kvm_run filled with DCR request */
+        EMULATE_FAIL,         /* can't emulate this instruction */
+};
+extern const unsigned char exception_priority[];
+extern const unsigned char priority_exception[];
+extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern char kvmppc_handlers_start[];
+extern unsigned long kvmppc_handler_len;
+extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
+extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                              unsigned int rt, unsigned int bytes,
+                              int is_bigendian);
+extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                               u32 val, unsigned int bytes, int is_bigendian);
+extern int kvmppc_emulate_instruction(struct kvm_run *run,
+                                      struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
+                           u64 asid, u32 flags);
+extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, u64 eaddr, u64 asid);
+extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
+extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
+static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
+{
+        unsigned int priority = exception_priority[exception];
+        set_bit(priority, &vcpu->arch.pending_exceptions);
+}
+static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
+{
+        unsigned int priority = exception_priority[exception];
+        clear_bit(priority, &vcpu->arch.pending_exceptions);
+}
+static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+        if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+                kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+        vcpu->arch.msr = new_msr;
+}
+#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/include/asm-powerpc/mmu-44x.h b/include/asm-powerpc/mmu-44x.h
index c8b02d97f75..a825524c981 100644
--- a/include/asm-powerpc/mmu-44x.h
+++ b/include/asm-powerpc/mmu-44x.h
@@ -53,6 +53,8 @@
 #ifndef __ASSEMBLY__
+extern unsigned int tlb_44x_hwater;
 typedef struct {
        unsigned long id;
        unsigned long vdso_base;
diff --git a/include/asm-s390/Kbuild b/include/asm-s390/Kbuild
index e92b429d2be..13c9805349f 100644
--- a/include/asm-s390/Kbuild
+++ b/include/asm-s390/Kbuild
@@ -7,6 +7,7 @@ header-y += tape390.h
 header-y += ucontext.h
 header-y += vtoc.h
 header-y += zcrypt.h
+header-y += kvm.h
 unifdef-y += cmb.h
 unifdef-y += debug.h
diff --git a/include/asm-s390/kvm.h b/include/asm-s390/kvm.h
index 573f2a35138..d74002f9579 100644
--- a/include/asm-s390/kvm.h
+++ b/include/asm-s390/kvm.h
@@ -1,6 +1,45 @@
 #ifndef __LINUX_KVM_S390_H
 #define __LINUX_KVM_S390_H
-/* s390 does not support KVM */
+/*
+ * asm-s390/kvm.h - KVM s390 specific structures and definitions
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ *               Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#include <asm/types.h>
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+        /* no PIC for s390 */
+};
+struct kvm_ioapic_state {
+        /* no IOAPIC for s390 */
+};
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+        /* general purpose regs for s390 */
+        __u64 gprs[16];
+};
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+        __u32 acrs[16];
+        __u64 crs[16];
+};
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+        __u32 fpc;
+        __u64 fprs[16];
+};
 #endif
diff --git a/include/asm-s390/kvm_host.h b/include/asm-s390/kvm_host.h
new file mode 100644
index 00000000000..f8204a4f2e0
--- /dev/null
+++ b/include/asm-s390/kvm_host.h
@@ -0,0 +1,234 @@
+/*
+ * asm-s390/kvm_host.h - definition for kernel virtual machines on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Carsten Otte <cotte@de.ibm.com>
+ */
+#ifndef ASM_KVM_HOST_H
+#define ASM_KVM_HOST_H
+#include <linux/kvm_host.h>
+#include <asm/debug.h>
+#define KVM_MAX_VCPUS 64
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+struct kvm_guest_debug {
+};
+struct sca_entry {
+        atomic_t scn;
+        __u64   reserved;
+        __u64   sda;
+        __u64   reserved2[2];
+} __attribute__((packed));
+struct sca_block {
+        __u64   ipte_control;
+        __u64   reserved[5];
+        __u64   mcn;
+        __u64   reserved2;
+        struct sca_entry cpu[64];
+} __attribute__((packed));
+#define KVM_PAGES_PER_HPAGE 256
+#define CPUSTAT_HOST       0x80000000
+#define CPUSTAT_WAIT       0x10000000
+#define CPUSTAT_ECALL_PEND 0x08000000
+#define CPUSTAT_STOP_INT   0x04000000
+#define CPUSTAT_IO_INT     0x02000000
+#define CPUSTAT_EXT_INT    0x01000000
+#define CPUSTAT_RUNNING    0x00800000
+#define CPUSTAT_RETAINED   0x00400000
+#define CPUSTAT_TIMING_SUB 0x00020000
+#define CPUSTAT_SIE_SUB    0x00010000
+#define CPUSTAT_RRF        0x00008000
+#define CPUSTAT_SLSV       0x00004000
+#define CPUSTAT_SLSR       0x00002000
+#define CPUSTAT_ZARCH      0x00000800
+#define CPUSTAT_MCDS       0x00000100
+#define CPUSTAT_SM         0x00000080
+#define CPUSTAT_G          0x00000008
+#define CPUSTAT_J          0x00000002
+#define CPUSTAT_P          0x00000001
+struct sie_block {
+        atomic_t cpuflags;              /* 0x0000 */
+        __u32   prefix;                 /* 0x0004 */
+        __u8    reserved8[32];          /* 0x0008 */
+        __u64   cputm;                  /* 0x0028 */
+        __u64   ckc;                    /* 0x0030 */
+        __u64   epoch;                  /* 0x0038 */
+        __u8    reserved40[4];          /* 0x0040 */
+#define LCTL_CR0        0x8000
+        __u16   lctl;                   /* 0x0044 */
+        __s16   icpua;                  /* 0x0046 */
+        __u32   ictl;                   /* 0x0048 */
+        __u32   eca;                    /* 0x004c */
+        __u8    icptcode;               /* 0x0050 */
+        __u8    reserved51;             /* 0x0051 */
+        __u16   ihcpu;                  /* 0x0052 */
+        __u8    reserved54[2];          /* 0x0054 */
+        __u16   ipa;                    /* 0x0056 */
+        __u32   ipb;                    /* 0x0058 */
+        __u32   scaoh;                  /* 0x005c */
+        __u8    reserved60;             /* 0x0060 */
+        __u8    ecb;                    /* 0x0061 */
+        __u8    reserved62[2];          /* 0x0062 */
+        __u32   scaol;                  /* 0x0064 */
+        __u8    reserved68[4];          /* 0x0068 */
+        __u32   todpr;                  /* 0x006c */
+        __u8    reserved70[16];         /* 0x0070 */
+        __u64   gmsor;                  /* 0x0080 */
+        __u64   gmslm;                  /* 0x0088 */
+        psw_t   gpsw;                   /* 0x0090 */
+        __u64   gg14;                   /* 0x00a0 */
+        __u64   gg15;                   /* 0x00a8 */
+        __u8    reservedb0[30];         /* 0x00b0 */
+        __u16   iprcc;                  /* 0x00ce */
+        __u8    reservedd0[48];         /* 0x00d0 */
+        __u64   gcr[16];                /* 0x0100 */
+        __u64   gbea;                   /* 0x0180 */
+        __u8    reserved188[120];       /* 0x0188 */
+} __attribute__((packed));
+struct kvm_vcpu_stat {
+        u32 exit_userspace;
+        u32 exit_external_request;
+        u32 exit_external_interrupt;
+        u32 exit_stop_request;
+        u32 exit_validity;
+        u32 exit_instruction;
+        u32 instruction_lctl;
+        u32 instruction_lctg;
+        u32 exit_program_interruption;
+        u32 exit_instr_and_program;
+        u32 deliver_emergency_signal;
+        u32 deliver_service_signal;
+        u32 deliver_virtio_interrupt;
+        u32 deliver_stop_signal;
+        u32 deliver_prefix_signal;
+        u32 deliver_restart_signal;
+        u32 deliver_program_int;
+        u32 exit_wait_state;
+        u32 instruction_stidp;
+        u32 instruction_spx;
+        u32 instruction_stpx;
+        u32 instruction_stap;
+        u32 instruction_storage_key;
+        u32 instruction_stsch;
+        u32 instruction_chsc;
+        u32 instruction_stsi;
+        u32 instruction_stfl;
+        u32 instruction_sigp_sense;
+        u32 instruction_sigp_emergency;
+        u32 instruction_sigp_stop;
+        u32 instruction_sigp_arch;
+        u32 instruction_sigp_prefix;
+        u32 instruction_sigp_restart;
+        u32 diagnose_44;
+};
+struct io_info {
+        __u16        subchannel_id;            /* 0x0b8 */
+        __u16        subchannel_nr;            /* 0x0ba */
+        __u32        io_int_parm;              /* 0x0bc */
+        __u32        io_int_word;              /* 0x0c0 */
+};
+struct ext_info {
+        __u32 ext_params;
+        __u64 ext_params2;
+};
+#define PGM_OPERATION            0x01
+#define PGM_PRIVILEGED_OPERATION 0x02
+#define PGM_EXECUTE              0x03
+#define PGM_PROTECTION           0x04
+#define PGM_ADDRESSING           0x05
+#define PGM_SPECIFICATION        0x06
+#define PGM_DATA                 0x07
+struct pgm_info {
+        __u16 code;
+};
+struct prefix_info {
+        __u32 address;
+};
+struct interrupt_info {
+        struct list_head list;
+        u64     type;
+        union {
+                struct io_info io;
+                struct ext_info ext;
+                struct pgm_info pgm;
+                struct prefix_info prefix;
+        };
+};
+/* for local_interrupt.action_flags */
+#define ACTION_STORE_ON_STOP 1
+#define ACTION_STOP_ON_STOP  2
+struct local_interrupt {
+        spinlock_t lock;
+        struct list_head list;
+        atomic_t active;
+        struct float_interrupt *float_int;
+        int timer_due; /* event indicator for waitqueue below */
+        wait_queue_head_t wq;
+        atomic_t *cpuflags;
+        unsigned int action_bits;
+};
+struct float_interrupt {
+        spinlock_t lock;
+        struct list_head list;
+        atomic_t active;
+        int next_rr_cpu;
+        unsigned long idle_mask [(64 + sizeof(long) - 1) / sizeof(long)];
+        struct local_interrupt *local_int[64];
+};
+struct kvm_vcpu_arch {
+        struct sie_block *sie_block;
+        unsigned long     guest_gprs[16];
+        s390_fp_regs      host_fpregs;
+        unsigned int      host_acrs[NUM_ACRS];
+        s390_fp_regs      guest_fpregs;
+        unsigned int      guest_acrs[NUM_ACRS];
+        struct local_interrupt local_int;
+        struct timer_list ckc_timer;
+        union  {
+                cpuid_t   cpu_id;
+                u64       stidp_data;
+        };
+};
+struct kvm_vm_stat {
+        u32 remote_tlb_flush;
+};
+struct kvm_arch{
+        unsigned long guest_origin;
+        unsigned long guest_memsize;
+        struct sca_block *sca;
+        debug_info_t *dbf;
+        struct float_interrupt float_int;
+};
+extern int sie64a(struct sie_block *, __u64 *);
+#endif
diff --git a/include/asm-s390/kvm_para.h b/include/asm-s390/kvm_para.h
new file mode 100644
index 00000000000..2c503796b61
--- /dev/null
+++ b/include/asm-s390/kvm_para.h
@@ -0,0 +1,150 @@
+/*
+ * asm-s390/kvm_para.h - definition for paravirtual devices on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#ifndef __S390_KVM_PARA_H
+#define __S390_KVM_PARA_H
+/*
+ * Hypercalls for KVM on s390. The calling convention is similar to the
+ * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1
+ * as hypercall number and R7 as parameter 6. The return value is
+ * written to R2. We use the diagnose instruction as hypercall. To avoid
+ * conflicts with existing diagnoses for LPAR and z/VM, we do not use
+ * the instruction encoded number, but specify the number in R1 and
+ * use 0x500 as KVM hypercall
+ *
+ * Copyright IBM Corp. 2007,2008
+ * Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+static inline long kvm_hypercall0(unsigned long nr)
+{
+        register unsigned long __nr asm("1") = nr;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr): "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1) : "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
+                               unsigned long p2)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register unsigned long __p2 asm("3") = p2;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2)
+                      : "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
+                               unsigned long p2, unsigned long p3)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register unsigned long __p2 asm("3") = p2;
+        register unsigned long __p3 asm("4") = p3;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
+                        "d" (__p3) : "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
+                               unsigned long p2, unsigned long p3,
+                               unsigned long p4)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register unsigned long __p2 asm("3") = p2;
+        register unsigned long __p3 asm("4") = p3;
+        register unsigned long __p4 asm("5") = p4;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
+                        "d" (__p3), "d" (__p4) : "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
+                               unsigned long p2, unsigned long p3,
+                               unsigned long p4, unsigned long p5)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register unsigned long __p2 asm("3") = p2;
+        register unsigned long __p3 asm("4") = p3;
+        register unsigned long __p4 asm("5") = p4;
+        register unsigned long __p5 asm("6") = p5;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
+                        "d" (__p3), "d" (__p4), "d" (__p5)  : "memory", "cc");
+        return __rc;
+}
+static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
+                               unsigned long p2, unsigned long p3,
+                               unsigned long p4, unsigned long p5,
+                               unsigned long p6)
+{
+        register unsigned long __nr asm("1") = nr;
+        register unsigned long __p1 asm("2") = p1;
+        register unsigned long __p2 asm("3") = p2;
+        register unsigned long __p3 asm("4") = p3;
+        register unsigned long __p4 asm("5") = p4;
+        register unsigned long __p5 asm("6") = p5;
+        register unsigned long __p6 asm("7") = p6;
+        register long __rc asm("2");
+        asm volatile ("diag 2,4,0x500\n"
+                      : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
+                        "d" (__p3), "d" (__p4), "d" (__p5), "d" (__p6)
+                      : "memory", "cc");
+        return __rc;
+}
+/* kvm on s390 is always paravirtualization enabled */
+static inline int kvm_para_available(void)
+{
+        return 1;
+}
+/* No feature bits are currently assigned for kvm on s390 */
+static inline unsigned int kvm_arch_para_features(void)
+{
+        return 0;
+}
+#endif /* __S390_KVM_PARA_H */
diff --git a/include/asm-s390/kvm_virtio.h b/include/asm-s390/kvm_virtio.h
new file mode 100644
index 00000000000..5c871a990c2
--- /dev/null
+++ b/include/asm-s390/kvm_virtio.h
@@ -0,0 +1,53 @@
+/*
+ * kvm_virtio.h - definition for virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
+ */
+#ifndef __KVM_S390_VIRTIO_H
+#define __KVM_S390_VIRTIO_H
+#include <linux/types.h>
+struct kvm_device_desc {
+        /* The device type: console, network, disk etc.  Type 0 terminates. */
+        __u8 type;
+        /* The number of virtqueues (first in config array) */
+        __u8 num_vq;
+        /*
+         * The number of bytes of feature bits.  Multiply by 2: one for host
+         * features and one for guest acknowledgements.
+         */
+        __u8 feature_len;
+        /* The number of bytes of the config array after virtqueues. */
+        __u8 config_len;
+        /* A status byte, written by the Guest. */
+        __u8 status;
+        __u8 config[0];
+};
+/*
+ * This is how we expect the device configuration field for a virtqueue
+ * to be laid out in config space.
+ */
+struct kvm_vqconfig {
+        /* The token returned with an interrupt. Set by the guest */
+        __u64 token;
+        /* The address of the virtio ring */
+        __u64 address;
+        /* The number of entries in the virtio_ring */
+        __u16 num;
+};
+#define KVM_S390_VIRTIO_NOTIFY          0
+#define KVM_S390_VIRTIO_RESET           1
+#define KVM_S390_VIRTIO_SET_STATUS      2
+#endif
diff --git a/include/asm-s390/lowcore.h b/include/asm-s390/lowcore.h
index 5de3efb3144..0bc51d52a89 100644
--- a/include/asm-s390/lowcore.h
+++ b/include/asm-s390/lowcore.h
@@ -381,27 +381,32 @@ struct _lowcore
        /* whether the kernel died with panic() or not */
        __u32        panic_magic;              /* 0xe00 */
-        __u8         pad13[0x1200-0xe04];      /* 0xe04 */
+        __u8         pad13[0x11b8-0xe04];      /* 0xe04 */
+        /* 64 bit extparam used for pfault, diag 250 etc  */
+        __u64        ext_params2;               /* 0x11B8 */
+        __u8         pad14[0x1200-0x11C0];      /* 0x11C0 */
        /* System info area */ 
        __u64        floating_pt_save_area[16]; /* 0x1200 */
        __u64        gpregs_save_area[16];      /* 0x1280 */
        __u32        st_status_fixed_logout[4]; /* 0x1300 */
-        __u8         pad14[0x1318-0x1310];      /* 0x1310 */
+        __u8         pad15[0x1318-0x1310];      /* 0x1310 */
        __u32        prefixreg_save_area;       /* 0x1318 */
        __u32        fpt_creg_save_area;        /* 0x131c */
-        __u8         pad15[0x1324-0x1320];      /* 0x1320 */
+        __u8         pad16[0x1324-0x1320];      /* 0x1320 */
        __u32        tod_progreg_save_area;     /* 0x1324 */
        __u32        cpu_timer_save_area[2];    /* 0x1328 */
        __u32        clock_comp_save_area[2];   /* 0x1330 */
-        __u8         pad16[0x1340-0x1338];      /* 0x1338 */ 
+        __u8         pad17[0x1340-0x1338];      /* 0x1338 */
        __u32        access_regs_save_area[16]; /* 0x1340 */ 
        __u64        cregs_save_area[16];       /* 0x1380 */
        /* align to the top of the prefix area */
-        __u8         pad17[0x2000-0x1400];      /* 0x1400 */
+        __u8         pad18[0x2000-0x1400];      /* 0x1400 */
 #endif /* !__s390x__ */
 } __attribute__((packed)); /* End structure*/
diff --git a/include/asm-s390/mmu.h b/include/asm-s390/mmu.h
index 1698e29c5b2..5dd5e7b3476 100644
--- a/include/asm-s390/mmu.h
+++ b/include/asm-s390/mmu.h
@@ -7,6 +7,7 @@ typedef struct {
        unsigned long asce_bits;
        unsigned long asce_limit;
        int noexec;
+        int pgstes;
 } mm_context_t;
 #endif
diff --git a/include/asm-s390/mmu_context.h b/include/asm-s390/mmu_context.h
index b5a34c6f91a..4c2fbf48c9c 100644
--- a/include/asm-s390/mmu_context.h
+++ b/include/asm-s390/mmu_context.h
@@ -20,7 +20,13 @@ static inline int init_new_context(struct task_struct *tsk,
 #ifdef CONFIG_64BIT
        mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
-        mm->context.noexec = s390_noexec;
+        if (current->mm->context.pgstes) {
+                mm->context.noexec = 0;
+                mm->context.pgstes = 1;
+        } else {
+                mm->context.noexec = s390_noexec;
+                mm->context.pgstes = 0;
+        }
        mm->context.asce_limit = STACK_TOP_MAX;
        crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
        return 0;
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index 65154dc9a9e..4c0698c0dda 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -30,6 +30,7 @@
 */
 #ifndef __ASSEMBLY__
 #include <linux/mm_types.h>
+#include <asm/bitops.h>
 #include <asm/bug.h>
 #include <asm/processor.h>
@@ -258,6 +259,13 @@ extern char empty_zero_page[PAGE_SIZE];
 * swap pte is 1011 and 0001, 0011, 0101, 0111 are invalid.
 */
+/* Page status table bits for virtualization */
+#define RCP_PCL_BIT     55
+#define RCP_HR_BIT      54
+#define RCP_HC_BIT      53
+#define RCP_GR_BIT      50
+#define RCP_GC_BIT      49
 #ifndef __s390x__
 /* Bits in the segment table address-space-control-element */
@@ -513,6 +521,48 @@ static inline int pte_file(pte_t pte)
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(a,b)  (pte_val(a) == pte_val(b))
+static inline void rcp_lock(pte_t *ptep)
+{
+#ifdef CONFIG_PGSTE
+        unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
+        preempt_disable();
+        while (test_and_set_bit(RCP_PCL_BIT, pgste))
+                ;
+#endif
+}
+static inline void rcp_unlock(pte_t *ptep)
+{
+#ifdef CONFIG_PGSTE
+        unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
+        clear_bit(RCP_PCL_BIT, pgste);
+        preempt_enable();
+#endif
+}
+/* forward declaration for SetPageUptodate in page-flags.h*/
+static inline void page_clear_dirty(struct page *page);
+#include <linux/page-flags.h>
+static inline void ptep_rcp_copy(pte_t *ptep)
+{
+#ifdef CONFIG_PGSTE
+        struct page *page = virt_to_page(pte_val(*ptep));
+        unsigned int skey;
+        unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
+        skey = page_get_storage_key(page_to_phys(page));
+        if (skey & _PAGE_CHANGED)
+                set_bit_simple(RCP_GC_BIT, pgste);
+        if (skey & _PAGE_REFERENCED)
+                set_bit_simple(RCP_GR_BIT, pgste);
+        if (test_and_clear_bit_simple(RCP_HC_BIT, pgste))
+                SetPageDirty(page);
+        if (test_and_clear_bit_simple(RCP_HR_BIT, pgste))
+                SetPageReferenced(page);
+#endif
+}
 /*
 * query functions pte_write/pte_dirty/pte_young only work if
 * pte_present() is true. Undefined behaviour if not..
@@ -599,6 +649,8 @@ static inline void pmd_clear(pmd_t *pmd)
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
+        if (mm->context.pgstes)
+                ptep_rcp_copy(ptep);
        pte_val(*ptep) = _PAGE_TYPE_EMPTY;
        if (mm->context.noexec)
                pte_val(ptep[PTRS_PER_PTE]) = _PAGE_TYPE_EMPTY;
@@ -667,6 +719,24 @@ static inline pte_t pte_mkyoung(pte_t pte)
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long addr, pte_t *ptep)
 {
+#ifdef CONFIG_PGSTE
+        unsigned long physpage;
+        int young;
+        unsigned long *pgste;
+        if (!vma->vm_mm->context.pgstes)
+                return 0;
+        physpage = pte_val(*ptep) & PAGE_MASK;
+        pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
+        young = ((page_get_storage_key(physpage) & _PAGE_REFERENCED) != 0);
+        rcp_lock(ptep);
+        if (young)
+                set_bit_simple(RCP_GR_BIT, pgste);
+        young |= test_and_clear_bit_simple(RCP_HR_BIT, pgste);
+        rcp_unlock(ptep);
+        return young;
+#endif
        return 0;
 }
@@ -674,7 +744,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pte_t *ptep)
 {
-        /* No need to flush TLB; bits are in storage key */
+        /* No need to flush TLB
+         * On s390 reference bits are in storage key and never in TLB
+         * With virtualization we handle the reference bit, without we
+         * we can simply return */
+#ifdef CONFIG_PGSTE
+        return ptep_test_and_clear_young(vma, address, ptep);
+#endif
        return 0;
 }
@@ -693,15 +769,25 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
                        : "=m" (*ptep) : "m" (*ptep),
                          "a" (pto), "a" (address));
        }
-        pte_val(*ptep) = _PAGE_TYPE_EMPTY;
 }
 static inline void ptep_invalidate(struct mm_struct *mm,
                                   unsigned long address, pte_t *ptep)
 {
+        if (mm->context.pgstes) {
+                rcp_lock(ptep);
+                __ptep_ipte(address, ptep);
+                ptep_rcp_copy(ptep);
+                pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+                rcp_unlock(ptep);
+                return;
+        }
        __ptep_ipte(address, ptep);
-        if (mm->context.noexec)
+        pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+        if (mm->context.noexec) {
                __ptep_ipte(address, ptep + PTRS_PER_PTE);
+                pte_val(*(ptep + PTRS_PER_PTE)) = _PAGE_TYPE_EMPTY;
+        }
 }
 /*
@@ -966,6 +1052,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 extern int add_shared_memory(unsigned long start, unsigned long size);
 extern int remove_shared_memory(unsigned long start, unsigned long size);
+extern int s390_enable_sie(void);
 /*
 * No page table caches to initialise
diff --git a/include/asm-s390/setup.h b/include/asm-s390/setup.h
index a76a6b8fd88..aaf4b518b94 100644
--- a/include/asm-s390/setup.h
+++ b/include/asm-s390/setup.h
@@ -62,6 +62,7 @@ extern unsigned long machine_flags;
 #define MACHINE_IS_VM           (machine_flags & 1)
 #define MACHINE_IS_P390         (machine_flags & 4)
 #define MACHINE_HAS_MVPG        (machine_flags & 16)
+#define MACHINE_IS_KVM          (machine_flags & 64)
 #define MACHINE_HAS_IDTE        (machine_flags & 128)
 #define MACHINE_HAS_DIAG9C      (machine_flags & 256)
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
index 7a71120426a..80eefef2cc7 100644
--- a/include/asm-x86/kvm.h
+++ b/include/asm-x86/kvm.h
@@ -188,4 +188,45 @@ struct kvm_cpuid2 {
        struct kvm_cpuid_entry2 entries[0];
 };
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+        __u32 count; /* can be 65536 */
+        __u16 latched_count;
+        __u8 count_latched;
+        __u8 status_latched;
+        __u8 status;
+        __u8 read_state;
+        __u8 write_state;
+        __u8 write_latch;
+        __u8 rw_mode;
+        __u8 mode;
+        __u8 bcd;
+        __u8 gate;
+        __s64 count_load_time;
+};
+struct kvm_pit_state {
+        struct kvm_pit_channel_state channels[3];
+};
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
 #endif
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 68ee390b284..9d963cd6533 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -20,6 +20,13 @@
 #include <asm/desc.h>
+#define KVM_MAX_VCPUS 16
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_PIO_PAGE_OFFSET 1
 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
 #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
 #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |    \
@@ -39,6 +46,13 @@
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
+/* shadow tables are PAE even on non-PAE hosts */
+#define KVM_HPAGE_SHIFT 21
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
+#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
 #define DE_VECTOR 0
 #define UD_VECTOR 6
 #define NM_VECTOR 7
@@ -48,6 +62,7 @@
 #define SS_VECTOR 12
 #define GP_VECTOR 13
 #define PF_VECTOR 14
+#define MC_VECTOR 18
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
@@ -58,7 +73,8 @@
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_NUM_MMU_PAGES 1024
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
@@ -106,6 +122,12 @@ enum {
 #define KVM_NR_MEM_OBJS 40
+struct kvm_guest_debug {
+        int enabled;
+        unsigned long bp[4];
+        int singlestep;
+};
 /*
 * We don't want allocation failures within the mmu code, so we preallocate
 * enough memory for a single page fault in a cache.
@@ -140,6 +162,7 @@ union kvm_mmu_page_role {
                unsigned pad_for_nice_hex_output:6;
                unsigned metaphysical:1;
                unsigned access:3;
+                unsigned invalid:1;
        };
 };
@@ -204,11 +227,6 @@ struct kvm_vcpu_arch {
        u64 shadow_efer;
        u64 apic_base;
        struct kvm_lapic *apic;    /* kernel irqchip context */
-#define VCPU_MP_STATE_RUNNABLE          0
-#define VCPU_MP_STATE_UNINITIALIZED     1
-#define VCPU_MP_STATE_INIT_RECEIVED     2
-#define VCPU_MP_STATE_SIPI_RECEIVED     3
-#define VCPU_MP_STATE_HALTED            4
        int mp_state;
        int sipi_vector;
        u64 ia32_misc_enable_msr;
@@ -226,8 +244,9 @@ struct kvm_vcpu_arch {
        u64  *last_pte_updated;
        struct {
-                gfn_t gfn;          /* presumed gfn during guest pte update */
+                gfn_t gfn;      /* presumed gfn during guest pte update */
-                struct page *page;  /* page corresponding to that gfn */
+                pfn_t pfn;      /* pfn corresponding to that gfn */
+                int largepage;
        } update_pte;
        struct i387_fxsave_struct host_fx_image;
@@ -261,6 +280,11 @@ struct kvm_vcpu_arch {
        /* emulate context */
        struct x86_emulate_ctxt emulate_ctxt;
+        gpa_t time;
+        struct kvm_vcpu_time_info hv_clock;
+        unsigned int time_offset;
+        struct page *time_page;
 };
 struct kvm_mem_alias {
@@ -283,10 +307,13 @@ struct kvm_arch{
        struct list_head active_mmu_pages;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
+        struct kvm_pit *vpit;
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
        struct page *apic_access_page;
+        gpa_t wall_clock;
 };
 struct kvm_vm_stat {
@@ -298,6 +325,7 @@ struct kvm_vm_stat {
        u32 mmu_recycled;
        u32 mmu_cache_miss;
        u32 remote_tlb_flush;
+        u32 lpages;
 };
 struct kvm_vcpu_stat {
@@ -320,6 +348,7 @@ struct kvm_vcpu_stat {
        u32 fpu_reload;
        u32 insn_emulation;
        u32 insn_emulation_fail;
+        u32 hypercalls;
 };
 struct descriptor_table {
@@ -355,6 +384,7 @@ struct kvm_x86_ops {
        u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
        void (*get_segment)(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
+        int (*get_cpl)(struct kvm_vcpu *vcpu);
        void (*set_segment)(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
        void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -410,6 +440,15 @@ void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                          const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+                  gpa_t addr, unsigned long *ret);
+extern bool tdp_enabled;
 enum emulation_result {
        EMULATE_DONE,       /* no further processing */
        EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
@@ -429,6 +468,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
                     unsigned long *rflags);
+void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -448,12 +488,14 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
                    unsigned long value);
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-unsigned long get_cr8(struct kvm_vcpu *vcpu);
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
@@ -491,6 +533,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_enable_tdp(void);
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
@@ -600,6 +644,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 #define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
 #define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
 #define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVVPID           ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 #define MSR_IA32_TIME_STAMP_COUNTER             0x010
@@ -610,4 +655,30 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 #define RMODE_TSS_SIZE                                                  \
        (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+enum {
+        TASK_SWITCH_CALL = 0,
+        TASK_SWITCH_IRET = 1,
+        TASK_SWITCH_JMP = 2,
+        TASK_SWITCH_GATE = 3,
+};
+#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 5, d1, d2, d3, d4, d5)
+#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 4, d1, d2, d3, d4, 0)
+#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 3, d1, d2, d3, 0, 0)
+#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 2, d1, d2, 0, 0, 0)
+#define KVMTRACE_1D(evt, vcpu, d1, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 1, d1, 0, 0, 0, 0)
+#define KVMTRACE_0D(evt, vcpu, name) \
+        trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                                vcpu, 0, 0, 0, 0, 0, 0)
 #endif
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index c6f3fd8d8c5..50984594207 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -10,10 +10,65 @@
 * paravirtualization, the appropriate feature bit should be checked.
 */
 #define KVM_CPUID_FEATURES      0x40000001
+#define KVM_FEATURE_CLOCKSOURCE         0
+#define KVM_FEATURE_NOP_IO_DELAY        1
+#define KVM_FEATURE_MMU_OP              2
+#define MSR_KVM_WALL_CLOCK  0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MAX_MMU_OP_BATCH           32
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE            1
+#define KVM_MMU_OP_FLUSH_TLB            2
+#define KVM_MMU_OP_RELEASE_PT           3
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+        __u32 op;
+        __u32 pad;
+};
+struct kvm_mmu_op_write_pte {
+        struct kvm_mmu_op_header header;
+        __u64 pte_phys;
+        __u64 pte_val;
+};
+struct kvm_mmu_op_flush_tlb {
+        struct kvm_mmu_op_header header;
+};
+struct kvm_mmu_op_release_pt {
+        struct kvm_mmu_op_header header;
+        __u64 pt_phys;
+};
 #ifdef __KERNEL__
 #include <asm/processor.h>
+/* xen binary-compatible interface. See xen headers for details */
+struct kvm_vcpu_time_info {
+        uint32_t version;
+        uint32_t pad0;
+        uint64_t tsc_timestamp;
+        uint64_t system_time;
+        uint32_t tsc_to_system_mul;
+        int8_t   tsc_shift;
+        int8_t   pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+struct kvm_wall_clock {
+        uint32_t wc_version;
+        uint32_t wc_sec;
+        uint32_t wc_nsec;
+} __attribute__((__packed__));
+extern void kvmclock_init(void);
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
 * trap that we will then rewrite to the appropriate instruction.
 */
diff --git a/include/asm-x86/reboot.h b/include/asm-x86/reboot.h
index 6b5233b4f84..e63741f1939 100644
--- a/include/asm-x86/reboot.h
+++ b/include/asm-x86/reboot.h
@@ -15,5 +15,7 @@ struct machine_ops {
 extern struct machine_ops machine_ops;
 void machine_real_restart(unsigned char *code, int length);
+void native_machine_crash_shutdown(struct pt_regs *regs);
+void native_machine_shutdown(void);
 #endif  /* _ASM_REBOOT_H */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c1ec04fd000..a281afeddfb 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -8,11 +8,18 @@
 */
 #include <asm/types.h>
+#include <linux/compiler.h>
 #include <linux/ioctl.h>
 #include <asm/kvm.h>
 #define KVM_API_VERSION 12
+/* for KVM_TRACE_ENABLE */
+struct kvm_user_trace_setup {
+        __u32 buf_size; /* sub_buffer size of each per-cpu */
+        __u32 buf_nr; /* the number of sub_buffers of each per-cpu */
+};
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
        __u32 slot;
@@ -73,6 +80,9 @@ struct kvm_irqchip {
 #define KVM_EXIT_INTR             10
 #define KVM_EXIT_SET_TPR          11
 #define KVM_EXIT_TPR_ACCESS       12
+#define KVM_EXIT_S390_SIEIC       13
+#define KVM_EXIT_S390_RESET       14
+#define KVM_EXIT_DCR              15
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -137,6 +147,27 @@ struct kvm_run {
                        __u32 is_write;
                        __u32 pad;
                } tpr_access;
+                /* KVM_EXIT_S390_SIEIC */
+                struct {
+                        __u8 icptcode;
+                        __u64 mask; /* psw upper half */
+                        __u64 addr; /* psw lower half */
+                        __u16 ipa;
+                        __u32 ipb;
+                } s390_sieic;
+                /* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+                __u64 s390_reset_flags;
+                /* KVM_EXIT_DCR */
+                struct {
+                        __u32 dcrn;
+                        __u32 data;
+                        __u8  is_write;
+                } dcr;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -204,6 +235,74 @@ struct kvm_vapic_addr {
        __u64 vapic_addr;
 };
+/* for KVM_SET_MPSTATE */
+#define KVM_MP_STATE_RUNNABLE          0
+#define KVM_MP_STATE_UNINITIALIZED     1
+#define KVM_MP_STATE_INIT_RECEIVED     2
+#define KVM_MP_STATE_HALTED            3
+#define KVM_MP_STATE_SIPI_RECEIVED     4
+struct kvm_mp_state {
+        __u32 mp_state;
+};
+struct kvm_s390_psw {
+        __u64 mask;
+        __u64 addr;
+};
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP              0xfffe0000u
+#define KVM_S390_PROGRAM_INT            0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX        0xfffe0002u
+#define KVM_S390_RESTART                0xfffe0003u
+#define KVM_S390_INT_VIRTIO             0xffff2603u
+#define KVM_S390_INT_SERVICE            0xffff2401u
+#define KVM_S390_INT_EMERGENCY          0xffff1201u
+struct kvm_s390_interrupt {
+        __u32 type;
+        __u32 parm;
+        __u64 parm64;
+};
+#define KVM_TRC_SHIFT           16
+/*
+ * kvm trace categories
+ */
+#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1)) /* only 12 bits */
+/*
+ * kvm trace action
+ */
+#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
+#define KVM_TRC_HEAD_SIZE       12
+#define KVM_TRC_CYCLE_SIZE      8
+#define KVM_TRC_EXTRA_MAX       7
+/* This structure represents a single trace buffer record. */
+struct kvm_trace_rec {
+        __u32 event:28;
+        __u32 extra_u32:3;
+        __u32 cycle_in:1;
+        __u32 pid;
+        __u32 vcpu_id;
+        union {
+                struct {
+                        __u32 cycle_lo, cycle_hi;
+                        __u32 extra_u32[KVM_TRC_EXTRA_MAX];
+                } cycle;
+                struct {
+                        __u32 extra_u32[KVM_TRC_EXTRA_MAX];
+                } nocycle;
+        } u;
+};
 #define KVMIO 0xAE
 /*
@@ -212,6 +311,8 @@ struct kvm_vapic_addr {
 #define KVM_GET_API_VERSION       _IO(KVMIO,   0x00)
 #define KVM_CREATE_VM             _IO(KVMIO,   0x01) /* returns a VM fd */
 #define KVM_GET_MSR_INDEX_LIST    _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+#define KVM_S390_ENABLE_SIE       _IO(KVMIO,   0x06)
 /*
 * Check if a kvm extension is available.  Argument is extension number,
 * return is 1 (yes) or 0 (no, sorry).
@@ -222,7 +323,12 @@ struct kvm_vapic_addr {
 */
 #define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 #define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+/*
+ * ioctls for kvm trace
+ */
+#define KVM_TRACE_ENABLE          _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define KVM_TRACE_PAUSE           _IO(KVMIO,  0x07)
+#define KVM_TRACE_DISABLE         _IO(KVMIO,  0x08)
 /*
 * Extension capability list.
 */
@@ -233,6 +339,13 @@ struct kvm_vapic_addr {
 #define KVM_CAP_SET_TSS_ADDR 4
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
+#define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
+#define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
+#define KVM_CAP_PIT 11
+#define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
+#define KVM_CAP_MP_STATE 14
 /*
 * ioctls for VM fds
@@ -255,6 +368,9 @@ struct kvm_vapic_addr {
 #define KVM_IRQ_LINE              _IOW(KVMIO, 0x61, struct kvm_irq_level)
 #define KVM_GET_IRQCHIP           _IOWR(KVMIO, 0x62, struct kvm_irqchip)
 #define KVM_SET_IRQCHIP           _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT            _IO(KVMIO,  0x64)
+#define KVM_GET_PIT               _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT               _IOR(KVMIO,  0x66, struct kvm_pit_state)
 /*
 * ioctls for vcpu fds
@@ -281,5 +397,17 @@ struct kvm_vapic_addr {
 #define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
 /* Available with KVM_CAP_VAPIC */
 #define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
+/* valid for virtual machine (for floating interrupt)_and_ vcpu */
+#define KVM_S390_INTERRUPT        _IOW(KVMIO,  0x94, struct kvm_s390_interrupt)
+/* store status for s390 */
+#define KVM_S390_STORE_STATUS_NOADDR    (-1ul)
+#define KVM_S390_STORE_STATUS_PREFIXED  (-2ul)
+#define KVM_S390_STORE_STATUS     _IOW(KVMIO,  0x95, unsigned long)
+/* initial ipl psw for s390 */
+#define KVM_S390_SET_INITIAL_PSW  _IOW(KVMIO,  0x96, struct kvm_s390_psw)
+/* initial reset for s390 */
+#define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
+#define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
+#define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 928b0d59e9b..398978972b7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/preempt.h>
+#include <linux/marker.h>
 #include <asm/signal.h>
 #include <linux/kvm.h>
@@ -24,29 +25,18 @@
 #include <asm/kvm_host.h>
-#define KVM_MAX_VCPUS 4
-#define KVM_MEMORY_SLOTS 8
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-#define KVM_PIO_PAGE_OFFSET 1
 /*
 * vcpu->requests bit members
 */
 #define KVM_REQ_TLB_FLUSH          0
 #define KVM_REQ_MIGRATE_TIMER      1
 #define KVM_REQ_REPORT_TPR_ACCESS  2
+#define KVM_REQ_MMU_RELOAD         3
+#define KVM_REQ_TRIPLE_FAULT       4
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-struct kvm_guest_debug {
-        int enabled;
-        unsigned long bp[4];
-        int singlestep;
-};
 /*
 * It would be nice to use something smarter than a linear search, TBD...
 * Thankfully we dont expect many devices to register (famous last words :),
@@ -67,7 +57,9 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
 struct kvm_vcpu {
        struct kvm *kvm;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
        struct preempt_notifier preempt_notifier;
+#endif
        int vcpu_id;
        struct mutex mutex;
        int   cpu;
@@ -100,6 +92,10 @@ struct kvm_memory_slot {
        unsigned long flags;
        unsigned long *rmap;
        unsigned long *dirty_bitmap;
+        struct {
+                unsigned long rmap_pde;
+                int write_count;
+        } *lpage_info;
        unsigned long userspace_addr;
        int user_alloc;
 };
@@ -114,11 +110,11 @@ struct kvm {
                                        KVM_PRIVATE_MEM_SLOTS];
        struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
        struct list_head vm_list;
-        struct file *filp;
        struct kvm_io_bus mmio_bus;
        struct kvm_io_bus pio_bus;
        struct kvm_vm_stat stat;
        struct kvm_arch arch;
+        atomic_t users_count;
 };
 /* The guest did something we don't support. */
@@ -145,14 +141,19 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
                  struct module *module);
 void kvm_exit(void);
+void kvm_get_kvm(struct kvm *kvm);
+void kvm_put_kvm(struct kvm *kvm);
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
 extern struct page *bad_page;
+extern pfn_t bad_pfn;
 int is_error_page(struct page *page);
+int is_error_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
                          struct kvm_userspace_memory_region *mem,
@@ -166,8 +167,19 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                                int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
+void kvm_set_page_dirty(struct page *page);
+void kvm_set_page_accessed(struct page *page);
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+void kvm_release_pfn_dirty(pfn_t);
+void kvm_release_pfn_clean(pfn_t pfn);
+void kvm_set_pfn_dirty(pfn_t pfn);
+void kvm_set_pfn_accessed(pfn_t pfn);
+void kvm_get_pfn(pfn_t pfn);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
                        int len);
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
@@ -188,6 +200,7 @@ void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_reload_remote_mmus(struct kvm *kvm);
 long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
@@ -223,6 +236,10 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs);
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state);
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
                                    struct kvm_debug_guest *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
@@ -255,6 +272,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 static inline void kvm_guest_enter(void)
@@ -296,5 +314,18 @@ struct kvm_stats_debugfs_item {
        struct dentry *dentry;
 };
 extern struct kvm_stats_debugfs_item debugfs_entries[];
+extern struct dentry *kvm_debugfs_dir;
+#ifdef CONFIG_KVM_TRACE
+int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
+void kvm_trace_cleanup(void);
+#else
+static inline
+int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
+{
+        return -EINVAL;
+}
+#define kvm_trace_cleanup() ((void)0)
+#endif
 #endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 5497aac0d2f..3ddce03766c 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -11,8 +11,11 @@
 /* Return values for hypercalls */
 #define KVM_ENOSYS              1000
+#define KVM_EFAULT              EFAULT
+#define KVM_E2BIG               E2BIG
-#define KVM_HC_VAPIC_POLL_IRQ            1
+#define KVM_HC_VAPIC_POLL_IRQ           1
+#define KVM_HC_MMU_OP                   2
 /*
 * hypercalls use architecture specific
@@ -20,6 +23,12 @@
 #include <asm/kvm_para.h>
 #ifdef __KERNEL__
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
 static inline int kvm_para_has_feature(unsigned int feature)
 {
        if (kvm_arch_para_features() & (1UL << feature))
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 1c4e46decb2..9b6f395c962 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -38,6 +38,8 @@ typedef unsigned long  hva_t;
 typedef u64            hpa_t;
 typedef unsigned long  hfn_t;
+typedef hfn_t pfn_t;
 struct kvm_pio_request {
        unsigned long count;
        int cur_count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0bd97044ab..9a4f3e63e3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1798,6 +1798,8 @@ extern void mmput(struct mm_struct *);
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
+/* Allocate a new mm structure and copy contents from tsk->mm */
+extern struct mm_struct *dup_mm(struct task_struct *tsk);
 extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index cb46befdd3a..c674aa8d3c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -521,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
 */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+struct mm_struct *dup_mm(struct task_struct *tsk)
 {
        struct mm_struct *mm, *oldmm = current->mm;
        int err;
diff --git a/mm/rmap.c b/mm/rmap.c
index 997f06907b6..e9bb6b1093f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked,
 {
        int referenced = 0;
-        if (page_test_and_clear_young(page))
-                referenced++;
        if (TestClearPageReferenced(page))
                referenced++;
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked,
                        unlock_page(page);
                }
        }
+        if (page_test_and_clear_young(page))
+                referenced++;
        return referenced;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b2e12893e3f..c82cf15730a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -40,6 +40,7 @@
 #include <linux/kvm_para.h>
 #include <linux/pagemap.h>
 #include <linux/mman.h>
+#include <linux/swap.h>
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -59,7 +60,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 static __read_mostly struct preempt_ops kvm_preempt_ops;
-static struct dentry *debugfs_dir;
+struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
@@ -119,6 +120,29 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
        smp_call_function_mask(cpus, ack_flush, NULL, 1);
 }
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+        int i, cpu;
+        cpumask_t cpus;
+        struct kvm_vcpu *vcpu;
+        cpus_clear(cpus);
+        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                vcpu = kvm->vcpus[i];
+                if (!vcpu)
+                        continue;
+                if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+                        continue;
+                cpu = vcpu->cpu;
+                if (cpu != -1 && cpu != raw_smp_processor_id())
+                        cpu_set(cpu, cpus);
+        }
+        if (cpus_empty(cpus))
+                return;
+        smp_call_function_mask(cpus, ack_flush, NULL, 1);
+}
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
        struct page *page;
@@ -170,6 +194,7 @@ static struct kvm *kvm_create_vm(void)
        mutex_init(&kvm->lock);
        kvm_io_bus_init(&kvm->mmio_bus);
        init_rwsem(&kvm->slots_lock);
+        atomic_set(&kvm->users_count, 1);
        spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        spin_unlock(&kvm_lock);
@@ -189,9 +214,13 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                vfree(free->dirty_bitmap);
+        if (!dont || free->lpage_info != dont->lpage_info)
+                vfree(free->lpage_info);
        free->npages = 0;
        free->dirty_bitmap = NULL;
        free->rmap = NULL;
+        free->lpage_info = NULL;
 }
 void kvm_free_physmem(struct kvm *kvm)
@@ -215,11 +244,25 @@ static void kvm_destroy_vm(struct kvm *kvm)
        mmdrop(mm);
 }
+void kvm_get_kvm(struct kvm *kvm)
+{
+        atomic_inc(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm);
+void kvm_put_kvm(struct kvm *kvm)
+{
+        if (atomic_dec_and_test(&kvm->users_count))
+                kvm_destroy_vm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm);
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
        struct kvm *kvm = filp->private_data;
-        kvm_destroy_vm(kvm);
+        kvm_put_kvm(kvm);
        return 0;
 }
@@ -301,6 +344,25 @@ int __kvm_set_memory_region(struct kvm *kvm,
                new.user_alloc = user_alloc;
                new.userspace_addr = mem->userspace_addr;
        }
+        if (npages && !new.lpage_info) {
+                int largepages = npages / KVM_PAGES_PER_HPAGE;
+                if (npages % KVM_PAGES_PER_HPAGE)
+                        largepages++;
+                if (base_gfn % KVM_PAGES_PER_HPAGE)
+                        largepages++;
+                new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
+                if (!new.lpage_info)
+                        goto out_free;
+                memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
+                if (base_gfn % KVM_PAGES_PER_HPAGE)
+                        new.lpage_info[0].write_count = 1;
+                if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
+                        new.lpage_info[largepages-1].write_count = 1;
+        }
        /* Allocate page dirty bitmap if needed */
        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
@@ -397,6 +459,12 @@ int is_error_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(is_error_page);
+int is_error_pfn(pfn_t pfn)
+{
+        return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_error_pfn);
 static inline unsigned long bad_hva(void)
 {
        return PAGE_OFFSET;
@@ -444,7 +512,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot;
@@ -458,7 +526,7 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 /*
 * Requires current->mm->mmap_sem to be held
 */
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
        struct page *page[1];
        unsigned long addr;
@@ -469,7 +537,7 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr)) {
                get_page(bad_page);
-                return bad_page;
+                return page_to_pfn(bad_page);
        }
        npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
@@ -477,27 +545,71 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
        if (npages != 1) {
                get_page(bad_page);
-                return bad_page;
+                return page_to_pfn(bad_page);
        }
-        return page[0];
+        return page_to_pfn(page[0]);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn);
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+        return pfn_to_page(gfn_to_pfn(kvm, gfn));
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 void kvm_release_page_clean(struct page *page)
 {
-        put_page(page);
+        kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+void kvm_release_pfn_clean(pfn_t pfn)
+{
+        put_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 void kvm_release_page_dirty(struct page *page)
 {
+        kvm_release_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+void kvm_release_pfn_dirty(pfn_t pfn)
+{
+        kvm_set_pfn_dirty(pfn);
+        kvm_release_pfn_clean(pfn);
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+void kvm_set_page_dirty(struct page *page)
+{
+        kvm_set_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
+void kvm_set_pfn_dirty(pfn_t pfn)
+{
+        struct page *page = pfn_to_page(pfn);
        if (!PageReserved(page))
                SetPageDirty(page);
-        put_page(page);
 }
-EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
+void kvm_set_pfn_accessed(pfn_t pfn)
+{
+        mark_page_accessed(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
+void kvm_get_pfn(pfn_t pfn)
+{
+        get_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_get_pfn);
 static int next_segment(unsigned long len, int offset)
 {
@@ -554,7 +666,9 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
+        pagefault_disable();
        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+        pagefault_enable();
        if (r)
                return -EFAULT;
        return 0;
@@ -651,6 +765,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         * We will block until either an interrupt or a signal wakes us up
         */
        while (!kvm_cpu_has_interrupt(vcpu)
+               && !kvm_cpu_has_pending_timer(vcpu)
               && !signal_pending(current)
               && !kvm_arch_vcpu_runnable(vcpu)) {
                set_current_state(TASK_INTERRUPTIBLE);
@@ -678,8 +793,10 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (vmf->pgoff == 0)
                page = virt_to_page(vcpu->run);
+#ifdef CONFIG_X86
        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->arch.pio_data);
+#endif
        else
                return VM_FAULT_SIGBUS;
        get_page(page);
@@ -701,11 +818,11 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
-        fput(vcpu->kvm->filp);
+        kvm_put_kvm(vcpu->kvm);
        return 0;
 }
-static struct file_operations kvm_vcpu_fops = {
+static const struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
        .compat_ioctl   = kvm_vcpu_ioctl,
@@ -723,9 +840,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
        r = anon_inode_getfd(&fd, &inode, &file,
                             "kvm-vcpu", &kvm_vcpu_fops, vcpu);
-        if (r)
+        if (r) {
+                kvm_put_kvm(vcpu->kvm);
                return r;
-        atomic_inc(&vcpu->kvm->filp->f_count);
+        }
        return fd;
 }
@@ -760,6 +878,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
        mutex_unlock(&kvm->lock);
        /* Now it's all set up, let userspace reach it */
+        kvm_get_kvm(kvm);
        r = create_vcpu_fd(vcpu);
        if (r < 0)
                goto unlink;
@@ -802,28 +921,39 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
                break;
        case KVM_GET_REGS: {
-                struct kvm_regs kvm_regs;
+                struct kvm_regs *kvm_regs;
-                memset(&kvm_regs, 0, sizeof kvm_regs);
+                r = -ENOMEM;
-                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
+                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
-                if (r)
+                if (!kvm_regs)
                        goto out;
+                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+                if (r)
+                        goto out_free1;
                r = -EFAULT;
-                if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
+                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
-                        goto out;
+                        goto out_free1;
                r = 0;
+out_free1:
+                kfree(kvm_regs);
                break;
        }
        case KVM_SET_REGS: {
-                struct kvm_regs kvm_regs;
+                struct kvm_regs *kvm_regs;
-                r = -EFAULT;
+                r = -ENOMEM;
-                if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+                if (!kvm_regs)
                        goto out;
-                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+                r = -EFAULT;
+                if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+                        goto out_free2;
+                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
                if (r)
-                        goto out;
+                        goto out_free2;
                r = 0;
+out_free2:
+                kfree(kvm_regs);
                break;
        }
        case KVM_GET_SREGS: {
@@ -851,6 +981,30 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = 0;
                break;
        }
+        case KVM_GET_MP_STATE: {
+                struct kvm_mp_state mp_state;
+                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+                if (r)
+                        goto out;
+                r = -EFAULT;
+                if (copy_to_user(argp, &mp_state, sizeof mp_state))
+                        goto out;
+                r = 0;
+                break;
+        }
+        case KVM_SET_MP_STATE: {
+                struct kvm_mp_state mp_state;
+                r = -EFAULT;
+                if (copy_from_user(&mp_state, argp, sizeof mp_state))
+                        goto out;
+                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+                if (r)
+                        goto out;
+                r = 0;
+                break;
+        }
        case KVM_TRANSLATE: {
                struct kvm_translation tr;
@@ -1005,7 +1159,7 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static struct file_operations kvm_vm_fops = {
+static const struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
        .compat_ioctl   = kvm_vm_ioctl,
@@ -1024,12 +1178,10 @@ static int kvm_dev_ioctl_create_vm(void)
                return PTR_ERR(kvm);
        r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
        if (r) {
-                kvm_destroy_vm(kvm);
+                kvm_put_kvm(kvm);
                return r;
        }
-        kvm->filp = file;
        return fd;
 }
@@ -1059,7 +1211,15 @@ static long kvm_dev_ioctl(struct file *filp,
                r = -EINVAL;
                if (arg)
                        goto out;
-                r = 2 * PAGE_SIZE;
+                r = PAGE_SIZE;     /* struct kvm_run */
+#ifdef CONFIG_X86
+                r += PAGE_SIZE;    /* pio data page */
+#endif
+                break;
+        case KVM_TRACE_ENABLE:
+        case KVM_TRACE_PAUSE:
+        case KVM_TRACE_DISABLE:
+                r = kvm_trace_ioctl(ioctl, arg);
                break;
        default:
                return kvm_arch_dev_ioctl(filp, ioctl, arg);
@@ -1232,9 +1392,9 @@ static void kvm_init_debug(void)
 {
        struct kvm_stats_debugfs_item *p;
-        debugfs_dir = debugfs_create_dir("kvm", NULL);
+        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
        for (p = debugfs_entries; p->name; ++p)
-                p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+                p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                                (void *)(long)p->offset,
                                                stat_fops[p->kind]);
 }
@@ -1245,7 +1405,7 @@ static void kvm_exit_debug(void)
        for (p = debugfs_entries; p->name; ++p)
                debugfs_remove(p->dentry);
-        debugfs_remove(debugfs_dir);
+        debugfs_remove(kvm_debugfs_dir);
 }
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
@@ -1272,6 +1432,7 @@ static struct sys_device kvm_sysdev = {
 };
 struct page *bad_page;
+pfn_t bad_pfn;
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
@@ -1313,6 +1474,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
                goto out;
        }
+        bad_pfn = page_to_pfn(bad_page);
        r = kvm_arch_hardware_setup();
        if (r < 0)
                goto out_free_0;
@@ -1386,6 +1549,7 @@ EXPORT_SYMBOL_GPL(kvm_init);
 void kvm_exit(void)
 {
+        kvm_trace_cleanup();
        misc_deregister(&kvm_dev);
        kmem_cache_destroy(kvm_vcpu_cache);
        sysdev_unregister(&kvm_sysdev);
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
new file mode 100644
index 00000000000..0e495470788
--- /dev/null
+++ b/virt/kvm/kvm_trace.c
@@ -0,0 +1,276 @@
+/*
+ * kvm trace
+ *
+ * It is designed to allow debugging traces of kvm to be generated
+ * on UP / SMP machines.  Each trace entry can be timestamped so that
+ * it's possible to reconstruct a chronological record of trace events.
+ * The implementation refers to blktrace kernel support.
+ *
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * Authors: Feng(Eric) Liu, eric.e.liu@intel.com
+ *
+ * Date:    Feb 2008
+ */
+#include <linux/module.h>
+#include <linux/relay.h>
+#include <linux/debugfs.h>
+#include <linux/kvm_host.h>
+#define KVM_TRACE_STATE_RUNNING         (1 << 0)
+#define KVM_TRACE_STATE_PAUSE           (1 << 1)
+#define KVM_TRACE_STATE_CLEARUP         (1 << 2)
+struct kvm_trace {
+        int trace_state;
+        struct rchan *rchan;
+        struct dentry *lost_file;
+        atomic_t lost_records;
+};
+static struct kvm_trace *kvm_trace;
+struct kvm_trace_probe {
+        const char *name;
+        const char *format;
+        u32 cycle_in;
+        marker_probe_func *probe_func;
+};
+static inline int calc_rec_size(int cycle, int extra)
+{
+        int rec_size = KVM_TRC_HEAD_SIZE;
+        rec_size += extra;
+        return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
+}
+static void kvm_add_trace(void *probe_private, void *call_data,
+                          const char *format, va_list *args)
+{
+        struct kvm_trace_probe *p = probe_private;
+        struct kvm_trace *kt = kvm_trace;
+        struct kvm_trace_rec rec;
+        struct kvm_vcpu *vcpu;
+        int    i, extra, size;
+        if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
+                return;
+        rec.event       = va_arg(*args, u32);
+        vcpu            = va_arg(*args, struct kvm_vcpu *);
+        rec.pid         = current->tgid;
+        rec.vcpu_id     = vcpu->vcpu_id;
+        extra           = va_arg(*args, u32);
+        WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
+        extra           = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
+        rec.extra_u32   = extra;
+        rec.cycle_in    = p->cycle_in;
+        if (rec.cycle_in) {
+                u64 cycle = 0;
+                cycle = get_cycles();
+                rec.u.cycle.cycle_lo = (u32)cycle;
+                rec.u.cycle.cycle_hi = (u32)(cycle >> 32);
+                for (i = 0; i < rec.extra_u32; i++)
+                        rec.u.cycle.extra_u32[i] = va_arg(*args, u32);
+        } else {
+                for (i = 0; i < rec.extra_u32; i++)
+                        rec.u.nocycle.extra_u32[i] = va_arg(*args, u32);
+        }
+        size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32));
+        relay_write(kt->rchan, &rec, size);
+}
+static struct kvm_trace_probe kvm_trace_probes[] = {
+        { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace },
+        { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace },
+};
+static int lost_records_get(void *data, u64 *val)
+{
+        struct kvm_trace *kt = data;
+        *val = atomic_read(&kt->lost_records);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n");
+/*
+ *  The relay channel is used in "no-overwrite" mode, it keeps trace of how
+ *  many times we encountered a full subbuffer, to tell user space app the
+ *  lost records there were.
+ */
+static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+                                     void *prev_subbuf, size_t prev_padding)
+{
+        struct kvm_trace *kt;
+        if (!relay_buf_full(buf))
+                return 1;
+        kt = buf->chan->private_data;
+        atomic_inc(&kt->lost_records);
+        return 0;
+}
+static struct dentry *kvm_create_buf_file_callack(const char *filename,
+                                                 struct dentry *parent,
+                                                 int mode,
+                                                 struct rchan_buf *buf,
+                                                 int *is_global)
+{
+        return debugfs_create_file(filename, mode, parent, buf,
+                                   &relay_file_operations);
+}
+static int kvm_remove_buf_file_callback(struct dentry *dentry)
+{
+        debugfs_remove(dentry);
+        return 0;
+}
+static struct rchan_callbacks kvm_relay_callbacks = {
+        .subbuf_start           = kvm_subbuf_start_callback,
+        .create_buf_file        = kvm_create_buf_file_callack,
+        .remove_buf_file        = kvm_remove_buf_file_callback,
+};
+static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts)
+{
+        struct kvm_trace *kt;
+        int i, r = -ENOMEM;
+        if (!kuts->buf_size || !kuts->buf_nr)
+                return -EINVAL;
+        kt = kzalloc(sizeof(*kt), GFP_KERNEL);
+        if (!kt)
+                goto err;
+        r = -EIO;
+        atomic_set(&kt->lost_records, 0);
+        kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir,
+                                            kt, &kvm_trace_lost_ops);
+        if (!kt->lost_file)
+                goto err;
+        kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size,
+                                kuts->buf_nr, &kvm_relay_callbacks, kt);
+        if (!kt->rchan)
+                goto err;
+        kvm_trace = kt;
+        for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
+                struct kvm_trace_probe *p = &kvm_trace_probes[i];
+                r = marker_probe_register(p->name, p->format, p->probe_func, p);
+                if (r)
+                        printk(KERN_INFO "Unable to register probe %s\n",
+                               p->name);
+        }
+        kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING;
+        return 0;
+err:
+        if (kt) {
+                if (kt->lost_file)
+                        debugfs_remove(kt->lost_file);
+                if (kt->rchan)
+                        relay_close(kt->rchan);
+                kfree(kt);
+        }
+        return r;
+}
+static int kvm_trace_enable(char __user *arg)
+{
+        struct kvm_user_trace_setup kuts;
+        int ret;
+        ret = copy_from_user(&kuts, arg, sizeof(kuts));
+        if (ret)
+                return -EFAULT;
+        ret = do_kvm_trace_enable(&kuts);
+        if (ret)
+                return ret;
+        return 0;
+}
+static int kvm_trace_pause(void)
+{
+        struct kvm_trace *kt = kvm_trace;
+        int r = -EINVAL;
+        if (kt == NULL)
+                return r;
+        if (kt->trace_state == KVM_TRACE_STATE_RUNNING) {
+                kt->trace_state = KVM_TRACE_STATE_PAUSE;
+                relay_flush(kt->rchan);
+                r = 0;
+        }
+        return r;
+}
+void kvm_trace_cleanup(void)
+{
+        struct kvm_trace *kt = kvm_trace;
+        int i;
+        if (kt == NULL)
+                return;
+        if (kt->trace_state == KVM_TRACE_STATE_RUNNING ||
+            kt->trace_state == KVM_TRACE_STATE_PAUSE) {
+                kt->trace_state = KVM_TRACE_STATE_CLEARUP;
+                for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
+                        struct kvm_trace_probe *p = &kvm_trace_probes[i];
+                        marker_probe_unregister(p->name, p->probe_func, p);
+                }
+                relay_close(kt->rchan);
+                debugfs_remove(kt->lost_file);
+                kfree(kt);
+        }
+}
+int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
+{
+        void __user *argp = (void __user *)arg;
+        long r = -EINVAL;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        switch (ioctl) {
+        case KVM_TRACE_ENABLE:
+                r = kvm_trace_enable(argp);
+                break;
+        case KVM_TRACE_PAUSE:
+                r = kvm_trace_pause();
+                break;
+        case KVM_TRACE_DISABLE:
+                r = 0;
+                kvm_trace_cleanup();
+                break;
+        }
+        return r;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-27 13:13:52 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-27 13:13:52 -0400
commit	42cadc86008aae0fd9ff31642dc01ed50723cf32 (patch)
tree	b05d4c8f0561bad5a0183a89fb23ce4c8ee1653c
parent	fba5c1af5c4fd6645fe62ea84ccde0981282cf66 (diff)
parent	66c0b394f08fd89236515c1c84485ea712a157be (diff)