47 files changed, 2137 insertions, 532 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c0ed72c02a2..30c40f08a3d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2032,7 +2032,6 @@ menu "Bus options (PCI etc.)"
 config PCI
        bool "PCI support"
        default y
-        select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
        ---help---
          Find out whether you have a PCI motherboard. PCI is the name of a
          bus system, i.e. the way the CPU talks to the other stuff inside
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 6c63c358a7e6..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -81,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5cb86ccd4acb..c171dcbf192d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 }
 /* camellia sboxes */
-const u64 camellia_sp10011110[256] = {
+__visible const u64 camellia_sp10011110[256] = {
        0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
        0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
        0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
@@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = {
        0x9e00009e9e9e9e00ULL,
 };
-const u64 camellia_sp22000222[256] = {
+__visible const u64 camellia_sp22000222[256] = {
        0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
        0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
        0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
@@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = {
        0x3d3d0000003d3d3dULL,
 };
-const u64 camellia_sp03303033[256] = {
+__visible const u64 camellia_sp03303033[256] = {
        0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
        0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
        0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
@@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = {
        0x004f4f004f004f4fULL,
 };
-const u64 camellia_sp00444404[256] = {
+__visible const u64 camellia_sp00444404[256] = {
        0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
        0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
        0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
@@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = {
        0x00009e9e9e9e009eULL,
 };
-const u64 camellia_sp02220222[256] = {
+__visible const u64 camellia_sp02220222[256] = {
        0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
        0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
        0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
@@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = {
        0x003d3d3d003d3d3dULL,
 };
-const u64 camellia_sp30333033[256] = {
+__visible const u64 camellia_sp30333033[256] = {
        0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
        0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
        0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
@@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = {
        0x4f004f4f4f004f4fULL,
 };
-const u64 camellia_sp44044404[256] = {
+__visible const u64 camellia_sp44044404[256] = {
        0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
        0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
        0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
@@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = {
        0x9e9e009e9e9e009eULL,
 };
-const u64 camellia_sp11101110[256] = {
+__visible const u64 camellia_sp11101110[256] = {
        0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
        0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
        0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
@@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
        /* modified for FLinv(kl2) */
-        dw = (subRL[1] & subRL[9]) >> 32,
+        dw = (subRL[1] & subRL[9]) >> 32;
-                subRL[1] ^= rol32(dw, 1);
+        subRL[1] ^= rol32(dw, 1);
        /* round 8 */
        subRL[11] ^= subRL[1];
@@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
        /* modified for FLinv(kl4) */
-        dw = (subRL[1] & subRL[17]) >> 32,
+        dw = (subRL[1] & subRL[17]) >> 32;
-                subRL[1] ^= rol32(dw, 1);
+        subRL[1] ^= rol32(dw, 1);
        /* round 14 */
        subRL[19] ^= subRL[1];
@@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        } else {
                subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
                /* modified for FLinv(kl6) */
-                dw = (subRL[1] & subRL[25]) >> 32,
+                dw = (subRL[1] & subRL[25]) >> 32;
-                        subRL[1] ^= rol32(dw, 1);
+                subRL[1] ^= rol32(dw, 1);
                /* round 20 */
                subRL[27] ^= subRL[1];
@@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
                kw4 ^= (kw4 & ~subRL[24]) << 32;
                /* modified for FL(kl5) */
-                dw = (kw4 & subRL[24]) >> 32,
+                dw = (kw4 & subRL[24]) >> 32;
-                        kw4 ^= rol32(dw, 1);
+                kw4 ^= rol32(dw, 1);
        }
        /* round 17 */
@@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        kw4 ^= (kw4 & ~subRL[16]) << 32;
        /* modified for FL(kl3) */
-        dw = (kw4 & subRL[16]) >> 32,
+        dw = (kw4 & subRL[16]) >> 32;
-                kw4 ^= rol32(dw, 1);
+        kw4 ^= rol32(dw, 1);
        /* round 11 */
        subRL[14] ^= kw4;
@@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        kw4 ^= (kw4 & ~subRL[8]) << 32;
        /* modified for FL(kl1) */
-        dw = (kw4 & subRL[8]) >> 32,
+        dw = (kw4 & subRL[8]) >> 32;
-                kw4 ^= rol32(dw, 1);
+        kw4 ^= rol32(dw, 1);
        /* round 5 */
        subRL[6] ^= kw4;
@@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]);                  /* round 5 */
        tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
-        dw = tl & (subRL[8] >> 32),                             /* FL(kl1) */
+        dw = tl & (subRL[8] >> 32);                             /* FL(kl1) */
-                tr = subRL[10] ^ rol32(dw, 1);
+        tr = subRL[10] ^ rol32(dw, 1);
        tt = (tr | ((u64)tl << 32));
        SET_SUBKEY_LR(7, subRL[6] ^ tt);                        /* round 6 */
@@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        SET_SUBKEY_LR(9, subRL[9]);                             /* FLinv(kl2) */
        tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
-        dw = tl & (subRL[9] >> 32),                             /* FLinv(kl2) */
+        dw = tl & (subRL[9] >> 32);                             /* FLinv(kl2) */
-                tr = subRL[7] ^ rol32(dw, 1);
+        tr = subRL[7] ^ rol32(dw, 1);
        tt = (tr | ((u64)tl << 32));
        SET_SUBKEY_LR(10, subRL[11] ^ tt);                      /* round 7 */
@@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]);               /* round 11 */
        tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
-        dw = tl & (subRL[16] >> 32),                            /* FL(kl3) */
+        dw = tl & (subRL[16] >> 32);                            /* FL(kl3) */
-                tr = subRL[18] ^ rol32(dw, 1);
+        tr = subRL[18] ^ rol32(dw, 1);
        tt = (tr | ((u64)tl << 32));
        SET_SUBKEY_LR(15, subRL[14] ^ tt);                      /* round 12 */
@@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
        SET_SUBKEY_LR(17, subRL[17]);                           /* FLinv(kl4) */
        tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
-        dw = tl & (subRL[17] >> 32),                            /* FLinv(kl4) */
+        dw = tl & (subRL[17] >> 32);                            /* FLinv(kl4) */
-                tr = subRL[15] ^ rol32(dw, 1);
+        tr = subRL[15] ^ rol32(dw, 1);
        tt = (tr | ((u64)tl << 32));
        SET_SUBKEY_LR(18, subRL[19] ^ tt);                      /* round 13 */
@@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
                SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]);       /* kw3 */
        } else {
                tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
-                dw = tl & (subRL[24] >> 32),                    /* FL(kl5) */
+                dw = tl & (subRL[24] >> 32);                    /* FL(kl5) */
-                        tr = subRL[26] ^ rol32(dw, 1);
+                tr = subRL[26] ^ rol32(dw, 1);
                tt = (tr | ((u64)tl << 32));
                SET_SUBKEY_LR(23, subRL[22] ^ tt);              /* round 18 */
@@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
                SET_SUBKEY_LR(25, subRL[25]);                   /* FLinv(kl6) */
                tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
-                dw = tl & (subRL[25] >> 32),                    /* FLinv(kl6) */
+                dw = tl & (subRL[25] >> 32);                    /* FLinv(kl6) */
-                        tr = subRL[23] ^ rol32(dw, 1);
+                tr = subRL[23] ^ rol32(dw, 1);
                tt = (tr | ((u64)tl << 32));
                SET_SUBKEY_LR(26, subRL[27] ^ tt);              /* round 19 */
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+#     Erdinc Ozturk <erdinc.ozturk@intel.com>
+#     Vinodh Gopal <vinodh.gopal@intel.com>
+#     James Guilford <james.guilford@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the
+#   distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+#       Function API:
+#       UINT16 crc_t10dif_pcl(
+#               UINT16 init_crc, //initial CRC value, 16 bits
+#               const unsigned char *buf, //buffer pointer to calculate CRC on
+#               UINT64 len //buffer length in bytes (64-bit data)
+#       );
+#
+#       Reference paper titled "Fast CRC Computation for Generic
+#       Polynomials Using PCLMULQDQ Instruction"
+#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+#include <linux/linkage.h>
+.text
+#define        arg1 %rdi
+#define        arg2 %rsi
+#define        arg3 %rdx
+#define        arg1_low32 %edi
+ENTRY(crc_t10dif_pcl)
+.align 16
+        # adjust the 16-bit initial_crc value, scale it to 32 bits
+        shl     $16, arg1_low32
+        # Allocate Stack Space
+        mov     %rsp, %rcx
+        sub     $16*2, %rsp
+        # align stack to 16 byte boundary
+        and     $~(0x10 - 1), %rsp
+        # check if smaller than 256
+        cmp     $256, arg3
+        # for sizes less than 128, we can't fold 64B at a time...
+        jl      _less_than_128
+        # load the initial crc value
+        movd    arg1_low32, %xmm10      # initial crc
+        # crc value does not need to be byte-reflected, but it needs
+        # to be moved to the high part of the register.
+        # because data will be byte-reflected and will align with
+        # initial crc at correct place.
+        pslldq  $12, %xmm10
+        movdqa  SHUF_MASK(%rip), %xmm11
+        # receive the initial 64B data, xor the initial crc value
+        movdqu  16*0(arg2), %xmm0
+        movdqu  16*1(arg2), %xmm1
+        movdqu  16*2(arg2), %xmm2
+        movdqu  16*3(arg2), %xmm3
+        movdqu  16*4(arg2), %xmm4
+        movdqu  16*5(arg2), %xmm5
+        movdqu  16*6(arg2), %xmm6
+        movdqu  16*7(arg2), %xmm7
+        pshufb  %xmm11, %xmm0
+        # XOR the initial_crc value
+        pxor    %xmm10, %xmm0
+        pshufb  %xmm11, %xmm1
+        pshufb  %xmm11, %xmm2
+        pshufb  %xmm11, %xmm3
+        pshufb  %xmm11, %xmm4
+        pshufb  %xmm11, %xmm5
+        pshufb  %xmm11, %xmm6
+        pshufb  %xmm11, %xmm7
+        movdqa  rk3(%rip), %xmm10       #xmm10 has rk3 and rk4
+                                        #imm value of pclmulqdq instruction
+                                        #will determine which constant to use
+        #################################################################
+        # we subtract 256 instead of 128 to save one instruction from the loop
+        sub     $256, arg3
+        # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+        # buffer. The _fold_64_B_loop will fold 64B at a time
+        # until we have 64+y Bytes of buffer
+        # fold 64B at a time. This section of the code folds 4 xmm
+        # registers in parallel
+_fold_64_B_loop:
+        # update the buffer pointer
+        add     $128, arg2              #    buf += 64#
+        movdqu  16*0(arg2), %xmm9
+        movdqu  16*1(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm0, %xmm8
+        movdqa  %xmm1, %xmm13
+        pclmulqdq       $0x0 , %xmm10, %xmm0
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0 , %xmm10, %xmm1
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm0
+        xorps   %xmm8 , %xmm0
+        pxor    %xmm12, %xmm1
+        xorps   %xmm13, %xmm1
+        movdqu  16*2(arg2), %xmm9
+        movdqu  16*3(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm2, %xmm8
+        movdqa  %xmm3, %xmm13
+        pclmulqdq       $0x0, %xmm10, %xmm2
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0, %xmm10, %xmm3
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm2
+        xorps   %xmm8 , %xmm2
+        pxor    %xmm12, %xmm3
+        xorps   %xmm13, %xmm3
+        movdqu  16*4(arg2), %xmm9
+        movdqu  16*5(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm4, %xmm8
+        movdqa  %xmm5, %xmm13
+        pclmulqdq       $0x0,  %xmm10, %xmm4
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0,  %xmm10, %xmm5
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 ,  %xmm4
+        xorps   %xmm8 ,  %xmm4
+        pxor    %xmm12,  %xmm5
+        xorps   %xmm13,  %xmm5
+        movdqu  16*6(arg2), %xmm9
+        movdqu  16*7(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm6 , %xmm8
+        movdqa  %xmm7 , %xmm13
+        pclmulqdq       $0x0 , %xmm10, %xmm6
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0 , %xmm10, %xmm7
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm6
+        xorps   %xmm8 , %xmm6
+        pxor    %xmm12, %xmm7
+        xorps   %xmm13, %xmm7
+        sub     $128, arg3
+        # check if there is another 64B in the buffer to be able to fold
+        jge     _fold_64_B_loop
+        ##################################################################
+        add     $128, arg2
+        # at this point, the buffer pointer is pointing at the last y Bytes
+        # of the buffer the 64B of folded data is in 4 of the xmm
+        # registers: xmm0, xmm1, xmm2, xmm3
+        # fold the 8 xmm registers to 1 xmm register with different constants
+        movdqa  rk9(%rip), %xmm10
+        movdqa  %xmm0, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm0
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm0, %xmm7
+        movdqa  rk11(%rip), %xmm10
+        movdqa  %xmm1, %xmm8
+        pclmulqdq        $0x11, %xmm10, %xmm1
+        pclmulqdq        $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm1, %xmm7
+        movdqa  rk13(%rip), %xmm10
+        movdqa  %xmm2, %xmm8
+        pclmulqdq        $0x11, %xmm10, %xmm2
+        pclmulqdq        $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm2, %xmm7
+        movdqa  rk15(%rip), %xmm10
+        movdqa  %xmm3, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm3
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm3, %xmm7
+        movdqa  rk17(%rip), %xmm10
+        movdqa  %xmm4, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm4
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm4, %xmm7
+        movdqa  rk19(%rip), %xmm10
+        movdqa  %xmm5, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm5
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm5, %xmm7
+        movdqa  rk1(%rip), %xmm10       #xmm10 has rk1 and rk2
+                                        #imm value of pclmulqdq instruction
+                                        #will determine which constant to use
+        movdqa  %xmm6, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm6
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm6, %xmm7
+        # instead of 64, we add 48 to the loop counter to save 1 instruction
+        # from the loop instead of a cmp instruction, we use the negative
+        # flag with the jl instruction
+        add     $128-16, arg3
+        jl      _final_reduction_for_128
+        # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+        # and the rest is in memory. We can fold 16 bytes at a time if y>=16
+        # continue folding 16B at a time
+_16B_reduction_loop:
+        movdqa  %xmm7, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        movdqu  (arg2), %xmm0
+        pshufb  %xmm11, %xmm0
+        pxor    %xmm0 , %xmm7
+        add     $16, arg2
+        sub     $16, arg3
+        # instead of a cmp instruction, we utilize the flags with the
+        # jge instruction equivalent of: cmp arg3, 16-16
+        # check if there is any more 16B in the buffer to be able to fold
+        jge     _16B_reduction_loop
+        #now we have 16+z bytes left to reduce, where 0<= z < 16.
+        #first, we reduce the data in the xmm7 register
+_final_reduction_for_128:
+        # check if any more data to fold. If not, compute the CRC of
+        # the final 128 bits
+        add     $16, arg3
+        je      _128_done
+        # here we are getting data that is less than 16 bytes.
+        # since we know that there was data before the pointer, we can
+        # offset the input pointer before the actual point, to receive
+        # exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+        movdqa  %xmm7, %xmm2
+        movdqu  -16(arg2, arg3), %xmm1
+        pshufb  %xmm11, %xmm1
+        # get rid of the extra data that was loaded before
+        # load the shift constant
+        lea     pshufb_shf_table+16(%rip), %rax
+        sub     arg3, %rax
+        movdqu  (%rax), %xmm0
+        # shift xmm2 to the left by arg3 bytes
+        pshufb  %xmm0, %xmm2
+        # shift xmm7 to the right by 16-arg3 bytes
+        pxor    mask1(%rip), %xmm0
+        pshufb  %xmm0, %xmm7
+        pblendvb        %xmm2, %xmm1    #xmm0 is implicit
+        # fold 16 Bytes
+        movdqa  %xmm1, %xmm2
+        movdqa  %xmm7, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm2, %xmm7
+_128_done:
+        # compute crc of a 128-bit value
+        movdqa  rk5(%rip), %xmm10       # rk5 and rk6 in xmm10
+        movdqa  %xmm7, %xmm0
+        #64b fold
+        pclmulqdq       $0x1, %xmm10, %xmm7
+        pslldq  $8   ,  %xmm0
+        pxor    %xmm0,  %xmm7
+        #32b fold
+        movdqa  %xmm7, %xmm0
+        pand    mask2(%rip), %xmm0
+        psrldq  $12, %xmm7
+        pclmulqdq       $0x10, %xmm10, %xmm7
+        pxor    %xmm0, %xmm7
+        #barrett reduction
+_barrett:
+        movdqa  rk7(%rip), %xmm10       # rk7 and rk8 in xmm10
+        movdqa  %xmm7, %xmm0
+        pclmulqdq       $0x01, %xmm10, %xmm7
+        pslldq  $4, %xmm7
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pslldq  $4, %xmm7
+        pxor    %xmm0, %xmm7
+        pextrd  $1, %xmm7, %eax
+_cleanup:
+        # scale the result back to 16 bits
+        shr     $16, %eax
+        mov     %rcx, %rsp
+        ret
+########################################################################
+.align 16
+_less_than_128:
+        # check if there is enough buffer to be able to fold 16B at a time
+        cmp     $32, arg3
+        jl      _less_than_32
+        movdqa  SHUF_MASK(%rip), %xmm11
+        # now if there is, load the constants
+        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
+        movd    arg1_low32, %xmm0       # get the initial crc value
+        pslldq  $12, %xmm0      # align it to its correct place
+        movdqu  (arg2), %xmm7   # load the plaintext
+        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
+        pxor    %xmm0, %xmm7
+        # update the buffer pointer
+        add     $16, arg2
+        # update the counter. subtract 32 instead of 16 to save one
+        # instruction from the loop
+        sub     $32, arg3
+        jmp     _16B_reduction_loop
+.align 16
+_less_than_32:
+        # mov initial crc to the return value. this is necessary for
+        # zero-length buffers.
+        mov     arg1_low32, %eax
+        test    arg3, arg3
+        je      _cleanup
+        movdqa  SHUF_MASK(%rip), %xmm11
+        movd    arg1_low32, %xmm0       # get the initial crc value
+        pslldq  $12, %xmm0      # align it to its correct place
+        cmp     $16, arg3
+        je      _exact_16_left
+        jl      _less_than_16_left
+        movdqu  (arg2), %xmm7   # load the plaintext
+        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        add     $16, arg2
+        sub     $16, arg3
+        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
+        jmp     _get_last_two_xmms
+.align 16
+_less_than_16_left:
+        # use stack space to load data less than 16 bytes, zero-out
+        # the 16B in memory first.
+        pxor    %xmm1, %xmm1
+        mov     %rsp, %r11
+        movdqa  %xmm1, (%r11)
+        cmp     $4, arg3
+        jl      _only_less_than_4
+        # backup the counter value
+        mov     arg3, %r9
+        cmp     $8, arg3
+        jl      _less_than_8_left
+        # load 8 Bytes
+        mov     (arg2), %rax
+        mov     %rax, (%r11)
+        add     $8, %r11
+        sub     $8, arg3
+        add     $8, arg2
+_less_than_8_left:
+        cmp     $4, arg3
+        jl      _less_than_4_left
+        # load 4 Bytes
+        mov     (arg2), %eax
+        mov     %eax, (%r11)
+        add     $4, %r11
+        sub     $4, arg3
+        add     $4, arg2
+_less_than_4_left:
+        cmp     $2, arg3
+        jl      _less_than_2_left
+        # load 2 Bytes
+        mov     (arg2), %ax
+        mov     %ax, (%r11)
+        add     $2, %r11
+        sub     $2, arg3
+        add     $2, arg2
+_less_than_2_left:
+        cmp     $1, arg3
+        jl      _zero_left
+        # load 1 Byte
+        mov     (arg2), %al
+        mov     %al, (%r11)
+_zero_left:
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        # shl r9, 4
+        lea     pshufb_shf_table+16(%rip), %rax
+        sub     %r9, %rax
+        movdqu  (%rax), %xmm0
+        pxor    mask1(%rip), %xmm0
+        pshufb  %xmm0, %xmm7
+        jmp     _128_done
+.align 16
+_exact_16_left:
+        movdqu  (arg2), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        jmp     _128_done
+_only_less_than_4:
+        cmp     $3, arg3
+        jl      _only_less_than_3
+        # load 3 Bytes
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        mov     1(arg2), %al
+        mov     %al, 1(%r11)
+        mov     2(arg2), %al
+        mov     %al, 2(%r11)
+        movdqa   (%rsp), %xmm7
+        pshufb   %xmm11, %xmm7
+        pxor     %xmm0 , %xmm7  # xor the initial crc value
+        psrldq  $5, %xmm7
+        jmp     _barrett
+_only_less_than_3:
+        cmp     $2, arg3
+        jl      _only_less_than_2
+        # load 2 Bytes
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        mov     1(arg2), %al
+        mov     %al, 1(%r11)
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        psrldq  $6, %xmm7
+        jmp     _barrett
+_only_less_than_2:
+        # load 1 Byte
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        psrldq  $7, %xmm7
+        jmp     _barrett
+ENDPROC(crc_t10dif_pcl)
+.data
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+#       DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+#       DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+#       DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+#       DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+#       DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+#       DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+#       DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+#       DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+#       DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+#       DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+#       DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+#       DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+#       DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+#       DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+#       DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+                                size_t len);
+struct chksum_desc_ctx {
+        __u16 crc;
+};
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+static int chksum_init(struct shash_desc *desc)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        ctx->crc = 0;
+        return 0;
+}
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+                         unsigned int length)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        if (irq_fpu_usable()) {
+                kernel_fpu_begin();
+                ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+                kernel_fpu_end();
+        } else
+                ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+        return 0;
+}
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        *(__u16 *)out = ctx->crc;
+        return 0;
+}
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+                        u8 *out)
+{
+        if (irq_fpu_usable()) {
+                kernel_fpu_begin();
+                *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+                kernel_fpu_end();
+        } else
+                *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+        return 0;
+}
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        return __chksum_finup(&ctx->crc, data, len, out);
+}
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+                         unsigned int length, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        return __chksum_finup(&ctx->crc, data, length, out);
+}
+static struct shash_alg alg = {
+        .digestsize             =       CRC_T10DIF_DIGEST_SIZE,
+        .init           =       chksum_init,
+        .update         =       chksum_update,
+        .final          =       chksum_final,
+        .finup          =       chksum_finup,
+        .digest         =       chksum_digest,
+        .descsize               =       sizeof(struct chksum_desc_ctx),
+        .base                   =       {
+                .cra_name               =       "crct10dif",
+                .cra_driver_name        =       "crct10dif-pclmul",
+                .cra_priority           =       200,
+                .cra_blocksize          =       CRC_T10DIF_BLOCK_SIZE,
+                .cra_module             =       THIS_MODULE,
+        }
+};
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+        X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+        {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+static int __init crct10dif_intel_mod_init(void)
+{
+        if (!x86_match_cpu(crct10dif_cpu_id))
+                return -ENODEV;
+        return crypto_register_shash(&alg);
+}
+static void __exit crct10dif_intel_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
index c09241659971..b4b38bacb404 100644
--- a/arch/x86/include/asm/dma-contiguous.h
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -4,7 +4,6 @@
 #ifdef __KERNEL__
 #include <linux/types.h>
-#include <asm-generic/dma-contiguous.h>
 static inline void
 dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 3a16c1483b45..64507f35800c 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -3,18 +3,23 @@
 #ifdef __KERNEL__
+#include <linux/stringify.h>
 #include <linux/types.h>
 #include <asm/nops.h>
 #include <asm/asm.h>
 #define JUMP_LABEL_NOP_SIZE 5
-#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+#ifdef CONFIG_X86_64
+# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
+#else
+# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
+#endif
 static __always_inline bool arch_static_branch(struct static_key *key)
 {
        asm goto("1:"
-                STATIC_KEY_INITIAL_NOP
+                ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
        u64 *pae_root;
        u64 *lm_root;
        u64 rsvd_bits_mask[2][4];
+        u64 bad_mt_xwr;
        /*
         * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
        u64 global_ovf_ctrl;
        u64 counter_bitmask[2];
        u64 global_ctrl_mask;
+        u64 reserved_bits;
        u8 version;
        struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
        struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
         * instruction.
         */
        bool write_fault_to_shadow_pgtable;
+        /* set at EPT violation at this point */
+        unsigned long exit_qualification;
+        /* pv related host specific info */
+        struct {
+                bool pv_unhalted;
+        } pv;
 };
 struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32  kvm_min_guest_tsc_khz;
 extern u32  kvm_max_guest_tsc_khz;
 enum emulation_result {
-        EMULATE_DONE,       /* no further processing */
+        EMULATE_DONE,         /* no further processing */
-        EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+        EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
        EMULATE_FAIL,         /* can't emulate this instruction */
 };
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d9e9e6c7ed32..7d7443283a9d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -100,29 +100,6 @@ static inline void early_quirks(void) { }
 extern void pci_iommu_alloc(void);
 #ifdef CONFIG_PCI_MSI
-/* MSI arch specific hooks */
-static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-        return x86_msi.setup_msi_irqs(dev, nvec, type);
-}
-static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
-{
-        x86_msi.teardown_msi_irqs(dev);
-}
-static inline void x86_teardown_msi_irq(unsigned int irq)
-{
-        x86_msi.teardown_msi_irq(irq);
-}
-static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
-{
-        x86_msi.restore_msi_irqs(dev, irq);
-}
-#define arch_setup_msi_irqs x86_setup_msi_irqs
-#define arch_teardown_msi_irqs x86_teardown_msi_irqs
-#define arch_teardown_msi_irq x86_teardown_msi_irq
-#define arch_restore_msi_irqs x86_restore_msi_irqs
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
@@ -130,16 +107,9 @@ void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev, int irq);
 int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
                  unsigned int irq_base, unsigned int irq_offset);
-/* default to the implementation in drivers/lib/msi.c */
-#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
-#define HAVE_DEFAULT_MSI_RESTORE_IRQS
-void default_teardown_msi_irqs(struct pci_dev *dev);
-void default_restore_msi_irqs(struct pci_dev *dev, int irq);
 #else
 #define native_setup_msi_irqs           NULL
 #define native_teardown_msi_irq         NULL
-#define default_teardown_msi_irqs       NULL
-#define default_restore_msi_irqs        NULL
 #endif
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 8d16befdec88..3d1999458709 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
-static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
-{
-        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
-}
-static inline int pte_swp_soft_dirty(pte_t pte)
-{
-        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
-}
-static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
-{
-        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
-}
 static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
 {
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
 #ifndef __ASSEMBLY__
 #include <linux/mm_types.h>
+#include <linux/mmdebug.h>
 #include <linux/log2.h>
 static inline int pte_none(pte_t pte)
@@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 {
 }
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+        VM_BUG_ON(pte_present(pte));
+        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+        VM_BUG_ON(pte_present(pte));
+        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+        VM_BUG_ON(pte_present(pte));
+        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
 #include <asm-generic/pgtable.h>
 #endif  /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f4843e031131..0ecac257fb26 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -75,6 +75,9 @@
 * with swap entry format. On x86 bits 6 and 7 are *not* involved
 * into swap entry computation, but bit 6 is used for nonlinear
 * file mapping, so we borrow bit 7 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+ * mark if and only if the PTE has present bit clear!
 */
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SWP_SOFT_DIRTY    _PAGE_PSE
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
-        u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf512003e663..e6d90babc245 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void)
 static inline void __flush_tlb_one(unsigned long addr)
 {
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
        __flush_tlb_single(addr);
 }
@@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr)
 #ifndef CONFIG_SMP
-#define flush_tlb() __flush_tlb()
+/* "_up" is for UniProcessor.
-#define flush_tlb_all() __flush_tlb_all()
+ *
-#define local_flush_tlb() __flush_tlb()
+ * This is a helper for other header functions.  *Not* intended to be called
+ * directly.  All global TLB flushes need to either call this, or to bump the
+ * vm statistics themselves.
+ */
+static inline void __flush_tlb_up(void)
+{
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+        __flush_tlb();
+}
+static inline void flush_tlb_all(void)
+{
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+        __flush_tlb_all();
+}
+static inline void flush_tlb(void)
+{
+        __flush_tlb_up();
+}
+static inline void local_flush_tlb(void)
+{
+        __flush_tlb_up();
+}
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
        if (mm == current->active_mm)
-                __flush_tlb();
+                __flush_tlb_up();
 }
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
                                   unsigned long start, unsigned long end)
 {
        if (vma->vm_mm == current->active_mm)
-                __flush_tlb();
+                __flush_tlb_up();
 }
 static inline void flush_tlb_mm_range(struct mm_struct *mm,
           unsigned long start, unsigned long end, unsigned long vmflag)
 {
        if (mm == current->active_mm)
-                __flush_tlb();
+                __flush_tlb_up();
 }
 static inline void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR          0
 #define VMX_EPT_EXTENT_CONTEXT                  1
 #define VMX_EPT_EXTENT_GLOBAL                   2
+#define VMX_EPT_EXTENT_SHIFT                    24
 #define VMX_EPT_EXECUTE_ONLY_BIT                (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                 (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                         (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                    (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                    (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                      (1ull << 20)
 #define VMX_EPT_AD_BIT                              (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT              (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT               (1ull << 26)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index ca842f2769ef..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -7,6 +7,7 @@ enum ipi_vector {
        XEN_CALL_FUNCTION_SINGLE_VECTOR,
        XEN_SPIN_UNLOCK_VECTOR,
        XEN_IRQ_WORK_VECTOR,
+        XEN_NMI_VECTOR,
        XEN_NR_IPIS,
 };
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7ea79c5fa1f2..492b29802f57 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = {
 #define AVX_XOR_SPEED \
 do { \
-        if (cpu_has_avx) \
+        if (cpu_has_avx && cpu_has_osxsave) \
                xor_speed(&xor_block_avx); \
 } while (0)
 #define AVX_SELECT(FASTEST) \
-        (cpu_has_avx ? &xor_block_avx : FASTEST)
+        (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
 #else
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
@@ -106,12 +107,13 @@
        { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
        { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
        { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+        { EXIT_REASON_INVEPT,                "INVEPT" }, \
+        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
        { EXIT_REASON_WBINVD,                "WBINVD" }, \
        { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
        { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
        { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
        { EXIT_REASON_INVD,                  "INVD" }, \
-        { EXIT_REASON_INVPCID,               "INVPCID" }, \
+        { EXIT_REASON_INVPCID,               "INVPCID" }
-        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..ce2d0a2c3e4f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
        }
        /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
        __flush_tlb();
        /* Save MTRR state */
@@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 static void post_set(void) __releases(set_atomicity_lock)
 {
        /* Flush TLBs (no need to flush caches - they are disabled) */
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
        __flush_tlb();
        /* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 69eb2fa25494..376dc7873447 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 }
 #ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
-                                            unsigned long end)
 {
        initrd_start = (unsigned long)__va(start);
        initrd_end = (unsigned long)__va(end);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 63bdb29b2549..b3cd3ebae077 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/acpi.h>
 #include <linux/pci_ids.h>
+#include <drm/i915_drm.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
 #include <asm/io_apic.h>
@@ -216,6 +217,157 @@ static void __init intel_remapping_check(int num, int slot, int func)
 }
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use.  This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process.  On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ *
+ * And yes, so far on current devices the base addr is always under 4G.
+ */
+static u32 __init intel_stolen_base(int num, int slot, int func)
+{
+        u32 base;
+        /*
+         * For the PCI IDs in this quirk, the stolen base is always
+         * in 0x5c, aka the BDSM register (yes that's really what
+         * it's called).
+         */
+        base = read_pci_config(num, slot, func, 0x5c);
+        base &= ~((1<<20) - 1);
+        return base;
+}
+#define KB(x)   ((x) * 1024)
+#define MB(x)   (KB (KB (x)))
+#define GB(x)   (MB (KB (x)))
+static size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+        size_t stolen_size;
+        u16 gmch_ctrl;
+        gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+        switch (gmch_ctrl & I855_GMCH_GMS_MASK) {
+        case I855_GMCH_GMS_STOLEN_1M:
+                stolen_size = MB(1);
+                break;
+        case I855_GMCH_GMS_STOLEN_4M:
+                stolen_size = MB(4);
+                break;
+        case I855_GMCH_GMS_STOLEN_8M:
+                stolen_size = MB(8);
+                break;
+        case I855_GMCH_GMS_STOLEN_16M:
+                stolen_size = MB(16);
+                break;
+        case I855_GMCH_GMS_STOLEN_32M:
+                stolen_size = MB(32);
+                break;
+        case I915_GMCH_GMS_STOLEN_48M:
+                stolen_size = MB(48);
+                break;
+        case I915_GMCH_GMS_STOLEN_64M:
+                stolen_size = MB(64);
+                break;
+        case G33_GMCH_GMS_STOLEN_128M:
+                stolen_size = MB(128);
+                break;
+        case G33_GMCH_GMS_STOLEN_256M:
+                stolen_size = MB(256);
+                break;
+        case INTEL_GMCH_GMS_STOLEN_96M:
+                stolen_size = MB(96);
+                break;
+        case INTEL_GMCH_GMS_STOLEN_160M:
+                stolen_size = MB(160);
+                break;
+        case INTEL_GMCH_GMS_STOLEN_224M:
+                stolen_size = MB(224);
+                break;
+        case INTEL_GMCH_GMS_STOLEN_352M:
+                stolen_size = MB(352);
+                break;
+        default:
+                stolen_size = 0;
+                break;
+        }
+        return stolen_size;
+}
+static size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+        u16 gmch_ctrl;
+        gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+        gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+        gmch_ctrl &= SNB_GMCH_GMS_MASK;
+        return gmch_ctrl << 25; /* 32 MB units */
+}
+typedef size_t (*stolen_size_fn)(int num, int slot, int func);
+static struct pci_device_id intel_stolen_ids[] __initdata = {
+        INTEL_I915G_IDS(gen3_stolen_size),
+        INTEL_I915GM_IDS(gen3_stolen_size),
+        INTEL_I945G_IDS(gen3_stolen_size),
+        INTEL_I945GM_IDS(gen3_stolen_size),
+        INTEL_VLV_M_IDS(gen3_stolen_size),
+        INTEL_VLV_D_IDS(gen3_stolen_size),
+        INTEL_PINEVIEW_IDS(gen3_stolen_size),
+        INTEL_I965G_IDS(gen3_stolen_size),
+        INTEL_G33_IDS(gen3_stolen_size),
+        INTEL_I965GM_IDS(gen3_stolen_size),
+        INTEL_GM45_IDS(gen3_stolen_size),
+        INTEL_G45_IDS(gen3_stolen_size),
+        INTEL_IRONLAKE_D_IDS(gen3_stolen_size),
+        INTEL_IRONLAKE_M_IDS(gen3_stolen_size),
+        INTEL_SNB_D_IDS(gen6_stolen_size),
+        INTEL_SNB_M_IDS(gen6_stolen_size),
+        INTEL_IVB_M_IDS(gen6_stolen_size),
+        INTEL_IVB_D_IDS(gen6_stolen_size),
+        INTEL_HSW_D_IDS(gen6_stolen_size),
+        INTEL_HSW_M_IDS(gen6_stolen_size),
+};
+static void __init intel_graphics_stolen(int num, int slot, int func)
+{
+        size_t size;
+        int i;
+        u32 start;
+        u16 device, subvendor, subdevice;
+        device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+        subvendor = read_pci_config_16(num, slot, func,
+                                       PCI_SUBSYSTEM_VENDOR_ID);
+        subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID);
+        for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
+                if (intel_stolen_ids[i].device == device) {
+                        stolen_size_fn stolen_size =
+                                (stolen_size_fn)intel_stolen_ids[i].driver_data;
+                        size = stolen_size(num, slot, func);
+                        start = intel_stolen_base(num, slot, func);
+                        if (size && start) {
+                                /* Mark this space as reserved */
+                                e820_add_region(start, size, E820_RESERVED);
+                                sanitize_e820_map(e820.map,
+                                                  ARRAY_SIZE(e820.map),
+                                                  &e820.nr_map);
+                        }
+                        return;
+                }
+        }
+}
 #define QFLAG_APPLY_ONCE        0x1
 #define QFLAG_APPLIED           0x2
 #define QFLAG_DONE              (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -251,6 +403,8 @@ static struct chipset early_qrk[] __initdata = {
          PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
        { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
          PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+        { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+          QFLAG_APPLY_ONCE, intel_graphics_stolen },
        {}
 };
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a3a2dd..f0dcb0ceb6a2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1176,6 +1176,9 @@ ftrace_restore_flags:
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+        cmpl $__PAGE_OFFSET, %esp
+        jb ftrace_stub          /* Paging not enabled yet? */
        cmpl $0, function_trace_stop
        jne  ftrace_stub
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 460f5d9ceebb..ee11b7dfbfbb 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -24,18 +24,57 @@ union jump_code_union {
        } __attribute__((packed));
 };
+static void bug_at(unsigned char *ip, int line)
+{
+        /*
+         * The location is not an op that we were expecting.
+         * Something went wrong. Crash the box, as something could be
+         * corrupting the kernel.
+         */
+        pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
+               ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+        BUG();
+}
 static void __jump_label_transform(struct jump_entry *entry,
                                   enum jump_label_type type,
-                                   void *(*poker)(void *, const void *, size_t))
+                                   void *(*poker)(void *, const void *, size_t),
+                                   int init)
 {
        union jump_code_union code;
+        const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
        if (type == JUMP_LABEL_ENABLE) {
+                /*
+                 * We are enabling this jump label. If it is not a nop
+                 * then something must have gone wrong.
+                 */
+                if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0))
+                        bug_at((void *)entry->code, __LINE__);
                code.jump = 0xe9;
                code.offset = entry->target -
                                (entry->code + JUMP_LABEL_NOP_SIZE);
-        } else
+        } else {
+                /*
+                 * We are disabling this jump label. If it is not what
+                 * we think it is, then something must have gone wrong.
+                 * If this is the first initialization call, then we
+                 * are converting the default nop to the ideal nop.
+                 */
+                if (init) {
+                        const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+                        if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
+                                bug_at((void *)entry->code, __LINE__);
+                } else {
+                        code.jump = 0xe9;
+                        code.offset = entry->target -
+                                (entry->code + JUMP_LABEL_NOP_SIZE);
+                        if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
+                                bug_at((void *)entry->code, __LINE__);
+                }
                memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+        }
        /*
         * Make text_poke_bp() a default fallback poker.
@@ -57,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry,
 {
        get_online_cpus();
        mutex_lock(&text_mutex);
-        __jump_label_transform(entry, type, NULL);
+        __jump_label_transform(entry, type, NULL, 0);
        mutex_unlock(&text_mutex);
        put_online_cpus();
 }
+static enum {
+        JL_STATE_START,
+        JL_STATE_NO_UPDATE,
+        JL_STATE_UPDATE,
+} jlstate __initdata_or_module = JL_STATE_START;
 __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
                                      enum jump_label_type type)
 {
-        __jump_label_transform(entry, type, text_poke_early);
+        /*
+         * This function is called at boot up and when modules are
+         * first loaded. Check if the default nop, the one that is
+         * inserted at compile time, is the ideal nop. If it is, then
+         * we do not need to update the nop, and we can leave it as is.
+         * If it is not, then we need to update the nop to the ideal nop.
+         */
+        if (jlstate == JL_STATE_START) {
+                const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+                const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+                if (memcmp(ideal_nop, default_nop, 5) != 0)
+                        jlstate = JL_STATE_UPDATE;
+                else
+                        jlstate = JL_STATE_NO_UPDATE;
+        }
+        if (jlstate == JL_STATE_UPDATE)
+                __jump_label_transform(entry, type, text_poke_early, 1);
 }
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 884aa4053313..1b10af835c31 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -62,11 +62,6 @@ void __init default_banner(void)
               pv_info.name);
 }
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code)                                     \
-        extern const char start_##ops##_##name[], end_##ops##_##name[]; \
-        asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
 /* Undefined instruction for dealing with missing ops pointers. */
 static const unsigned char ud2a[] = { 0x0f, 0x0b };
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-        if (!pvclock_vdso_info) {
-                BUG();
-                return NULL;
-        }
-        return &pvclock_vdso_info[cpu];
-}
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-        return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-                                void *v)
-{
-        struct task_migration_notifier *mn = v;
-        struct pvclock_vsyscall_time_info *pvti;
-        pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-        /* this is NULL when pvclock vsyscall is not initialized */
-        if (unlikely(pvti == NULL))
-                return NOTIFY_DONE;
-        pvti->migrate_count++;
-        return NOTIFY_DONE;
-}
-static struct notifier_block pvclock_migrate = {
-        .notifier_call = pvclock_task_migrate,
-};
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-        pvclock_vdso_info = i;
        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                             __pa(i) + (idx*PAGE_SIZE),
                             PAGE_KERNEL_VVAR);
        }
-        register_task_migration_notifier(&pvclock_migrate);
        return 0;
 }
 #endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 5f24c71accaa..8ce0072cd700 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -107,6 +107,8 @@ struct x86_platform_ops x86_platform = {
 };
 EXPORT_SYMBOL_GPL(x86_platform);
+#if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi = {
        .setup_msi_irqs         = native_setup_msi_irqs,
        .compose_msi_msg        = native_compose_msi_msg,
@@ -116,6 +118,28 @@ struct x86_msi_ops x86_msi = {
        .setup_hpet_msi         = default_setup_hpet_msi,
 };
+/* MSI arch specific hooks */
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+        return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+        x86_msi.teardown_msi_irqs(dev);
+}
+void arch_teardown_msi_irq(unsigned int irq)
+{
+        x86_msi.teardown_msi_irq(irq);
+}
+void arch_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+        x86_msi.restore_msi_irqs(dev, irq);
+}
+#endif
 struct x86_io_apic_ops x86_io_apic_ops = {
        .init                   = native_io_apic_init_mappings,
        .read                   = native_io_apic_read,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
                             (1 << KVM_FEATURE_ASYNC_PF) |
                             (1 << KVM_FEATURE_PV_EOI) |
-                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+                             (1 << KVM_FEATURE_PV_UNHALT);
                if (sched_info_on())
                        entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
        *((u32 *) (apic->regs + reg_off)) = val;
 }
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-        return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-        return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
 static inline int apic_test_vector(int vec, void *bitmap)
 {
        return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
        apic->irr_pending = true;
-        return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+        apic_set_vector(vec, apic->regs + APIC_IRR);
 }
 static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                if (unlikely(!apic_enabled(apic)))
                        break;
+                result = 1;
                if (dest_map)
                        __set_bit(vcpu->vcpu_id, dest_map);
-                if (kvm_x86_ops->deliver_posted_interrupt) {
+                if (kvm_x86_ops->deliver_posted_interrupt)
-                        result = 1;
                        kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-                } else {
+                else {
-                        result = !apic_test_and_set_irr(vector, apic);
+                        apic_set_irr(vector, apic);
-                        if (!result) {
-                                if (trig_mode)
-                                        apic_debug("level trig mode repeatedly "
-                                                "for vector %d", vector);
-                                goto out;
-                        }
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
-out:
                trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-                                trig_mode, vector, !result);
+                                          trig_mode, vector, false);
                break;
        case APIC_DM_REMRD:
-                apic_debug("Ignoring delivery mode 3\n");
+                result = 1;
+                vcpu->arch.pv.pv_unhalted = 1;
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+                kvm_vcpu_kick(vcpu);
                break;
        case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
                                            * PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-                        | PT64_NX_MASK)
+                        | shadow_x_mask | shadow_nx_mask)
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
-static int is_dirty_gpte(unsigned long pte)
-{
-        return pte & PT_DIRTY_MASK;
-}
 static int is_rmap_spte(u64 pte)
 {
        return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
        return __shadow_walk_next(iterator, *iterator->sptep);
 }
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
 {
        u64 spte;
+        BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+                        VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
        spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+               shadow_user_mask | shadow_x_mask;
+        if (accessed)
+                spte |= shadow_accessed_mask;
        mmu_spte_set(sptep, spte);
 }
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
        mmu_free_roots(vcpu);
 }
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-        int bit7;
-        bit7 = (gpte >> 7) & 1;
-        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
        return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp, u64 *spte,
-                                  u64 gpte)
-{
-        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-                goto no_present;
-        if (!is_present_gpte(gpte))
-                goto no_present;
-        if (!(gpte & PT_ACCESSED_MASK))
-                goto no_present;
-        return false;
-no_present:
-        drop_spte(vcpu->kvm, spte);
-        return true;
-}
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp,
                                    u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
-                        link_shadow_page(iterator.sptep, sp);
+                        link_shadow_page(iterator.sptep, sp, true);
                }
        }
        return emulate;
@@ -2808,7 +2781,7 @@ exit:
        return ret;
 }
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
 {
        /*
         * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        bool ret = false;
        u64 spte = 0ull;
-        if (!page_fault_can_be_fast(vcpu, error_code))
+        if (!page_fault_can_be_fast(error_code))
                return false;
        walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
        ++vcpu->stat.tlb_flush;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
        nonpaging_free(vcpu);
 }
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-        unsigned mask;
-        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-        mask = (unsigned)~ACC_WRITE_MASK;
-        /* Allow write access to dirty gptes */
-        mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-        *access &= mask;
-}
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
        return false;
 }
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-        unsigned access;
-        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-        access &= ~(gpte >> PT64_NX_SHIFT);
-        return access;
-}
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
 {
        unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
        return mmu->last_pte_bitmap & (1 << index);
 }
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
        u64 exb_bit_rsvd = 0;
+        context->bad_mt_xwr = 0;
        if (!context->nx)
                exb_bit_rsvd = rsvd_bits(63, 63);
        switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        }
 }
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *context, bool execonly)
+{
+        int maxphyaddr = cpuid_maxphyaddr(vcpu);
+        int pte;
+        context->rsvd_bits_mask[0][3] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+        context->rsvd_bits_mask[0][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+        /* large page */
+        context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+        context->rsvd_bits_mask[1][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+        context->rsvd_bits_mask[1][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+        context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+        for (pte = 0; pte < 64; pte++) {
+                int rwx_bits = pte & 7;
+                int mt = pte >> 3;
+                if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+                                rwx_bits == 0x2 || rwx_bits == 0x6 ||
+                                (rwx_bits == 0x4 && !execonly))
+                        context->bad_mt_xwr |= (1ull << pte);
+        }
+}
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *mmu, bool ept)
 {
        unsigned bit, byte, pfec;
        u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
                        w = bit & ACC_WRITE_MASK;
                        u = bit & ACC_USER_MASK;
-                        /* Not really needed: !nx will cause pte.nx to fault */
+                        if (!ept) {
-                        x |= !mmu->nx;
+                                /* Not really needed: !nx will cause pte.nx to fault */
-                        /* Allow supervisor writes if !cr0.wp */
+                                x |= !mmu->nx;
-                        w |= !is_write_protection(vcpu) && !uf;
+                                /* Allow supervisor writes if !cr0.wp */
-                        /* Disallow supervisor fetches of user code if cr4.smep */
+                                w |= !is_write_protection(vcpu) && !uf;
-                        x &= !(smep && u && !uf);
+                                /* Disallow supervisor fetches of user code if cr4.smep */
+                                x &= !(smep && u && !uf);
+                        } else
+                                /* Not really needed: no U/S accesses on ept  */
+                                u = 1;
                        fault = (ff && !x) || (uf && !u) || (wf && !w);
                        map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->root_level = level;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->root_level = PT32_ROOT_LEVEL;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->gva_to_gpa = paging32_gva_to_gpa;
        }
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly)
+{
+        ASSERT(vcpu);
+        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+        context->nx = true;
+        context->new_cr3 = paging_new_cr3;
+        context->page_fault = ept_page_fault;
+        context->gva_to_gpa = ept_gva_to_gpa;
+        context->sync_page = ept_sync_page;
+        context->invlpg = ept_invlpg;
+        context->update_pte = ept_update_pte;
+        context->free = paging_free;
+        context->root_level = context->shadow_root_level;
+        context->root_hpa = INVALID_PAGE;
+        context->direct_map = false;
+        update_permission_bitmask(vcpu, context, true);
+        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
        }
-        update_permission_bitmask(vcpu, g_context);
+        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
        return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
                return true;
        if ((old ^ new) & PT64_BASE_ADDR_MASK)
                return true;
-        old ^= PT64_NX_MASK;
+        old ^= shadow_nx_mask;
-        new ^= PT64_NX_MASK;
+        new ^= shadow_nx_mask;
        return (old & ~new & PT64_PERM_MASK) != 0;
 }
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
        switch (er) {
        case EMULATE_DONE:
                return 1;
-        case EMULATE_DO_MMIO:
+        case EMULATE_USER_EXIT:
                ++vcpu->stat.mmio_exits;
                /* fall through */
        case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
        /*
         * The very rare case: if the generation-number is round,
         * zap all shadow pages.
-         *
-         * The max value is MMIO_MAX_GEN - 1 since it is not called
-         * when mark memslot invalid.
         */
-        if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+        if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
                printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
                kvm_mmu_invalidate_zap_all_pages(kvm);
        }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
 * so the code in this file is compiled twice, once per pte size.
 */
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+               __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
 #if PTTYPE == 64
        #define pt_element_t u64
        #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
        #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+        #define pt_element_t u64
+        #define guest_walker guest_walkerEPT
+        #define FNAME(name) ept_##name
+        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+        #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK 0
+        #define PT_GUEST_DIRTY_MASK 0
+        #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+        #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+        #define CMPXCHG cmpxchg64
+        #define PT_MAX_FULL_LEVELS 4
 #else
        #error Invalid PTTYPE value
 #endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
        return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+        unsigned mask;
+        /* dirty bit is not supported, so no need to track it */
+        if (!PT_GUEST_DIRTY_MASK)
+                return;
+        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+        mask = (unsigned)~ACC_WRITE_MASK;
+        /* Allow write access to dirty gptes */
+        mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+                PT_WRITABLE_MASK;
+        *access &= mask;
+}
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+        int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+                ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+        return is_present_gpte(pte);
+#else
+        return pte & 7;
+#endif
+}
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                               pt_element_t __user *ptep_user, unsigned index,
                               pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        return (ret != orig_pte);
 }
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp, u64 *spte,
+                                  u64 gpte)
+{
+        if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+                goto no_present;
+        if (!FNAME(is_present_gpte)(gpte))
+                goto no_present;
+        /* if accessed bit is not supported prefetch non accessed gpte */
+        if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+                goto no_present;
+        return false;
+no_present:
+        drop_spte(vcpu->kvm, spte);
+        return true;
+}
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+        unsigned access;
+#if PTTYPE == PTTYPE_EPT
+        access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+                ACC_USER_MASK;
+#else
+        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+        access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+        return access;
+}
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                                             struct kvm_mmu *mmu,
                                             struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
        gfn_t table_gfn;
        int ret;
+        /* dirty/accessed bits are not supported, so no need to update them */
+        if (!PT_GUEST_DIRTY_MASK)
+                return 0;
        for (level = walker->max_level; level >= walker->level; --level) {
                pte = orig_pte = walker->ptes[level - 1];
                table_gfn = walker->table_gfn[level - 1];
                ptep_user = walker->ptep_user[level - 1];
                index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-                if (!(pte & PT_ACCESSED_MASK)) {
+                if (!(pte & PT_GUEST_ACCESSED_MASK)) {
                        trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_ACCESSED_MASK;
+                        pte |= PT_GUEST_ACCESSED_MASK;
                }
-                if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+                if (level == walker->level && write_fault &&
+                                !(pte & PT_GUEST_DIRTY_MASK)) {
                        trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_DIRTY_MASK;
+                        pte |= PT_GUEST_DIRTY_MASK;
                }
                if (pte == orig_pte)
                        continue;
@@ -170,7 +275,7 @@ retry_walk:
        if (walker->level == PT32E_ROOT_LEVEL) {
                pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (!is_present_gpte(pte))
+                if (!FNAME(is_present_gpte)(pte))
                        goto error;
                --walker->level;
        }
@@ -179,7 +284,7 @@ retry_walk:
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
               (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
-        accessed_dirty = PT_ACCESSED_MASK;
+        accessed_dirty = PT_GUEST_ACCESSED_MASK;
        pt_access = pte_access = ACC_ALL;
        ++walker->level;
@@ -215,17 +320,17 @@ retry_walk:
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (unlikely(!is_present_gpte(pte)))
+                if (unlikely(!FNAME(is_present_gpte)(pte)))
                        goto error;
-                if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+                if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
-                                              walker->level))) {
+                                                     walker->level))) {
                        errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
                        goto error;
                }
                accessed_dirty &= pte;
-                pte_access = pt_access & gpte_access(vcpu, pte);
+                pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
                walker->ptes[walker->level - 1] = pte;
        } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
        walker->gfn = real_gpa >> PAGE_SHIFT;
        if (!write_fault)
-                protect_clean_gpte(&pte_access, pte);
+                FNAME(protect_clean_gpte)(&pte_access, pte);
        else
                /*
-                 * On a write fault, fold the dirty bit into accessed_dirty by
+                 * On a write fault, fold the dirty bit into accessed_dirty.
-                 * shifting it one place right.
+                 * For modes without A/D bits support accessed_dirty will be
+                 * always clear.
                 */
-                accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+                accessed_dirty &= pte >>
+                        (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
        if (unlikely(!accessed_dirty)) {
                ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
        walker->fault.vector = PF_VECTOR;
        walker->fault.error_code_valid = true;
        walker->fault.error_code = errcode;
+#if PTTYPE == PTTYPE_EPT
+        /*
+         * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+         * misconfiguration requires to be injected. The detection is
+         * done by is_rsvd_bits_set() above.
+         *
+         * We set up the value of exit_qualification to inject:
+         * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+         * [5:3] - Calculated by the page walk of the guest EPT page tables
+         * [7:8] - Derived from [7:8] of real exit_qualification
+         *
+         * The other bits are set to 0.
+         */
+        if (!(errcode & PFERR_RSVD_MASK)) {
+                vcpu->arch.exit_qualification &= 0x187;
+                vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+        }
+#endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
                                        access);
 }
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                   struct kvm_vcpu *vcpu, gva_t addr,
                                   u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
        return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
                                        addr, access);
 }
+#endif
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        gfn_t gfn;
        pfn_t pfn;
-        if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return false;
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
        gfn = gpte_to_gfn(gpte);
-        pte_access = sp->role.access & gpte_access(vcpu, gpte);
+        pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-        protect_clean_gpte(&pte_access, gpte);
+        FNAME(protect_clean_gpte)(&pte_access, gpte);
        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
        if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                        goto out_gpte_changed;
                if (sp)
-                        link_shadow_page(it.sptep, sp);
+                        link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
                                      true, direct_access, it.sptep);
-                link_shadow_page(it.sptep, sp);
+                link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
        return gpa;
 }
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
                                      u32 access,
                                      struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
        return gpa;
 }
+#endif
 /*
 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
-                if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
-                pte_access &= gpte_access(vcpu, gpte);
+                pte_access &= FNAME(gpte_access)(vcpu, gpte);
-                protect_clean_gpte(&pte_access, gpte);
+                FNAME(protect_clean_gpte)(&pte_access, gpte);
                if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
                      &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                unsigned config, bool exclude_user, bool exclude_kernel,
-                bool intr)
+                bool intr, bool in_tx, bool in_tx_cp)
 {
        struct perf_event *event;
        struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .exclude_kernel = exclude_kernel,
                .config = config,
        };
+        if (in_tx)
+                attr.config |= HSW_IN_TX;
+        if (in_tx_cp)
+                attr.config |= HSW_IN_TX_CHECKPOINTED;
        attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
                                ARCH_PERFMON_EVENTSEL_INV |
-                                ARCH_PERFMON_EVENTSEL_CMASK))) {
+                                ARCH_PERFMON_EVENTSEL_CMASK |
+                                HSW_IN_TX |
+                                HSW_IN_TX_CHECKPOINTED))) {
                config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
                                unit_mask);
                if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        reprogram_counter(pmc, type, config,
                        !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
                        !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-                        eventsel & ARCH_PERFMON_EVENTSEL_INT);
+                        eventsel & ARCH_PERFMON_EVENTSEL_INT,
+                        (eventsel & HSW_IN_TX),
+                        (eventsel & HSW_IN_TX_CHECKPOINTED));
 }
 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
                        arch_events[fixed_pmc_events[idx]].event_type,
                        !(en & 0x2), /* exclude user */
                        !(en & 0x1), /* exclude kernel */
-                        pmi);
+                        pmi, false, false);
 }
 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
                        if (data == pmc->eventsel)
                                return 0;
-                        if (!(data & 0xffffffff00200000ull)) {
+                        if (!(data & pmu->reserved_bits)) {
                                reprogram_gp_counter(pmc, data);
                                return 0;
                        }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->counter_bitmask[KVM_PMC_GP] = 0;
        pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
        pmu->version = 0;
+        pmu->reserved_bits = 0xffffffff00200000ull;
        entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
        if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
                (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
        pmu->global_ctrl_mask = ~pmu->global_ctrl;
+        entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+        if (entry &&
+            (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+            (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+                pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
 }
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
         * we must keep them pinned while L2 runs.
         */
        struct page *apic_access_page;
+        u64 msr_ia32_feature_control;
 };
 #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                (vmcs12->secondary_vm_exec_control & bit);
 }
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-        struct kvm_vcpu *vcpu)
 {
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
         * 17 must be 1.
         */
+        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+                nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+        nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-        nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+                VM_EXIT_HOST_ADDR_SPACE_SIZE |
-#else
-        nested_vmx_exit_ctls_high = 0;
 #endif
-        nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+                                      VM_EXIT_LOAD_IA32_EFER);
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
        nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
-                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+#ifdef CONFIG_X86_64
-        nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_ENTRY_IA32E_MODE |
+#endif
+                VM_ENTRY_LOAD_IA32_PAT;
+        nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+                                       VM_ENTRY_LOAD_IA32_EFER);
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_WBINVD_EXITING;
+        if (enable_ept) {
+                /* nested EPT: emulate EPT also to L1 */
+                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                nested_vmx_ept_caps &= vmx_capability.ept;
+                /*
+                 * Since invept is completely emulated we support both global
+                 * and context invalidation independent of what host cpu
+                 * supports
+                 */
+                nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                        VMX_EPT_EXTENT_CONTEXT_BIT;
+        } else
+                nested_vmx_ept_caps = 0;
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
        nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        switch (msr_index) {
        case MSR_IA32_FEATURE_CONTROL:
-                *pdata = 0;
+                if (nested_vmx_allowed(vcpu)) {
-                break;
+                        *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+                        break;
+                }
+                return 0;
        case MSR_IA32_VMX_BASIC:
                /*
                 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
-                /* Currently, no nested ept or nested vpid */
+                /* Currently, no nested vpid support */
-                *pdata = 0;
+                *pdata = nested_vmx_ept_caps;
                break;
        default:
                return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 1;
 }
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+        u32 msr_index = msr_info->index;
+        u64 data = msr_info->data;
+        bool host_initialized = msr_info->host_initiated;
        if (!nested_vmx_allowed(vcpu))
                return 0;
-        if (msr_index == MSR_IA32_FEATURE_CONTROL)
+        if (msr_index == MSR_IA32_FEATURE_CONTROL) {
-                /* TODO: the right thing. */
+                if (!host_initialized &&
+                                to_vmx(vcpu)->nested.msr_ia32_feature_control
+                                & FEATURE_CONTROL_LOCKED)
+                        return 0;
+                to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
                return 1;
+        }
        /*
         * No need to treat VMX capability MSRs specially: If we don't handle
         * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                /* Otherwise falls through */
        default:
-                if (vmx_set_vmx_msr(vcpu, msr_index, data))
+                if (vmx_set_vmx_msr(vcpu, msr_info))
                        break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        /* It is a write fault? */
        error_code = exit_qualification & (1U << 1);
+        /* It is a fetch fault? */
+        error_code |= (exit_qualification & (1U << 2)) << 2;
        /* ept page table is present? */
        error_code |= (exit_qualification >> 3) & 0x1;
+        vcpu->arch.exit_qualification = exit_qualification;
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
-                if (err == EMULATE_DO_MMIO) {
+                if (err == EMULATE_USER_EXIT) {
+                        ++vcpu->stat.mmio_exits;
                        ret = 0;
                        goto out;
                }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                free_loaded_vmcs(&vmx->vmcs01);
 }
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_CF);
+}
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                 u32 vm_instruction_error);
+                                        u32 vm_instruction_error)
+{
+        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+                /*
+                 * failValid writes the error number to the current VMCS, which
+                 * can't be done there isn't a current VMCS.
+                 */
+                nested_vmx_failInvalid(vcpu);
+                return;
+        }
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_ZF);
+        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+        /*
+         * We don't need to force a shadow sync because
+         * VM_INSTRUCTION_ERROR is not shadowed
+         */
+}
 /*
 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
+        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
        /* The Intel VMX Instruction Reference lists a bunch of bits that
         * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                skip_emulated_instruction(vcpu);
                return 1;
        }
+        if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+                        != VMXON_NEEDED_FEATURES) {
+                kvm_inject_gp(vcpu, 0);
+                return 1;
+        }
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
                if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        vmx->nested.vmxon = true;
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
                return 1;
        free_nested(to_vmx(vcpu));
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
        return 0;
 }
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_CF);
-}
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                        u32 vm_instruction_error)
-{
-        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-                /*
-                 * failValid writes the error number to the current VMCS, which
-                 * can't be done there isn't a current VMCS.
-                 */
-                nested_vmx_failInvalid(vcpu);
-                return;
-        }
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_ZF);
-        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-        /*
-         * We don't need to force a shadow sync because
-         * VM_INSTRUCTION_ERROR is not shadowed
-         */
-}
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-        unsigned long *fields = (unsigned long *)shadow_read_write_fields;
+        const unsigned long *fields = shadow_read_write_fields;
-        int num_fields = max_shadow_read_write_fields;
+        const int num_fields = max_shadow_read_write_fields;
        vmcs_load(shadow_vmcs);
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-        unsigned long *fields[] = {
+        const unsigned long *fields[] = {
-                (unsigned long *)shadow_read_write_fields,
+                shadow_read_write_fields,
-                (unsigned long *)shadow_read_only_fields
+                shadow_read_only_fields
        };
-        int num_lists =  ARRAY_SIZE(fields);
+        const int max_fields[] = {
-        int max_fields[] = {
                max_shadow_read_write_fields,
                max_shadow_read_only_fields
        };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
        vmcs_load(shadow_vmcs);
-        for (q = 0; q < num_lists; q++) {
+        for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+        u32 vmx_instruction_info, types;
+        unsigned long type;
+        gva_t gva;
+        struct x86_exception e;
+        struct {
+                u64 eptp, gpa;
+        } operand;
+        u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        if (!nested_vmx_check_permission(vcpu))
+                return 1;
+        if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+        types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+        if (!(types & (1UL << type))) {
+                nested_vmx_failValid(vcpu,
+                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+                return 1;
+        }
+        /* According to the Intel VMX instruction reference, the memory
+         * operand is read even if it isn't needed (e.g., for type==global)
+         */
+        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                        vmx_instruction_info, &gva))
+                return 1;
+        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                                sizeof(operand), &e)) {
+                kvm_inject_page_fault(vcpu, &e);
+                return 1;
+        }
+        switch (type) {
+        case VMX_EPT_EXTENT_CONTEXT:
+                if ((operand.eptp & eptp_mask) !=
+                                (nested_ept_get_cr3(vcpu) & eptp_mask))
+                        break;
+        case VMX_EPT_EXTENT_GLOBAL:
+                kvm_mmu_sync_roots(vcpu);
+                kvm_mmu_flush_tlb(vcpu);
+                nested_vmx_succeed(vcpu);
+                break;
+        default:
+                BUG_ON(1);
+                break;
+        }
+        skip_emulated_instruction(vcpu);
+        return 1;
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+        [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+        case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
        case EXIT_REASON_EPT_VIOLATION:
+                /*
+                 * L0 always deals with the EPT violation. If nested EPT is
+                 * used, and the nested mmu code discovers that the address is
+                 * missing in the guest EPT table (EPT12), the EPT violation
+                 * will be injected with nested_ept_inject_page_fault()
+                 */
+                return 0;
        case EXIT_REASON_EPT_MISCONFIG:
+                /*
+                 * L2 never uses directly L1's EPT, but rather L0's own EPT
+                 * table (shadow on EPT) or a merged EPT table that L0 built
+                 * (EPT on EPT). So any problems with the structure of the
+                 * table is L0's fault.
+                 */
                return 0;
        case EXIT_REASON_PREEMPTION_TIMER:
                return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                        get_vmcs12(vcpu), vcpu)))) {
+                                        get_vmcs12(vcpu))))) {
                if (vmx_interrupt_allowed(vcpu)) {
                        vmx->soft_vnmi_blocked = 0;
                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                entry->ecx |= bit(X86_FEATURE_VMX);
 }
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+                struct x86_exception *fault)
+{
+        struct vmcs12 *vmcs12;
+        nested_vmx_vmexit(vcpu);
+        vmcs12 = get_vmcs12(vcpu);
+        if (fault->error_code & PFERR_RSVD_MASK)
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+        else
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+        vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+        vmcs12->guest_physical_address = fault->address;
+}
+/* Callbacks for nested_ept_init_mmu_context: */
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+        /* return the page table to be shadowed - in our case, EPT12 */
+        return get_vmcs12(vcpu)->ept_pointer;
+}
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+        int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+        return r;
+}
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->guest_interruptibility_info);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-        vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-        /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+        /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-        vmcs_write32(VM_EXIT_CONTROLS,
+         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-                vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+         * bits are further modified by vmx_set_efer() below.
-        vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+         */
+        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+         * emulated by vmx_set_efer(), below.
+         */
+        vmcs_write32(VM_ENTRY_CONTROLS,
+                (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-        else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+                vcpu->arch.pat = vmcs12->guest_ia32_pat;
+        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmx_flush_tlb(vcpu);
        }
+        if (nested_cpu_has_ept(vmcs12)) {
+                kvm_mmu_unload(vcpu);
+                nested_ept_init_mmu_context(vcpu);
+        }
        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
+        /*
+         * L1 may access the L2's PDPTR, so save them to construct vmcs12
+         */
+        if (enable_ept) {
+                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+        }
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+        /*
+         * In some cases (usually, nested EPT), L2 is allowed to change its
+         * own CR3 without exiting. If it has changed it, we must keep it.
+         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+         *
+         * Additionally, restore L2's PDPTR to vmcs12.
+         */
+        if (enable_ept) {
+                vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+                vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+                vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+                vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+        }
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
+        struct kvm_segment seg;
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
-        /* shadow page tables on either EPT or shadow page tables */
+        if (nested_cpu_has_ept(vmcs12))
+                nested_ept_uninit_mmu_context(vcpu);
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-        vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-        vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
-        vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-        vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+                vcpu->arch.pat = vmcs12->host_ia32_pat;
+        }
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
+        /* Set L1 segment info according to Intel SDM
+            27.5.2 Loading Host Segment and Descriptor-Table Registers */
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .selector = vmcs12->host_cs_selector,
+                .type = 11,
+                .present = 1,
+                .s = 1,
+                .g = 1
+        };
+        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+                seg.l = 1;
+        else
+                seg.db = 1;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .type = 3,
+                .present = 1,
+                .s = 1,
+                .db = 1,
+                .g = 1
+        };
+        seg.selector = vmcs12->host_ds_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+        seg.selector = vmcs12->host_es_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+        seg.selector = vmcs12->host_ss_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+        seg.selector = vmcs12->host_fs_selector;
+        seg.base = vmcs12->host_fs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+        seg.selector = vmcs12->host_gs_selector;
+        seg.base = vmcs12->host_gs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+        seg = (struct kvm_segment) {
+                .base = vmcs12->host_tr_base,
+                .limit = 0x67,
+                .selector = vmcs12->host_tr_selector,
+                .type = 11,
+                .present = 1
+        };
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                 */
        }
-        /*
-         * Does the new cr3 value map to physical memory? (Note, we
-         * catch an invalid cr3 even in real-mode, because it would
-         * cause trouble later on when we turn on paging anyway.)
-         *
-         * A real CPU would silently accept an invalid cr3 and would
-         * attempt to use it - with largely undefined (and often hard
-         * to debug) behavior on the guest side.
-         */
-        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-                return 1;
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
        vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+        MSR_IA32_FEATURE_CONTROL
 };
 static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
        unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                delta = user_ns.clock - now_ns;
                local_irq_enable();
                kvm->arch.kvmclock_offset = delta;
+                kvm_gen_update_masterclock(kvm);
                break;
        }
        case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+                                unsigned long *db)
+{
+        u32 dr6 = 0;
+        int i;
+        u32 enable, rwlen;
+        enable = dr7;
+        rwlen = dr7 >> 16;
+        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+                        dr6 |= (1 << i);
+        return dr6;
+}
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        /*
+         * Use the "raw" value to see if TF was passed to the processor.
+         * Note that the new value of the flags has not been saved yet.
+         *
+         * This is correct even for TF set by the guest, because "the
+         * processor will not generate this exception after the instruction
+         * that sets the TF flag".
+         */
+        unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+        if (unlikely(rflags & X86_EFLAGS_TF)) {
+                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                        kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                } else {
+                        vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+                        /*
+                         * "Certain debug exceptions may clear bit 0-3.  The
+                         * remaining contents of the DR6 register are never
+                         * cleared by the processor".
+                         */
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= DR6_BS;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                }
+        }
+}
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+        u32 dr6 = 0;
+        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.guest_debug_dr7,
+                                           vcpu->arch.eff_db);
+                if (dr6 != 0) {
+                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+                                get_segment_base(vcpu, VCPU_SREG_CS);
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                        return true;
+                }
+        }
+        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.dr7,
+                                           vcpu->arch.db);
+                if (dr6 != 0) {
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= dr6;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                        *r = EMULATE_DONE;
+                        return true;
+                }
+        }
+        return false;
+}
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            unsigned long cr2,
                            int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                init_emulate_ctxt(vcpu);
+                /*
+                 * We will reenter on the same instruction since
+                 * we do not set complete_userspace_io.  This does not
+                 * handle watchpoints yet, those would be handled in
+                 * the emulate_ops.
+                 */
+                if (kvm_vcpu_check_breakpoint(vcpu, &r))
+                        return r;
                ctxt->interruptibility = 0;
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
                inject_emulated_exception(vcpu);
                r = EMULATE_DONE;
        } else if (vcpu->arch.pio.count) {
-                if (!vcpu->arch.pio.in)
+                if (!vcpu->arch.pio.in) {
+                        /* FIXME: return into emulator if single-stepping.  */
                        vcpu->arch.pio.count = 0;
-                else {
+                } else {
                        writeback = false;
                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
                }
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
        } else if (vcpu->mmio_needed) {
                if (!vcpu->mmio_is_write)
                        writeback = false;
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
        } else if (r == EMULATION_RESTART)
                goto restart;
@@ -5050,10 +5166,12 @@ restart:
        if (writeback) {
                toggle_interruptibility(vcpu, ctxt->interruptibility);
-                kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
+                if (r == EMULATE_DONE)
+                        kvm_vcpu_check_singlestep(vcpu, &r);
+                kvm_set_rflags(vcpu, ctxt->eflags);
        } else
                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
        int r;
-        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+        struct kvm_x86_ops *ops = opaque;
        if (kvm_x86_ops) {
                printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
        return 1;
 }
+/*
+ * kvm_pv_kick_cpu_op:  Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+        struct kvm_lapic_irq lapic_irq;
+        lapic_irq.shorthand = 0;
+        lapic_irq.dest_mode = 0;
+        lapic_irq.dest_id = apicid;
+        lapic_irq.delivery_mode = APIC_DM_REMRD;
+        kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
+        case KVM_HC_KICK_CPU:
+                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+                ret = 0;
+                break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
-        int i;
-        struct kvm_vcpu *vcpu;
-        struct kvm_arch *ka = &kvm->arch;
-        spin_lock(&ka->pvclock_gtod_sync_lock);
-        kvm_make_mclock_inprogress_request(kvm);
-        /* no guest entries from this point */
-        pvclock_update_vm_gtod_copy(kvm);
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-        /* guest entries allowed */
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-        spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
        u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                kvm_apic_accept_events(vcpu);
                                switch(vcpu->arch.mp_state) {
                                case KVM_MP_STATE_HALTED:
+                                        vcpu->arch.pv.pv_unhalted = false;
                                        vcpu->arch.mp_state =
                                                KVM_MP_STATE_RUNNABLE;
                                case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
        if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
                vcpu->mmio_needed = 0;
+                /* FIXME: return into emulator if single-stepping.  */
                if (vcpu->mmio_is_write)
                        return 1;
                vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
        kvm_apic_accept_events(vcpu);
-        mp_state->mp_state = vcpu->arch.mp_state;
+        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+                                        vcpu->arch.pv.pv_unhalted)
+                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+        else
+                mp_state->mp_state = vcpu->arch.mp_state;
        return 0;
 }
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
+        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
        return -ENOMEM;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+        /*
+         * memslots->generation has been incremented.
+         * mmio generation may have reached its maximum value.
+         */
+        kvm_mmu_invalidate_mmio_sptes(kvm);
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         */
        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-        /*
-         * If memory slot is created, or moved, we need to clear all
-         * mmio sptes.
-         */
-        kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
                || kvm_apic_has_events(vcpu)
+                || vcpu->arch.pv.pv_unhalted
                || atomic_read(&vcpu->arch.nmi_queued) ||
                (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 6a22c19da663..bdf8532494fe 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,8 +7,7 @@
 * kernel and insert a module (lg.ko) which allows us to run other Linux
 * kernels the same way we'd run processes.  We call the first kernel the Host,
 * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
+ * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- * Launcher.
 *
 * Secondly, we only run specially modified Guests, not normal kernels: setting
 * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
@@ -1057,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss,
 }
 /* Let's just say, I wouldn't do debugging under a Guest. */
+static unsigned long lguest_get_debugreg(int regno)
+{
+        /* FIXME: Implement */
+        return 0;
+}
 static void lguest_set_debugreg(int regno, unsigned long value)
 {
        /* FIXME: Implement */
@@ -1304,6 +1309,7 @@ __init void lguest_init(void)
        pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
        pv_cpu_ops.set_ldt = lguest_set_ldt;
        pv_cpu_ops.load_tls = lguest_load_tls;
+        pv_cpu_ops.get_debugreg = lguest_get_debugreg;
        pv_cpu_ops.set_debugreg = lguest_set_debugreg;
        pv_cpu_ops.clts = lguest_clts;
        pv_cpu_ops.read_cr0 = lguest_read_cr0;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
+int pmd_huge_support(void)
+{
+        return 0;
+}
 #else
 struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
        return !!(pud_val(pud) & _PAGE_PSE);
 }
+int pmd_huge_support(void)
+{
+        return 1;
+}
 #endif
 /* x86_64 also uses this file */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
        if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
                return;
+        count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
                if (f->flush_end == TLB_FLUSH_ALL)
                        local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        info.flush_start = start;
        info.flush_end = end;
+        count_vm_event(NR_TLB_REMOTE_FLUSH);
        if (is_uv_system()) {
                unsigned int cpu;
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
        preempt_disable();
+        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
        local_flush_tlb();
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
        act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
        /* tlb_flushall_shift is on balance point, details in commit log */
-        if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+        if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+                count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
                local_flush_tlb();
-        else {
+        } else {
                if (has_large_page(mm, start, end)) {
                        local_flush_tlb();
                        goto flush_all;
                }
                /* flush range by one by one 'invlpg' */
-                for (addr = start; addr < end;  addr += PAGE_SIZE)
+                for (addr = start; addr < end;  addr += PAGE_SIZE) {
+                        count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
                        __flush_tlb_single(addr);
+                }
                if (cpumask_any_but(mm_cpumask(mm),
                                smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 static void do_flush_tlb_all(void *info)
 {
+        count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        __flush_tlb_all();
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
                leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
 void flush_tlb_all(void)
 {
+        count_vm_event(NR_TLB_REMOTE_FLUSH);
        on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 48768df2471a..6890d8498e0b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy)
                nmi_cpu_shutdown(dummy);
 }
-static int nmi_create_files(struct super_block *sb, struct dentry *root)
+static int nmi_create_files(struct dentry *root)
 {
        unsigned int i;
@@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
                        continue;
                snprintf(buf,  sizeof(buf), "%d", i);
-                dir = oprofilefs_mkdir(sb, root, buf);
+                dir = oprofilefs_mkdir(root, buf);
-                oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
+                oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
-                oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
+                oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
-                oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
+                oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
-                oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
+                oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
-                oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
+                oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
-                oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
+                oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
-                oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
+                oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
        }
        return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b2b94438ff05..50d86c0e9ba4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -454,16 +454,16 @@ static void init_ibs(void)
        printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
 }
-static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
+static int (*create_arch_files)(struct dentry *root);
-static int setup_ibs_files(struct super_block *sb, struct dentry *root)
+static int setup_ibs_files(struct dentry *root)
 {
        struct dentry *dir;
        int ret = 0;
        /* architecture specific files */
        if (create_arch_files)
-                ret = create_arch_files(sb, root);
+                ret = create_arch_files(root);
        if (ret)
                return ret;
@@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
        ibs_config.max_cnt_op = 250000;
        if (ibs_caps & IBS_CAPS_FETCHSAM) {
-                dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
+                dir = oprofilefs_mkdir(root, "ibs_fetch");
-                oprofilefs_create_ulong(sb, dir, "enable",
+                oprofilefs_create_ulong(dir, "enable",
                                        &ibs_config.fetch_enabled);
-                oprofilefs_create_ulong(sb, dir, "max_count",
+                oprofilefs_create_ulong(dir, "max_count",
                                        &ibs_config.max_cnt_fetch);
-                oprofilefs_create_ulong(sb, dir, "rand_enable",
+                oprofilefs_create_ulong(dir, "rand_enable",
                                        &ibs_config.rand_en);
        }
        if (ibs_caps & IBS_CAPS_OPSAM) {
-                dir = oprofilefs_mkdir(sb, root, "ibs_op");
+                dir = oprofilefs_mkdir(root, "ibs_op");
-                oprofilefs_create_ulong(sb, dir, "enable",
+                oprofilefs_create_ulong(dir, "enable",
                                        &ibs_config.op_enabled);
-                oprofilefs_create_ulong(sb, dir, "max_count",
+                oprofilefs_create_ulong(dir, "max_count",
                                        &ibs_config.max_cnt_op);
                if (ibs_caps & IBS_CAPS_OPCNT)
-                        oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+                        oprofilefs_create_ulong(dir, "dispatched_ops",
                                                &ibs_config.dispatched_ops);
                if (ibs_caps & IBS_CAPS_BRNTRGT)
-                        oprofilefs_create_ulong(sb, dir, "branch_target",
+                        oprofilefs_create_ulong(dir, "branch_target",
                                                &ibs_config.branch_target);
        }
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 47fe66fe61f1..3ca5957b7a34 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -20,7 +20,7 @@
 #include <linux/intel_pmic_gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/i2c.h>
-#include <linux/i2c/pca953x.h>
+#include <linux/platform_data/pca953x.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
 #include <linux/platform_device.h>
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
index 9d34eddb517f..96eb2bd28832 100644
--- a/arch/x86/um/os-Linux/prctl.c
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -4,7 +4,7 @@
 */
 #include <sys/ptrace.h>
-#include <linux/ptrace.h>
+#include <asm/ptrace.h>
 int os_arch_prctl(int pid, int code, unsigned long *addr)
 {
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
        cycle_t ret;
        u64 last;
        u32 version;
-        u32 migrate_count;
        u8 flags;
        unsigned cpu, cpu1;
        /*
-         * When looping to get a consistent (time-info, tsc) pair, we
+         * Note: hypervisor must guarantee that:
-         * also need to deal with the possibility we can switch vcpus,
+         * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-         * so make sure we always re-fetch time-info for the current vcpu.
+         * 2. that per-CPU pvclock time info is updated if the
+         *    underlying CPU changes.
+         * 3. that version is increased whenever underlying CPU
+         *    changes.
+         *
         */
        do {
                cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
                pvti = get_pvti(cpu);
-                migrate_count = pvti->migrate_count;
                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
                /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
        } while (unlikely(cpu != cpu1 ||
                          (pvti->pvti.version & 1) ||
-                          pvti->pvti.version != version ||
+                          pvti->pvti.version != version));
-                          pvti->migrate_count != migrate_count));
        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2fcaedc0b739..fa6ade76ef3f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void)
        if (!xen_initial_domain())
                cpuid_leaf1_edx_mask &=
-                        ~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
+                        ~((1 << X86_FEATURE_ACPI));  /* disable ACPI */
-                          (1 << X86_FEATURE_ACPI));  /* disable ACPI */
        cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
@@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
                addr = (unsigned long)xen_int3;
        else if (addr == (unsigned long)stack_segment)
                addr = (unsigned long)xen_stack_segment;
-        else if (addr == (unsigned long)double_fault ||
+        else if (addr == (unsigned long)double_fault) {
-                 addr == (unsigned long)nmi) {
                /* Don't need to handle these */
                return 0;
 #ifdef CONFIG_X86_MCE
@@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
                 */
                ;
 #endif
-        } else {
+        } else if (addr == (unsigned long)nmi)
+                /*
+                 * Use the native version as well.
+                 */
+                ;
+        else {
                /* Some other trap using IST? */
                if (WARN_ON(val->ist != 0))
                        return 0;
@@ -1689,7 +1692,6 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
        case CPU_UP_PREPARE:
                xen_vcpu_setup(cpu);
                if (xen_have_vector_callback) {
-                        xen_init_lock_cpu(cpu);
                        if (xen_feature(XENFEAT_hvm_safe_pvclock))
                                xen_setup_timer(cpu);
                }
@@ -1710,6 +1712,8 @@ static void __init xen_hvm_guest_init(void)
        xen_hvm_init_shared_info();
+        xen_panic_handler_init();
        if (xen_feature(XENFEAT_hvm_callback_vector))
                xen_have_vector_callback = 1;
        xen_hvm_smp_init();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae1..0da7f863056f 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags)
        /* convert from IF type flag */
        flags = !(flags & X86_EFLAGS_IF);
-        /* There's a one instruction preempt window here.  We need to
+        /* See xen_irq_enable() for why preemption must be disabled. */
-           make sure we're don't switch CPUs between getting the vcpu
-           pointer and updating the mask. */
        preempt_disable();
        vcpu = this_cpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = flags;
-        preempt_enable_no_resched();
-        /* Doesn't matter if we get preempted here, because any
-           pending event will get dealt with anyway. */
        if (flags == 0) {
-                preempt_check_resched();
                barrier(); /* unmask then check (avoid races) */
                if (unlikely(vcpu->evtchn_upcall_pending))
                        xen_force_evtchn_callback();
-        }
+                preempt_enable();
+        } else
+                preempt_enable_no_resched();
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
@@ -82,10 +77,12 @@ static void xen_irq_enable(void)
 {
        struct vcpu_info *vcpu;
-        /* We don't need to worry about being preempted here, since
+        /*
-           either a) interrupts are disabled, so no preemption, or b)
+         * We may be preempted as soon as vcpu->evtchn_upcall_mask is
-           the caller is confused and is trying to re-enable interrupts
+         * cleared, so disable preemption to ensure we check for
-           on an indeterminate processor. */
+         * events on the VCPU we are still running on.
+         */
+        preempt_disable();
        vcpu = this_cpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = 0;
@@ -96,6 +93,8 @@ static void xen_irq_enable(void)
        barrier(); /* unmask then check (avoid races) */
        if (unlikely(vcpu->evtchn_upcall_pending))
                xen_force_evtchn_callback();
+        preempt_enable();
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927e..8b901e8d782d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -161,6 +161,7 @@
 #include <asm/xen/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
 #include <xen/grant_table.h>
 #include "multicalls.h"
@@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page,
        if (kmap_op != NULL) {
                if (!PageHighMem(page)) {
                        struct multicall_space mcs;
-                        struct gnttab_unmap_grant_ref *unmap_op;
+                        struct gnttab_unmap_and_replace *unmap_op;
+                        struct page *scratch_page = get_balloon_scratch_page();
+                        unsigned long scratch_page_address = (unsigned long)
+                                __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
                        /*
                         * It might be that we queued all the m2p grant table
@@ -986,25 +990,31 @@ int m2p_remove_override(struct page *page,
                                printk(KERN_WARNING "m2p_remove_override: "
                                                "pfn %lx mfn %lx, failed to modify kernel mappings",
                                                pfn, mfn);
+                                put_balloon_scratch_page();
                                return -1;
                        }
-                        mcs = xen_mc_entry(
+                        xen_mc_batch();
-                                        sizeof(struct gnttab_unmap_grant_ref));
+                        mcs = __xen_mc_entry(
+                                        sizeof(struct gnttab_unmap_and_replace));
                        unmap_op = mcs.args;
                        unmap_op->host_addr = kmap_op->host_addr;
+                        unmap_op->new_addr = scratch_page_address;
                        unmap_op->handle = kmap_op->handle;
-                        unmap_op->dev_bus_addr = 0;
                        MULTI_grant_table_op(mcs.mc,
-                                        GNTTABOP_unmap_grant_ref, unmap_op, 1);
+                                        GNTTABOP_unmap_and_replace, unmap_op, 1);
+                        mcs = __xen_mc_entry(0);
+                        MULTI_update_va_mapping(mcs.mc, scratch_page_address,
+                                        pfn_pte(page_to_pfn(scratch_page),
+                                        PAGE_KERNEL_RO), 0);
                        xen_mc_issue(PARAVIRT_LAZY_MMU);
-                        set_pte_at(&init_mm, address, ptep,
-                                        pfn_pte(pfn, PAGE_KERNEL));
-                        __flush_tlb_single(address);
                        kmap_op->host_addr = 0;
+                        put_balloon_scratch_page();
                }
        }
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8f3eea6b80c5..09f3059cb00b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,6 +33,9 @@
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
+#ifdef CONFIG_X86_64
+extern const char nmi[];
+#endif
 extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
@@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk(
        unsigned long pfn;
        /*
-         * If the PFNs are currently mapped, the VA mapping also needs
+         * If the PFNs are currently mapped, clear the mappings
-         * to be updated to be 1:1.
+         * (except for the ISA region which must be 1:1 mapped) to
+         * release the refcounts (in Xen) on the original frames.
         */
-        for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+        for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+                pte_t pte = __pte_ma(0);
+                if (pfn < PFN_UP(ISA_END_ADDRESS))
+                        pte = mfn_pte(pfn, PAGE_KERNEL_IO);
                (void)HYPERVISOR_update_va_mapping(
-                        (unsigned long)__va(pfn << PAGE_SHIFT),
+                        (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
-                        mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+        }
        if (start_pfn < nr_pages)
                *released += xen_release_chunk(
@@ -547,7 +556,13 @@ void xen_enable_syscall(void)
        }
 #endif /* CONFIG_X86_64 */
 }
+void __cpuinit xen_enable_nmi(void)
+{
+#ifdef CONFIG_X86_64
+        if (register_callback(CALLBACKTYPE_nmi, nmi))
+                BUG();
+#endif
+}
 void __init xen_arch_setup(void)
 {
        xen_panic_handler_init();
@@ -565,7 +580,7 @@ void __init xen_arch_setup(void)
        xen_enable_sysenter();
        xen_enable_syscall();
+        xen_enable_nmi();
 #ifdef CONFIG_ACPI
        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
                printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 597655bd72b0..d1e4777b4e75 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -273,12 +273,20 @@ static void __init xen_smp_prepare_boot_cpu(void)
        BUG_ON(smp_processor_id() != 0);
        native_smp_prepare_boot_cpu();
-        /* We've switched to the "real" per-cpu gdt, so make sure the
+        if (xen_pv_domain()) {
-           old memory can be recycled */
+                /* We've switched to the "real" per-cpu gdt, so make sure the
-        make_lowmem_page_readwrite(xen_initial_gdt);
+                   old memory can be recycled */
+                make_lowmem_page_readwrite(xen_initial_gdt);
-        xen_filter_cpu_maps();
+                xen_filter_cpu_maps();
-        xen_setup_vcpu_info_placement();
+                xen_setup_vcpu_info_placement();
+        }
+        /*
+         * The alternative logic (which patches the unlock/lock) runs before
+         * the smp bootup up code is activated. Hence we need to set this up
+         * the core kernel is being patched. Otherwise we will have only
+         * modules patched but not core code.
+         */
        xen_init_spinlocks();
 }
@@ -573,6 +581,12 @@ static inline int xen_map_vector(int vector)
        case IRQ_WORK_VECTOR:
                xen_vector = XEN_IRQ_WORK_VECTOR;
                break;
+#ifdef CONFIG_X86_64
+        case NMI_VECTOR:
+        case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+                xen_vector = XEN_NMI_VECTOR;
+                break;
+#endif
        default:
                xen_vector = -1;
                printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -703,6 +717,15 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
        WARN_ON(rc);
        if (!rc)
                rc =  native_cpu_up(cpu, tidle);
+        /*
+         * We must initialize the slowpath CPU kicker _after_ the native
+         * path has executed. If we initialized it before none of the
+         * unlocker IPI kicks would reach the booting CPU as the booting
+         * CPU had not set itself 'online' in cpu_online_mask. That mask
+         * is checked when IPIs are sent (on HVM at least).
+         */
+        xen_init_lock_cpu(cpu);
        return rc;
 }
@@ -722,4 +745,5 @@ void __init xen_hvm_smp_init(void)
        smp_ops.cpu_die = xen_hvm_cpu_die;
        smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
        smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+        smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
 }
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0438b9324a72..253f63fceea1 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -81,7 +81,6 @@ static inline void spin_time_accum_blocked(u64 start)
        spinlock_stats.time_blocked += delta;
 }
 #else  /* !CONFIG_XEN_DEBUG_FS */
-#define TIMEOUT                 (1 << 10)
 static inline void add_stats(enum xen_contention_stat var, u32 val)
 {
 }
@@ -96,23 +95,6 @@ static inline void spin_time_accum_blocked(u64 start)
 }
 #endif  /* CONFIG_XEN_DEBUG_FS */
-/*
- * Size struct xen_spinlock so it's the same as arch_spinlock_t.
- */
-#if NR_CPUS < 256
-typedef u8 xen_spinners_t;
-# define inc_spinners(xl) \
-        asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
-        asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
-#else
-typedef u16 xen_spinners_t;
-# define inc_spinners(xl) \
-        asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
-        asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
-#endif
 struct xen_lock_waiting {
        struct arch_spinlock *lock;
        __ticket_t want;
@@ -123,6 +105,7 @@ static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
+static bool xen_pvspin = true;
 static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
        int irq = __this_cpu_read(lock_kicker_irq);
@@ -241,16 +224,12 @@ void xen_init_lock_cpu(int cpu)
        int irq;
        char *name;
+        if (!xen_pvspin)
+                return;
        WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
             cpu, per_cpu(lock_kicker_irq, cpu));
-        /*
-         * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
-         * (xen: disable PV spinlocks on HVM)
-         */
-        if (xen_hvm_domain())
-                return;
        name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
        irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
                                     cpu,
@@ -270,11 +249,7 @@ void xen_init_lock_cpu(int cpu)
 void xen_uninit_lock_cpu(int cpu)
 {
-        /*
+        if (!xen_pvspin)
-         * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
-         * (xen: disable PV spinlocks on HVM)
-         */
-        if (xen_hvm_domain())
                return;
        unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
@@ -283,16 +258,9 @@ void xen_uninit_lock_cpu(int cpu)
        per_cpu(irq_name, cpu) = NULL;
 }
-static bool xen_pvspin __initdata = true;
 void __init xen_init_spinlocks(void)
 {
-        /*
-         * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
-         * (xen: disable PV spinlocks on HVM)
-         */
-        if (xen_hvm_domain())
-                return;
        if (!xen_pvspin) {
                printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
@@ -323,6 +291,9 @@ static int __init xen_spinlock_debugfs(void)
        if (d_xen == NULL)
                return -ENOMEM;
+        if (!xen_pvspin)
+                return 0;
        d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
        debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);