aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/camellia_glue.c64
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S643
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c151
-rw-r--r--arch/x86/include/asm/dma-contiguous.h1
-rw-r--r--arch/x86/include/asm/jump_label.h9
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/pci.h30
-rw-r--r--arch/x86/include/asm/pgtable.h34
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h37
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/asm/xen/events.h1
-rw-r--r--arch/x86/include/asm/xor_avx.h4
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/devicetree.c3
-rw-r--r--arch/x86/kernel/early-quirks.c154
-rw-r--r--arch/x86/kernel/entry_32.S3
-rw-r--r--arch/x86/kernel/jump_label.c70
-rw-r--r--arch/x86/kernel/paravirt.c5
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/x86_init.c24
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c181
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h178
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/vmx.c441
-rw-r--r--arch/x86/kvm/x86.c224
-rw-r--r--arch/x86/lguest/boot.c10
-rw-r--r--arch/x86/mm/hugetlbpage.c8
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/oprofile/nmi_int.c18
-rw-r--r--arch/x86/oprofile/op_model_amd.c24
-rw-r--r--arch/x86/platform/mrst/mrst.c2
-rw-r--r--arch/x86/um/os-Linux/prctl.c2
-rw-r--r--arch/x86/vdso/vclock_gettime.c16
-rw-r--r--arch/x86/xen/enlighten.c16
-rw-r--r--arch/x86/xen/irq.c25
-rw-r--r--arch/x86/xen/p2m.c26
-rw-r--r--arch/x86/xen/setup.c29
-rw-r--r--arch/x86/xen/smp.c34
-rw-r--r--arch/x86/xen/spinlock.c45
47 files changed, 2137 insertions, 532 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c0ed72c02a2..30c40f08a3d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2032,7 +2032,6 @@ menu "Bus options (PCI etc.)"
2032config PCI 2032config PCI
2033 bool "PCI support" 2033 bool "PCI support"
2034 default y 2034 default y
2035 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
2036 ---help--- 2035 ---help---
2037 Find out whether you have a PCI motherboard. PCI is the name of a 2036 Find out whether you have a PCI motherboard. PCI is the name of a
2038 bus system, i.e. the way the CPU talks to the other stuff inside 2037 bus system, i.e. the way the CPU talks to the other stuff inside
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 6c63c358a7e6..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
27obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o 27obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
28obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o 28obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
29obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o 29obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
30obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
30 31
31# These modules require assembler to support AVX. 32# These modules require assembler to support AVX.
32ifeq ($(avx_supported),yes) 33ifeq ($(avx_supported),yes)
@@ -81,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
81crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o 82crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
82sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o 83sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
83sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o 84sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
85crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5cb86ccd4acb..c171dcbf192d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
62} 62}
63 63
64/* camellia sboxes */ 64/* camellia sboxes */
65const u64 camellia_sp10011110[256] = { 65__visible const u64 camellia_sp10011110[256] = {
66 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, 66 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
67 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, 67 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
68 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, 68 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
@@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = {
151 0x9e00009e9e9e9e00ULL, 151 0x9e00009e9e9e9e00ULL,
152}; 152};
153 153
154const u64 camellia_sp22000222[256] = { 154__visible const u64 camellia_sp22000222[256] = {
155 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, 155 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
156 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, 156 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
157 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, 157 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
@@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = {
240 0x3d3d0000003d3d3dULL, 240 0x3d3d0000003d3d3dULL,
241}; 241};
242 242
243const u64 camellia_sp03303033[256] = { 243__visible const u64 camellia_sp03303033[256] = {
244 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, 244 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
245 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, 245 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
246 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, 246 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
@@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = {
329 0x004f4f004f004f4fULL, 329 0x004f4f004f004f4fULL,
330}; 330};
331 331
332const u64 camellia_sp00444404[256] = { 332__visible const u64 camellia_sp00444404[256] = {
333 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL, 333 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
334 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL, 334 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
335 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL, 335 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
@@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = {
418 0x00009e9e9e9e009eULL, 418 0x00009e9e9e9e009eULL,
419}; 419};
420 420
421const u64 camellia_sp02220222[256] = { 421__visible const u64 camellia_sp02220222[256] = {
422 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL, 422 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
423 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL, 423 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
424 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL, 424 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
@@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = {
507 0x003d3d3d003d3d3dULL, 507 0x003d3d3d003d3d3dULL,
508}; 508};
509 509
510const u64 camellia_sp30333033[256] = { 510__visible const u64 camellia_sp30333033[256] = {
511 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL, 511 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
512 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL, 512 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
513 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL, 513 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
@@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = {
596 0x4f004f4f4f004f4fULL, 596 0x4f004f4f4f004f4fULL,
597}; 597};
598 598
599const u64 camellia_sp44044404[256] = { 599__visible const u64 camellia_sp44044404[256] = {
600 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL, 600 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
601 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL, 601 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
602 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL, 602 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
@@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = {
685 0x9e9e009e9e9e009eULL, 685 0x9e9e009e9e9e009eULL,
686}; 686};
687 687
688const u64 camellia_sp11101110[256] = { 688__visible const u64 camellia_sp11101110[256] = {
689 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL, 689 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
690 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL, 690 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
691 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL, 691 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
@@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
828 828
829 subRL[1] ^= (subRL[1] & ~subRL[9]) << 32; 829 subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
830 /* modified for FLinv(kl2) */ 830 /* modified for FLinv(kl2) */
831 dw = (subRL[1] & subRL[9]) >> 32, 831 dw = (subRL[1] & subRL[9]) >> 32;
832 subRL[1] ^= rol32(dw, 1); 832 subRL[1] ^= rol32(dw, 1);
833 833
834 /* round 8 */ 834 /* round 8 */
835 subRL[11] ^= subRL[1]; 835 subRL[11] ^= subRL[1];
@@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
840 840
841 subRL[1] ^= (subRL[1] & ~subRL[17]) << 32; 841 subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
842 /* modified for FLinv(kl4) */ 842 /* modified for FLinv(kl4) */
843 dw = (subRL[1] & subRL[17]) >> 32, 843 dw = (subRL[1] & subRL[17]) >> 32;
844 subRL[1] ^= rol32(dw, 1); 844 subRL[1] ^= rol32(dw, 1);
845 845
846 /* round 14 */ 846 /* round 14 */
847 subRL[19] ^= subRL[1]; 847 subRL[19] ^= subRL[1];
@@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
859 } else { 859 } else {
860 subRL[1] ^= (subRL[1] & ~subRL[25]) << 32; 860 subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
861 /* modified for FLinv(kl6) */ 861 /* modified for FLinv(kl6) */
862 dw = (subRL[1] & subRL[25]) >> 32, 862 dw = (subRL[1] & subRL[25]) >> 32;
863 subRL[1] ^= rol32(dw, 1); 863 subRL[1] ^= rol32(dw, 1);
864 864
865 /* round 20 */ 865 /* round 20 */
866 subRL[27] ^= subRL[1]; 866 subRL[27] ^= subRL[1];
@@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
882 882
883 kw4 ^= (kw4 & ~subRL[24]) << 32; 883 kw4 ^= (kw4 & ~subRL[24]) << 32;
884 /* modified for FL(kl5) */ 884 /* modified for FL(kl5) */
885 dw = (kw4 & subRL[24]) >> 32, 885 dw = (kw4 & subRL[24]) >> 32;
886 kw4 ^= rol32(dw, 1); 886 kw4 ^= rol32(dw, 1);
887 } 887 }
888 888
889 /* round 17 */ 889 /* round 17 */
@@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
895 895
896 kw4 ^= (kw4 & ~subRL[16]) << 32; 896 kw4 ^= (kw4 & ~subRL[16]) << 32;
897 /* modified for FL(kl3) */ 897 /* modified for FL(kl3) */
898 dw = (kw4 & subRL[16]) >> 32, 898 dw = (kw4 & subRL[16]) >> 32;
899 kw4 ^= rol32(dw, 1); 899 kw4 ^= rol32(dw, 1);
900 900
901 /* round 11 */ 901 /* round 11 */
902 subRL[14] ^= kw4; 902 subRL[14] ^= kw4;
@@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
907 907
908 kw4 ^= (kw4 & ~subRL[8]) << 32; 908 kw4 ^= (kw4 & ~subRL[8]) << 32;
909 /* modified for FL(kl1) */ 909 /* modified for FL(kl1) */
910 dw = (kw4 & subRL[8]) >> 32, 910 dw = (kw4 & subRL[8]) >> 32;
911 kw4 ^= rol32(dw, 1); 911 kw4 ^= rol32(dw, 1);
912 912
913 /* round 5 */ 913 /* round 5 */
914 subRL[6] ^= kw4; 914 subRL[6] ^= kw4;
@@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
928 SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */ 928 SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
929 929
930 tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]); 930 tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
931 dw = tl & (subRL[8] >> 32), /* FL(kl1) */ 931 dw = tl & (subRL[8] >> 32); /* FL(kl1) */
932 tr = subRL[10] ^ rol32(dw, 1); 932 tr = subRL[10] ^ rol32(dw, 1);
933 tt = (tr | ((u64)tl << 32)); 933 tt = (tr | ((u64)tl << 32));
934 934
935 SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */ 935 SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
@@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
937 SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */ 937 SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
938 938
939 tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]); 939 tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
940 dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */ 940 dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */
941 tr = subRL[7] ^ rol32(dw, 1); 941 tr = subRL[7] ^ rol32(dw, 1);
942 tt = (tr | ((u64)tl << 32)); 942 tt = (tr | ((u64)tl << 32));
943 943
944 SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */ 944 SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
@@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
948 SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */ 948 SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
949 949
950 tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]); 950 tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
951 dw = tl & (subRL[16] >> 32), /* FL(kl3) */ 951 dw = tl & (subRL[16] >> 32); /* FL(kl3) */
952 tr = subRL[18] ^ rol32(dw, 1); 952 tr = subRL[18] ^ rol32(dw, 1);
953 tt = (tr | ((u64)tl << 32)); 953 tt = (tr | ((u64)tl << 32));
954 954
955 SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */ 955 SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
@@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
957 SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */ 957 SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
958 958
959 tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]); 959 tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
960 dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */ 960 dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */
961 tr = subRL[15] ^ rol32(dw, 1); 961 tr = subRL[15] ^ rol32(dw, 1);
962 tt = (tr | ((u64)tl << 32)); 962 tt = (tr | ((u64)tl << 32));
963 963
964 SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */ 964 SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
@@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
972 SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */ 972 SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
973 } else { 973 } else {
974 tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]); 974 tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
975 dw = tl & (subRL[24] >> 32), /* FL(kl5) */ 975 dw = tl & (subRL[24] >> 32); /* FL(kl5) */
976 tr = subRL[26] ^ rol32(dw, 1); 976 tr = subRL[26] ^ rol32(dw, 1);
977 tt = (tr | ((u64)tl << 32)); 977 tt = (tr | ((u64)tl << 32));
978 978
979 SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */ 979 SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
@@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
981 SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */ 981 SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
982 982
983 tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]); 983 tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
984 dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */ 984 dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */
985 tr = subRL[23] ^ rol32(dw, 1); 985 tr = subRL[23] ^ rol32(dw, 1);
986 tt = (tr | ((u64)tl << 32)); 986 tt = (tr | ((u64)tl << 32));
987 987
988 SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */ 988 SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
1########################################################################
2# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
3#
4# Copyright (c) 2013, Intel Corporation
5#
6# Authors:
7# Erdinc Ozturk <erdinc.ozturk@intel.com>
8# Vinodh Gopal <vinodh.gopal@intel.com>
9# James Guilford <james.guilford@intel.com>
10# Tim Chen <tim.c.chen@linux.intel.com>
11#
12# This software is available to you under a choice of one of two
13# licenses. You may choose to be licensed under the terms of the GNU
14# General Public License (GPL) Version 2, available from the file
15# COPYING in the main directory of this source tree, or the
16# OpenIB.org BSD license below:
17#
18# Redistribution and use in source and binary forms, with or without
19# modification, are permitted provided that the following conditions are
20# met:
21#
22# * Redistributions of source code must retain the above copyright
23# notice, this list of conditions and the following disclaimer.
24#
25# * Redistributions in binary form must reproduce the above copyright
26# notice, this list of conditions and the following disclaimer in the
27# documentation and/or other materials provided with the
28# distribution.
29#
30# * Neither the name of the Intel Corporation nor the names of its
31# contributors may be used to endorse or promote products derived from
32# this software without specific prior written permission.
33#
34#
35# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
36# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
39# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46########################################################################
47# Function API:
48# UINT16 crc_t10dif_pcl(
49# UINT16 init_crc, //initial CRC value, 16 bits
50# const unsigned char *buf, //buffer pointer to calculate CRC on
51# UINT64 len //buffer length in bytes (64-bit data)
52# );
53#
54# Reference paper titled "Fast CRC Computation for Generic
55# Polynomials Using PCLMULQDQ Instruction"
56# URL: http://www.intel.com/content/dam/www/public/us/en/documents
57# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
58#
59#
60
61#include <linux/linkage.h>
62
63.text
64
65#define arg1 %rdi
66#define arg2 %rsi
67#define arg3 %rdx
68
69#define arg1_low32 %edi
70
71ENTRY(crc_t10dif_pcl)
72.align 16
73
74 # adjust the 16-bit initial_crc value, scale it to 32 bits
75 shl $16, arg1_low32
76
77 # Allocate Stack Space
78 mov %rsp, %rcx
79 sub $16*2, %rsp
80 # align stack to 16 byte boundary
81 and $~(0x10 - 1), %rsp
82
83 # check if smaller than 256
84 cmp $256, arg3
85
86 # for sizes less than 128, we can't fold 64B at a time...
87 jl _less_than_128
88
89
90 # load the initial crc value
91 movd arg1_low32, %xmm10 # initial crc
92
93 # crc value does not need to be byte-reflected, but it needs
94 # to be moved to the high part of the register.
95 # because data will be byte-reflected and will align with
96 # initial crc at correct place.
97 pslldq $12, %xmm10
98
99 movdqa SHUF_MASK(%rip), %xmm11
100 # receive the initial 64B data, xor the initial crc value
101 movdqu 16*0(arg2), %xmm0
102 movdqu 16*1(arg2), %xmm1
103 movdqu 16*2(arg2), %xmm2
104 movdqu 16*3(arg2), %xmm3
105 movdqu 16*4(arg2), %xmm4
106 movdqu 16*5(arg2), %xmm5
107 movdqu 16*6(arg2), %xmm6
108 movdqu 16*7(arg2), %xmm7
109
110 pshufb %xmm11, %xmm0
111 # XOR the initial_crc value
112 pxor %xmm10, %xmm0
113 pshufb %xmm11, %xmm1
114 pshufb %xmm11, %xmm2
115 pshufb %xmm11, %xmm3
116 pshufb %xmm11, %xmm4
117 pshufb %xmm11, %xmm5
118 pshufb %xmm11, %xmm6
119 pshufb %xmm11, %xmm7
120
121 movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
122 #imm value of pclmulqdq instruction
123 #will determine which constant to use
124
125 #################################################################
126 # we subtract 256 instead of 128 to save one instruction from the loop
127 sub $256, arg3
128
129 # at this section of the code, there is 64*x+y (0<=y<64) bytes of
130 # buffer. The _fold_64_B_loop will fold 64B at a time
131 # until we have 64+y Bytes of buffer
132
133
134 # fold 64B at a time. This section of the code folds 4 xmm
135 # registers in parallel
136_fold_64_B_loop:
137
138 # update the buffer pointer
139 add $128, arg2 # buf += 64#
140
141 movdqu 16*0(arg2), %xmm9
142 movdqu 16*1(arg2), %xmm12
143 pshufb %xmm11, %xmm9
144 pshufb %xmm11, %xmm12
145 movdqa %xmm0, %xmm8
146 movdqa %xmm1, %xmm13
147 pclmulqdq $0x0 , %xmm10, %xmm0
148 pclmulqdq $0x11, %xmm10, %xmm8
149 pclmulqdq $0x0 , %xmm10, %xmm1
150 pclmulqdq $0x11, %xmm10, %xmm13
151 pxor %xmm9 , %xmm0
152 xorps %xmm8 , %xmm0
153 pxor %xmm12, %xmm1
154 xorps %xmm13, %xmm1
155
156 movdqu 16*2(arg2), %xmm9
157 movdqu 16*3(arg2), %xmm12
158 pshufb %xmm11, %xmm9
159 pshufb %xmm11, %xmm12
160 movdqa %xmm2, %xmm8
161 movdqa %xmm3, %xmm13
162 pclmulqdq $0x0, %xmm10, %xmm2
163 pclmulqdq $0x11, %xmm10, %xmm8
164 pclmulqdq $0x0, %xmm10, %xmm3
165 pclmulqdq $0x11, %xmm10, %xmm13
166 pxor %xmm9 , %xmm2
167 xorps %xmm8 , %xmm2
168 pxor %xmm12, %xmm3
169 xorps %xmm13, %xmm3
170
171 movdqu 16*4(arg2), %xmm9
172 movdqu 16*5(arg2), %xmm12
173 pshufb %xmm11, %xmm9
174 pshufb %xmm11, %xmm12
175 movdqa %xmm4, %xmm8
176 movdqa %xmm5, %xmm13
177 pclmulqdq $0x0, %xmm10, %xmm4
178 pclmulqdq $0x11, %xmm10, %xmm8
179 pclmulqdq $0x0, %xmm10, %xmm5
180 pclmulqdq $0x11, %xmm10, %xmm13
181 pxor %xmm9 , %xmm4
182 xorps %xmm8 , %xmm4
183 pxor %xmm12, %xmm5
184 xorps %xmm13, %xmm5
185
186 movdqu 16*6(arg2), %xmm9
187 movdqu 16*7(arg2), %xmm12
188 pshufb %xmm11, %xmm9
189 pshufb %xmm11, %xmm12
190 movdqa %xmm6 , %xmm8
191 movdqa %xmm7 , %xmm13
192 pclmulqdq $0x0 , %xmm10, %xmm6
193 pclmulqdq $0x11, %xmm10, %xmm8
194 pclmulqdq $0x0 , %xmm10, %xmm7
195 pclmulqdq $0x11, %xmm10, %xmm13
196 pxor %xmm9 , %xmm6
197 xorps %xmm8 , %xmm6
198 pxor %xmm12, %xmm7
199 xorps %xmm13, %xmm7
200
201 sub $128, arg3
202
203 # check if there is another 64B in the buffer to be able to fold
204 jge _fold_64_B_loop
205 ##################################################################
206
207
208 add $128, arg2
209 # at this point, the buffer pointer is pointing at the last y Bytes
210 # of the buffer the 64B of folded data is in 4 of the xmm
211 # registers: xmm0, xmm1, xmm2, xmm3
212
213
214 # fold the 8 xmm registers to 1 xmm register with different constants
215
216 movdqa rk9(%rip), %xmm10
217 movdqa %xmm0, %xmm8
218 pclmulqdq $0x11, %xmm10, %xmm0
219 pclmulqdq $0x0 , %xmm10, %xmm8
220 pxor %xmm8, %xmm7
221 xorps %xmm0, %xmm7
222
223 movdqa rk11(%rip), %xmm10
224 movdqa %xmm1, %xmm8
225 pclmulqdq $0x11, %xmm10, %xmm1
226 pclmulqdq $0x0 , %xmm10, %xmm8
227 pxor %xmm8, %xmm7
228 xorps %xmm1, %xmm7
229
230 movdqa rk13(%rip), %xmm10
231 movdqa %xmm2, %xmm8
232 pclmulqdq $0x11, %xmm10, %xmm2
233 pclmulqdq $0x0 , %xmm10, %xmm8
234 pxor %xmm8, %xmm7
235 pxor %xmm2, %xmm7
236
237 movdqa rk15(%rip), %xmm10
238 movdqa %xmm3, %xmm8
239 pclmulqdq $0x11, %xmm10, %xmm3
240 pclmulqdq $0x0 , %xmm10, %xmm8
241 pxor %xmm8, %xmm7
242 xorps %xmm3, %xmm7
243
244 movdqa rk17(%rip), %xmm10
245 movdqa %xmm4, %xmm8
246 pclmulqdq $0x11, %xmm10, %xmm4
247 pclmulqdq $0x0 , %xmm10, %xmm8
248 pxor %xmm8, %xmm7
249 pxor %xmm4, %xmm7
250
251 movdqa rk19(%rip), %xmm10
252 movdqa %xmm5, %xmm8
253 pclmulqdq $0x11, %xmm10, %xmm5
254 pclmulqdq $0x0 , %xmm10, %xmm8
255 pxor %xmm8, %xmm7
256 xorps %xmm5, %xmm7
257
258 movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
259 #imm value of pclmulqdq instruction
260 #will determine which constant to use
261 movdqa %xmm6, %xmm8
262 pclmulqdq $0x11, %xmm10, %xmm6
263 pclmulqdq $0x0 , %xmm10, %xmm8
264 pxor %xmm8, %xmm7
265 pxor %xmm6, %xmm7
266
267
268 # instead of 64, we add 48 to the loop counter to save 1 instruction
269 # from the loop instead of a cmp instruction, we use the negative
270 # flag with the jl instruction
271 add $128-16, arg3
272 jl _final_reduction_for_128
273
274 # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
275 # and the rest is in memory. We can fold 16 bytes at a time if y>=16
276 # continue folding 16B at a time
277
278_16B_reduction_loop:
279 movdqa %xmm7, %xmm8
280 pclmulqdq $0x11, %xmm10, %xmm7
281 pclmulqdq $0x0 , %xmm10, %xmm8
282 pxor %xmm8, %xmm7
283 movdqu (arg2), %xmm0
284 pshufb %xmm11, %xmm0
285 pxor %xmm0 , %xmm7
286 add $16, arg2
287 sub $16, arg3
288 # instead of a cmp instruction, we utilize the flags with the
289 # jge instruction equivalent of: cmp arg3, 16-16
290 # check if there is any more 16B in the buffer to be able to fold
291 jge _16B_reduction_loop
292
293 #now we have 16+z bytes left to reduce, where 0<= z < 16.
294 #first, we reduce the data in the xmm7 register
295
296
297_final_reduction_for_128:
298 # check if any more data to fold. If not, compute the CRC of
299 # the final 128 bits
300 add $16, arg3
301 je _128_done
302
303 # here we are getting data that is less than 16 bytes.
304 # since we know that there was data before the pointer, we can
305 # offset the input pointer before the actual point, to receive
306 # exactly 16 bytes. after that the registers need to be adjusted.
307_get_last_two_xmms:
308 movdqa %xmm7, %xmm2
309
310 movdqu -16(arg2, arg3), %xmm1
311 pshufb %xmm11, %xmm1
312
313 # get rid of the extra data that was loaded before
314 # load the shift constant
315 lea pshufb_shf_table+16(%rip), %rax
316 sub arg3, %rax
317 movdqu (%rax), %xmm0
318
319 # shift xmm2 to the left by arg3 bytes
320 pshufb %xmm0, %xmm2
321
322 # shift xmm7 to the right by 16-arg3 bytes
323 pxor mask1(%rip), %xmm0
324 pshufb %xmm0, %xmm7
325 pblendvb %xmm2, %xmm1 #xmm0 is implicit
326
327 # fold 16 Bytes
328 movdqa %xmm1, %xmm2
329 movdqa %xmm7, %xmm8
330 pclmulqdq $0x11, %xmm10, %xmm7
331 pclmulqdq $0x0 , %xmm10, %xmm8
332 pxor %xmm8, %xmm7
333 pxor %xmm2, %xmm7
334
335_128_done:
336 # compute crc of a 128-bit value
337 movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
338 movdqa %xmm7, %xmm0
339
340 #64b fold
341 pclmulqdq $0x1, %xmm10, %xmm7
342 pslldq $8 , %xmm0
343 pxor %xmm0, %xmm7
344
345 #32b fold
346 movdqa %xmm7, %xmm0
347
348 pand mask2(%rip), %xmm0
349
350 psrldq $12, %xmm7
351 pclmulqdq $0x10, %xmm10, %xmm7
352 pxor %xmm0, %xmm7
353
354 #barrett reduction
355_barrett:
356 movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
357 movdqa %xmm7, %xmm0
358 pclmulqdq $0x01, %xmm10, %xmm7
359 pslldq $4, %xmm7
360 pclmulqdq $0x11, %xmm10, %xmm7
361
362 pslldq $4, %xmm7
363 pxor %xmm0, %xmm7
364 pextrd $1, %xmm7, %eax
365
366_cleanup:
367 # scale the result back to 16 bits
368 shr $16, %eax
369 mov %rcx, %rsp
370 ret
371
372########################################################################
373
374.align 16
375_less_than_128:
376
377 # check if there is enough buffer to be able to fold 16B at a time
378 cmp $32, arg3
379 jl _less_than_32
380 movdqa SHUF_MASK(%rip), %xmm11
381
382 # now if there is, load the constants
383 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
384
385 movd arg1_low32, %xmm0 # get the initial crc value
386 pslldq $12, %xmm0 # align it to its correct place
387 movdqu (arg2), %xmm7 # load the plaintext
388 pshufb %xmm11, %xmm7 # byte-reflect the plaintext
389 pxor %xmm0, %xmm7
390
391
392 # update the buffer pointer
393 add $16, arg2
394
395 # update the counter. subtract 32 instead of 16 to save one
396 # instruction from the loop
397 sub $32, arg3
398
399 jmp _16B_reduction_loop
400
401
402.align 16
403_less_than_32:
404 # mov initial crc to the return value. this is necessary for
405 # zero-length buffers.
406 mov arg1_low32, %eax
407 test arg3, arg3
408 je _cleanup
409
410 movdqa SHUF_MASK(%rip), %xmm11
411
412 movd arg1_low32, %xmm0 # get the initial crc value
413 pslldq $12, %xmm0 # align it to its correct place
414
415 cmp $16, arg3
416 je _exact_16_left
417 jl _less_than_16_left
418
419 movdqu (arg2), %xmm7 # load the plaintext
420 pshufb %xmm11, %xmm7 # byte-reflect the plaintext
421 pxor %xmm0 , %xmm7 # xor the initial crc value
422 add $16, arg2
423 sub $16, arg3
424 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
425 jmp _get_last_two_xmms
426
427
428.align 16
429_less_than_16_left:
430 # use stack space to load data less than 16 bytes, zero-out
431 # the 16B in memory first.
432
433 pxor %xmm1, %xmm1
434 mov %rsp, %r11
435 movdqa %xmm1, (%r11)
436
437 cmp $4, arg3
438 jl _only_less_than_4
439
440 # backup the counter value
441 mov arg3, %r9
442 cmp $8, arg3
443 jl _less_than_8_left
444
445 # load 8 Bytes
446 mov (arg2), %rax
447 mov %rax, (%r11)
448 add $8, %r11
449 sub $8, arg3
450 add $8, arg2
451_less_than_8_left:
452
453 cmp $4, arg3
454 jl _less_than_4_left
455
456 # load 4 Bytes
457 mov (arg2), %eax
458 mov %eax, (%r11)
459 add $4, %r11
460 sub $4, arg3
461 add $4, arg2
462_less_than_4_left:
463
464 cmp $2, arg3
465 jl _less_than_2_left
466
467 # load 2 Bytes
468 mov (arg2), %ax
469 mov %ax, (%r11)
470 add $2, %r11
471 sub $2, arg3
472 add $2, arg2
473_less_than_2_left:
474 cmp $1, arg3
475 jl _zero_left
476
477 # load 1 Byte
478 mov (arg2), %al
479 mov %al, (%r11)
480_zero_left:
481 movdqa (%rsp), %xmm7
482 pshufb %xmm11, %xmm7
483 pxor %xmm0 , %xmm7 # xor the initial crc value
484
485 # shl r9, 4
486 lea pshufb_shf_table+16(%rip), %rax
487 sub %r9, %rax
488 movdqu (%rax), %xmm0
489 pxor mask1(%rip), %xmm0
490
491 pshufb %xmm0, %xmm7
492 jmp _128_done
493
494.align 16
495_exact_16_left:
496 movdqu (arg2), %xmm7
497 pshufb %xmm11, %xmm7
498 pxor %xmm0 , %xmm7 # xor the initial crc value
499
500 jmp _128_done
501
502_only_less_than_4:
503 cmp $3, arg3
504 jl _only_less_than_3
505
506 # load 3 Bytes
507 mov (arg2), %al
508 mov %al, (%r11)
509
510 mov 1(arg2), %al
511 mov %al, 1(%r11)
512
513 mov 2(arg2), %al
514 mov %al, 2(%r11)
515
516 movdqa (%rsp), %xmm7
517 pshufb %xmm11, %xmm7
518 pxor %xmm0 , %xmm7 # xor the initial crc value
519
520 psrldq $5, %xmm7
521
522 jmp _barrett
523_only_less_than_3:
524 cmp $2, arg3
525 jl _only_less_than_2
526
527 # load 2 Bytes
528 mov (arg2), %al
529 mov %al, (%r11)
530
531 mov 1(arg2), %al
532 mov %al, 1(%r11)
533
534 movdqa (%rsp), %xmm7
535 pshufb %xmm11, %xmm7
536 pxor %xmm0 , %xmm7 # xor the initial crc value
537
538 psrldq $6, %xmm7
539
540 jmp _barrett
541_only_less_than_2:
542
543 # load 1 Byte
544 mov (arg2), %al
545 mov %al, (%r11)
546
547 movdqa (%rsp), %xmm7
548 pshufb %xmm11, %xmm7
549 pxor %xmm0 , %xmm7 # xor the initial crc value
550
551 psrldq $7, %xmm7
552
553 jmp _barrett
554
555ENDPROC(crc_t10dif_pcl)
556
557.data
558
559# precomputed constants
560# these constants are precomputed from the poly:
561# 0x8bb70000 (0x8bb7 scaled to 32 bits)
562.align 16
563# Q = 0x18BB70000
564# rk1 = 2^(32*3) mod Q << 32
565# rk2 = 2^(32*5) mod Q << 32
566# rk3 = 2^(32*15) mod Q << 32
567# rk4 = 2^(32*17) mod Q << 32
568# rk5 = 2^(32*3) mod Q << 32
569# rk6 = 2^(32*2) mod Q << 32
570# rk7 = floor(2^64/Q)
571# rk8 = Q
572rk1:
573.quad 0x2d56000000000000
574rk2:
575.quad 0x06df000000000000
576rk3:
577.quad 0x9d9d000000000000
578rk4:
579.quad 0x7cf5000000000000
580rk5:
581.quad 0x2d56000000000000
582rk6:
583.quad 0x1368000000000000
584rk7:
585.quad 0x00000001f65a57f8
586rk8:
587.quad 0x000000018bb70000
588
589rk9:
590.quad 0xceae000000000000
591rk10:
592.quad 0xbfd6000000000000
593rk11:
594.quad 0x1e16000000000000
595rk12:
596.quad 0x713c000000000000
597rk13:
598.quad 0xf7f9000000000000
599rk14:
600.quad 0x80a6000000000000
601rk15:
602.quad 0x044c000000000000
603rk16:
604.quad 0xe658000000000000
605rk17:
606.quad 0xad18000000000000
607rk18:
608.quad 0xa497000000000000
609rk19:
610.quad 0x6ee3000000000000
611rk20:
612.quad 0xe7b5000000000000
613
614
615
616mask1:
617.octa 0x80808080808080808080808080808080
618mask2:
619.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
620
621SHUF_MASK:
622.octa 0x000102030405060708090A0B0C0D0E0F
623
624pshufb_shf_table:
625# use these values for shift constants for the pshufb instruction
626# different alignments result in values as shown:
627# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
628# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
629# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
630# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
631# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
632# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
633# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
634# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
635# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
636# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
637# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
638# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
639# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
640# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
641# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
642.octa 0x8f8e8d8c8b8a89888786858483828100
643.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
1/*
2 * Cryptographic API.
3 *
4 * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
5 *
6 * Copyright (C) 2013 Intel Corporation
7 * Author: Tim Chen <tim.c.chen@linux.intel.com>
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the Free
11 * Software Foundation; either version 2 of the License, or (at your option)
12 * any later version.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25#include <linux/types.h>
26#include <linux/module.h>
27#include <linux/crc-t10dif.h>
28#include <crypto/internal/hash.h>
29#include <linux/init.h>
30#include <linux/string.h>
31#include <linux/kernel.h>
32#include <asm/i387.h>
33#include <asm/cpufeature.h>
34#include <asm/cpu_device_id.h>
35
36asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
37 size_t len);
38
39struct chksum_desc_ctx {
40 __u16 crc;
41};
42
43/*
44 * Steps through buffer one byte at at time, calculates reflected
45 * crc using table.
46 */
47
48static int chksum_init(struct shash_desc *desc)
49{
50 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
51
52 ctx->crc = 0;
53
54 return 0;
55}
56
57static int chksum_update(struct shash_desc *desc, const u8 *data,
58 unsigned int length)
59{
60 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
61
62 if (irq_fpu_usable()) {
63 kernel_fpu_begin();
64 ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
65 kernel_fpu_end();
66 } else
67 ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
68 return 0;
69}
70
71static int chksum_final(struct shash_desc *desc, u8 *out)
72{
73 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
74
75 *(__u16 *)out = ctx->crc;
76 return 0;
77}
78
79static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
80 u8 *out)
81{
82 if (irq_fpu_usable()) {
83 kernel_fpu_begin();
84 *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
85 kernel_fpu_end();
86 } else
87 *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
88 return 0;
89}
90
91static int chksum_finup(struct shash_desc *desc, const u8 *data,
92 unsigned int len, u8 *out)
93{
94 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
95
96 return __chksum_finup(&ctx->crc, data, len, out);
97}
98
99static int chksum_digest(struct shash_desc *desc, const u8 *data,
100 unsigned int length, u8 *out)
101{
102 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
103
104 return __chksum_finup(&ctx->crc, data, length, out);
105}
106
107static struct shash_alg alg = {
108 .digestsize = CRC_T10DIF_DIGEST_SIZE,
109 .init = chksum_init,
110 .update = chksum_update,
111 .final = chksum_final,
112 .finup = chksum_finup,
113 .digest = chksum_digest,
114 .descsize = sizeof(struct chksum_desc_ctx),
115 .base = {
116 .cra_name = "crct10dif",
117 .cra_driver_name = "crct10dif-pclmul",
118 .cra_priority = 200,
119 .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
120 .cra_module = THIS_MODULE,
121 }
122};
123
124static const struct x86_cpu_id crct10dif_cpu_id[] = {
125 X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
126 {}
127};
128MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
129
130static int __init crct10dif_intel_mod_init(void)
131{
132 if (!x86_match_cpu(crct10dif_cpu_id))
133 return -ENODEV;
134
135 return crypto_register_shash(&alg);
136}
137
138static void __exit crct10dif_intel_mod_fini(void)
139{
140 crypto_unregister_shash(&alg);
141}
142
143module_init(crct10dif_intel_mod_init);
144module_exit(crct10dif_intel_mod_fini);
145
146MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
147MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
148MODULE_LICENSE("GPL");
149
150MODULE_ALIAS("crct10dif");
151MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
index c09241659971..b4b38bacb404 100644
--- a/arch/x86/include/asm/dma-contiguous.h
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -4,7 +4,6 @@
4#ifdef __KERNEL__ 4#ifdef __KERNEL__
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <asm-generic/dma-contiguous.h>
8 7
9static inline void 8static inline void
10dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } 9dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 3a16c1483b45..64507f35800c 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -3,18 +3,23 @@
3 3
4#ifdef __KERNEL__ 4#ifdef __KERNEL__
5 5
6#include <linux/stringify.h>
6#include <linux/types.h> 7#include <linux/types.h>
7#include <asm/nops.h> 8#include <asm/nops.h>
8#include <asm/asm.h> 9#include <asm/asm.h>
9 10
10#define JUMP_LABEL_NOP_SIZE 5 11#define JUMP_LABEL_NOP_SIZE 5
11 12
12#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" 13#ifdef CONFIG_X86_64
14# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
15#else
16# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
17#endif
13 18
14static __always_inline bool arch_static_branch(struct static_key *key) 19static __always_inline bool arch_static_branch(struct static_key *key)
15{ 20{
16 asm goto("1:" 21 asm goto("1:"
17 STATIC_KEY_INITIAL_NOP 22 ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
18 ".pushsection __jump_table, \"aw\" \n\t" 23 ".pushsection __jump_table, \"aw\" \n\t"
19 _ASM_ALIGN "\n\t" 24 _ASM_ALIGN "\n\t"
20 _ASM_PTR "1b, %l[l_yes], %c0 \n\t" 25 _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
286 u64 *pae_root; 286 u64 *pae_root;
287 u64 *lm_root; 287 u64 *lm_root;
288 u64 rsvd_bits_mask[2][4]; 288 u64 rsvd_bits_mask[2][4];
289 u64 bad_mt_xwr;
289 290
290 /* 291 /*
291 * Bitmap: bit set = last pte in walk 292 * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
323 u64 global_ovf_ctrl; 324 u64 global_ovf_ctrl;
324 u64 counter_bitmask[2]; 325 u64 counter_bitmask[2];
325 u64 global_ctrl_mask; 326 u64 global_ctrl_mask;
327 u64 reserved_bits;
326 u8 version; 328 u8 version;
327 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; 329 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
328 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; 330 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
511 * instruction. 513 * instruction.
512 */ 514 */
513 bool write_fault_to_shadow_pgtable; 515 bool write_fault_to_shadow_pgtable;
516
517 /* set at EPT violation at this point */
518 unsigned long exit_qualification;
519
520 /* pv related host specific info */
521 struct {
522 bool pv_unhalted;
523 } pv;
514}; 524};
515 525
516struct kvm_lpage_info { 526struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz;
802extern u32 kvm_max_guest_tsc_khz; 812extern u32 kvm_max_guest_tsc_khz;
803 813
804enum emulation_result { 814enum emulation_result {
805 EMULATE_DONE, /* no further processing */ 815 EMULATE_DONE, /* no further processing */
806 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 816 EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
807 EMULATE_FAIL, /* can't emulate this instruction */ 817 EMULATE_FAIL, /* can't emulate this instruction */
808}; 818};
809 819
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d9e9e6c7ed32..7d7443283a9d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -100,29 +100,6 @@ static inline void early_quirks(void) { }
100extern void pci_iommu_alloc(void); 100extern void pci_iommu_alloc(void);
101 101
102#ifdef CONFIG_PCI_MSI 102#ifdef CONFIG_PCI_MSI
103/* MSI arch specific hooks */
104static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
105{
106 return x86_msi.setup_msi_irqs(dev, nvec, type);
107}
108
109static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
110{
111 x86_msi.teardown_msi_irqs(dev);
112}
113
114static inline void x86_teardown_msi_irq(unsigned int irq)
115{
116 x86_msi.teardown_msi_irq(irq);
117}
118static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
119{
120 x86_msi.restore_msi_irqs(dev, irq);
121}
122#define arch_setup_msi_irqs x86_setup_msi_irqs
123#define arch_teardown_msi_irqs x86_teardown_msi_irqs
124#define arch_teardown_msi_irq x86_teardown_msi_irq
125#define arch_restore_msi_irqs x86_restore_msi_irqs
126/* implemented in arch/x86/kernel/apic/io_apic. */ 103/* implemented in arch/x86/kernel/apic/io_apic. */
127struct msi_desc; 104struct msi_desc;
128int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); 105int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
@@ -130,16 +107,9 @@ void native_teardown_msi_irq(unsigned int irq);
130void native_restore_msi_irqs(struct pci_dev *dev, int irq); 107void native_restore_msi_irqs(struct pci_dev *dev, int irq);
131int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, 108int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
132 unsigned int irq_base, unsigned int irq_offset); 109 unsigned int irq_base, unsigned int irq_offset);
133/* default to the implementation in drivers/lib/msi.c */
134#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
135#define HAVE_DEFAULT_MSI_RESTORE_IRQS
136void default_teardown_msi_irqs(struct pci_dev *dev);
137void default_restore_msi_irqs(struct pci_dev *dev, int irq);
138#else 110#else
139#define native_setup_msi_irqs NULL 111#define native_setup_msi_irqs NULL
140#define native_teardown_msi_irq NULL 112#define native_teardown_msi_irq NULL
141#define default_teardown_msi_irqs NULL
142#define default_restore_msi_irqs NULL
143#endif 113#endif
144 114
145#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 115#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 8d16befdec88..3d1999458709 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
315 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 315 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
316} 316}
317 317
318static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
319{
320 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
321}
322
323static inline int pte_swp_soft_dirty(pte_t pte)
324{
325 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
326}
327
328static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
329{
330 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
331}
332
333static inline pte_t pte_file_clear_soft_dirty(pte_t pte) 318static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
334{ 319{
335 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); 320 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
446 431
447#ifndef __ASSEMBLY__ 432#ifndef __ASSEMBLY__
448#include <linux/mm_types.h> 433#include <linux/mm_types.h>
434#include <linux/mmdebug.h>
449#include <linux/log2.h> 435#include <linux/log2.h>
450 436
451static inline int pte_none(pte_t pte) 437static inline int pte_none(pte_t pte)
@@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
864{ 850{
865} 851}
866 852
853static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
854{
855 VM_BUG_ON(pte_present(pte));
856 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
857}
858
859static inline int pte_swp_soft_dirty(pte_t pte)
860{
861 VM_BUG_ON(pte_present(pte));
862 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
863}
864
865static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
866{
867 VM_BUG_ON(pte_present(pte));
868 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
869}
870
867#include <asm-generic/pgtable.h> 871#include <asm-generic/pgtable.h>
868#endif /* __ASSEMBLY__ */ 872#endif /* __ASSEMBLY__ */
869 873
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f4843e031131..0ecac257fb26 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -75,6 +75,9 @@
75 * with swap entry format. On x86 bits 6 and 7 are *not* involved 75 * with swap entry format. On x86 bits 6 and 7 are *not* involved
76 * into swap entry computation, but bit 6 is used for nonlinear 76 * into swap entry computation, but bit 6 is used for nonlinear
77 * file mapping, so we borrow bit 7 for soft dirty tracking. 77 * file mapping, so we borrow bit 7 for soft dirty tracking.
78 *
79 * Please note that this bit must be treated as swap dirty page
80 * mark if and only if the PTE has present bit clear!
78 */ 81 */
79#ifdef CONFIG_MEM_SOFT_DIRTY 82#ifdef CONFIG_MEM_SOFT_DIRTY
80#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE 83#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
93 93
94struct pvclock_vsyscall_time_info { 94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti; 95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES))); 96} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98 97
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 98#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf512003e663..e6d90babc245 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void)
62 62
63static inline void __flush_tlb_one(unsigned long addr) 63static inline void __flush_tlb_one(unsigned long addr)
64{ 64{
65 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
65 __flush_tlb_single(addr); 66 __flush_tlb_single(addr);
66} 67}
67 68
@@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr)
84 85
85#ifndef CONFIG_SMP 86#ifndef CONFIG_SMP
86 87
87#define flush_tlb() __flush_tlb() 88/* "_up" is for UniProcessor.
88#define flush_tlb_all() __flush_tlb_all() 89 *
89#define local_flush_tlb() __flush_tlb() 90 * This is a helper for other header functions. *Not* intended to be called
91 * directly. All global TLB flushes need to either call this, or to bump the
92 * vm statistics themselves.
93 */
94static inline void __flush_tlb_up(void)
95{
96 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
97 __flush_tlb();
98}
99
100static inline void flush_tlb_all(void)
101{
102 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
103 __flush_tlb_all();
104}
105
106static inline void flush_tlb(void)
107{
108 __flush_tlb_up();
109}
110
111static inline void local_flush_tlb(void)
112{
113 __flush_tlb_up();
114}
90 115
91static inline void flush_tlb_mm(struct mm_struct *mm) 116static inline void flush_tlb_mm(struct mm_struct *mm)
92{ 117{
93 if (mm == current->active_mm) 118 if (mm == current->active_mm)
94 __flush_tlb(); 119 __flush_tlb_up();
95} 120}
96 121
97static inline void flush_tlb_page(struct vm_area_struct *vma, 122static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
105 unsigned long start, unsigned long end) 130 unsigned long start, unsigned long end)
106{ 131{
107 if (vma->vm_mm == current->active_mm) 132 if (vma->vm_mm == current->active_mm)
108 __flush_tlb(); 133 __flush_tlb_up();
109} 134}
110 135
111static inline void flush_tlb_mm_range(struct mm_struct *mm, 136static inline void flush_tlb_mm_range(struct mm_struct *mm,
112 unsigned long start, unsigned long end, unsigned long vmflag) 137 unsigned long start, unsigned long end, unsigned long vmflag)
113{ 138{
114 if (mm == current->active_mm) 139 if (mm == current->active_mm)
115 __flush_tlb(); 140 __flush_tlb_up();
116} 141}
117 142
118static inline void native_flush_tlb_others(const struct cpumask *cpumask, 143static inline void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
388#define VMX_EPT_EXTENT_CONTEXT 1 388#define VMX_EPT_EXTENT_CONTEXT 1
389#define VMX_EPT_EXTENT_GLOBAL 2 389#define VMX_EPT_EXTENT_GLOBAL 2
390#define VMX_EPT_EXTENT_SHIFT 24
390 391
391#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) 392#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
392#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) 393#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
394#define VMX_EPTP_WB_BIT (1ull << 14) 395#define VMX_EPTP_WB_BIT (1ull << 14)
395#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 396#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
396#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 397#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
398#define VMX_EPT_INVEPT_BIT (1ull << 20)
397#define VMX_EPT_AD_BIT (1ull << 21) 399#define VMX_EPT_AD_BIT (1ull << 21)
398#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 400#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
399#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 401#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index ca842f2769ef..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -7,6 +7,7 @@ enum ipi_vector {
7 XEN_CALL_FUNCTION_SINGLE_VECTOR, 7 XEN_CALL_FUNCTION_SINGLE_VECTOR,
8 XEN_SPIN_UNLOCK_VECTOR, 8 XEN_SPIN_UNLOCK_VECTOR,
9 XEN_IRQ_WORK_VECTOR, 9 XEN_IRQ_WORK_VECTOR,
10 XEN_NMI_VECTOR,
10 11
11 XEN_NR_IPIS, 12 XEN_NR_IPIS,
12}; 13};
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7ea79c5fa1f2..492b29802f57 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = {
167 167
168#define AVX_XOR_SPEED \ 168#define AVX_XOR_SPEED \
169do { \ 169do { \
170 if (cpu_has_avx) \ 170 if (cpu_has_avx && cpu_has_osxsave) \
171 xor_speed(&xor_block_avx); \ 171 xor_speed(&xor_block_avx); \
172} while (0) 172} while (0)
173 173
174#define AVX_SELECT(FASTEST) \ 174#define AVX_SELECT(FASTEST) \
175 (cpu_has_avx ? &xor_block_avx : FASTEST) 175 (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
176 176
177#else 177#else
178 178
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65#define EXIT_REASON_EOI_INDUCED 45 65#define EXIT_REASON_EOI_INDUCED 45
66#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
67#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
68#define EXIT_REASON_INVEPT 50
68#define EXIT_REASON_PREEMPTION_TIMER 52 69#define EXIT_REASON_PREEMPTION_TIMER 52
69#define EXIT_REASON_WBINVD 54 70#define EXIT_REASON_WBINVD 54
70#define EXIT_REASON_XSETBV 55 71#define EXIT_REASON_XSETBV 55
@@ -106,12 +107,13 @@
106 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 107 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
107 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 108 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
108 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 109 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
110 { EXIT_REASON_INVEPT, "INVEPT" }, \
111 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
109 { EXIT_REASON_WBINVD, "WBINVD" }, \ 112 { EXIT_REASON_WBINVD, "WBINVD" }, \
110 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ 113 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
111 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 114 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
112 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 115 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
113 { EXIT_REASON_INVD, "INVD" }, \ 116 { EXIT_REASON_INVD, "INVD" }, \
114 { EXIT_REASON_INVPCID, "INVPCID" }, \ 117 { EXIT_REASON_INVPCID, "INVPCID" }
115 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
116 118
117#endif /* _UAPIVMX_H */ 119#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..ce2d0a2c3e4f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
683 } 683 }
684 684
685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
686 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
686 __flush_tlb(); 687 __flush_tlb();
687 688
688 /* Save MTRR state */ 689 /* Save MTRR state */
@@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
696static void post_set(void) __releases(set_atomicity_lock) 697static void post_set(void) __releases(set_atomicity_lock)
697{ 698{
698 /* Flush TLBs (no need to flush caches - they are disabled) */ 699 /* Flush TLBs (no need to flush caches - they are disabled) */
700 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
699 __flush_tlb(); 701 __flush_tlb();
700 702
701 /* Intel (P6) standard MTRRs */ 703 /* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 69eb2fa25494..376dc7873447 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
52} 52}
53 53
54#ifdef CONFIG_BLK_DEV_INITRD 54#ifdef CONFIG_BLK_DEV_INITRD
55void __init early_init_dt_setup_initrd_arch(unsigned long start, 55void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
56 unsigned long end)
57{ 56{
58 initrd_start = (unsigned long)__va(start); 57 initrd_start = (unsigned long)__va(start);
59 initrd_end = (unsigned long)__va(end); 58 initrd_end = (unsigned long)__va(end);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 63bdb29b2549..b3cd3ebae077 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,6 +12,7 @@
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/acpi.h> 13#include <linux/acpi.h>
14#include <linux/pci_ids.h> 14#include <linux/pci_ids.h>
15#include <drm/i915_drm.h>
15#include <asm/pci-direct.h> 16#include <asm/pci-direct.h>
16#include <asm/dma.h> 17#include <asm/dma.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
@@ -216,6 +217,157 @@ static void __init intel_remapping_check(int num, int slot, int func)
216 217
217} 218}
218 219
220/*
221 * Systems with Intel graphics controllers set aside memory exclusively
222 * for gfx driver use. This memory is not marked in the E820 as reserved
223 * or as RAM, and so is subject to overlap from E820 manipulation later
224 * in the boot process. On some systems, MMIO space is allocated on top,
225 * despite the efforts of the "RAM buffer" approach, which simply rounds
226 * memory boundaries up to 64M to try to catch space that may decode
227 * as RAM and so is not suitable for MMIO.
228 *
229 * And yes, so far on current devices the base addr is always under 4G.
230 */
231static u32 __init intel_stolen_base(int num, int slot, int func)
232{
233 u32 base;
234
235 /*
236 * For the PCI IDs in this quirk, the stolen base is always
237 * in 0x5c, aka the BDSM register (yes that's really what
238 * it's called).
239 */
240 base = read_pci_config(num, slot, func, 0x5c);
241 base &= ~((1<<20) - 1);
242
243 return base;
244}
245
246#define KB(x) ((x) * 1024)
247#define MB(x) (KB (KB (x)))
248#define GB(x) (MB (KB (x)))
249
250static size_t __init gen3_stolen_size(int num, int slot, int func)
251{
252 size_t stolen_size;
253 u16 gmch_ctrl;
254
255 gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
256
257 switch (gmch_ctrl & I855_GMCH_GMS_MASK) {
258 case I855_GMCH_GMS_STOLEN_1M:
259 stolen_size = MB(1);
260 break;
261 case I855_GMCH_GMS_STOLEN_4M:
262 stolen_size = MB(4);
263 break;
264 case I855_GMCH_GMS_STOLEN_8M:
265 stolen_size = MB(8);
266 break;
267 case I855_GMCH_GMS_STOLEN_16M:
268 stolen_size = MB(16);
269 break;
270 case I855_GMCH_GMS_STOLEN_32M:
271 stolen_size = MB(32);
272 break;
273 case I915_GMCH_GMS_STOLEN_48M:
274 stolen_size = MB(48);
275 break;
276 case I915_GMCH_GMS_STOLEN_64M:
277 stolen_size = MB(64);
278 break;
279 case G33_GMCH_GMS_STOLEN_128M:
280 stolen_size = MB(128);
281 break;
282 case G33_GMCH_GMS_STOLEN_256M:
283 stolen_size = MB(256);
284 break;
285 case INTEL_GMCH_GMS_STOLEN_96M:
286 stolen_size = MB(96);
287 break;
288 case INTEL_GMCH_GMS_STOLEN_160M:
289 stolen_size = MB(160);
290 break;
291 case INTEL_GMCH_GMS_STOLEN_224M:
292 stolen_size = MB(224);
293 break;
294 case INTEL_GMCH_GMS_STOLEN_352M:
295 stolen_size = MB(352);
296 break;
297 default:
298 stolen_size = 0;
299 break;
300 }
301
302 return stolen_size;
303}
304
305static size_t __init gen6_stolen_size(int num, int slot, int func)
306{
307 u16 gmch_ctrl;
308
309 gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
310 gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
311 gmch_ctrl &= SNB_GMCH_GMS_MASK;
312
313 return gmch_ctrl << 25; /* 32 MB units */
314}
315
316typedef size_t (*stolen_size_fn)(int num, int slot, int func);
317
318static struct pci_device_id intel_stolen_ids[] __initdata = {
319 INTEL_I915G_IDS(gen3_stolen_size),
320 INTEL_I915GM_IDS(gen3_stolen_size),
321 INTEL_I945G_IDS(gen3_stolen_size),
322 INTEL_I945GM_IDS(gen3_stolen_size),
323 INTEL_VLV_M_IDS(gen3_stolen_size),
324 INTEL_VLV_D_IDS(gen3_stolen_size),
325 INTEL_PINEVIEW_IDS(gen3_stolen_size),
326 INTEL_I965G_IDS(gen3_stolen_size),
327 INTEL_G33_IDS(gen3_stolen_size),
328 INTEL_I965GM_IDS(gen3_stolen_size),
329 INTEL_GM45_IDS(gen3_stolen_size),
330 INTEL_G45_IDS(gen3_stolen_size),
331 INTEL_IRONLAKE_D_IDS(gen3_stolen_size),
332 INTEL_IRONLAKE_M_IDS(gen3_stolen_size),
333 INTEL_SNB_D_IDS(gen6_stolen_size),
334 INTEL_SNB_M_IDS(gen6_stolen_size),
335 INTEL_IVB_M_IDS(gen6_stolen_size),
336 INTEL_IVB_D_IDS(gen6_stolen_size),
337 INTEL_HSW_D_IDS(gen6_stolen_size),
338 INTEL_HSW_M_IDS(gen6_stolen_size),
339};
340
341static void __init intel_graphics_stolen(int num, int slot, int func)
342{
343 size_t size;
344 int i;
345 u32 start;
346 u16 device, subvendor, subdevice;
347
348 device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
349 subvendor = read_pci_config_16(num, slot, func,
350 PCI_SUBSYSTEM_VENDOR_ID);
351 subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID);
352
353 for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
354 if (intel_stolen_ids[i].device == device) {
355 stolen_size_fn stolen_size =
356 (stolen_size_fn)intel_stolen_ids[i].driver_data;
357 size = stolen_size(num, slot, func);
358 start = intel_stolen_base(num, slot, func);
359 if (size && start) {
360 /* Mark this space as reserved */
361 e820_add_region(start, size, E820_RESERVED);
362 sanitize_e820_map(e820.map,
363 ARRAY_SIZE(e820.map),
364 &e820.nr_map);
365 }
366 return;
367 }
368 }
369}
370
219#define QFLAG_APPLY_ONCE 0x1 371#define QFLAG_APPLY_ONCE 0x1
220#define QFLAG_APPLIED 0x2 372#define QFLAG_APPLIED 0x2
221#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 373#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -251,6 +403,8 @@ static struct chipset early_qrk[] __initdata = {
251 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, 403 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
252 { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, 404 { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
253 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, 405 PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
406 { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
407 QFLAG_APPLY_ONCE, intel_graphics_stolen },
254 {} 408 {}
255}; 409};
256 410
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a3a2dd..f0dcb0ceb6a2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1176,6 +1176,9 @@ ftrace_restore_flags:
1176#else /* ! CONFIG_DYNAMIC_FTRACE */ 1176#else /* ! CONFIG_DYNAMIC_FTRACE */
1177 1177
1178ENTRY(mcount) 1178ENTRY(mcount)
1179 cmpl $__PAGE_OFFSET, %esp
1180 jb ftrace_stub /* Paging not enabled yet? */
1181
1179 cmpl $0, function_trace_stop 1182 cmpl $0, function_trace_stop
1180 jne ftrace_stub 1183 jne ftrace_stub
1181 1184
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 460f5d9ceebb..ee11b7dfbfbb 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -24,18 +24,57 @@ union jump_code_union {
24 } __attribute__((packed)); 24 } __attribute__((packed));
25}; 25};
26 26
27static void bug_at(unsigned char *ip, int line)
28{
29 /*
30 * The location is not an op that we were expecting.
31 * Something went wrong. Crash the box, as something could be
32 * corrupting the kernel.
33 */
34 pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
35 ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
36 BUG();
37}
38
27static void __jump_label_transform(struct jump_entry *entry, 39static void __jump_label_transform(struct jump_entry *entry,
28 enum jump_label_type type, 40 enum jump_label_type type,
29 void *(*poker)(void *, const void *, size_t)) 41 void *(*poker)(void *, const void *, size_t),
42 int init)
30{ 43{
31 union jump_code_union code; 44 union jump_code_union code;
45 const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
32 46
33 if (type == JUMP_LABEL_ENABLE) { 47 if (type == JUMP_LABEL_ENABLE) {
48 /*
49 * We are enabling this jump label. If it is not a nop
50 * then something must have gone wrong.
51 */
52 if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0))
53 bug_at((void *)entry->code, __LINE__);
54
34 code.jump = 0xe9; 55 code.jump = 0xe9;
35 code.offset = entry->target - 56 code.offset = entry->target -
36 (entry->code + JUMP_LABEL_NOP_SIZE); 57 (entry->code + JUMP_LABEL_NOP_SIZE);
37 } else 58 } else {
59 /*
60 * We are disabling this jump label. If it is not what
61 * we think it is, then something must have gone wrong.
62 * If this is the first initialization call, then we
63 * are converting the default nop to the ideal nop.
64 */
65 if (init) {
66 const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
67 if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
68 bug_at((void *)entry->code, __LINE__);
69 } else {
70 code.jump = 0xe9;
71 code.offset = entry->target -
72 (entry->code + JUMP_LABEL_NOP_SIZE);
73 if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
74 bug_at((void *)entry->code, __LINE__);
75 }
38 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); 76 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
77 }
39 78
40 /* 79 /*
41 * Make text_poke_bp() a default fallback poker. 80 * Make text_poke_bp() a default fallback poker.
@@ -57,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry,
57{ 96{
58 get_online_cpus(); 97 get_online_cpus();
59 mutex_lock(&text_mutex); 98 mutex_lock(&text_mutex);
60 __jump_label_transform(entry, type, NULL); 99 __jump_label_transform(entry, type, NULL, 0);
61 mutex_unlock(&text_mutex); 100 mutex_unlock(&text_mutex);
62 put_online_cpus(); 101 put_online_cpus();
63} 102}
64 103
104static enum {
105 JL_STATE_START,
106 JL_STATE_NO_UPDATE,
107 JL_STATE_UPDATE,
108} jlstate __initdata_or_module = JL_STATE_START;
109
65__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, 110__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
66 enum jump_label_type type) 111 enum jump_label_type type)
67{ 112{
68 __jump_label_transform(entry, type, text_poke_early); 113 /*
114 * This function is called at boot up and when modules are
115 * first loaded. Check if the default nop, the one that is
116 * inserted at compile time, is the ideal nop. If it is, then
117 * we do not need to update the nop, and we can leave it as is.
118 * If it is not, then we need to update the nop to the ideal nop.
119 */
120 if (jlstate == JL_STATE_START) {
121 const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
122 const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
123
124 if (memcmp(ideal_nop, default_nop, 5) != 0)
125 jlstate = JL_STATE_UPDATE;
126 else
127 jlstate = JL_STATE_NO_UPDATE;
128 }
129 if (jlstate == JL_STATE_UPDATE)
130 __jump_label_transform(entry, type, text_poke_early, 1);
69} 131}
70 132
71#endif 133#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 884aa4053313..1b10af835c31 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -62,11 +62,6 @@ void __init default_banner(void)
62 pv_info.name); 62 pv_info.name);
63} 63}
64 64
65/* Simple instruction patching code. */
66#define DEF_NATIVE(ops, name, code) \
67 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
68 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
69
70/* Undefined instruction for dealing with missing ops pointers. */ 65/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b }; 66static const unsigned char ud2a[] = { 0x0f, 0x0b };
72 67
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
129} 129}
130 130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64 131#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/* 132/*
172 * Initialize the generic pvclock vsyscall state. This will allocate 133 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a 134 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
181 142
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 143 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183 144
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 145 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 146 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa(i) + (idx*PAGE_SIZE), 147 __pa(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR); 148 PAGE_KERNEL_VVAR);
190 } 149 }
191 150
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0; 151 return 0;
196} 152}
197#endif 153#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 5f24c71accaa..8ce0072cd700 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -107,6 +107,8 @@ struct x86_platform_ops x86_platform = {
107}; 107};
108 108
109EXPORT_SYMBOL_GPL(x86_platform); 109EXPORT_SYMBOL_GPL(x86_platform);
110
111#if defined(CONFIG_PCI_MSI)
110struct x86_msi_ops x86_msi = { 112struct x86_msi_ops x86_msi = {
111 .setup_msi_irqs = native_setup_msi_irqs, 113 .setup_msi_irqs = native_setup_msi_irqs,
112 .compose_msi_msg = native_compose_msi_msg, 114 .compose_msi_msg = native_compose_msi_msg,
@@ -116,6 +118,28 @@ struct x86_msi_ops x86_msi = {
116 .setup_hpet_msi = default_setup_hpet_msi, 118 .setup_hpet_msi = default_setup_hpet_msi,
117}; 119};
118 120
121/* MSI arch specific hooks */
122int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
123{
124 return x86_msi.setup_msi_irqs(dev, nvec, type);
125}
126
127void arch_teardown_msi_irqs(struct pci_dev *dev)
128{
129 x86_msi.teardown_msi_irqs(dev);
130}
131
132void arch_teardown_msi_irq(unsigned int irq)
133{
134 x86_msi.teardown_msi_irq(irq);
135}
136
137void arch_restore_msi_irqs(struct pci_dev *dev, int irq)
138{
139 x86_msi.restore_msi_irqs(dev, irq);
140}
141#endif
142
119struct x86_io_apic_ops x86_io_apic_ops = { 143struct x86_io_apic_ops x86_io_apic_ops = {
120 .init = native_io_apic_init_mappings, 144 .init = native_io_apic_init_mappings,
121 .read = native_io_apic_read, 145 .read = native_io_apic_read,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
413 (1 << KVM_FEATURE_CLOCKSOURCE2) | 413 (1 << KVM_FEATURE_CLOCKSOURCE2) |
414 (1 << KVM_FEATURE_ASYNC_PF) | 414 (1 << KVM_FEATURE_ASYNC_PF) |
415 (1 << KVM_FEATURE_PV_EOI) | 415 (1 << KVM_FEATURE_PV_EOI) |
416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
417 (1 << KVM_FEATURE_PV_UNHALT);
417 418
418 if (sched_info_on()) 419 if (sched_info_on())
419 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 420 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
79 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
80} 80}
81 81
82static inline int apic_test_and_set_vector(int vec, void *bitmap)
83{
84 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
85}
86
87static inline int apic_test_and_clear_vector(int vec, void *bitmap)
88{
89 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
90}
91
92static inline int apic_test_vector(int vec, void *bitmap) 82static inline int apic_test_vector(int vec, void *bitmap)
93{ 83{
94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 84 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
331} 321}
332EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 322EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
333 323
334static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 324static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
335{ 325{
336 apic->irr_pending = true; 326 apic->irr_pending = true;
337 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); 327 apic_set_vector(vec, apic->regs + APIC_IRR);
338} 328}
339 329
340static inline int apic_search_irr(struct kvm_lapic *apic) 330static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
681 if (unlikely(!apic_enabled(apic))) 671 if (unlikely(!apic_enabled(apic)))
682 break; 672 break;
683 673
674 result = 1;
675
684 if (dest_map) 676 if (dest_map)
685 __set_bit(vcpu->vcpu_id, dest_map); 677 __set_bit(vcpu->vcpu_id, dest_map);
686 678
687 if (kvm_x86_ops->deliver_posted_interrupt) { 679 if (kvm_x86_ops->deliver_posted_interrupt)
688 result = 1;
689 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 680 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
690 } else { 681 else {
691 result = !apic_test_and_set_irr(vector, apic); 682 apic_set_irr(vector, apic);
692
693 if (!result) {
694 if (trig_mode)
695 apic_debug("level trig mode repeatedly "
696 "for vector %d", vector);
697 goto out;
698 }
699 683
700 kvm_make_request(KVM_REQ_EVENT, vcpu); 684 kvm_make_request(KVM_REQ_EVENT, vcpu);
701 kvm_vcpu_kick(vcpu); 685 kvm_vcpu_kick(vcpu);
702 } 686 }
703out:
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 687 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
705 trig_mode, vector, !result); 688 trig_mode, vector, false);
706 break; 689 break;
707 690
708 case APIC_DM_REMRD: 691 case APIC_DM_REMRD:
709 apic_debug("Ignoring delivery mode 3\n"); 692 result = 1;
693 vcpu->arch.pv.pv_unhalted = 1;
694 kvm_make_request(KVM_REQ_EVENT, vcpu);
695 kvm_vcpu_kick(vcpu);
710 break; 696 break;
711 697
712 case APIC_DM_SMI: 698 case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
133 * PT32_LEVEL_BITS))) - 1)) 133 * PT32_LEVEL_BITS))) - 1))
134 134
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
136 | PT64_NX_MASK) 136 | shadow_x_mask | shadow_nx_mask)
137 137
138#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
139#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
331 return pte & PT_PAGE_SIZE_MASK; 331 return pte & PT_PAGE_SIZE_MASK;
332} 332}
333 333
334static int is_dirty_gpte(unsigned long pte)
335{
336 return pte & PT_DIRTY_MASK;
337}
338
339static int is_rmap_spte(u64 pte) 334static int is_rmap_spte(u64 pte)
340{ 335{
341 return is_shadow_present_pte(pte); 336 return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2052 return __shadow_walk_next(iterator, *iterator->sptep); 2047 return __shadow_walk_next(iterator, *iterator->sptep);
2053} 2048}
2054 2049
2055static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 2050static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
2056{ 2051{
2057 u64 spte; 2052 u64 spte;
2058 2053
2054 BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
2055 VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2056
2059 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2057 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2060 shadow_user_mask | shadow_x_mask | shadow_accessed_mask; 2058 shadow_user_mask | shadow_x_mask;
2059
2060 if (accessed)
2061 spte |= shadow_accessed_mask;
2061 2062
2062 mmu_spte_set(sptep, spte); 2063 mmu_spte_set(sptep, spte);
2063} 2064}
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2574 mmu_free_roots(vcpu); 2575 mmu_free_roots(vcpu);
2575} 2576}
2576 2577
2577static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2578{
2579 int bit7;
2580
2581 bit7 = (gpte >> 7) & 1;
2582 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2583}
2584
2585static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2578static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2586 bool no_dirty_log) 2579 bool no_dirty_log)
2587{ 2580{
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2594 return gfn_to_pfn_memslot_atomic(slot, gfn); 2587 return gfn_to_pfn_memslot_atomic(slot, gfn);
2595} 2588}
2596 2589
2597static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2598 struct kvm_mmu_page *sp, u64 *spte,
2599 u64 gpte)
2600{
2601 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2602 goto no_present;
2603
2604 if (!is_present_gpte(gpte))
2605 goto no_present;
2606
2607 if (!(gpte & PT_ACCESSED_MASK))
2608 goto no_present;
2609
2610 return false;
2611
2612no_present:
2613 drop_spte(vcpu->kvm, spte);
2614 return true;
2615}
2616
2617static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2590static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2618 struct kvm_mmu_page *sp, 2591 struct kvm_mmu_page *sp,
2619 u64 *start, u64 *end) 2592 u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2710 iterator.level - 1, 2683 iterator.level - 1,
2711 1, ACC_ALL, iterator.sptep); 2684 1, ACC_ALL, iterator.sptep);
2712 2685
2713 link_shadow_page(iterator.sptep, sp); 2686 link_shadow_page(iterator.sptep, sp, true);
2714 } 2687 }
2715 } 2688 }
2716 return emulate; 2689 return emulate;
@@ -2808,7 +2781,7 @@ exit:
2808 return ret; 2781 return ret;
2809} 2782}
2810 2783
2811static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) 2784static bool page_fault_can_be_fast(u32 error_code)
2812{ 2785{
2813 /* 2786 /*
2814 * Do not fix the mmio spte with invalid generation number which 2787 * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2861 bool ret = false; 2834 bool ret = false;
2862 u64 spte = 0ull; 2835 u64 spte = 0ull;
2863 2836
2864 if (!page_fault_can_be_fast(vcpu, error_code)) 2837 if (!page_fault_can_be_fast(error_code))
2865 return false; 2838 return false;
2866 2839
2867 walk_shadow_page_lockless_begin(vcpu); 2840 walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3209 mmu_sync_roots(vcpu); 3182 mmu_sync_roots(vcpu);
3210 spin_unlock(&vcpu->kvm->mmu_lock); 3183 spin_unlock(&vcpu->kvm->mmu_lock);
3211} 3184}
3185EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3212 3186
3213static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3187static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3214 u32 access, struct x86_exception *exception) 3188 u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3478 ++vcpu->stat.tlb_flush; 3452 ++vcpu->stat.tlb_flush;
3479 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3453 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3480} 3454}
3455EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
3481 3456
3482static void paging_new_cr3(struct kvm_vcpu *vcpu) 3457static void paging_new_cr3(struct kvm_vcpu *vcpu)
3483{ 3458{
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3501 nonpaging_free(vcpu); 3476 nonpaging_free(vcpu);
3502} 3477}
3503 3478
3504static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3505{
3506 unsigned mask;
3507
3508 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3509
3510 mask = (unsigned)~ACC_WRITE_MASK;
3511 /* Allow write access to dirty gptes */
3512 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3513 *access &= mask;
3514}
3515
3516static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, 3479static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3517 unsigned access, int *nr_present) 3480 unsigned access, int *nr_present)
3518{ 3481{
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3530 return false; 3493 return false;
3531} 3494}
3532 3495
3533static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3534{
3535 unsigned access;
3536
3537 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3538 access &= ~(gpte >> PT64_NX_SHIFT);
3539
3540 return access;
3541}
3542
3543static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) 3496static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3544{ 3497{
3545 unsigned index; 3498 unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
3549 return mmu->last_pte_bitmap & (1 << index); 3502 return mmu->last_pte_bitmap & (1 << index);
3550} 3503}
3551 3504
3505#define PTTYPE_EPT 18 /* arbitrary */
3506#define PTTYPE PTTYPE_EPT
3507#include "paging_tmpl.h"
3508#undef PTTYPE
3509
3552#define PTTYPE 64 3510#define PTTYPE 64
3553#include "paging_tmpl.h" 3511#include "paging_tmpl.h"
3554#undef PTTYPE 3512#undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3563 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3521 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3564 u64 exb_bit_rsvd = 0; 3522 u64 exb_bit_rsvd = 0;
3565 3523
3524 context->bad_mt_xwr = 0;
3525
3566 if (!context->nx) 3526 if (!context->nx)
3567 exb_bit_rsvd = rsvd_bits(63, 63); 3527 exb_bit_rsvd = rsvd_bits(63, 63);
3568 switch (context->root_level) { 3528 switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3618 } 3578 }
3619} 3579}
3620 3580
3621static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3581static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
3582 struct kvm_mmu *context, bool execonly)
3583{
3584 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3585 int pte;
3586
3587 context->rsvd_bits_mask[0][3] =
3588 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
3589 context->rsvd_bits_mask[0][2] =
3590 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3591 context->rsvd_bits_mask[0][1] =
3592 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3593 context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
3594
3595 /* large page */
3596 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
3597 context->rsvd_bits_mask[1][2] =
3598 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
3599 context->rsvd_bits_mask[1][1] =
3600 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
3601 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3602
3603 for (pte = 0; pte < 64; pte++) {
3604 int rwx_bits = pte & 7;
3605 int mt = pte >> 3;
3606 if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
3607 rwx_bits == 0x2 || rwx_bits == 0x6 ||
3608 (rwx_bits == 0x4 && !execonly))
3609 context->bad_mt_xwr |= (1ull << pte);
3610 }
3611}
3612
3613static void update_permission_bitmask(struct kvm_vcpu *vcpu,
3614 struct kvm_mmu *mmu, bool ept)
3622{ 3615{
3623 unsigned bit, byte, pfec; 3616 unsigned bit, byte, pfec;
3624 u8 map; 3617 u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
3636 w = bit & ACC_WRITE_MASK; 3629 w = bit & ACC_WRITE_MASK;
3637 u = bit & ACC_USER_MASK; 3630 u = bit & ACC_USER_MASK;
3638 3631
3639 /* Not really needed: !nx will cause pte.nx to fault */ 3632 if (!ept) {
3640 x |= !mmu->nx; 3633 /* Not really needed: !nx will cause pte.nx to fault */
3641 /* Allow supervisor writes if !cr0.wp */ 3634 x |= !mmu->nx;
3642 w |= !is_write_protection(vcpu) && !uf; 3635 /* Allow supervisor writes if !cr0.wp */
3643 /* Disallow supervisor fetches of user code if cr4.smep */ 3636 w |= !is_write_protection(vcpu) && !uf;
3644 x &= !(smep && u && !uf); 3637 /* Disallow supervisor fetches of user code if cr4.smep */
3638 x &= !(smep && u && !uf);
3639 } else
3640 /* Not really needed: no U/S accesses on ept */
3641 u = 1;
3645 3642
3646 fault = (ff && !x) || (uf && !u) || (wf && !w); 3643 fault = (ff && !x) || (uf && !u) || (wf && !w);
3647 map |= fault << bit; 3644 map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3676 context->root_level = level; 3673 context->root_level = level;
3677 3674
3678 reset_rsvds_bits_mask(vcpu, context); 3675 reset_rsvds_bits_mask(vcpu, context);
3679 update_permission_bitmask(vcpu, context); 3676 update_permission_bitmask(vcpu, context, false);
3680 update_last_pte_bitmap(vcpu, context); 3677 update_last_pte_bitmap(vcpu, context);
3681 3678
3682 ASSERT(is_pae(vcpu)); 3679 ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3706 context->root_level = PT32_ROOT_LEVEL; 3703 context->root_level = PT32_ROOT_LEVEL;
3707 3704
3708 reset_rsvds_bits_mask(vcpu, context); 3705 reset_rsvds_bits_mask(vcpu, context);
3709 update_permission_bitmask(vcpu, context); 3706 update_permission_bitmask(vcpu, context, false);
3710 update_last_pte_bitmap(vcpu, context); 3707 update_last_pte_bitmap(vcpu, context);
3711 3708
3712 context->new_cr3 = paging_new_cr3; 3709 context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3768 context->gva_to_gpa = paging32_gva_to_gpa; 3765 context->gva_to_gpa = paging32_gva_to_gpa;
3769 } 3766 }
3770 3767
3771 update_permission_bitmask(vcpu, context); 3768 update_permission_bitmask(vcpu, context, false);
3772 update_last_pte_bitmap(vcpu, context); 3769 update_last_pte_bitmap(vcpu, context);
3773 3770
3774 return 0; 3771 return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3800} 3797}
3801EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3798EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3802 3799
3800int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
3801 bool execonly)
3802{
3803 ASSERT(vcpu);
3804 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3805
3806 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
3807
3808 context->nx = true;
3809 context->new_cr3 = paging_new_cr3;
3810 context->page_fault = ept_page_fault;
3811 context->gva_to_gpa = ept_gva_to_gpa;
3812 context->sync_page = ept_sync_page;
3813 context->invlpg = ept_invlpg;
3814 context->update_pte = ept_update_pte;
3815 context->free = paging_free;
3816 context->root_level = context->shadow_root_level;
3817 context->root_hpa = INVALID_PAGE;
3818 context->direct_map = false;
3819
3820 update_permission_bitmask(vcpu, context, true);
3821 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
3822
3823 return 0;
3824}
3825EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
3826
3803static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3827static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3804{ 3828{
3805 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); 3829 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3847 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3871 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3848 } 3872 }
3849 3873
3850 update_permission_bitmask(vcpu, g_context); 3874 update_permission_bitmask(vcpu, g_context, false);
3851 update_last_pte_bitmap(vcpu, g_context); 3875 update_last_pte_bitmap(vcpu, g_context);
3852 3876
3853 return 0; 3877 return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
3923 return true; 3947 return true;
3924 if ((old ^ new) & PT64_BASE_ADDR_MASK) 3948 if ((old ^ new) & PT64_BASE_ADDR_MASK)
3925 return true; 3949 return true;
3926 old ^= PT64_NX_MASK; 3950 old ^= shadow_nx_mask;
3927 new ^= PT64_NX_MASK; 3951 new ^= shadow_nx_mask;
3928 return (old & ~new & PT64_PERM_MASK) != 0; 3952 return (old & ~new & PT64_PERM_MASK) != 0;
3929} 3953}
3930 3954
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
4182 switch (er) { 4206 switch (er) {
4183 case EMULATE_DONE: 4207 case EMULATE_DONE:
4184 return 1; 4208 return 1;
4185 case EMULATE_DO_MMIO: 4209 case EMULATE_USER_EXIT:
4186 ++vcpu->stat.mmio_exits; 4210 ++vcpu->stat.mmio_exits;
4187 /* fall through */ 4211 /* fall through */
4188 case EMULATE_FAIL: 4212 case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4390 /* 4414 /*
4391 * The very rare case: if the generation-number is round, 4415 * The very rare case: if the generation-number is round,
4392 * zap all shadow pages. 4416 * zap all shadow pages.
4393 *
4394 * The max value is MMIO_MAX_GEN - 1 since it is not called
4395 * when mark memslot invalid.
4396 */ 4417 */
4397 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { 4418 if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
4398 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); 4419 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4399 kvm_mmu_invalidate_zap_all_pages(kvm); 4420 kvm_mmu_invalidate_zap_all_pages(kvm);
4400 } 4421 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
71 71
72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
74int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
75 bool execonly);
74 76
75static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 77static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
76{ 78{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
23 * so the code in this file is compiled twice, once per pte size. 23 * so the code in this file is compiled twice, once per pte size.
24 */ 24 */
25 25
26/*
27 * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
28 * uses for EPT without A/D paging type.
29 */
30extern u64 __pure __using_nonexistent_pte_bit(void)
31 __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
32
26#if PTTYPE == 64 33#if PTTYPE == 64
27 #define pt_element_t u64 34 #define pt_element_t u64
28 #define guest_walker guest_walker64 35 #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 39 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 40 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 41 #define PT_LEVEL_BITS PT64_LEVEL_BITS
42 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
43 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
44 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
45 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
35 #ifdef CONFIG_X86_64 46 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4 47 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg 48 #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 60 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50 #define PT_LEVEL_BITS PT32_LEVEL_BITS 61 #define PT_LEVEL_BITS PT32_LEVEL_BITS
51 #define PT_MAX_FULL_LEVELS 2 62 #define PT_MAX_FULL_LEVELS 2
63 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
64 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
65 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
66 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
52 #define CMPXCHG cmpxchg 67 #define CMPXCHG cmpxchg
68#elif PTTYPE == PTTYPE_EPT
69 #define pt_element_t u64
70 #define guest_walker guest_walkerEPT
71 #define FNAME(name) ept_##name
72 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
73 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
74 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
75 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
76 #define PT_LEVEL_BITS PT64_LEVEL_BITS
77 #define PT_GUEST_ACCESSED_MASK 0
78 #define PT_GUEST_DIRTY_MASK 0
79 #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
80 #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
81 #define CMPXCHG cmpxchg64
82 #define PT_MAX_FULL_LEVELS 4
53#else 83#else
54 #error Invalid PTTYPE value 84 #error Invalid PTTYPE value
55#endif 85#endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
80 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 110 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81} 111}
82 112
113static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
114{
115 unsigned mask;
116
117 /* dirty bit is not supported, so no need to track it */
118 if (!PT_GUEST_DIRTY_MASK)
119 return;
120
121 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
122
123 mask = (unsigned)~ACC_WRITE_MASK;
124 /* Allow write access to dirty gptes */
125 mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
126 PT_WRITABLE_MASK;
127 *access &= mask;
128}
129
130static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
131{
132 int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
133
134 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
135 ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
136}
137
138static inline int FNAME(is_present_gpte)(unsigned long pte)
139{
140#if PTTYPE != PTTYPE_EPT
141 return is_present_gpte(pte);
142#else
143 return pte & 7;
144#endif
145}
146
83static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 pt_element_t __user *ptep_user, unsigned index, 148 pt_element_t __user *ptep_user, unsigned index,
85 pt_element_t orig_pte, pt_element_t new_pte) 149 pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
103 return (ret != orig_pte); 167 return (ret != orig_pte);
104} 168}
105 169
170static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
171 struct kvm_mmu_page *sp, u64 *spte,
172 u64 gpte)
173{
174 if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
175 goto no_present;
176
177 if (!FNAME(is_present_gpte)(gpte))
178 goto no_present;
179
180 /* if accessed bit is not supported prefetch non accessed gpte */
181 if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
182 goto no_present;
183
184 return false;
185
186no_present:
187 drop_spte(vcpu->kvm, spte);
188 return true;
189}
190
191static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
192{
193 unsigned access;
194#if PTTYPE == PTTYPE_EPT
195 access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
196 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
197 ACC_USER_MASK;
198#else
199 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
200 access &= ~(gpte >> PT64_NX_SHIFT);
201#endif
202
203 return access;
204}
205
106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 206static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
107 struct kvm_mmu *mmu, 207 struct kvm_mmu *mmu,
108 struct guest_walker *walker, 208 struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
114 gfn_t table_gfn; 214 gfn_t table_gfn;
115 int ret; 215 int ret;
116 216
217 /* dirty/accessed bits are not supported, so no need to update them */
218 if (!PT_GUEST_DIRTY_MASK)
219 return 0;
220
117 for (level = walker->max_level; level >= walker->level; --level) { 221 for (level = walker->max_level; level >= walker->level; --level) {
118 pte = orig_pte = walker->ptes[level - 1]; 222 pte = orig_pte = walker->ptes[level - 1];
119 table_gfn = walker->table_gfn[level - 1]; 223 table_gfn = walker->table_gfn[level - 1];
120 ptep_user = walker->ptep_user[level - 1]; 224 ptep_user = walker->ptep_user[level - 1];
121 index = offset_in_page(ptep_user) / sizeof(pt_element_t); 225 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
122 if (!(pte & PT_ACCESSED_MASK)) { 226 if (!(pte & PT_GUEST_ACCESSED_MASK)) {
123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 227 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
124 pte |= PT_ACCESSED_MASK; 228 pte |= PT_GUEST_ACCESSED_MASK;
125 } 229 }
126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { 230 if (level == walker->level && write_fault &&
231 !(pte & PT_GUEST_DIRTY_MASK)) {
127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
128 pte |= PT_DIRTY_MASK; 233 pte |= PT_GUEST_DIRTY_MASK;
129 } 234 }
130 if (pte == orig_pte) 235 if (pte == orig_pte)
131 continue; 236 continue;
@@ -170,7 +275,7 @@ retry_walk:
170 if (walker->level == PT32E_ROOT_LEVEL) { 275 if (walker->level == PT32E_ROOT_LEVEL) {
171 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 276 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172 trace_kvm_mmu_paging_element(pte, walker->level); 277 trace_kvm_mmu_paging_element(pte, walker->level);
173 if (!is_present_gpte(pte)) 278 if (!FNAME(is_present_gpte)(pte))
174 goto error; 279 goto error;
175 --walker->level; 280 --walker->level;
176 } 281 }
@@ -179,7 +284,7 @@ retry_walk:
179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 284 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 285 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
181 286
182 accessed_dirty = PT_ACCESSED_MASK; 287 accessed_dirty = PT_GUEST_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL; 288 pt_access = pte_access = ACC_ALL;
184 ++walker->level; 289 ++walker->level;
185 290
@@ -215,17 +320,17 @@ retry_walk:
215 320
216 trace_kvm_mmu_paging_element(pte, walker->level); 321 trace_kvm_mmu_paging_element(pte, walker->level);
217 322
218 if (unlikely(!is_present_gpte(pte))) 323 if (unlikely(!FNAME(is_present_gpte)(pte)))
219 goto error; 324 goto error;
220 325
221 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 326 if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
222 walker->level))) { 327 walker->level))) {
223 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 328 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
224 goto error; 329 goto error;
225 } 330 }
226 331
227 accessed_dirty &= pte; 332 accessed_dirty &= pte;
228 pte_access = pt_access & gpte_access(vcpu, pte); 333 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
229 334
230 walker->ptes[walker->level - 1] = pte; 335 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte)); 336 } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
248 walker->gfn = real_gpa >> PAGE_SHIFT; 353 walker->gfn = real_gpa >> PAGE_SHIFT;
249 354
250 if (!write_fault) 355 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 356 FNAME(protect_clean_gpte)(&pte_access, pte);
252 else 357 else
253 /* 358 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by 359 * On a write fault, fold the dirty bit into accessed_dirty.
255 * shifting it one place right. 360 * For modes without A/D bits support accessed_dirty will be
361 * always clear.
256 */ 362 */
257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); 363 accessed_dirty &= pte >>
364 (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
258 365
259 if (unlikely(!accessed_dirty)) { 366 if (unlikely(!accessed_dirty)) {
260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 367 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
279 walker->fault.vector = PF_VECTOR; 386 walker->fault.vector = PF_VECTOR;
280 walker->fault.error_code_valid = true; 387 walker->fault.error_code_valid = true;
281 walker->fault.error_code = errcode; 388 walker->fault.error_code = errcode;
389
390#if PTTYPE == PTTYPE_EPT
391 /*
392 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
393 * misconfiguration requires to be injected. The detection is
394 * done by is_rsvd_bits_set() above.
395 *
396 * We set up the value of exit_qualification to inject:
397 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
398 * [5:3] - Calculated by the page walk of the guest EPT page tables
399 * [7:8] - Derived from [7:8] of real exit_qualification
400 *
401 * The other bits are set to 0.
402 */
403 if (!(errcode & PFERR_RSVD_MASK)) {
404 vcpu->arch.exit_qualification &= 0x187;
405 vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
406 }
407#endif
282 walker->fault.address = addr; 408 walker->fault.address = addr;
283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 409 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
284 410
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
293 access); 419 access);
294} 420}
295 421
422#if PTTYPE != PTTYPE_EPT
296static int FNAME(walk_addr_nested)(struct guest_walker *walker, 423static int FNAME(walk_addr_nested)(struct guest_walker *walker,
297 struct kvm_vcpu *vcpu, gva_t addr, 424 struct kvm_vcpu *vcpu, gva_t addr,
298 u32 access) 425 u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
300 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, 427 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
301 addr, access); 428 addr, access);
302} 429}
430#endif
303 431
304static bool 432static bool
305FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 433FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
309 gfn_t gfn; 437 gfn_t gfn;
310 pfn_t pfn; 438 pfn_t pfn;
311 439
312 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
313 return false; 441 return false;
314 442
315 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 443 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
316 444
317 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
318 pte_access = sp->role.access & gpte_access(vcpu, gpte); 446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
319 protect_clean_gpte(&pte_access, gpte); 447 FNAME(protect_clean_gpte)(&pte_access, gpte);
320 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 448 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
321 no_dirty_log && (pte_access & ACC_WRITE_MASK)); 449 no_dirty_log && (pte_access & ACC_WRITE_MASK));
322 if (is_error_pfn(pfn)) 450 if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
446 goto out_gpte_changed; 574 goto out_gpte_changed;
447 575
448 if (sp) 576 if (sp)
449 link_shadow_page(it.sptep, sp); 577 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
450 } 578 }
451 579
452 for (; 580 for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
466 594
467 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, 595 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
468 true, direct_access, it.sptep); 596 true, direct_access, it.sptep);
469 link_shadow_page(it.sptep, sp); 597 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
470 } 598 }
471 599
472 clear_sp_write_flooding_count(it.sptep); 600 clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
727 return gpa; 855 return gpa;
728} 856}
729 857
858#if PTTYPE != PTTYPE_EPT
730static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 859static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
731 u32 access, 860 u32 access,
732 struct x86_exception *exception) 861 struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
745 874
746 return gpa; 875 return gpa;
747} 876}
877#endif
748 878
749/* 879/*
750 * Using the cached information from sp->gfns is safe because: 880 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
785 sizeof(pt_element_t))) 915 sizeof(pt_element_t)))
786 return -EINVAL; 916 return -EINVAL;
787 917
788 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { 918 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
789 vcpu->kvm->tlbs_dirty++; 919 vcpu->kvm->tlbs_dirty++;
790 continue; 920 continue;
791 } 921 }
792 922
793 gfn = gpte_to_gfn(gpte); 923 gfn = gpte_to_gfn(gpte);
794 pte_access = sp->role.access; 924 pte_access = sp->role.access;
795 pte_access &= gpte_access(vcpu, gpte); 925 pte_access &= FNAME(gpte_access)(vcpu, gpte);
796 protect_clean_gpte(&pte_access, gpte); 926 FNAME(protect_clean_gpte)(&pte_access, gpte);
797 927
798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, 928 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present)) 929 &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
830#undef gpte_to_gfn 960#undef gpte_to_gfn
831#undef gpte_to_gfn_lvl 961#undef gpte_to_gfn_lvl
832#undef CMPXCHG 962#undef CMPXCHG
963#undef PT_GUEST_ACCESSED_MASK
964#undef PT_GUEST_DIRTY_MASK
965#undef PT_GUEST_DIRTY_SHIFT
966#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
160 160
161static void reprogram_counter(struct kvm_pmc *pmc, u32 type, 161static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
162 unsigned config, bool exclude_user, bool exclude_kernel, 162 unsigned config, bool exclude_user, bool exclude_kernel,
163 bool intr) 163 bool intr, bool in_tx, bool in_tx_cp)
164{ 164{
165 struct perf_event *event; 165 struct perf_event *event;
166 struct perf_event_attr attr = { 166 struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
173 .exclude_kernel = exclude_kernel, 173 .exclude_kernel = exclude_kernel,
174 .config = config, 174 .config = config,
175 }; 175 };
176 if (in_tx)
177 attr.config |= HSW_IN_TX;
178 if (in_tx_cp)
179 attr.config |= HSW_IN_TX_CHECKPOINTED;
176 180
177 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); 181 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
178 182
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
226 230
227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 231 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
228 ARCH_PERFMON_EVENTSEL_INV | 232 ARCH_PERFMON_EVENTSEL_INV |
229 ARCH_PERFMON_EVENTSEL_CMASK))) { 233 ARCH_PERFMON_EVENTSEL_CMASK |
234 HSW_IN_TX |
235 HSW_IN_TX_CHECKPOINTED))) {
230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 236 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
231 unit_mask); 237 unit_mask);
232 if (config != PERF_COUNT_HW_MAX) 238 if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
239 reprogram_counter(pmc, type, config, 245 reprogram_counter(pmc, type, config,
240 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 246 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
241 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 247 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
242 eventsel & ARCH_PERFMON_EVENTSEL_INT); 248 eventsel & ARCH_PERFMON_EVENTSEL_INT,
249 (eventsel & HSW_IN_TX),
250 (eventsel & HSW_IN_TX_CHECKPOINTED));
243} 251}
244 252
245static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) 253static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
256 arch_events[fixed_pmc_events[idx]].event_type, 264 arch_events[fixed_pmc_events[idx]].event_type,
257 !(en & 0x2), /* exclude user */ 265 !(en & 0x2), /* exclude user */
258 !(en & 0x1), /* exclude kernel */ 266 !(en & 0x1), /* exclude kernel */
259 pmi); 267 pmi, false, false);
260} 268}
261 269
262static inline u8 fixed_en_pmi(u64 ctrl, int idx) 270static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
408 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { 416 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
409 if (data == pmc->eventsel) 417 if (data == pmc->eventsel)
410 return 0; 418 return 0;
411 if (!(data & 0xffffffff00200000ull)) { 419 if (!(data & pmu->reserved_bits)) {
412 reprogram_gp_counter(pmc, data); 420 reprogram_gp_counter(pmc, data);
413 return 0; 421 return 0;
414 } 422 }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
450 pmu->counter_bitmask[KVM_PMC_GP] = 0; 458 pmu->counter_bitmask[KVM_PMC_GP] = 0;
451 pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 459 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
452 pmu->version = 0; 460 pmu->version = 0;
461 pmu->reserved_bits = 0xffffffff00200000ull;
453 462
454 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); 463 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
455 if (!entry) 464 if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
478 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 487 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
479 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); 488 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
480 pmu->global_ctrl_mask = ~pmu->global_ctrl; 489 pmu->global_ctrl_mask = ~pmu->global_ctrl;
490
491 entry = kvm_find_cpuid_entry(vcpu, 7, 0);
492 if (entry &&
493 (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
494 (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
495 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
481} 496}
482 497
483void kvm_pmu_init(struct kvm_vcpu *vcpu) 498void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
373 * we must keep them pinned while L2 runs. 373 * we must keep them pinned while L2 runs.
374 */ 374 */
375 struct page *apic_access_page; 375 struct page *apic_access_page;
376 u64 msr_ia32_feature_control;
376}; 377};
377 378
378#define POSTED_INTR_ON 0 379#define POSTED_INTR_ON 0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
711 kvm_release_page_clean(page); 712 kvm_release_page_clean(page);
712} 713}
713 714
715static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
714static u64 construct_eptp(unsigned long root_hpa); 716static u64 construct_eptp(unsigned long root_hpa);
715static void kvm_cpu_vmxon(u64 addr); 717static void kvm_cpu_vmxon(u64 addr);
716static void kvm_cpu_vmxoff(void); 718static void kvm_cpu_vmxoff(void);
717static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
718static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 719static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
719static void vmx_set_segment(struct kvm_vcpu *vcpu, 720static void vmx_set_segment(struct kvm_vcpu *vcpu,
720 struct kvm_segment *var, int seg); 721 struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1039 (vmcs12->secondary_vm_exec_control & bit); 1040 (vmcs12->secondary_vm_exec_control & bit);
1040} 1041}
1041 1042
1042static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, 1043static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1043 struct kvm_vcpu *vcpu)
1044{ 1044{
1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1046} 1046}
1047 1047
1048static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1049{
1050 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1051}
1052
1048static inline bool is_exception(u32 intr_info) 1053static inline bool is_exception(u32 intr_info)
1049{ 1054{
1050 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1055 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2155static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2160static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2156static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2161static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2157static u32 nested_vmx_misc_low, nested_vmx_misc_high; 2162static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2163static u32 nested_vmx_ept_caps;
2158static __init void nested_vmx_setup_ctls_msrs(void) 2164static __init void nested_vmx_setup_ctls_msrs(void)
2159{ 2165{
2160 /* 2166 /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2190 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and 2196 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2191 * 17 must be 1. 2197 * 17 must be 1.
2192 */ 2198 */
2199 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2200 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2193 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2201 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2194 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2202 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2203 nested_vmx_exit_ctls_high &=
2195#ifdef CONFIG_X86_64 2204#ifdef CONFIG_X86_64
2196 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2205 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2197#else
2198 nested_vmx_exit_ctls_high = 0;
2199#endif 2206#endif
2200 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2207 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2208 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2209 VM_EXIT_LOAD_IA32_EFER);
2201 2210
2202 /* entry controls */ 2211 /* entry controls */
2203 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2212 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2205 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ 2214 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2206 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2215 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2207 nested_vmx_entry_ctls_high &= 2216 nested_vmx_entry_ctls_high &=
2208 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2217#ifdef CONFIG_X86_64
2209 nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2218 VM_ENTRY_IA32E_MODE |
2219#endif
2220 VM_ENTRY_LOAD_IA32_PAT;
2221 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2222 VM_ENTRY_LOAD_IA32_EFER);
2210 2223
2211 /* cpu-based controls */ 2224 /* cpu-based controls */
2212 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2225 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2254 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2242 SECONDARY_EXEC_WBINVD_EXITING; 2255 SECONDARY_EXEC_WBINVD_EXITING;
2243 2256
2257 if (enable_ept) {
2258 /* nested EPT: emulate EPT also to L1 */
2259 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
2260 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2261 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2262 nested_vmx_ept_caps &= vmx_capability.ept;
2263 /*
2264 * Since invept is completely emulated we support both global
2265 * and context invalidation independent of what host cpu
2266 * supports
2267 */
2268 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2269 VMX_EPT_EXTENT_CONTEXT_BIT;
2270 } else
2271 nested_vmx_ept_caps = 0;
2272
2244 /* miscellaneous data */ 2273 /* miscellaneous data */
2245 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2274 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2246 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2275 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2282 2311
2283 switch (msr_index) { 2312 switch (msr_index) {
2284 case MSR_IA32_FEATURE_CONTROL: 2313 case MSR_IA32_FEATURE_CONTROL:
2285 *pdata = 0; 2314 if (nested_vmx_allowed(vcpu)) {
2286 break; 2315 *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2316 break;
2317 }
2318 return 0;
2287 case MSR_IA32_VMX_BASIC: 2319 case MSR_IA32_VMX_BASIC:
2288 /* 2320 /*
2289 * This MSR reports some information about VMX support. We 2321 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2346 nested_vmx_secondary_ctls_high); 2378 nested_vmx_secondary_ctls_high);
2347 break; 2379 break;
2348 case MSR_IA32_VMX_EPT_VPID_CAP: 2380 case MSR_IA32_VMX_EPT_VPID_CAP:
2349 /* Currently, no nested ept or nested vpid */ 2381 /* Currently, no nested vpid support */
2350 *pdata = 0; 2382 *pdata = nested_vmx_ept_caps;
2351 break; 2383 break;
2352 default: 2384 default:
2353 return 0; 2385 return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2356 return 1; 2388 return 1;
2357} 2389}
2358 2390
2359static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2391static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2360{ 2392{
2393 u32 msr_index = msr_info->index;
2394 u64 data = msr_info->data;
2395 bool host_initialized = msr_info->host_initiated;
2396
2361 if (!nested_vmx_allowed(vcpu)) 2397 if (!nested_vmx_allowed(vcpu))
2362 return 0; 2398 return 0;
2363 2399
2364 if (msr_index == MSR_IA32_FEATURE_CONTROL) 2400 if (msr_index == MSR_IA32_FEATURE_CONTROL) {
2365 /* TODO: the right thing. */ 2401 if (!host_initialized &&
2402 to_vmx(vcpu)->nested.msr_ia32_feature_control
2403 & FEATURE_CONTROL_LOCKED)
2404 return 0;
2405 to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
2366 return 1; 2406 return 1;
2407 }
2408
2367 /* 2409 /*
2368 * No need to treat VMX capability MSRs specially: If we don't handle 2410 * No need to treat VMX capability MSRs specially: If we don't handle
2369 * them, handle_wrmsr will #GP(0), which is correct (they are readonly) 2411 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2494 return 1; 2536 return 1;
2495 /* Otherwise falls through */ 2537 /* Otherwise falls through */
2496 default: 2538 default:
2497 if (vmx_set_vmx_msr(vcpu, msr_index, data)) 2539 if (vmx_set_vmx_msr(vcpu, msr_info))
2498 break; 2540 break;
2499 msr = find_msr_entry(vmx, msr_index); 2541 msr = find_msr_entry(vmx, msr_index);
2500 if (msr) { 2542 if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
5302 5344
5303 /* It is a write fault? */ 5345 /* It is a write fault? */
5304 error_code = exit_qualification & (1U << 1); 5346 error_code = exit_qualification & (1U << 1);
5347 /* It is a fetch fault? */
5348 error_code |= (exit_qualification & (1U << 2)) << 2;
5305 /* ept page table is present? */ 5349 /* ept page table is present? */
5306 error_code |= (exit_qualification >> 3) & 0x1; 5350 error_code |= (exit_qualification >> 3) & 0x1;
5307 5351
5352 vcpu->arch.exit_qualification = exit_qualification;
5353
5308 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5354 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5309} 5355}
5310 5356
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5438 5484
5439 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5485 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5440 5486
5441 if (err == EMULATE_DO_MMIO) { 5487 if (err == EMULATE_USER_EXIT) {
5488 ++vcpu->stat.mmio_exits;
5442 ret = 0; 5489 ret = 0;
5443 goto out; 5490 goto out;
5444 } 5491 }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5567 free_loaded_vmcs(&vmx->vmcs01); 5614 free_loaded_vmcs(&vmx->vmcs01);
5568} 5615}
5569 5616
5617/*
5618 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5619 * set the success or error code of an emulated VMX instruction, as specified
5620 * by Vol 2B, VMX Instruction Reference, "Conventions".
5621 */
5622static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5623{
5624 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5625 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5626 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5627}
5628
5629static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5630{
5631 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5632 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5633 X86_EFLAGS_SF | X86_EFLAGS_OF))
5634 | X86_EFLAGS_CF);
5635}
5636
5570static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 5637static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5571 u32 vm_instruction_error); 5638 u32 vm_instruction_error)
5639{
5640 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5641 /*
5642 * failValid writes the error number to the current VMCS, which
5643 * can't be done there isn't a current VMCS.
5644 */
5645 nested_vmx_failInvalid(vcpu);
5646 return;
5647 }
5648 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5649 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5650 X86_EFLAGS_SF | X86_EFLAGS_OF))
5651 | X86_EFLAGS_ZF);
5652 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5653 /*
5654 * We don't need to force a shadow sync because
5655 * VM_INSTRUCTION_ERROR is not shadowed
5656 */
5657}
5572 5658
5573/* 5659/*
5574 * Emulate the VMXON instruction. 5660 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5583 struct kvm_segment cs; 5669 struct kvm_segment cs;
5584 struct vcpu_vmx *vmx = to_vmx(vcpu); 5670 struct vcpu_vmx *vmx = to_vmx(vcpu);
5585 struct vmcs *shadow_vmcs; 5671 struct vmcs *shadow_vmcs;
5672 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
5673 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
5586 5674
5587 /* The Intel VMX Instruction Reference lists a bunch of bits that 5675 /* The Intel VMX Instruction Reference lists a bunch of bits that
5588 * are prerequisite to running VMXON, most notably cr4.VMXE must be 5676 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5611 skip_emulated_instruction(vcpu); 5699 skip_emulated_instruction(vcpu);
5612 return 1; 5700 return 1;
5613 } 5701 }
5702
5703 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5704 != VMXON_NEEDED_FEATURES) {
5705 kvm_inject_gp(vcpu, 0);
5706 return 1;
5707 }
5708
5614 if (enable_shadow_vmcs) { 5709 if (enable_shadow_vmcs) {
5615 shadow_vmcs = alloc_vmcs(); 5710 shadow_vmcs = alloc_vmcs();
5616 if (!shadow_vmcs) 5711 if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5628 vmx->nested.vmxon = true; 5723 vmx->nested.vmxon = true;
5629 5724
5630 skip_emulated_instruction(vcpu); 5725 skip_emulated_instruction(vcpu);
5726 nested_vmx_succeed(vcpu);
5631 return 1; 5727 return 1;
5632} 5728}
5633 5729
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
5712 return 1; 5808 return 1;
5713 free_nested(to_vmx(vcpu)); 5809 free_nested(to_vmx(vcpu));
5714 skip_emulated_instruction(vcpu); 5810 skip_emulated_instruction(vcpu);
5811 nested_vmx_succeed(vcpu);
5715 return 1; 5812 return 1;
5716} 5813}
5717 5814
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5768 return 0; 5865 return 0;
5769} 5866}
5770 5867
5771/*
5772 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5773 * set the success or error code of an emulated VMX instruction, as specified
5774 * by Vol 2B, VMX Instruction Reference, "Conventions".
5775 */
5776static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5777{
5778 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5779 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5780 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5781}
5782
5783static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5784{
5785 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5786 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5787 X86_EFLAGS_SF | X86_EFLAGS_OF))
5788 | X86_EFLAGS_CF);
5789}
5790
5791static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5792 u32 vm_instruction_error)
5793{
5794 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5795 /*
5796 * failValid writes the error number to the current VMCS, which
5797 * can't be done there isn't a current VMCS.
5798 */
5799 nested_vmx_failInvalid(vcpu);
5800 return;
5801 }
5802 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5803 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5804 X86_EFLAGS_SF | X86_EFLAGS_OF))
5805 | X86_EFLAGS_ZF);
5806 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5807 /*
5808 * We don't need to force a shadow sync because
5809 * VM_INSTRUCTION_ERROR is not shadowed
5810 */
5811}
5812
5813/* Emulate the VMCLEAR instruction */ 5868/* Emulate the VMCLEAR instruction */
5814static int handle_vmclear(struct kvm_vcpu *vcpu) 5869static int handle_vmclear(struct kvm_vcpu *vcpu)
5815{ 5870{
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
5972 unsigned long field; 6027 unsigned long field;
5973 u64 field_value; 6028 u64 field_value;
5974 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6029 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5975 unsigned long *fields = (unsigned long *)shadow_read_write_fields; 6030 const unsigned long *fields = shadow_read_write_fields;
5976 int num_fields = max_shadow_read_write_fields; 6031 const int num_fields = max_shadow_read_write_fields;
5977 6032
5978 vmcs_load(shadow_vmcs); 6033 vmcs_load(shadow_vmcs);
5979 6034
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6002 6057
6003static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6058static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6004{ 6059{
6005 unsigned long *fields[] = { 6060 const unsigned long *fields[] = {
6006 (unsigned long *)shadow_read_write_fields, 6061 shadow_read_write_fields,
6007 (unsigned long *)shadow_read_only_fields 6062 shadow_read_only_fields
6008 }; 6063 };
6009 int num_lists = ARRAY_SIZE(fields); 6064 const int max_fields[] = {
6010 int max_fields[] = {
6011 max_shadow_read_write_fields, 6065 max_shadow_read_write_fields,
6012 max_shadow_read_only_fields 6066 max_shadow_read_only_fields
6013 }; 6067 };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6018 6072
6019 vmcs_load(shadow_vmcs); 6073 vmcs_load(shadow_vmcs);
6020 6074
6021 for (q = 0; q < num_lists; q++) { 6075 for (q = 0; q < ARRAY_SIZE(fields); q++) {
6022 for (i = 0; i < max_fields[q]; i++) { 6076 for (i = 0; i < max_fields[q]; i++) {
6023 field = fields[q][i]; 6077 field = fields[q][i];
6024 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6078 vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6248 return 1; 6302 return 1;
6249} 6303}
6250 6304
6305/* Emulate the INVEPT instruction */
6306static int handle_invept(struct kvm_vcpu *vcpu)
6307{
6308 u32 vmx_instruction_info, types;
6309 unsigned long type;
6310 gva_t gva;
6311 struct x86_exception e;
6312 struct {
6313 u64 eptp, gpa;
6314 } operand;
6315 u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
6316
6317 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
6318 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6319 kvm_queue_exception(vcpu, UD_VECTOR);
6320 return 1;
6321 }
6322
6323 if (!nested_vmx_check_permission(vcpu))
6324 return 1;
6325
6326 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
6327 kvm_queue_exception(vcpu, UD_VECTOR);
6328 return 1;
6329 }
6330
6331 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6332 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
6333
6334 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6335
6336 if (!(types & (1UL << type))) {
6337 nested_vmx_failValid(vcpu,
6338 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6339 return 1;
6340 }
6341
6342 /* According to the Intel VMX instruction reference, the memory
6343 * operand is read even if it isn't needed (e.g., for type==global)
6344 */
6345 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6346 vmx_instruction_info, &gva))
6347 return 1;
6348 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
6349 sizeof(operand), &e)) {
6350 kvm_inject_page_fault(vcpu, &e);
6351 return 1;
6352 }
6353
6354 switch (type) {
6355 case VMX_EPT_EXTENT_CONTEXT:
6356 if ((operand.eptp & eptp_mask) !=
6357 (nested_ept_get_cr3(vcpu) & eptp_mask))
6358 break;
6359 case VMX_EPT_EXTENT_GLOBAL:
6360 kvm_mmu_sync_roots(vcpu);
6361 kvm_mmu_flush_tlb(vcpu);
6362 nested_vmx_succeed(vcpu);
6363 break;
6364 default:
6365 BUG_ON(1);
6366 break;
6367 }
6368
6369 skip_emulated_instruction(vcpu);
6370 return 1;
6371}
6372
6251/* 6373/*
6252 * The exit handlers return 1 if the exit was handled fully and guest execution 6374 * The exit handlers return 1 if the exit was handled fully and guest execution
6253 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6375 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6292 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6414 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6293 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, 6415 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
6294 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, 6416 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
6417 [EXIT_REASON_INVEPT] = handle_invept,
6295}; 6418};
6296 6419
6297static const int kvm_vmx_max_exit_handlers = 6420static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6518 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 6641 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
6519 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 6642 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
6520 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6643 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6644 case EXIT_REASON_INVEPT:
6521 /* 6645 /*
6522 * VMX instructions trap unconditionally. This allows L1 to 6646 * VMX instructions trap unconditionally. This allows L1 to
6523 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6647 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6550 return nested_cpu_has2(vmcs12, 6674 return nested_cpu_has2(vmcs12,
6551 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 6675 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
6552 case EXIT_REASON_EPT_VIOLATION: 6676 case EXIT_REASON_EPT_VIOLATION:
6677 /*
6678 * L0 always deals with the EPT violation. If nested EPT is
6679 * used, and the nested mmu code discovers that the address is
6680 * missing in the guest EPT table (EPT12), the EPT violation
6681 * will be injected with nested_ept_inject_page_fault()
6682 */
6683 return 0;
6553 case EXIT_REASON_EPT_MISCONFIG: 6684 case EXIT_REASON_EPT_MISCONFIG:
6685 /*
6686 * L2 never uses directly L1's EPT, but rather L0's own EPT
6687 * table (shadow on EPT) or a merged EPT table that L0 built
6688 * (EPT on EPT). So any problems with the structure of the
6689 * table is L0's fault.
6690 */
6554 return 0; 6691 return 0;
6555 case EXIT_REASON_PREEMPTION_TIMER: 6692 case EXIT_REASON_PREEMPTION_TIMER:
6556 return vmcs12->pin_based_vm_exec_control & 6693 return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6638 6775
6639 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6776 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
6640 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6777 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
6641 get_vmcs12(vcpu), vcpu)))) { 6778 get_vmcs12(vcpu))))) {
6642 if (vmx_interrupt_allowed(vcpu)) { 6779 if (vmx_interrupt_allowed(vcpu)) {
6643 vmx->soft_vnmi_blocked = 0; 6780 vmx->soft_vnmi_blocked = 0;
6644 } else if (vmx->vnmi_blocked_time > 1000000000LL && 6781 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7326 entry->ecx |= bit(X86_FEATURE_VMX); 7463 entry->ecx |= bit(X86_FEATURE_VMX);
7327} 7464}
7328 7465
7466static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
7467 struct x86_exception *fault)
7468{
7469 struct vmcs12 *vmcs12;
7470 nested_vmx_vmexit(vcpu);
7471 vmcs12 = get_vmcs12(vcpu);
7472
7473 if (fault->error_code & PFERR_RSVD_MASK)
7474 vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
7475 else
7476 vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
7477 vmcs12->exit_qualification = vcpu->arch.exit_qualification;
7478 vmcs12->guest_physical_address = fault->address;
7479}
7480
7481/* Callbacks for nested_ept_init_mmu_context: */
7482
7483static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
7484{
7485 /* return the page table to be shadowed - in our case, EPT12 */
7486 return get_vmcs12(vcpu)->ept_pointer;
7487}
7488
7489static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
7490{
7491 int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
7492 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
7493
7494 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
7495 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
7496 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
7497
7498 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
7499
7500 return r;
7501}
7502
7503static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
7504{
7505 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
7506}
7507
7329/* 7508/*
7330 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 7509 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
7331 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 7510 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7388 vmcs12->guest_interruptibility_info); 7567 vmcs12->guest_interruptibility_info);
7389 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7568 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
7390 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 7569 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
7391 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7570 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
7392 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7571 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
7393 vmcs12->guest_pending_dbg_exceptions); 7572 vmcs12->guest_pending_dbg_exceptions);
7394 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 7573 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7508 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 7687 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
7509 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 7688 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
7510 7689
7511 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ 7690 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
7512 vmcs_write32(VM_EXIT_CONTROLS, 7691 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
7513 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); 7692 * bits are further modified by vmx_set_efer() below.
7514 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | 7693 */
7694 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
7695
7696 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7697 * emulated by vmx_set_efer(), below.
7698 */
7699 vmcs_write32(VM_ENTRY_CONTROLS,
7700 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
7701 ~VM_ENTRY_IA32E_MODE) |
7515 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 7702 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
7516 7703
7517 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) 7704 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
7518 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 7705 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
7519 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 7706 vcpu->arch.pat = vmcs12->guest_ia32_pat;
7707 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
7520 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 7708 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
7521 7709
7522 7710
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7538 vmx_flush_tlb(vcpu); 7726 vmx_flush_tlb(vcpu);
7539 } 7727 }
7540 7728
7729 if (nested_cpu_has_ept(vmcs12)) {
7730 kvm_mmu_unload(vcpu);
7731 nested_ept_init_mmu_context(vcpu);
7732 }
7733
7541 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7734 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
7542 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7735 vcpu->arch.efer = vmcs12->guest_ia32_efer;
7543 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7736 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7565 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 7758 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
7566 kvm_mmu_reset_context(vcpu); 7759 kvm_mmu_reset_context(vcpu);
7567 7760
7761 /*
7762 * L1 may access the L2's PDPTR, so save them to construct vmcs12
7763 */
7764 if (enable_ept) {
7765 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
7766 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
7767 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
7768 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
7769 }
7770
7568 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 7771 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
7569 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 7772 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
7570} 7773}
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7887 vmcs12->guest_pending_dbg_exceptions = 8090 vmcs12->guest_pending_dbg_exceptions =
7888 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8091 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7889 8092
8093 /*
8094 * In some cases (usually, nested EPT), L2 is allowed to change its
8095 * own CR3 without exiting. If it has changed it, we must keep it.
8096 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
8097 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
8098 *
8099 * Additionally, restore L2's PDPTR to vmcs12.
8100 */
8101 if (enable_ept) {
8102 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
8103 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
8104 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
8105 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
8106 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
8107 }
8108
7890 vmcs12->vm_entry_controls = 8109 vmcs12->vm_entry_controls =
7891 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8110 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
7892 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 8111 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7948static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 8167static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7949 struct vmcs12 *vmcs12) 8168 struct vmcs12 *vmcs12)
7950{ 8169{
8170 struct kvm_segment seg;
8171
7951 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 8172 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7952 vcpu->arch.efer = vmcs12->host_ia32_efer; 8173 vcpu->arch.efer = vmcs12->host_ia32_efer;
7953 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 8174 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7982 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 8203 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
7983 kvm_set_cr4(vcpu, vmcs12->host_cr4); 8204 kvm_set_cr4(vcpu, vmcs12->host_cr4);
7984 8205
7985 /* shadow page tables on either EPT or shadow page tables */ 8206 if (nested_cpu_has_ept(vmcs12))
8207 nested_ept_uninit_mmu_context(vcpu);
8208
7986 kvm_set_cr3(vcpu, vmcs12->host_cr3); 8209 kvm_set_cr3(vcpu, vmcs12->host_cr3);
7987 kvm_mmu_reset_context(vcpu); 8210 kvm_mmu_reset_context(vcpu);
7988 8211
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8001 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 8224 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
8002 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 8225 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
8003 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 8226 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
8004 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); 8227
8005 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); 8228 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
8006 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
8007 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
8008 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
8009 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
8010 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
8011 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
8012 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
8013 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
8014
8015 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
8016 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 8229 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
8230 vcpu->arch.pat = vmcs12->host_ia32_pat;
8231 }
8017 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 8232 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8018 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 8233 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
8019 vmcs12->host_ia32_perf_global_ctrl); 8234 vmcs12->host_ia32_perf_global_ctrl);
8020 8235
8236 /* Set L1 segment info according to Intel SDM
8237 27.5.2 Loading Host Segment and Descriptor-Table Registers */
8238 seg = (struct kvm_segment) {
8239 .base = 0,
8240 .limit = 0xFFFFFFFF,
8241 .selector = vmcs12->host_cs_selector,
8242 .type = 11,
8243 .present = 1,
8244 .s = 1,
8245 .g = 1
8246 };
8247 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
8248 seg.l = 1;
8249 else
8250 seg.db = 1;
8251 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
8252 seg = (struct kvm_segment) {
8253 .base = 0,
8254 .limit = 0xFFFFFFFF,
8255 .type = 3,
8256 .present = 1,
8257 .s = 1,
8258 .db = 1,
8259 .g = 1
8260 };
8261 seg.selector = vmcs12->host_ds_selector;
8262 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
8263 seg.selector = vmcs12->host_es_selector;
8264 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
8265 seg.selector = vmcs12->host_ss_selector;
8266 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
8267 seg.selector = vmcs12->host_fs_selector;
8268 seg.base = vmcs12->host_fs_base;
8269 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
8270 seg.selector = vmcs12->host_gs_selector;
8271 seg.base = vmcs12->host_gs_base;
8272 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
8273 seg = (struct kvm_segment) {
8274 .base = vmcs12->host_tr_base,
8275 .limit = 0x67,
8276 .selector = vmcs12->host_tr_selector,
8277 .type = 11,
8278 .present = 1
8279 };
8280 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
8281
8021 kvm_set_dr(vcpu, 7, 0x400); 8282 kvm_set_dr(vcpu, 7, 0x400);
8022 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 8283 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
8023} 8284}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
682 */ 682 */
683 } 683 }
684 684
685 /*
686 * Does the new cr3 value map to physical memory? (Note, we
687 * catch an invalid cr3 even in real-mode, because it would
688 * cause trouble later on when we turn on paging anyway.)
689 *
690 * A real CPU would silently accept an invalid cr3 and would
691 * attempt to use it - with largely undefined (and often hard
692 * to debug) behavior on the guest side.
693 */
694 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
695 return 1;
696 vcpu->arch.cr3 = cr3; 685 vcpu->arch.cr3 = cr3;
697 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 686 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
698 vcpu->arch.mmu.new_cr3(vcpu); 687 vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
850#ifdef CONFIG_X86_64 839#ifdef CONFIG_X86_64
851 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 840 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
852#endif 841#endif
853 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 842 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
843 MSR_IA32_FEATURE_CONTROL
854}; 844};
855 845
856static unsigned num_msrs_to_save; 846static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1457#endif 1447#endif
1458} 1448}
1459 1449
1450static void kvm_gen_update_masterclock(struct kvm *kvm)
1451{
1452#ifdef CONFIG_X86_64
1453 int i;
1454 struct kvm_vcpu *vcpu;
1455 struct kvm_arch *ka = &kvm->arch;
1456
1457 spin_lock(&ka->pvclock_gtod_sync_lock);
1458 kvm_make_mclock_inprogress_request(kvm);
1459 /* no guest entries from this point */
1460 pvclock_update_vm_gtod_copy(kvm);
1461
1462 kvm_for_each_vcpu(i, vcpu, kvm)
1463 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1464
1465 /* guest entries allowed */
1466 kvm_for_each_vcpu(i, vcpu, kvm)
1467 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1468
1469 spin_unlock(&ka->pvclock_gtod_sync_lock);
1470#endif
1471}
1472
1460static int kvm_guest_time_update(struct kvm_vcpu *v) 1473static int kvm_guest_time_update(struct kvm_vcpu *v)
1461{ 1474{
1462 unsigned long flags, this_tsc_khz; 1475 unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3806 delta = user_ns.clock - now_ns; 3819 delta = user_ns.clock - now_ns;
3807 local_irq_enable(); 3820 local_irq_enable();
3808 kvm->arch.kvmclock_offset = delta; 3821 kvm->arch.kvmclock_offset = delta;
3822 kvm_gen_update_masterclock(kvm);
3809 break; 3823 break;
3810 } 3824 }
3811 case KVM_GET_CLOCK: { 3825 case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4955static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 4969static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4956static int complete_emulated_pio(struct kvm_vcpu *vcpu); 4970static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4957 4971
4972static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
4973 unsigned long *db)
4974{
4975 u32 dr6 = 0;
4976 int i;
4977 u32 enable, rwlen;
4978
4979 enable = dr7;
4980 rwlen = dr7 >> 16;
4981 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
4982 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
4983 dr6 |= (1 << i);
4984 return dr6;
4985}
4986
4987static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
4988{
4989 struct kvm_run *kvm_run = vcpu->run;
4990
4991 /*
4992 * Use the "raw" value to see if TF was passed to the processor.
4993 * Note that the new value of the flags has not been saved yet.
4994 *
4995 * This is correct even for TF set by the guest, because "the
4996 * processor will not generate this exception after the instruction
4997 * that sets the TF flag".
4998 */
4999 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5000
5001 if (unlikely(rflags & X86_EFLAGS_TF)) {
5002 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5003 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
5004 kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5005 kvm_run->debug.arch.exception = DB_VECTOR;
5006 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5007 *r = EMULATE_USER_EXIT;
5008 } else {
5009 vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5010 /*
5011 * "Certain debug exceptions may clear bit 0-3. The
5012 * remaining contents of the DR6 register are never
5013 * cleared by the processor".
5014 */
5015 vcpu->arch.dr6 &= ~15;
5016 vcpu->arch.dr6 |= DR6_BS;
5017 kvm_queue_exception(vcpu, DB_VECTOR);
5018 }
5019 }
5020}
5021
5022static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5023{
5024 struct kvm_run *kvm_run = vcpu->run;
5025 unsigned long eip = vcpu->arch.emulate_ctxt.eip;
5026 u32 dr6 = 0;
5027
5028 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5029 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5030 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5031 vcpu->arch.guest_debug_dr7,
5032 vcpu->arch.eff_db);
5033
5034 if (dr6 != 0) {
5035 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5036 kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
5037 get_segment_base(vcpu, VCPU_SREG_CS);
5038
5039 kvm_run->debug.arch.exception = DB_VECTOR;
5040 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5041 *r = EMULATE_USER_EXIT;
5042 return true;
5043 }
5044 }
5045
5046 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
5047 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5048 vcpu->arch.dr7,
5049 vcpu->arch.db);
5050
5051 if (dr6 != 0) {
5052 vcpu->arch.dr6 &= ~15;
5053 vcpu->arch.dr6 |= dr6;
5054 kvm_queue_exception(vcpu, DB_VECTOR);
5055 *r = EMULATE_DONE;
5056 return true;
5057 }
5058 }
5059
5060 return false;
5061}
5062
4958int x86_emulate_instruction(struct kvm_vcpu *vcpu, 5063int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4959 unsigned long cr2, 5064 unsigned long cr2,
4960 int emulation_type, 5065 int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4975 5080
4976 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 5081 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4977 init_emulate_ctxt(vcpu); 5082 init_emulate_ctxt(vcpu);
5083
5084 /*
5085 * We will reenter on the same instruction since
5086 * we do not set complete_userspace_io. This does not
5087 * handle watchpoints yet, those would be handled in
5088 * the emulate_ops.
5089 */
5090 if (kvm_vcpu_check_breakpoint(vcpu, &r))
5091 return r;
5092
4978 ctxt->interruptibility = 0; 5093 ctxt->interruptibility = 0;
4979 ctxt->have_exception = false; 5094 ctxt->have_exception = false;
4980 ctxt->perm_ok = false; 5095 ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
5031 inject_emulated_exception(vcpu); 5146 inject_emulated_exception(vcpu);
5032 r = EMULATE_DONE; 5147 r = EMULATE_DONE;
5033 } else if (vcpu->arch.pio.count) { 5148 } else if (vcpu->arch.pio.count) {
5034 if (!vcpu->arch.pio.in) 5149 if (!vcpu->arch.pio.in) {
5150 /* FIXME: return into emulator if single-stepping. */
5035 vcpu->arch.pio.count = 0; 5151 vcpu->arch.pio.count = 0;
5036 else { 5152 } else {
5037 writeback = false; 5153 writeback = false;
5038 vcpu->arch.complete_userspace_io = complete_emulated_pio; 5154 vcpu->arch.complete_userspace_io = complete_emulated_pio;
5039 } 5155 }
5040 r = EMULATE_DO_MMIO; 5156 r = EMULATE_USER_EXIT;
5041 } else if (vcpu->mmio_needed) { 5157 } else if (vcpu->mmio_needed) {
5042 if (!vcpu->mmio_is_write) 5158 if (!vcpu->mmio_is_write)
5043 writeback = false; 5159 writeback = false;
5044 r = EMULATE_DO_MMIO; 5160 r = EMULATE_USER_EXIT;
5045 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 5161 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5046 } else if (r == EMULATION_RESTART) 5162 } else if (r == EMULATION_RESTART)
5047 goto restart; 5163 goto restart;
@@ -5050,10 +5166,12 @@ restart:
5050 5166
5051 if (writeback) { 5167 if (writeback) {
5052 toggle_interruptibility(vcpu, ctxt->interruptibility); 5168 toggle_interruptibility(vcpu, ctxt->interruptibility);
5053 kvm_set_rflags(vcpu, ctxt->eflags);
5054 kvm_make_request(KVM_REQ_EVENT, vcpu); 5169 kvm_make_request(KVM_REQ_EVENT, vcpu);
5055 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5170 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5056 kvm_rip_write(vcpu, ctxt->eip); 5171 kvm_rip_write(vcpu, ctxt->eip);
5172 if (r == EMULATE_DONE)
5173 kvm_vcpu_check_singlestep(vcpu, &r);
5174 kvm_set_rflags(vcpu, ctxt->eflags);
5057 } else 5175 } else
5058 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 5176 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5059 5177
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
5347int kvm_arch_init(void *opaque) 5465int kvm_arch_init(void *opaque)
5348{ 5466{
5349 int r; 5467 int r;
5350 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 5468 struct kvm_x86_ops *ops = opaque;
5351 5469
5352 if (kvm_x86_ops) { 5470 if (kvm_x86_ops) {
5353 printk(KERN_ERR "kvm: already loaded the other module\n"); 5471 printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5495 return 1; 5613 return 1;
5496} 5614}
5497 5615
5616/*
5617 * kvm_pv_kick_cpu_op: Kick a vcpu.
5618 *
5619 * @apicid - apicid of vcpu to be kicked.
5620 */
5621static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5622{
5623 struct kvm_lapic_irq lapic_irq;
5624
5625 lapic_irq.shorthand = 0;
5626 lapic_irq.dest_mode = 0;
5627 lapic_irq.dest_id = apicid;
5628
5629 lapic_irq.delivery_mode = APIC_DM_REMRD;
5630 kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5631}
5632
5498int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5633int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5499{ 5634{
5500 unsigned long nr, a0, a1, a2, a3, ret; 5635 unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5528 case KVM_HC_VAPIC_POLL_IRQ: 5663 case KVM_HC_VAPIC_POLL_IRQ:
5529 ret = 0; 5664 ret = 0;
5530 break; 5665 break;
5666 case KVM_HC_KICK_CPU:
5667 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5668 ret = 0;
5669 break;
5531 default: 5670 default:
5532 ret = -KVM_ENOSYS; 5671 ret = -KVM_ENOSYS;
5533 break; 5672 break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5689 kvm_make_request(KVM_REQ_EVENT, vcpu); 5828 kvm_make_request(KVM_REQ_EVENT, vcpu);
5690} 5829}
5691 5830
5692static void kvm_gen_update_masterclock(struct kvm *kvm)
5693{
5694#ifdef CONFIG_X86_64
5695 int i;
5696 struct kvm_vcpu *vcpu;
5697 struct kvm_arch *ka = &kvm->arch;
5698
5699 spin_lock(&ka->pvclock_gtod_sync_lock);
5700 kvm_make_mclock_inprogress_request(kvm);
5701 /* no guest entries from this point */
5702 pvclock_update_vm_gtod_copy(kvm);
5703
5704 kvm_for_each_vcpu(i, vcpu, kvm)
5705 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5706
5707 /* guest entries allowed */
5708 kvm_for_each_vcpu(i, vcpu, kvm)
5709 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5710
5711 spin_unlock(&ka->pvclock_gtod_sync_lock);
5712#endif
5713}
5714
5715static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 5831static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5716{ 5832{
5717 u64 eoi_exit_bitmap[4]; 5833 u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5950 kvm_apic_accept_events(vcpu); 6066 kvm_apic_accept_events(vcpu);
5951 switch(vcpu->arch.mp_state) { 6067 switch(vcpu->arch.mp_state) {
5952 case KVM_MP_STATE_HALTED: 6068 case KVM_MP_STATE_HALTED:
6069 vcpu->arch.pv.pv_unhalted = false;
5953 vcpu->arch.mp_state = 6070 vcpu->arch.mp_state =
5954 KVM_MP_STATE_RUNNABLE; 6071 KVM_MP_STATE_RUNNABLE;
5955 case KVM_MP_STATE_RUNNABLE: 6072 case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6061 6178
6062 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 6179 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
6063 vcpu->mmio_needed = 0; 6180 vcpu->mmio_needed = 0;
6181
6182 /* FIXME: return into emulator if single-stepping. */
6064 if (vcpu->mmio_is_write) 6183 if (vcpu->mmio_is_write)
6065 return 1; 6184 return 1;
6066 vcpu->mmio_read_completed = 1; 6185 vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6249 struct kvm_mp_state *mp_state) 6368 struct kvm_mp_state *mp_state)
6250{ 6369{
6251 kvm_apic_accept_events(vcpu); 6370 kvm_apic_accept_events(vcpu);
6252 mp_state->mp_state = vcpu->arch.mp_state; 6371 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6372 vcpu->arch.pv.pv_unhalted)
6373 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6374 else
6375 mp_state->mp_state = vcpu->arch.mp_state;
6376
6253 return 0; 6377 return 0;
6254} 6378}
6255 6379
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6770 BUG_ON(vcpu->kvm == NULL); 6894 BUG_ON(vcpu->kvm == NULL);
6771 kvm = vcpu->kvm; 6895 kvm = vcpu->kvm;
6772 6896
6897 vcpu->arch.pv.pv_unhalted = false;
6773 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 6898 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6774 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6899 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6775 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6900 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
7019 return -ENOMEM; 7144 return -ENOMEM;
7020} 7145}
7021 7146
7147void kvm_arch_memslots_updated(struct kvm *kvm)
7148{
7149 /*
7150 * memslots->generation has been incremented.
7151 * mmio generation may have reached its maximum value.
7152 */
7153 kvm_mmu_invalidate_mmio_sptes(kvm);
7154}
7155
7022int kvm_arch_prepare_memory_region(struct kvm *kvm, 7156int kvm_arch_prepare_memory_region(struct kvm *kvm,
7023 struct kvm_memory_slot *memslot, 7157 struct kvm_memory_slot *memslot,
7024 struct kvm_userspace_memory_region *mem, 7158 struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7079 */ 7213 */
7080 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7214 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7081 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7215 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7082 /*
7083 * If memory slot is created, or moved, we need to clear all
7084 * mmio sptes.
7085 */
7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7087} 7216}
7088 7217
7089void kvm_arch_flush_shadow_all(struct kvm *kvm) 7218void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7103 !vcpu->arch.apf.halted) 7232 !vcpu->arch.apf.halted)
7104 || !list_empty_careful(&vcpu->async_pf.done) 7233 || !list_empty_careful(&vcpu->async_pf.done)
7105 || kvm_apic_has_events(vcpu) 7234 || kvm_apic_has_events(vcpu)
7235 || vcpu->arch.pv.pv_unhalted
7106 || atomic_read(&vcpu->arch.nmi_queued) || 7236 || atomic_read(&vcpu->arch.nmi_queued) ||
7107 (kvm_arch_interrupt_allowed(vcpu) && 7237 (kvm_arch_interrupt_allowed(vcpu) &&
7108 kvm_cpu_has_interrupt(vcpu)); 7238 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 6a22c19da663..bdf8532494fe 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,8 +7,7 @@
7 * kernel and insert a module (lg.ko) which allows us to run other Linux 7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host, 8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests 9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/virtual/lguest/lguest.c) is called the 10 * (such as the example in tools/lguest/lguest.c) is called the Launcher.
11 * Launcher.
12 * 11 *
13 * Secondly, we only run specially modified Guests, not normal kernels: setting 12 * Secondly, we only run specially modified Guests, not normal kernels: setting
14 * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows 13 * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
@@ -1057,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss,
1057} 1056}
1058 1057
1059/* Let's just say, I wouldn't do debugging under a Guest. */ 1058/* Let's just say, I wouldn't do debugging under a Guest. */
1059static unsigned long lguest_get_debugreg(int regno)
1060{
1061 /* FIXME: Implement */
1062 return 0;
1063}
1064
1060static void lguest_set_debugreg(int regno, unsigned long value) 1065static void lguest_set_debugreg(int regno, unsigned long value)
1061{ 1066{
1062 /* FIXME: Implement */ 1067 /* FIXME: Implement */
@@ -1304,6 +1309,7 @@ __init void lguest_init(void)
1304 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; 1309 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
1305 pv_cpu_ops.set_ldt = lguest_set_ldt; 1310 pv_cpu_ops.set_ldt = lguest_set_ldt;
1306 pv_cpu_ops.load_tls = lguest_load_tls; 1311 pv_cpu_ops.load_tls = lguest_load_tls;
1312 pv_cpu_ops.get_debugreg = lguest_get_debugreg;
1307 pv_cpu_ops.set_debugreg = lguest_set_debugreg; 1313 pv_cpu_ops.set_debugreg = lguest_set_debugreg;
1308 pv_cpu_ops.clts = lguest_clts; 1314 pv_cpu_ops.clts = lguest_clts;
1309 pv_cpu_ops.read_cr0 = lguest_read_cr0; 1315 pv_cpu_ops.read_cr0 = lguest_read_cr0;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
59 return NULL; 59 return NULL;
60} 60}
61 61
62int pmd_huge_support(void)
63{
64 return 0;
65}
62#else 66#else
63 67
64struct page * 68struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
77 return !!(pud_val(pud) & _PAGE_PSE); 81 return !!(pud_val(pud) & _PAGE_PSE);
78} 82}
79 83
84int pmd_huge_support(void)
85{
86 return 1;
87}
80#endif 88#endif
81 89
82/* x86_64 also uses this file */ 90/* x86_64 also uses this file */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 104 return;
105 105
106 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
106 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
107 if (f->flush_end == TLB_FLUSH_ALL) 108 if (f->flush_end == TLB_FLUSH_ALL)
108 local_flush_tlb(); 109 local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
130 info.flush_start = start; 131 info.flush_start = start;
131 info.flush_end = end; 132 info.flush_end = end;
132 133
134 count_vm_event(NR_TLB_REMOTE_FLUSH);
133 if (is_uv_system()) { 135 if (is_uv_system()) {
134 unsigned int cpu; 136 unsigned int cpu;
135 137
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
149 151
150 preempt_disable(); 152 preempt_disable();
151 153
154 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
152 local_flush_tlb(); 155 local_flush_tlb();
153 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
154 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
211 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; 214 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
212 215
213 /* tlb_flushall_shift is on balance point, details in commit log */ 216 /* tlb_flushall_shift is on balance point, details in commit log */
214 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) 217 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
218 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
215 local_flush_tlb(); 219 local_flush_tlb();
216 else { 220 } else {
217 if (has_large_page(mm, start, end)) { 221 if (has_large_page(mm, start, end)) {
218 local_flush_tlb(); 222 local_flush_tlb();
219 goto flush_all; 223 goto flush_all;
220 } 224 }
221 /* flush range by one by one 'invlpg' */ 225 /* flush range by one by one 'invlpg' */
222 for (addr = start; addr < end; addr += PAGE_SIZE) 226 for (addr = start; addr < end; addr += PAGE_SIZE) {
227 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
223 __flush_tlb_single(addr); 228 __flush_tlb_single(addr);
229 }
224 230
225 if (cpumask_any_but(mm_cpumask(mm), 231 if (cpumask_any_but(mm_cpumask(mm),
226 smp_processor_id()) < nr_cpu_ids) 232 smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
256 262
257static void do_flush_tlb_all(void *info) 263static void do_flush_tlb_all(void *info)
258{ 264{
265 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
259 __flush_tlb_all(); 266 __flush_tlb_all();
260 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 267 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
261 leave_mm(smp_processor_id()); 268 leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
263 270
264void flush_tlb_all(void) 271void flush_tlb_all(void)
265{ 272{
273 count_vm_event(NR_TLB_REMOTE_FLUSH);
266 on_each_cpu(do_flush_tlb_all, NULL, 1); 274 on_each_cpu(do_flush_tlb_all, NULL, 1);
267} 275}
268 276
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 48768df2471a..6890d8498e0b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy)
403 nmi_cpu_shutdown(dummy); 403 nmi_cpu_shutdown(dummy);
404} 404}
405 405
406static int nmi_create_files(struct super_block *sb, struct dentry *root) 406static int nmi_create_files(struct dentry *root)
407{ 407{
408 unsigned int i; 408 unsigned int i;
409 409
@@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
420 continue; 420 continue;
421 421
422 snprintf(buf, sizeof(buf), "%d", i); 422 snprintf(buf, sizeof(buf), "%d", i);
423 dir = oprofilefs_mkdir(sb, root, buf); 423 dir = oprofilefs_mkdir(root, buf);
424 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 424 oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
425 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 425 oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
426 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 426 oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
427 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 427 oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
428 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 428 oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
429 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 429 oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
430 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); 430 oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
431 } 431 }
432 432
433 return 0; 433 return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b2b94438ff05..50d86c0e9ba4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -454,16 +454,16 @@ static void init_ibs(void)
454 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); 454 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
455} 455}
456 456
457static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 457static int (*create_arch_files)(struct dentry *root);
458 458
459static int setup_ibs_files(struct super_block *sb, struct dentry *root) 459static int setup_ibs_files(struct dentry *root)
460{ 460{
461 struct dentry *dir; 461 struct dentry *dir;
462 int ret = 0; 462 int ret = 0;
463 463
464 /* architecture specific files */ 464 /* architecture specific files */
465 if (create_arch_files) 465 if (create_arch_files)
466 ret = create_arch_files(sb, root); 466 ret = create_arch_files(root);
467 467
468 if (ret) 468 if (ret)
469 return ret; 469 return ret;
@@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
479 ibs_config.max_cnt_op = 250000; 479 ibs_config.max_cnt_op = 250000;
480 480
481 if (ibs_caps & IBS_CAPS_FETCHSAM) { 481 if (ibs_caps & IBS_CAPS_FETCHSAM) {
482 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 482 dir = oprofilefs_mkdir(root, "ibs_fetch");
483 oprofilefs_create_ulong(sb, dir, "enable", 483 oprofilefs_create_ulong(dir, "enable",
484 &ibs_config.fetch_enabled); 484 &ibs_config.fetch_enabled);
485 oprofilefs_create_ulong(sb, dir, "max_count", 485 oprofilefs_create_ulong(dir, "max_count",
486 &ibs_config.max_cnt_fetch); 486 &ibs_config.max_cnt_fetch);
487 oprofilefs_create_ulong(sb, dir, "rand_enable", 487 oprofilefs_create_ulong(dir, "rand_enable",
488 &ibs_config.rand_en); 488 &ibs_config.rand_en);
489 } 489 }
490 490
491 if (ibs_caps & IBS_CAPS_OPSAM) { 491 if (ibs_caps & IBS_CAPS_OPSAM) {
492 dir = oprofilefs_mkdir(sb, root, "ibs_op"); 492 dir = oprofilefs_mkdir(root, "ibs_op");
493 oprofilefs_create_ulong(sb, dir, "enable", 493 oprofilefs_create_ulong(dir, "enable",
494 &ibs_config.op_enabled); 494 &ibs_config.op_enabled);
495 oprofilefs_create_ulong(sb, dir, "max_count", 495 oprofilefs_create_ulong(dir, "max_count",
496 &ibs_config.max_cnt_op); 496 &ibs_config.max_cnt_op);
497 if (ibs_caps & IBS_CAPS_OPCNT) 497 if (ibs_caps & IBS_CAPS_OPCNT)
498 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 498 oprofilefs_create_ulong(dir, "dispatched_ops",
499 &ibs_config.dispatched_ops); 499 &ibs_config.dispatched_ops);
500 if (ibs_caps & IBS_CAPS_BRNTRGT) 500 if (ibs_caps & IBS_CAPS_BRNTRGT)
501 oprofilefs_create_ulong(sb, dir, "branch_target", 501 oprofilefs_create_ulong(dir, "branch_target",
502 &ibs_config.branch_target); 502 &ibs_config.branch_target);
503 } 503 }
504 504
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 47fe66fe61f1..3ca5957b7a34 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -20,7 +20,7 @@
20#include <linux/intel_pmic_gpio.h> 20#include <linux/intel_pmic_gpio.h>
21#include <linux/spi/spi.h> 21#include <linux/spi/spi.h>
22#include <linux/i2c.h> 22#include <linux/i2c.h>
23#include <linux/i2c/pca953x.h> 23#include <linux/platform_data/pca953x.h>
24#include <linux/gpio_keys.h> 24#include <linux/gpio_keys.h>
25#include <linux/input.h> 25#include <linux/input.h>
26#include <linux/platform_device.h> 26#include <linux/platform_device.h>
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
index 9d34eddb517f..96eb2bd28832 100644
--- a/arch/x86/um/os-Linux/prctl.c
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -4,7 +4,7 @@
4 */ 4 */
5 5
6#include <sys/ptrace.h> 6#include <sys/ptrace.h>
7#include <linux/ptrace.h> 7#include <asm/ptrace.h>
8 8
9int os_arch_prctl(int pid, int code, unsigned long *addr) 9int os_arch_prctl(int pid, int code, unsigned long *addr)
10{ 10{
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
85 cycle_t ret; 85 cycle_t ret;
86 u64 last; 86 u64 last;
87 u32 version; 87 u32 version;
88 u32 migrate_count;
89 u8 flags; 88 u8 flags;
90 unsigned cpu, cpu1; 89 unsigned cpu, cpu1;
91 90
92 91
93 /* 92 /*
94 * When looping to get a consistent (time-info, tsc) pair, we 93 * Note: hypervisor must guarantee that:
95 * also need to deal with the possibility we can switch vcpus, 94 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
96 * so make sure we always re-fetch time-info for the current vcpu. 95 * 2. that per-CPU pvclock time info is updated if the
96 * underlying CPU changes.
97 * 3. that version is increased whenever underlying CPU
98 * changes.
99 *
97 */ 100 */
98 do { 101 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK; 102 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
104 107
105 pvti = get_pvti(cpu); 108 pvti = get_pvti(cpu);
106 109
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 110 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110 111
111 /* 112 /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 118 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 || 119 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) || 120 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version || 121 pvti->pvti.version != version));
121 pvti->migrate_count != migrate_count));
122 122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE; 124 *mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2fcaedc0b739..fa6ade76ef3f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void)
427 427
428 if (!xen_initial_domain()) 428 if (!xen_initial_domain())
429 cpuid_leaf1_edx_mask &= 429 cpuid_leaf1_edx_mask &=
430 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 430 ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
431 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
432 431
433 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); 432 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
434 433
@@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
735 addr = (unsigned long)xen_int3; 734 addr = (unsigned long)xen_int3;
736 else if (addr == (unsigned long)stack_segment) 735 else if (addr == (unsigned long)stack_segment)
737 addr = (unsigned long)xen_stack_segment; 736 addr = (unsigned long)xen_stack_segment;
738 else if (addr == (unsigned long)double_fault || 737 else if (addr == (unsigned long)double_fault) {
739 addr == (unsigned long)nmi) {
740 /* Don't need to handle these */ 738 /* Don't need to handle these */
741 return 0; 739 return 0;
742#ifdef CONFIG_X86_MCE 740#ifdef CONFIG_X86_MCE
@@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
747 */ 745 */
748 ; 746 ;
749#endif 747#endif
750 } else { 748 } else if (addr == (unsigned long)nmi)
749 /*
750 * Use the native version as well.
751 */
752 ;
753 else {
751 /* Some other trap using IST? */ 754 /* Some other trap using IST? */
752 if (WARN_ON(val->ist != 0)) 755 if (WARN_ON(val->ist != 0))
753 return 0; 756 return 0;
@@ -1689,7 +1692,6 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
1689 case CPU_UP_PREPARE: 1692 case CPU_UP_PREPARE:
1690 xen_vcpu_setup(cpu); 1693 xen_vcpu_setup(cpu);
1691 if (xen_have_vector_callback) { 1694 if (xen_have_vector_callback) {
1692 xen_init_lock_cpu(cpu);
1693 if (xen_feature(XENFEAT_hvm_safe_pvclock)) 1695 if (xen_feature(XENFEAT_hvm_safe_pvclock))
1694 xen_setup_timer(cpu); 1696 xen_setup_timer(cpu);
1695 } 1697 }
@@ -1710,6 +1712,8 @@ static void __init xen_hvm_guest_init(void)
1710 1712
1711 xen_hvm_init_shared_info(); 1713 xen_hvm_init_shared_info();
1712 1714
1715 xen_panic_handler_init();
1716
1713 if (xen_feature(XENFEAT_hvm_callback_vector)) 1717 if (xen_feature(XENFEAT_hvm_callback_vector))
1714 xen_have_vector_callback = 1; 1718 xen_have_vector_callback = 1;
1715 xen_hvm_smp_init(); 1719 xen_hvm_smp_init();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae1..0da7f863056f 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags)
47 /* convert from IF type flag */ 47 /* convert from IF type flag */
48 flags = !(flags & X86_EFLAGS_IF); 48 flags = !(flags & X86_EFLAGS_IF);
49 49
50 /* There's a one instruction preempt window here. We need to 50 /* See xen_irq_enable() for why preemption must be disabled. */
51 make sure we're don't switch CPUs between getting the vcpu
52 pointer and updating the mask. */
53 preempt_disable(); 51 preempt_disable();
54 vcpu = this_cpu_read(xen_vcpu); 52 vcpu = this_cpu_read(xen_vcpu);
55 vcpu->evtchn_upcall_mask = flags; 53 vcpu->evtchn_upcall_mask = flags;
56 preempt_enable_no_resched();
57
58 /* Doesn't matter if we get preempted here, because any
59 pending event will get dealt with anyway. */
60 54
61 if (flags == 0) { 55 if (flags == 0) {
62 preempt_check_resched();
63 barrier(); /* unmask then check (avoid races) */ 56 barrier(); /* unmask then check (avoid races) */
64 if (unlikely(vcpu->evtchn_upcall_pending)) 57 if (unlikely(vcpu->evtchn_upcall_pending))
65 xen_force_evtchn_callback(); 58 xen_force_evtchn_callback();
66 } 59 preempt_enable();
60 } else
61 preempt_enable_no_resched();
67} 62}
68PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); 63PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
69 64
@@ -82,10 +77,12 @@ static void xen_irq_enable(void)
82{ 77{
83 struct vcpu_info *vcpu; 78 struct vcpu_info *vcpu;
84 79
85 /* We don't need to worry about being preempted here, since 80 /*
86 either a) interrupts are disabled, so no preemption, or b) 81 * We may be preempted as soon as vcpu->evtchn_upcall_mask is
87 the caller is confused and is trying to re-enable interrupts 82 * cleared, so disable preemption to ensure we check for
88 on an indeterminate processor. */ 83 * events on the VCPU we are still running on.
84 */
85 preempt_disable();
89 86
90 vcpu = this_cpu_read(xen_vcpu); 87 vcpu = this_cpu_read(xen_vcpu);
91 vcpu->evtchn_upcall_mask = 0; 88 vcpu->evtchn_upcall_mask = 0;
@@ -96,6 +93,8 @@ static void xen_irq_enable(void)
96 barrier(); /* unmask then check (avoid races) */ 93 barrier(); /* unmask then check (avoid races) */
97 if (unlikely(vcpu->evtchn_upcall_pending)) 94 if (unlikely(vcpu->evtchn_upcall_pending))
98 xen_force_evtchn_callback(); 95 xen_force_evtchn_callback();
96
97 preempt_enable();
99} 98}
100PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); 99PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
101 100
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927e..8b901e8d782d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -161,6 +161,7 @@
161#include <asm/xen/page.h> 161#include <asm/xen/page.h>
162#include <asm/xen/hypercall.h> 162#include <asm/xen/hypercall.h>
163#include <asm/xen/hypervisor.h> 163#include <asm/xen/hypervisor.h>
164#include <xen/balloon.h>
164#include <xen/grant_table.h> 165#include <xen/grant_table.h>
165 166
166#include "multicalls.h" 167#include "multicalls.h"
@@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page,
967 if (kmap_op != NULL) { 968 if (kmap_op != NULL) {
968 if (!PageHighMem(page)) { 969 if (!PageHighMem(page)) {
969 struct multicall_space mcs; 970 struct multicall_space mcs;
970 struct gnttab_unmap_grant_ref *unmap_op; 971 struct gnttab_unmap_and_replace *unmap_op;
972 struct page *scratch_page = get_balloon_scratch_page();
973 unsigned long scratch_page_address = (unsigned long)
974 __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
971 975
972 /* 976 /*
973 * It might be that we queued all the m2p grant table 977 * It might be that we queued all the m2p grant table
@@ -986,25 +990,31 @@ int m2p_remove_override(struct page *page,
986 printk(KERN_WARNING "m2p_remove_override: " 990 printk(KERN_WARNING "m2p_remove_override: "
987 "pfn %lx mfn %lx, failed to modify kernel mappings", 991 "pfn %lx mfn %lx, failed to modify kernel mappings",
988 pfn, mfn); 992 pfn, mfn);
993 put_balloon_scratch_page();
989 return -1; 994 return -1;
990 } 995 }
991 996
992 mcs = xen_mc_entry( 997 xen_mc_batch();
993 sizeof(struct gnttab_unmap_grant_ref)); 998
999 mcs = __xen_mc_entry(
1000 sizeof(struct gnttab_unmap_and_replace));
994 unmap_op = mcs.args; 1001 unmap_op = mcs.args;
995 unmap_op->host_addr = kmap_op->host_addr; 1002 unmap_op->host_addr = kmap_op->host_addr;
1003 unmap_op->new_addr = scratch_page_address;
996 unmap_op->handle = kmap_op->handle; 1004 unmap_op->handle = kmap_op->handle;
997 unmap_op->dev_bus_addr = 0;
998 1005
999 MULTI_grant_table_op(mcs.mc, 1006 MULTI_grant_table_op(mcs.mc,
1000 GNTTABOP_unmap_grant_ref, unmap_op, 1); 1007 GNTTABOP_unmap_and_replace, unmap_op, 1);
1008
1009 mcs = __xen_mc_entry(0);
1010 MULTI_update_va_mapping(mcs.mc, scratch_page_address,
1011 pfn_pte(page_to_pfn(scratch_page),
1012 PAGE_KERNEL_RO), 0);
1001 1013
1002 xen_mc_issue(PARAVIRT_LAZY_MMU); 1014 xen_mc_issue(PARAVIRT_LAZY_MMU);
1003 1015
1004 set_pte_at(&init_mm, address, ptep,
1005 pfn_pte(pfn, PAGE_KERNEL));
1006 __flush_tlb_single(address);
1007 kmap_op->host_addr = 0; 1016 kmap_op->host_addr = 0;
1017 put_balloon_scratch_page();
1008 } 1018 }
1009 } 1019 }
1010 1020
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8f3eea6b80c5..09f3059cb00b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,6 +33,9 @@
33/* These are code, but not functions. Defined in entry.S */ 33/* These are code, but not functions. Defined in entry.S */
34extern const char xen_hypervisor_callback[]; 34extern const char xen_hypervisor_callback[];
35extern const char xen_failsafe_callback[]; 35extern const char xen_failsafe_callback[];
36#ifdef CONFIG_X86_64
37extern const char nmi[];
38#endif
36extern void xen_sysenter_target(void); 39extern void xen_sysenter_target(void);
37extern void xen_syscall_target(void); 40extern void xen_syscall_target(void);
38extern void xen_syscall32_target(void); 41extern void xen_syscall32_target(void);
@@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk(
215 unsigned long pfn; 218 unsigned long pfn;
216 219
217 /* 220 /*
218 * If the PFNs are currently mapped, the VA mapping also needs 221 * If the PFNs are currently mapped, clear the mappings
219 * to be updated to be 1:1. 222 * (except for the ISA region which must be 1:1 mapped) to
223 * release the refcounts (in Xen) on the original frames.
220 */ 224 */
221 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) 225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
226 pte_t pte = __pte_ma(0);
227
228 if (pfn < PFN_UP(ISA_END_ADDRESS))
229 pte = mfn_pte(pfn, PAGE_KERNEL_IO);
230
222 (void)HYPERVISOR_update_va_mapping( 231 (void)HYPERVISOR_update_va_mapping(
223 (unsigned long)__va(pfn << PAGE_SHIFT), 232 (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
224 mfn_pte(pfn, PAGE_KERNEL_IO), 0); 233 }
225 234
226 if (start_pfn < nr_pages) 235 if (start_pfn < nr_pages)
227 *released += xen_release_chunk( 236 *released += xen_release_chunk(
@@ -547,7 +556,13 @@ void xen_enable_syscall(void)
547 } 556 }
548#endif /* CONFIG_X86_64 */ 557#endif /* CONFIG_X86_64 */
549} 558}
550 559void __cpuinit xen_enable_nmi(void)
560{
561#ifdef CONFIG_X86_64
562 if (register_callback(CALLBACKTYPE_nmi, nmi))
563 BUG();
564#endif
565}
551void __init xen_arch_setup(void) 566void __init xen_arch_setup(void)
552{ 567{
553 xen_panic_handler_init(); 568 xen_panic_handler_init();
@@ -565,7 +580,7 @@ void __init xen_arch_setup(void)
565 580
566 xen_enable_sysenter(); 581 xen_enable_sysenter();
567 xen_enable_syscall(); 582 xen_enable_syscall();
568 583 xen_enable_nmi();
569#ifdef CONFIG_ACPI 584#ifdef CONFIG_ACPI
570 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 585 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
571 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 586 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 597655bd72b0..d1e4777b4e75 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -273,12 +273,20 @@ static void __init xen_smp_prepare_boot_cpu(void)
273 BUG_ON(smp_processor_id() != 0); 273 BUG_ON(smp_processor_id() != 0);
274 native_smp_prepare_boot_cpu(); 274 native_smp_prepare_boot_cpu();
275 275
276 /* We've switched to the "real" per-cpu gdt, so make sure the 276 if (xen_pv_domain()) {
277 old memory can be recycled */ 277 /* We've switched to the "real" per-cpu gdt, so make sure the
278 make_lowmem_page_readwrite(xen_initial_gdt); 278 old memory can be recycled */
279 make_lowmem_page_readwrite(xen_initial_gdt);
279 280
280 xen_filter_cpu_maps(); 281 xen_filter_cpu_maps();
281 xen_setup_vcpu_info_placement(); 282 xen_setup_vcpu_info_placement();
283 }
284 /*
285 * The alternative logic (which patches the unlock/lock) runs before
286 * the smp bootup up code is activated. Hence we need to set this up
287 * the core kernel is being patched. Otherwise we will have only
288 * modules patched but not core code.
289 */
282 xen_init_spinlocks(); 290 xen_init_spinlocks();
283} 291}
284 292
@@ -573,6 +581,12 @@ static inline int xen_map_vector(int vector)
573 case IRQ_WORK_VECTOR: 581 case IRQ_WORK_VECTOR:
574 xen_vector = XEN_IRQ_WORK_VECTOR; 582 xen_vector = XEN_IRQ_WORK_VECTOR;
575 break; 583 break;
584#ifdef CONFIG_X86_64
585 case NMI_VECTOR:
586 case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
587 xen_vector = XEN_NMI_VECTOR;
588 break;
589#endif
576 default: 590 default:
577 xen_vector = -1; 591 xen_vector = -1;
578 printk(KERN_ERR "xen: vector 0x%x is not implemented\n", 592 printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -703,6 +717,15 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
703 WARN_ON(rc); 717 WARN_ON(rc);
704 if (!rc) 718 if (!rc)
705 rc = native_cpu_up(cpu, tidle); 719 rc = native_cpu_up(cpu, tidle);
720
721 /*
722 * We must initialize the slowpath CPU kicker _after_ the native
723 * path has executed. If we initialized it before none of the
724 * unlocker IPI kicks would reach the booting CPU as the booting
725 * CPU had not set itself 'online' in cpu_online_mask. That mask
726 * is checked when IPIs are sent (on HVM at least).
727 */
728 xen_init_lock_cpu(cpu);
706 return rc; 729 return rc;
707} 730}
708 731
@@ -722,4 +745,5 @@ void __init xen_hvm_smp_init(void)
722 smp_ops.cpu_die = xen_hvm_cpu_die; 745 smp_ops.cpu_die = xen_hvm_cpu_die;
723 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 746 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
724 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 747 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
748 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
725} 749}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0438b9324a72..253f63fceea1 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -81,7 +81,6 @@ static inline void spin_time_accum_blocked(u64 start)
81 spinlock_stats.time_blocked += delta; 81 spinlock_stats.time_blocked += delta;
82} 82}
83#else /* !CONFIG_XEN_DEBUG_FS */ 83#else /* !CONFIG_XEN_DEBUG_FS */
84#define TIMEOUT (1 << 10)
85static inline void add_stats(enum xen_contention_stat var, u32 val) 84static inline void add_stats(enum xen_contention_stat var, u32 val)
86{ 85{
87} 86}
@@ -96,23 +95,6 @@ static inline void spin_time_accum_blocked(u64 start)
96} 95}
97#endif /* CONFIG_XEN_DEBUG_FS */ 96#endif /* CONFIG_XEN_DEBUG_FS */
98 97
99/*
100 * Size struct xen_spinlock so it's the same as arch_spinlock_t.
101 */
102#if NR_CPUS < 256
103typedef u8 xen_spinners_t;
104# define inc_spinners(xl) \
105 asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
106# define dec_spinners(xl) \
107 asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
108#else
109typedef u16 xen_spinners_t;
110# define inc_spinners(xl) \
111 asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
112# define dec_spinners(xl) \
113 asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
114#endif
115
116struct xen_lock_waiting { 98struct xen_lock_waiting {
117 struct arch_spinlock *lock; 99 struct arch_spinlock *lock;
118 __ticket_t want; 100 __ticket_t want;
@@ -123,6 +105,7 @@ static DEFINE_PER_CPU(char *, irq_name);
123static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); 105static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
124static cpumask_t waiting_cpus; 106static cpumask_t waiting_cpus;
125 107
108static bool xen_pvspin = true;
126static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) 109static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
127{ 110{
128 int irq = __this_cpu_read(lock_kicker_irq); 111 int irq = __this_cpu_read(lock_kicker_irq);
@@ -241,16 +224,12 @@ void xen_init_lock_cpu(int cpu)
241 int irq; 224 int irq;
242 char *name; 225 char *name;
243 226
227 if (!xen_pvspin)
228 return;
229
244 WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", 230 WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
245 cpu, per_cpu(lock_kicker_irq, cpu)); 231 cpu, per_cpu(lock_kicker_irq, cpu));
246 232
247 /*
248 * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
249 * (xen: disable PV spinlocks on HVM)
250 */
251 if (xen_hvm_domain())
252 return;
253
254 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); 233 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
255 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, 234 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
256 cpu, 235 cpu,
@@ -270,11 +249,7 @@ void xen_init_lock_cpu(int cpu)
270 249
271void xen_uninit_lock_cpu(int cpu) 250void xen_uninit_lock_cpu(int cpu)
272{ 251{
273 /* 252 if (!xen_pvspin)
274 * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
275 * (xen: disable PV spinlocks on HVM)
276 */
277 if (xen_hvm_domain())
278 return; 253 return;
279 254
280 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); 255 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
@@ -283,16 +258,9 @@ void xen_uninit_lock_cpu(int cpu)
283 per_cpu(irq_name, cpu) = NULL; 258 per_cpu(irq_name, cpu) = NULL;
284} 259}
285 260
286static bool xen_pvspin __initdata = true;
287 261
288void __init xen_init_spinlocks(void) 262void __init xen_init_spinlocks(void)
289{ 263{
290 /*
291 * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
292 * (xen: disable PV spinlocks on HVM)
293 */
294 if (xen_hvm_domain())
295 return;
296 264
297 if (!xen_pvspin) { 265 if (!xen_pvspin) {
298 printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); 266 printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
@@ -323,6 +291,9 @@ static int __init xen_spinlock_debugfs(void)
323 if (d_xen == NULL) 291 if (d_xen == NULL)
324 return -ENOMEM; 292 return -ENOMEM;
325 293
294 if (!xen_pvspin)
295 return 0;
296
326 d_spin_debug = debugfs_create_dir("spinlocks", d_xen); 297 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
327 298
328 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); 299 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);