aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore3
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S115
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c130
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/cache.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/inst.h96
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h55
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/kvm_para.h13
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/pci_x86.h4
-rw-r--r--arch/x86/include/asm/perf_event_p4.h3
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/scatterlist.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h2
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/topology.h26
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kernel/acpi/boot.c19
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/amd_iommu.c16
-rw-r--r--arch/x86/kernel/amd_iommu_init.c20
-rw-r--r--arch/x86/kernel/apic/apic.c41
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c89
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c41
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/microcode_core.c1
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/setup.c11
-rw-r--r--arch/x86/kernel/setup_percpu.c12
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c230
-rw-r--r--arch/x86/kvm/mmutrace.h84
-rw-r--r--arch/x86/kvm/paging_tmpl.h46
-rw-r--r--arch/x86/kvm/svm.c1040
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c378
-rw-r--r--arch/x86/kvm/x86.c1599
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/numa_64.c9
-rw-r--r--arch/x86/mm/pat.c10
-rw-r--r--arch/x86/mm/pat_internal.h6
-rw-r--r--arch/x86/mm/pat_rbtree.c7
-rw-r--r--arch/x86/mm/pf_in.c2
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/acpi.c8
-rw-r--r--arch/x86/pci/broadcom_bus.c101
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/direct.c16
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/irq.c9
-rw-r--r--arch/x86/pci/legacy.c42
-rw-r--r--arch/x86/pci/mmconfig-shared.c17
-rw-r--r--arch/x86/pci/mmconfig_32.c8
-rw-r--r--arch/x86/pci/numaq_32.c8
-rw-r--r--arch/x86/pci/pcbios.c8
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/xen/suspend.c4
91 files changed, 4139 insertions, 2271 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
new file mode 100644
index 000000000000..028079065af6
--- /dev/null
+++ b/arch/x86/.gitignore
@@ -0,0 +1,3 @@
1boot/compressed/vmlinux
2tools/test_get_len
3
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a2d3a5fbeeda..dcb0593b4a66 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,9 @@ config SBUS
109config NEED_DMA_MAP_STATE 109config NEED_DMA_MAP_STATE
110 def_bool (X86_64 || DMAR || DMA_API_DEBUG) 110 def_bool (X86_64 || DMAR || DMA_API_DEBUG)
111 111
112config NEED_SG_DMA_LENGTH
113 def_bool y
114
112config GENERIC_ISA_DMA 115config GENERIC_ISA_DMA
113 def_bool y 116 def_bool y
114 117
@@ -1703,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1703 def_bool X86_64 1706 def_bool X86_64
1704 depends on NUMA 1707 depends on NUMA
1705 1708
1709config USE_PERCPU_NUMA_NODE_ID
1710 def_bool X86_64
1711 depends on NUMA
1712
1706menu "Power management and ACPI options" 1713menu "Power management and ACPI options"
1707 1714
1708config ARCH_HIBERNATION_HEADER 1715config ARCH_HIBERNATION_HEADER
@@ -1923,6 +1930,14 @@ config PCI_MMCONFIG
1923 bool "Support mmconfig PCI config space access" 1930 bool "Support mmconfig PCI config space access"
1924 depends on X86_64 && PCI && ACPI 1931 depends on X86_64 && PCI && ACPI
1925 1932
1933config PCI_CNB20LE_QUIRK
1934 bool "Read CNB20LE Host Bridge Windows"
1935 depends on PCI
1936 help
1937 Read the PCI windows out of the CNB20LE host bridge. This allows
1938 PCI hotplug to work on systems with the CNB20LE chipset which do
1939 not have ACPI.
1940
1926config DMAR 1941config DMAR
1927 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1942 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1928 depends on PCI_MSI && ACPI && EXPERIMENTAL 1943 depends on PCI_MSI && ACPI && EXPERIMENTAL
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index bcbd36c41432..5c228129d175 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ 77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ 78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 79
80 printf(".section \".rodata.compressed\",\"a\",@progbits\n"); 80 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
81 printf(".globl z_input_len\n"); 81 printf(".globl z_input_len\n");
82 printf("z_input_len = %lu\n", ilen); 82 printf("z_input_len = %lu\n", ilen);
83 printf(".globl z_output_len\n"); 83 printf(".globl z_output_len\n");
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index a6f1a59a5b0c..5ddabceee124 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -26,8 +26,8 @@ SECTIONS
26 HEAD_TEXT 26 HEAD_TEXT
27 _ehead = . ; 27 _ehead = . ;
28 } 28 }
29 .rodata.compressed : { 29 .rodata..compressed : {
30 *(.rodata.compressed) 30 *(.rodata..compressed)
31 } 31 }
32 .text : { 32 .text : {
33 _text = .; /* Text */ 33 _text = .; /* Text */
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 20bb0e1ac681..ff16756a51c1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,9 @@
32#define IN IN1 32#define IN IN1
33#define KEY %xmm2 33#define KEY %xmm2
34#define IV %xmm3 34#define IV %xmm3
35#define BSWAP_MASK %xmm10
36#define CTR %xmm11
37#define INC %xmm12
35 38
36#define KEYP %rdi 39#define KEYP %rdi
37#define OUTP %rsi 40#define OUTP %rsi
@@ -42,6 +45,7 @@
42#define T1 %r10 45#define T1 %r10
43#define TKEYP T1 46#define TKEYP T1
44#define T2 %r11 47#define T2 %r11
48#define TCTR_LOW T2
45 49
46_key_expansion_128: 50_key_expansion_128:
47_key_expansion_256a: 51_key_expansion_256a:
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
724 movups IV, (IVP) 728 movups IV, (IVP)
725.Lcbc_dec_just_ret: 729.Lcbc_dec_just_ret:
726 ret 730 ret
731
732.align 16
733.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
735
736/*
737 * _aesni_inc_init: internal ABI
738 * setup registers used by _aesni_inc
739 * input:
740 * IV
741 * output:
742 * CTR: == IV, in little endian
743 * TCTR_LOW: == lower qword of CTR
744 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask
746 */
747_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR
750 PSHUFB_XMM BSWAP_MASK CTR
751 mov $1, TCTR_LOW
752 MOVQ_R64_XMM TCTR_LOW INC
753 MOVQ_R64_XMM CTR TCTR_LOW
754 ret
755
756/*
757 * _aesni_inc: internal ABI
758 * Increase IV by 1, IV is in big endian
759 * input:
760 * IV
761 * CTR: == IV, in little endian
762 * TCTR_LOW: == lower qword of CTR
763 * INC: == 1, in little endian
764 * BSWAP_MASK == endian swapping mask
765 * output:
766 * IV: Increase by 1
767 * changed:
768 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR
770 */
771_aesni_inc:
772 paddq INC, CTR
773 add $1, TCTR_LOW
774 jnc .Linc_low
775 pslldq $8, INC
776 paddq INC, CTR
777 psrldq $8, INC
778.Linc_low:
779 movaps CTR, IV
780 PSHUFB_XMM BSWAP_MASK IV
781 ret
782
783/*
784 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
785 * size_t len, u8 *iv)
786 */
787ENTRY(aesni_ctr_enc)
788 cmp $16, LEN
789 jb .Lctr_enc_just_ret
790 mov 480(KEYP), KLEN
791 movups (IVP), IV
792 call _aesni_inc_init
793 cmp $64, LEN
794 jb .Lctr_enc_loop1
795.align 4
796.Lctr_enc_loop4:
797 movaps IV, STATE1
798 call _aesni_inc
799 movups (INP), IN1
800 movaps IV, STATE2
801 call _aesni_inc
802 movups 0x10(INP), IN2
803 movaps IV, STATE3
804 call _aesni_inc
805 movups 0x20(INP), IN3
806 movaps IV, STATE4
807 call _aesni_inc
808 movups 0x30(INP), IN4
809 call _aesni_enc4
810 pxor IN1, STATE1
811 movups STATE1, (OUTP)
812 pxor IN2, STATE2
813 movups STATE2, 0x10(OUTP)
814 pxor IN3, STATE3
815 movups STATE3, 0x20(OUTP)
816 pxor IN4, STATE4
817 movups STATE4, 0x30(OUTP)
818 sub $64, LEN
819 add $64, INP
820 add $64, OUTP
821 cmp $64, LEN
822 jge .Lctr_enc_loop4
823 cmp $16, LEN
824 jb .Lctr_enc_ret
825.align 4
826.Lctr_enc_loop1:
827 movaps IV, STATE
828 call _aesni_inc
829 movups (INP), IN
830 call _aesni_enc1
831 pxor IN, STATE
832 movups STATE, (OUTP)
833 sub $16, LEN
834 add $16, INP
835 add $16, OUTP
836 cmp $16, LEN
837 jge .Lctr_enc_loop1
838.Lctr_enc_ret:
839 movups IV, (IVP)
840.Lctr_enc_just_ret:
841 ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 49c552c060e9..2cb3dcc4490a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -18,6 +18,7 @@
18#include <crypto/algapi.h> 18#include <crypto/algapi.h>
19#include <crypto/aes.h> 19#include <crypto/aes.h>
20#include <crypto/cryptd.h> 20#include <crypto/cryptd.h>
21#include <crypto/ctr.h>
21#include <asm/i387.h> 22#include <asm/i387.h>
22#include <asm/aes.h> 23#include <asm/aes.h>
23 24
@@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
58 const u8 *in, unsigned int len, u8 *iv); 59 const u8 *in, unsigned int len, u8 *iv);
59asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
60 const u8 *in, unsigned int len, u8 *iv); 61 const u8 *in, unsigned int len, u8 *iv);
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv);
61 64
62static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
63{ 66{
@@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = {
321 }, 324 },
322}; 325};
323 326
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk)
329{
330 u8 *ctrblk = walk->iv;
331 u8 keystream[AES_BLOCK_SIZE];
332 u8 *src = walk->src.virt.addr;
333 u8 *dst = walk->dst.virt.addr;
334 unsigned int nbytes = walk->nbytes;
335
336 aesni_enc(ctx, keystream, ctrblk);
337 crypto_xor(keystream, src, nbytes);
338 memcpy(dst, keystream, nbytes);
339 crypto_inc(ctrblk, AES_BLOCK_SIZE);
340}
341
342static int ctr_crypt(struct blkcipher_desc *desc,
343 struct scatterlist *dst, struct scatterlist *src,
344 unsigned int nbytes)
345{
346 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
347 struct blkcipher_walk walk;
348 int err;
349
350 blkcipher_walk_init(&walk, dst, src, nbytes);
351 err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
352 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
353
354 kernel_fpu_begin();
355 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
356 aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
357 nbytes & AES_BLOCK_MASK, walk.iv);
358 nbytes &= AES_BLOCK_SIZE - 1;
359 err = blkcipher_walk_done(desc, &walk, nbytes);
360 }
361 if (walk.nbytes) {
362 ctr_crypt_final(ctx, &walk);
363 err = blkcipher_walk_done(desc, &walk, 0);
364 }
365 kernel_fpu_end();
366
367 return err;
368}
369
370static struct crypto_alg blk_ctr_alg = {
371 .cra_name = "__ctr-aes-aesni",
372 .cra_driver_name = "__driver-ctr-aes-aesni",
373 .cra_priority = 0,
374 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
375 .cra_blocksize = 1,
376 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
377 .cra_alignmask = 0,
378 .cra_type = &crypto_blkcipher_type,
379 .cra_module = THIS_MODULE,
380 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
381 .cra_u = {
382 .blkcipher = {
383 .min_keysize = AES_MIN_KEY_SIZE,
384 .max_keysize = AES_MAX_KEY_SIZE,
385 .ivsize = AES_BLOCK_SIZE,
386 .setkey = aes_set_key,
387 .encrypt = ctr_crypt,
388 .decrypt = ctr_crypt,
389 },
390 },
391};
392
324static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
325 unsigned int key_len) 394 unsigned int key_len)
326{ 395{
@@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = {
467 }, 536 },
468}; 537};
469 538
470#ifdef HAS_CTR
471static int ablk_ctr_init(struct crypto_tfm *tfm) 539static int ablk_ctr_init(struct crypto_tfm *tfm)
472{ 540{
473 struct cryptd_ablkcipher *cryptd_tfm; 541 struct cryptd_ablkcipher *cryptd_tfm;
474 542
475 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", 543 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
476 0, 0);
477 if (IS_ERR(cryptd_tfm)) 544 if (IS_ERR(cryptd_tfm))
478 return PTR_ERR(cryptd_tfm); 545 return PTR_ERR(cryptd_tfm);
479 ablk_init_common(tfm, cryptd_tfm); 546 ablk_init_common(tfm, cryptd_tfm);
@@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = {
500 .ivsize = AES_BLOCK_SIZE, 567 .ivsize = AES_BLOCK_SIZE,
501 .setkey = ablk_set_key, 568 .setkey = ablk_set_key,
502 .encrypt = ablk_encrypt, 569 .encrypt = ablk_encrypt,
503 .decrypt = ablk_decrypt, 570 .decrypt = ablk_encrypt,
504 .geniv = "chainiv", 571 .geniv = "chainiv",
505 }, 572 },
506 }, 573 },
507}; 574};
575
576#ifdef HAS_CTR
577static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
578{
579 struct cryptd_ablkcipher *cryptd_tfm;
580
581 cryptd_tfm = cryptd_alloc_ablkcipher(
582 "rfc3686(__driver-ctr-aes-aesni)", 0, 0);
583 if (IS_ERR(cryptd_tfm))
584 return PTR_ERR(cryptd_tfm);
585 ablk_init_common(tfm, cryptd_tfm);
586 return 0;
587}
588
589static struct crypto_alg ablk_rfc3686_ctr_alg = {
590 .cra_name = "rfc3686(ctr(aes))",
591 .cra_driver_name = "rfc3686-ctr-aes-aesni",
592 .cra_priority = 400,
593 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
594 .cra_blocksize = 1,
595 .cra_ctxsize = sizeof(struct async_aes_ctx),
596 .cra_alignmask = 0,
597 .cra_type = &crypto_ablkcipher_type,
598 .cra_module = THIS_MODULE,
599 .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
600 .cra_init = ablk_rfc3686_ctr_init,
601 .cra_exit = ablk_exit,
602 .cra_u = {
603 .ablkcipher = {
604 .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
605 .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
606 .ivsize = CTR_RFC3686_IV_SIZE,
607 .setkey = ablk_set_key,
608 .encrypt = ablk_encrypt,
609 .decrypt = ablk_decrypt,
610 .geniv = "seqiv",
611 },
612 },
613};
508#endif 614#endif
509 615
510#ifdef HAS_LRW 616#ifdef HAS_LRW
@@ -640,13 +746,17 @@ static int __init aesni_init(void)
640 goto blk_ecb_err; 746 goto blk_ecb_err;
641 if ((err = crypto_register_alg(&blk_cbc_alg))) 747 if ((err = crypto_register_alg(&blk_cbc_alg)))
642 goto blk_cbc_err; 748 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
643 if ((err = crypto_register_alg(&ablk_ecb_alg))) 751 if ((err = crypto_register_alg(&ablk_ecb_alg)))
644 goto ablk_ecb_err; 752 goto ablk_ecb_err;
645 if ((err = crypto_register_alg(&ablk_cbc_alg))) 753 if ((err = crypto_register_alg(&ablk_cbc_alg)))
646 goto ablk_cbc_err; 754 goto ablk_cbc_err;
647#ifdef HAS_CTR
648 if ((err = crypto_register_alg(&ablk_ctr_alg))) 755 if ((err = crypto_register_alg(&ablk_ctr_alg)))
649 goto ablk_ctr_err; 756 goto ablk_ctr_err;
757#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err;
650#endif 760#endif
651#ifdef HAS_LRW 761#ifdef HAS_LRW
652 if ((err = crypto_register_alg(&ablk_lrw_alg))) 762 if ((err = crypto_register_alg(&ablk_lrw_alg)))
@@ -675,13 +785,17 @@ ablk_pcbc_err:
675ablk_lrw_err: 785ablk_lrw_err:
676#endif 786#endif
677#ifdef HAS_CTR 787#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err:
790#endif
678 crypto_unregister_alg(&ablk_ctr_alg); 791 crypto_unregister_alg(&ablk_ctr_alg);
679ablk_ctr_err: 792ablk_ctr_err:
680#endif
681 crypto_unregister_alg(&ablk_cbc_alg); 793 crypto_unregister_alg(&ablk_cbc_alg);
682ablk_cbc_err: 794ablk_cbc_err:
683 crypto_unregister_alg(&ablk_ecb_alg); 795 crypto_unregister_alg(&ablk_ecb_alg);
684ablk_ecb_err: 796ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
685 crypto_unregister_alg(&blk_cbc_alg); 799 crypto_unregister_alg(&blk_cbc_alg);
686blk_cbc_err: 800blk_cbc_err:
687 crypto_unregister_alg(&blk_ecb_alg); 801 crypto_unregister_alg(&blk_ecb_alg);
@@ -705,10 +819,12 @@ static void __exit aesni_exit(void)
705 crypto_unregister_alg(&ablk_lrw_alg); 819 crypto_unregister_alg(&ablk_lrw_alg);
706#endif 820#endif
707#ifdef HAS_CTR 821#ifdef HAS_CTR
708 crypto_unregister_alg(&ablk_ctr_alg); 822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
709#endif 823#endif
824 crypto_unregister_alg(&ablk_ctr_alg);
710 crypto_unregister_alg(&ablk_cbc_alg); 825 crypto_unregister_alg(&ablk_cbc_alg);
711 crypto_unregister_alg(&ablk_ecb_alg); 826 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
712 crypto_unregister_alg(&blk_cbc_alg); 828 crypto_unregister_alg(&blk_cbc_alg);
713 crypto_unregister_alg(&blk_ecb_alg); 829 crypto_unregister_alg(&blk_ecb_alg);
714 crypto_unregister_alg(&__aesni_alg); 830 crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 56f462cf22d2..aa2c39d968fc 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -85,7 +85,6 @@ extern int acpi_ioapic;
85extern int acpi_noirq; 85extern int acpi_noirq;
86extern int acpi_strict; 86extern int acpi_strict;
87extern int acpi_disabled; 87extern int acpi_disabled;
88extern int acpi_ht;
89extern int acpi_pci_disabled; 88extern int acpi_pci_disabled;
90extern int acpi_skip_timer_override; 89extern int acpi_skip_timer_override;
91extern int acpi_use_timer_override; 90extern int acpi_use_timer_override;
@@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
97static inline void disable_acpi(void) 96static inline void disable_acpi(void)
98{ 97{
99 acpi_disabled = 1; 98 acpi_disabled = 1;
100 acpi_ht = 0;
101 acpi_pci_disabled = 1; 99 acpi_pci_disabled = 1;
102 acpi_noirq = 1; 100 acpi_noirq = 1;
103} 101}
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 2f9047cfaaca..48f99f15452e 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -7,7 +7,7 @@
7#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) 7#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
8#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 8#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data..read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT 12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) 13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index dca9c545f44e..468145914389 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -332,6 +332,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
332#endif 332#endif
333} 333}
334 334
335#if __GNUC__ >= 4
335#define static_cpu_has(bit) \ 336#define static_cpu_has(bit) \
336( \ 337( \
337 __builtin_constant_p(boot_cpu_has(bit)) ? \ 338 __builtin_constant_p(boot_cpu_has(bit)) ? \
@@ -340,6 +341,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
340 __static_cpu_has(bit) : \ 341 __static_cpu_has(bit) : \
341 boot_cpu_has(bit) \ 342 boot_cpu_has(bit) \
342) 343)
344#else
345/*
346 * gcc 3.x is too stupid to do the static test; fall back to dynamic.
347 */
348#define static_cpu_has(bit) boot_cpu_has(bit)
349#endif
343 350
344#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ 351#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
345 352
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 14cf526091f9..280bf7fb6aba 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -7,7 +7,66 @@
7 7
8#ifdef __ASSEMBLY__ 8#ifdef __ASSEMBLY__
9 9
10#define REG_NUM_INVALID 100
11
12#define REG_TYPE_R64 0
13#define REG_TYPE_XMM 1
14#define REG_TYPE_INVALID 100
15
16 .macro R64_NUM opd r64
17 \opd = REG_NUM_INVALID
18 .ifc \r64,%rax
19 \opd = 0
20 .endif
21 .ifc \r64,%rcx
22 \opd = 1
23 .endif
24 .ifc \r64,%rdx
25 \opd = 2
26 .endif
27 .ifc \r64,%rbx
28 \opd = 3
29 .endif
30 .ifc \r64,%rsp
31 \opd = 4
32 .endif
33 .ifc \r64,%rbp
34 \opd = 5
35 .endif
36 .ifc \r64,%rsi
37 \opd = 6
38 .endif
39 .ifc \r64,%rdi
40 \opd = 7
41 .endif
42 .ifc \r64,%r8
43 \opd = 8
44 .endif
45 .ifc \r64,%r9
46 \opd = 9
47 .endif
48 .ifc \r64,%r10
49 \opd = 10
50 .endif
51 .ifc \r64,%r11
52 \opd = 11
53 .endif
54 .ifc \r64,%r12
55 \opd = 12
56 .endif
57 .ifc \r64,%r13
58 \opd = 13
59 .endif
60 .ifc \r64,%r14
61 \opd = 14
62 .endif
63 .ifc \r64,%r15
64 \opd = 15
65 .endif
66 .endm
67
10 .macro XMM_NUM opd xmm 68 .macro XMM_NUM opd xmm
69 \opd = REG_NUM_INVALID
11 .ifc \xmm,%xmm0 70 .ifc \xmm,%xmm0
12 \opd = 0 71 \opd = 0
13 .endif 72 .endif
@@ -58,13 +117,25 @@
58 .endif 117 .endif
59 .endm 118 .endm
60 119
120 .macro REG_TYPE type reg
121 R64_NUM reg_type_r64 \reg
122 XMM_NUM reg_type_xmm \reg
123 .if reg_type_r64 <> REG_NUM_INVALID
124 \type = REG_TYPE_R64
125 .elseif reg_type_xmm <> REG_NUM_INVALID
126 \type = REG_TYPE_XMM
127 .else
128 \type = REG_TYPE_INVALID
129 .endif
130 .endm
131
61 .macro PFX_OPD_SIZE 132 .macro PFX_OPD_SIZE
62 .byte 0x66 133 .byte 0x66
63 .endm 134 .endm
64 135
65 .macro PFX_REX opd1 opd2 136 .macro PFX_REX opd1 opd2 W=0
66 .if (\opd1 | \opd2) & 8 137 .if ((\opd1 | \opd2) & 8) || \W
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) 138 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
68 .endif 139 .endif
69 .endm 140 .endm
70 141
@@ -145,6 +216,25 @@
145 .byte 0x0f, 0x38, 0xdf 216 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 217 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm 218 .endm
219
220 .macro MOVQ_R64_XMM opd1 opd2
221 REG_TYPE movq_r64_xmm_opd1_type \opd1
222 .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
223 XMM_NUM movq_r64_xmm_opd1 \opd1
224 R64_NUM movq_r64_xmm_opd2 \opd2
225 .else
226 R64_NUM movq_r64_xmm_opd1 \opd1
227 XMM_NUM movq_r64_xmm_opd2 \opd2
228 .endif
229 PFX_OPD_SIZE
230 PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
231 .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
232 .byte 0x0f, 0x7e
233 .else
234 .byte 0x0f, 0x6e
235 .endif
236 MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
237 .endm
148#endif 238#endif
149 239
150#endif 240#endif
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 000000000000..4470c9ad4a3e
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,55 @@
1#ifndef _ASM_X86_INTEL_SCU_IPC_H_
2#define _ASM_X86_INTEL_SCU_IPC_H_
3
4/* Read single register */
5int intel_scu_ipc_ioread8(u16 addr, u8 *data);
6
7/* Read two sequential registers */
8int intel_scu_ipc_ioread16(u16 addr, u16 *data);
9
10/* Read four sequential registers */
11int intel_scu_ipc_ioread32(u16 addr, u32 *data);
12
13/* Read a vector */
14int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
15
16/* Write single register */
17int intel_scu_ipc_iowrite8(u16 addr, u8 data);
18
19/* Write two sequential registers */
20int intel_scu_ipc_iowrite16(u16 addr, u16 data);
21
22/* Write four sequential registers */
23int intel_scu_ipc_iowrite32(u16 addr, u32 data);
24
25/* Write a vector */
26int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
27
28/* Update single register based on the mask */
29int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
30
31/*
32 * Indirect register read
33 * Can be used when SCCB(System Controller Configuration Block) register
34 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
35 */
36int intel_scu_ipc_register_read(u32 addr, u32 *data);
37
38/*
39 * Indirect register write
40 * Can be used when SCCB(System Controller Configuration Block) register
41 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
42 */
43int intel_scu_ipc_register_write(u32 addr, u32 data);
44
45/* Issue commands to the SCU with or without data */
46int intel_scu_ipc_simple_command(int cmd, int sub);
47int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
48 u32 *out, int outlen);
49/* I2C control api */
50int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
51
52/* Update FW version */
53int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
54
55#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16c..ff90055c7f0b 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS
24 25
25/* Architectural interrupt line count. */ 26/* Architectural interrupt line count. */
26#define KVM_NR_INTERRUPTS 256 27#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ 258/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 259#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 260#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
261#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
262
263/* Interrupt shadow states */
264#define KVM_X86_SHADOW_INT_MOV_SS 0x01
265#define KVM_X86_SHADOW_INT_STI 0x02
260 266
261/* for KVM_GET/SET_VCPU_EVENTS */ 267/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events { 268struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
271 __u8 injected; 277 __u8 injected;
272 __u8 nr; 278 __u8 nr;
273 __u8 soft; 279 __u8 soft;
274 __u8 pad; 280 __u8 shadow;
275 } interrupt; 281 } interrupt;
276 struct { 282 struct {
277 __u8 injected; 283 __u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
284 __u32 reserved[10]; 290 __u32 reserved[10];
285}; 291};
286 292
293/* for KVM_GET/SET_DEBUGREGS */
294struct kvm_debugregs {
295 __u64 db[4];
296 __u64 dr6;
297 __u64 dr7;
298 __u64 flags;
299 __u64 reserved[9];
300};
301
287#endif /* _ASM_X86_KVM_H */ 302#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13ba..0b2729bf2070 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
11#ifndef _ASM_X86_KVM_X86_EMULATE_H 11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H 12#define _ASM_X86_KVM_X86_EMULATE_H
13 13
14#include <asm/desc_defs.h>
15
14struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
15 17
16/* 18/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 65 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 66
65 /* 67 /*
68 * write_std: Write bytes of standard (non-emulated/special) memory.
69 * Used for descriptor writing.
70 * @addr: [IN ] Linear address to which to write.
71 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to write to memory.
73 */
74 int (*write_std)(unsigned long addr, void *val,
75 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
76 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory. 77 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch. 78 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read. 79 * @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
109 unsigned int bytes, 120 unsigned int bytes,
110 struct kvm_vcpu *vcpu); 121 struct kvm_vcpu *vcpu);
111 122
123 int (*pio_in_emulated)(int size, unsigned short port, void *val,
124 unsigned int count, struct kvm_vcpu *vcpu);
125
126 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
127 unsigned int count, struct kvm_vcpu *vcpu);
128
129 bool (*get_cached_descriptor)(struct desc_struct *desc,
130 int seg, struct kvm_vcpu *vcpu);
131 void (*set_cached_descriptor)(struct desc_struct *desc,
132 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
112}; 140};
113 141
114/* Type, address-of, and value of an instruction's operand. */ 142/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
124 unsigned long end; 152 unsigned long end;
125}; 153};
126 154
155struct read_cache {
156 u8 data[1024];
157 unsigned long pos;
158 unsigned long end;
159};
160
127struct decode_cache { 161struct decode_cache {
128 u8 twobyte; 162 u8 twobyte;
129 u8 b; 163 u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
139 u8 seg_override; 173 u8 seg_override;
140 unsigned int d; 174 unsigned int d;
141 unsigned long regs[NR_VCPU_REGS]; 175 unsigned long regs[NR_VCPU_REGS];
142 unsigned long eip, eip_orig; 176 unsigned long eip;
143 /* modrm */ 177 /* modrm */
144 u8 modrm; 178 u8 modrm;
145 u8 modrm_mod; 179 u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
151 void *modrm_ptr; 185 void *modrm_ptr;
152 unsigned long modrm_val; 186 unsigned long modrm_val;
153 struct fetch_cache fetch; 187 struct fetch_cache fetch;
188 struct read_cache io_read;
154}; 189};
155 190
156#define X86_SHADOW_INT_MOV_SS 1
157#define X86_SHADOW_INT_STI 2
158
159struct x86_emulate_ctxt { 191struct x86_emulate_ctxt {
160 /* Register state before/after emulation. */ 192 /* Register state before/after emulation. */
161 struct kvm_vcpu *vcpu; 193 struct kvm_vcpu *vcpu;
162 194
163 unsigned long eflags; 195 unsigned long eflags;
196 unsigned long eip; /* eip before instruction emulation */
164 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 197 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
165 int mode; 198 int mode;
166 u32 cs_base; 199 u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
168 /* interruptibility state, as a result of execution of STI or MOV SS */ 201 /* interruptibility state, as a result of execution of STI or MOV SS */
169 int interruptibility; 202 int interruptibility;
170 203
204 bool restart; /* restart string instruction after writeback */
171 /* decode cache */ 205 /* decode cache */
172 struct decode_cache decode; 206 struct decode_cache decode;
173}; 207};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
194 struct x86_emulate_ops *ops); 228 struct x86_emulate_ops *ops);
195int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 229int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
196 struct x86_emulate_ops *ops); 230 struct x86_emulate_ops *ops);
231int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
232 struct x86_emulate_ops *ops,
233 u16 tss_selector, int reason,
234 bool has_error_code, u32 error_code);
197 235
198#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 236#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37d..76f5483cffec 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,15 +171,15 @@ struct kvm_pte_chain {
171union kvm_mmu_page_role { 171union kvm_mmu_page_role {
172 unsigned word; 172 unsigned word;
173 struct { 173 struct {
174 unsigned glevels:4;
175 unsigned level:4; 174 unsigned level:4;
175 unsigned cr4_pae:1;
176 unsigned quadrant:2; 176 unsigned quadrant:2;
177 unsigned pad_for_nice_hex_output:6; 177 unsigned pad_for_nice_hex_output:6;
178 unsigned direct:1; 178 unsigned direct:1;
179 unsigned access:3; 179 unsigned access:3;
180 unsigned invalid:1; 180 unsigned invalid:1;
181 unsigned cr4_pge:1;
182 unsigned nxe:1; 181 unsigned nxe:1;
182 unsigned cr0_wp:1;
183 }; 183 };
184}; 184};
185 185
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
187 struct list_head link; 187 struct list_head link;
188 struct hlist_node hash_link; 188 struct hlist_node hash_link;
189 189
190 struct list_head oos_link;
191
192 /* 190 /*
193 * The following two entries are used to key the shadow page in the 191 * The following two entries are used to key the shadow page in the
194 * hash table. 192 * hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
204 * in this shadow page. 202 * in this shadow page.
205 */ 203 */
206 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 204 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
207 int multimapped; /* More than one parent_pte? */ 205 bool multimapped; /* More than one parent_pte? */
208 int root_count; /* Currently serving as active root */
209 bool unsync; 206 bool unsync;
207 int root_count; /* Currently serving as active root */
210 unsigned int unsync_children; 208 unsigned int unsync_children;
211 union { 209 union {
212 u64 *parent_pte; /* !multimapped */ 210 u64 *parent_pte; /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
224 222
225struct kvm_pio_request { 223struct kvm_pio_request {
226 unsigned long count; 224 unsigned long count;
227 int cur_count;
228 gva_t guest_gva;
229 int in; 225 int in;
230 int port; 226 int port;
231 int size; 227 int size;
232 int string;
233 int down;
234 int rep;
235}; 228};
236 229
237/* 230/*
@@ -320,6 +313,7 @@ struct kvm_vcpu_arch {
320 struct kvm_queued_exception { 313 struct kvm_queued_exception {
321 bool pending; 314 bool pending;
322 bool has_error_code; 315 bool has_error_code;
316 bool reinject;
323 u8 nr; 317 u8 nr;
324 u32 error_code; 318 u32 error_code;
325 } exception; 319 } exception;
@@ -362,8 +356,8 @@ struct kvm_vcpu_arch {
362 u64 *mce_banks; 356 u64 *mce_banks;
363 357
364 /* used for guest single stepping over the given code position */ 358 /* used for guest single stepping over the given code position */
365 u16 singlestep_cs;
366 unsigned long singlestep_rip; 359 unsigned long singlestep_rip;
360
367 /* fields used by HYPER-V emulation */ 361 /* fields used by HYPER-V emulation */
368 u64 hv_vapic; 362 u64 hv_vapic;
369}; 363};
@@ -389,6 +383,7 @@ struct kvm_arch {
389 unsigned int n_free_mmu_pages; 383 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 384 unsigned int n_requested_mmu_pages;
391 unsigned int n_alloc_mmu_pages; 385 unsigned int n_alloc_mmu_pages;
386 atomic_t invlpg_counter;
392 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 387 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
393 /* 388 /*
394 * Hash table of struct kvm_mmu_page. 389 * Hash table of struct kvm_mmu_page.
@@ -461,11 +456,6 @@ struct kvm_vcpu_stat {
461 u32 nmi_injections; 456 u32 nmi_injections;
462}; 457};
463 458
464struct descriptor_table {
465 u16 limit;
466 unsigned long base;
467} __attribute__((packed));
468
469struct kvm_x86_ops { 459struct kvm_x86_ops {
470 int (*cpu_has_kvm_support)(void); /* __init */ 460 int (*cpu_has_kvm_support)(void); /* __init */
471 int (*disabled_by_bios)(void); /* __init */ 461 int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +493,11 @@ struct kvm_x86_ops {
503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 493 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
504 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 494 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
505 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 495 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
506 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 496 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 497 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 498 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 499 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); 500 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 501 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 502 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 503 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -527,7 +516,8 @@ struct kvm_x86_ops {
527 void (*set_irq)(struct kvm_vcpu *vcpu); 516 void (*set_irq)(struct kvm_vcpu *vcpu);
528 void (*set_nmi)(struct kvm_vcpu *vcpu); 517 void (*set_nmi)(struct kvm_vcpu *vcpu);
529 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
530 bool has_error_code, u32 error_code); 519 bool has_error_code, u32 error_code,
520 bool reinject);
531 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
532 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
533 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 523 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -541,6 +531,8 @@ struct kvm_x86_ops {
541 int (*get_lpage_level)(void); 531 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void); 532 bool (*rdtscp_supported)(void);
543 533
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535
544 const struct trace_print_flags *exit_reasons_str; 536 const struct trace_print_flags *exit_reasons_str;
545}; 537};
546 538
@@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
587void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
588void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
589void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
590void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
591 unsigned long *rflags);
592 582
593unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
594void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
595 unsigned long *rflags);
596void kvm_enable_efer_bits(u64); 583void kvm_enable_efer_bits(u64);
597int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 584int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
598int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 585int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
599 586
600struct x86_emulate_ctxt; 587struct x86_emulate_ctxt;
601 588
602int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, 589int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
603 int size, unsigned port);
604int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
605 int size, unsigned long count, int down,
606 gva_t address, int rep, unsigned port);
607void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 590void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
608int kvm_emulate_halt(struct kvm_vcpu *vcpu); 591int kvm_emulate_halt(struct kvm_vcpu *vcpu);
609int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
618 601
619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code);
620 604
621void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
622void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
623void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
624void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
625unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
626void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
627void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
634 620
635void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 621void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
636void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 622void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
623void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
624void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
637void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 625void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
638 u32 error_code); 626 u32 error_code);
639bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 627bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
@@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr,
649 unsigned int bytes, 637 unsigned int bytes,
650 struct kvm_vcpu *vcpu); 638 struct kvm_vcpu *vcpu);
651 639
652unsigned long segment_base(u16 selector);
653
654void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
655void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
656 const u8 *new, int bytes, 642 const u8 *new, int bytes,
@@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
675void kvm_enable_tdp(void); 661void kvm_enable_tdp(void);
676void kvm_disable_tdp(void); 662void kvm_disable_tdp(void);
677 663
678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
679int complete_pio(struct kvm_vcpu *vcpu); 664int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu); 665bool kvm_check_iopl(struct kvm_vcpu *vcpu);
681 666
@@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel)
724 asm("lldt %0" : : "rm"(sel)); 709 asm("lldt %0" : : "rm"(sel));
725} 710}
726 711
727static inline void kvm_get_idt(struct descriptor_table *table)
728{
729 asm("sidt %0" : "=m"(*table));
730}
731
732static inline void kvm_get_gdt(struct descriptor_table *table)
733{
734 asm("sgdt %0" : "=m"(*table));
735}
736
737static inline unsigned long kvm_read_tr_base(void)
738{
739 u16 tr;
740 asm("str %0" : "=g"(tr));
741 return segment_base(tr);
742}
743
744#ifdef CONFIG_X86_64 712#ifdef CONFIG_X86_64
745static inline unsigned long read_msr(unsigned long msr) 713static inline unsigned long read_msr(unsigned long msr)
746{ 714{
@@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
826void kvm_define_shared_msr(unsigned index, u32 msr); 794void kvm_define_shared_msr(unsigned index, u32 msr);
827void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 795void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
828 796
797bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
798
829#endif /* _ASM_X86_KVM_HOST_H */ 799#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d7..05eba5e9a8e8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,10 +16,23 @@
16#define KVM_FEATURE_CLOCKSOURCE 0 16#define KVM_FEATURE_CLOCKSOURCE 0
17#define KVM_FEATURE_NOP_IO_DELAY 1 17#define KVM_FEATURE_NOP_IO_DELAY 1
18#define KVM_FEATURE_MMU_OP 2 18#define KVM_FEATURE_MMU_OP 2
19/* This indicates that the new set of kvmclock msrs
20 * are available. The use of 0x11 and 0x12 is deprecated
21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3
23
24/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored.
26 */
27#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
19 28
20#define MSR_KVM_WALL_CLOCK 0x11 29#define MSR_KVM_WALL_CLOCK 0x11
21#define MSR_KVM_SYSTEM_TIME 0x12 30#define MSR_KVM_SYSTEM_TIME 0x12
22 31
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
35
23#define KVM_MAX_MMU_OP_BATCH 32 36#define KVM_MAX_MMU_OP_BATCH 32
24 37
25/* Operations for KVM_HC_MMU_OP */ 38/* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c3fdd631ed3..f32a4301c4d4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
225static inline void mcheck_intel_therm_init(void) { } 225static inline void mcheck_intel_therm_init(void) { }
226#endif 226#endif
227 227
228/*
229 * Used by APEI to report memory error via /dev/mcelog
230 */
231
232struct cper_sec_mem_err;
233extern void apei_mce_report_mem_error(int corrected,
234 struct cper_sec_mem_err *mem_err);
235
228#endif /* __KERNEL__ */ 236#endif /* __KERNEL__ */
229#endif /* _ASM_X86_MCE_H */ 237#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f9..8c7ae4318629 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -110,6 +110,7 @@
110#define MSR_AMD64_PATCH_LOADER 0xc0010020 110#define MSR_AMD64_PATCH_LOADER 0xc0010020
111#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 111#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
112#define MSR_AMD64_OSVW_STATUS 0xc0010141 112#define MSR_AMD64_OSVW_STATUS 0xc0010141
113#define MSR_AMD64_DC_CFG 0xc0011022
113#define MSR_AMD64_IBSFETCHCTL 0xc0011030 114#define MSR_AMD64_IBSFETCHCTL 0xc0011030
114#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 115#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
115#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 116#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
@@ -202,8 +203,9 @@
202#define MSR_IA32_EBL_CR_POWERON 0x0000002a 203#define MSR_IA32_EBL_CR_POWERON 0x0000002a
203#define MSR_IA32_FEATURE_CONTROL 0x0000003a 204#define MSR_IA32_FEATURE_CONTROL 0x0000003a
204 205
205#define FEATURE_CONTROL_LOCKED (1<<0) 206#define FEATURE_CONTROL_LOCKED (1<<0)
206#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) 207#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
208#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
207 209
208#define MSR_IA32_APICBASE 0x0000001b 210#define MSR_IA32_APICBASE 0x0000001b
209#define MSR_IA32_APICBASE_BSP (1<<8) 211#define MSR_IA32_APICBASE_BSP (1<<8)
@@ -235,6 +237,8 @@
235 237
236#define MSR_IA32_MISC_ENABLE 0x000001a0 238#define MSR_IA32_MISC_ENABLE 0x000001a0
237 239
240#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
241
238/* MISC_ENABLE bits: architectural */ 242/* MISC_ENABLE bits: architectural */
239#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 243#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
240#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 244#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6d..cd2a31dc5fb8 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -53,6 +53,8 @@ extern int pcibios_last_bus;
53extern struct pci_bus *pci_root_bus; 53extern struct pci_bus *pci_root_bus;
54extern struct pci_ops pci_root_ops; 54extern struct pci_ops pci_root_ops;
55 55
56void pcibios_scan_specific_bus(int busn);
57
56/* pci-irq.c */ 58/* pci-irq.c */
57 59
58struct irq_info { 60struct irq_info {
@@ -83,7 +85,7 @@ struct irq_routing_table {
83 85
84extern unsigned int pcibios_irq_mask; 86extern unsigned int pcibios_irq_mask;
85 87
86extern spinlock_t pci_config_lock; 88extern raw_spinlock_t pci_config_lock;
87 89
88extern int (*pcibios_enable_irq)(struct pci_dev *dev); 90extern int (*pcibios_enable_irq)(struct pci_dev *dev);
89extern void (*pcibios_disable_irq)(struct pci_dev *dev); 91extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b05400a542ff..64a8ebff06fc 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -89,7 +89,8 @@
89 P4_CCCR_ENABLE) 89 P4_CCCR_ENABLE)
90 90
91/* HT mask */ 91/* HT mask */
92#define P4_CCCR_MASK_HT (P4_CCCR_MASK | P4_CCCR_THREAD_ANY) 92#define P4_CCCR_MASK_HT \
93 (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
93 94
94#define P4_GEN_ESCR_EMASK(class, name, bit) \ 95#define P4_GEN_ESCR_EMASK(class, name, bit) \
95 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) 96 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f2626..35f2d1948ada 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
29 u64 system_time; 29 u64 system_time;
30 u32 tsc_to_system_mul; 30 u32 tsc_to_system_mul;
31 s8 tsc_shift; 31 s8 tsc_shift;
32 u8 pad[3]; 32 u8 flags;
33 u8 pad[2];
33} __attribute__((__packed__)); /* 32 bytes */ 34} __attribute__((__packed__)); /* 32 bytes */
34 35
35struct pvclock_wall_clock { 36struct pvclock_wall_clock {
@@ -38,5 +39,6 @@ struct pvclock_wall_clock {
38 u32 nsec; 39 u32 nsec;
39} __attribute__((__packed__)); 40} __attribute__((__packed__));
40 41
42#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
41#endif /* __ASSEMBLY__ */ 43#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H */ 44#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8ce..cd02f324aa6b 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
10void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
11 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d0..000000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#define PFX "rdc321x: "
2
3/* General purpose configuration and data registers */
4#define RDC3210_CFGREG_ADDR 0x0CF8
5#define RDC3210_CFGREG_DATA 0x0CFC
6
7#define RDC321X_GPIO_CTRL_REG1 0x48
8#define RDC321X_GPIO_CTRL_REG2 0x84
9#define RDC321X_GPIO_DATA_REG1 0x4c
10#define RDC321X_GPIO_DATA_REG2 0x88
11
12#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 75af592677ec..fb0b1874396f 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
1#ifndef _ASM_X86_SCATTERLIST_H 1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H 2#define _ASM_X86_SCATTERLIST_H
3 3
4#define ISA_DMA_THRESHOLD (0x00ffffff)
5
6#include <asm-generic/scatterlist.h> 4#include <asm-generic/scatterlist.h>
7 5
6#define ISA_DMA_THRESHOLD (0x00ffffff)
7#define ARCH_HAS_SG_CHAIN
8
8#endif /* _ASM_X86_SCATTERLIST_H */ 9#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 48dcfa62ea07..fd921c3a6841 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
15struct saved_context { 15struct saved_context {
16 u16 es, fs, gs, ss; 16 u16 es, fs, gs, ss;
17 unsigned long cr0, cr2, cr3, cr4; 17 unsigned long cr0, cr2, cr3, cr4;
18 u64 misc_enable;
19 bool misc_enable_saved;
18 struct desc_ptr gdt; 20 struct desc_ptr gdt;
19 struct desc_ptr idt; 21 struct desc_ptr idt;
20 u16 ldt; 22 u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 06284f42b759..8d942afae681 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -27,6 +27,8 @@ struct saved_context {
27 u16 ds, es, fs, gs, ss; 27 u16 ds, es, fs, gs, ss;
28 unsigned long gs_base, gs_kernel_base, fs_base; 28 unsigned long gs_base, gs_kernel_base, fs_base;
29 unsigned long cr0, cr2, cr3, cr4, cr8; 29 unsigned long cr0, cr2, cr3, cr4, cr8;
30 u64 misc_enable;
31 bool misc_enable_saved;
30 unsigned long efer; 32 unsigned long efer;
31 u16 gdt_pad; 33 u16 gdt_pad;
32 u16 gdt_limit; 34 u16 gdt_limit;
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4c..0e831059ac5a 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 81 u32 event_inj_err;
82 u64 nested_cr3; 82 u64 nested_cr3;
83 u64 lbr_ctl; 83 u64 lbr_ctl;
84 u8 reserved_5[832]; 84 u64 reserved_5;
85 u64 next_rip;
86 u8 reserved_6[816];
85}; 87};
86 88
87 89
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
115#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) 117#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
116#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) 118#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
117 119
120#define SVM_VM_CR_VALID_MASK 0x001fULL
121#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
122#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
123
118struct __attribute__ ((__packed__)) vmcb_seg { 124struct __attribute__ ((__packed__)) vmcb_seg {
119 u16 selector; 125 u16 selector;
120 u16 attrib; 126 u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
238 244
239#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 245#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
240#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
241 248
242#define SVM_EXIT_READ_CR0 0x000 249#define SVM_EXIT_READ_CR0 0x000
243#define SVM_EXIT_READ_CR3 0x003 250#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 62ba9400cc43..f0b6e5dbc5a0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -239,8 +239,8 @@ static inline struct thread_info *current_thread_info(void)
239#define TS_USEDFPU 0x0001 /* FPU was used by this task 239#define TS_USEDFPU 0x0001 /* FPU was used by this task
240 this quantum (SMP) */ 240 this quantum (SMP) */
241#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ 241#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
242#define TS_POLLING 0x0004 /* true if in idle loop 242#define TS_POLLING 0x0004 /* idle task polling need_resched,
243 and not sleeping */ 243 skip sending interrupt */
244#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 244#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
245 245
246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) 246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c5087d796587..21899cc31e52 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -53,33 +53,29 @@
53extern int cpu_to_node_map[]; 53extern int cpu_to_node_map[];
54 54
55/* Returns the number of the node containing CPU 'cpu' */ 55/* Returns the number of the node containing CPU 'cpu' */
56static inline int cpu_to_node(int cpu) 56static inline int __cpu_to_node(int cpu)
57{ 57{
58 return cpu_to_node_map[cpu]; 58 return cpu_to_node_map[cpu];
59} 59}
60#define early_cpu_to_node(cpu) cpu_to_node(cpu) 60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
61 62
62#else /* CONFIG_X86_64 */ 63#else /* CONFIG_X86_64 */
63 64
64/* Mappings between logical cpu number and node number */ 65/* Mappings between logical cpu number and node number */
65DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
66 67
67/* Returns the number of the current Node. */
68DECLARE_PER_CPU(int, node_number);
69#define numa_node_id() percpu_read(node_number)
70
71#ifdef CONFIG_DEBUG_PER_CPU_MAPS 68#ifdef CONFIG_DEBUG_PER_CPU_MAPS
72extern int cpu_to_node(int cpu); 69/*
70 * override generic percpu implementation of cpu_to_node
71 */
72extern int __cpu_to_node(int cpu);
73#define cpu_to_node __cpu_to_node
74
73extern int early_cpu_to_node(int cpu); 75extern int early_cpu_to_node(int cpu);
74 76
75#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 77#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
76 78
77/* Returns the number of the node containing CPU 'cpu' */
78static inline int cpu_to_node(int cpu)
79{
80 return per_cpu(x86_cpu_to_node_map, cpu);
81}
82
83/* Same function but used if called before per_cpu areas are setup */ 79/* Same function but used if called before per_cpu areas are setup */
84static inline int early_cpu_to_node(int cpu) 80static inline int early_cpu_to_node(int cpu)
85{ 81{
@@ -170,6 +166,10 @@ static inline int numa_node_id(void)
170{ 166{
171 return 0; 167 return 0;
172} 168}
169/*
170 * indicate override:
171 */
172#define numa_node_id numa_node_id
173 173
174static inline int early_cpu_to_node(int cpu) 174static inline int early_cpu_to_node(int cpu)
175{ 175{
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740ec..9e6779f7cf2d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
25 * 25 *
26 */ 26 */
27 27
28#include <linux/types.h>
29
28/* 30/*
29 * Definitions of Primary Processor-Based VM-Execution Controls. 31 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */ 32 */
@@ -120,6 +122,8 @@ enum vmcs_field {
120 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, 122 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
121 GUEST_IA32_PAT = 0x00002804, 123 GUEST_IA32_PAT = 0x00002804,
122 GUEST_IA32_PAT_HIGH = 0x00002805, 124 GUEST_IA32_PAT_HIGH = 0x00002805,
125 GUEST_IA32_EFER = 0x00002806,
126 GUEST_IA32_EFER_HIGH = 0x00002807,
123 GUEST_PDPTR0 = 0x0000280a, 127 GUEST_PDPTR0 = 0x0000280a,
124 GUEST_PDPTR0_HIGH = 0x0000280b, 128 GUEST_PDPTR0_HIGH = 0x0000280b,
125 GUEST_PDPTR1 = 0x0000280c, 129 GUEST_PDPTR1 = 0x0000280c,
@@ -130,6 +134,8 @@ enum vmcs_field {
130 GUEST_PDPTR3_HIGH = 0x00002811, 134 GUEST_PDPTR3_HIGH = 0x00002811,
131 HOST_IA32_PAT = 0x00002c00, 135 HOST_IA32_PAT = 0x00002c00,
132 HOST_IA32_PAT_HIGH = 0x00002c01, 136 HOST_IA32_PAT_HIGH = 0x00002c01,
137 HOST_IA32_EFER = 0x00002c02,
138 HOST_IA32_EFER_HIGH = 0x00002c03,
133 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 139 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
134 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 140 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
135 EXCEPTION_BITMAP = 0x00004004, 141 EXCEPTION_BITMAP = 0x00004004,
@@ -394,6 +400,10 @@ enum vmcs_field {
394#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" 400#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
395#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" 401#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
396 402
397 403struct vmx_msr_entry {
404 u32 index;
405 u32 reserved;
406 u64 value;
407} __aligned(16);
398 408
399#endif 409#endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 488be461a380..60cc4058ed5f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled);
63int acpi_noirq; /* skip ACPI IRQ initialization */ 63int acpi_noirq; /* skip ACPI IRQ initialization */
64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ 64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
65EXPORT_SYMBOL(acpi_pci_disabled); 65EXPORT_SYMBOL(acpi_pci_disabled);
66int acpi_ht __initdata = 1; /* enable HT */
67 66
68int acpi_lapic; 67int acpi_lapic;
69int acpi_ioapic; 68int acpi_ioapic;
@@ -1501,9 +1500,8 @@ void __init acpi_boot_table_init(void)
1501 1500
1502 /* 1501 /*
1503 * If acpi_disabled, bail out 1502 * If acpi_disabled, bail out
1504 * One exception: acpi=ht continues far enough to enumerate LAPICs
1505 */ 1503 */
1506 if (acpi_disabled && !acpi_ht) 1504 if (acpi_disabled)
1507 return; 1505 return;
1508 1506
1509 /* 1507 /*
@@ -1534,9 +1532,8 @@ int __init early_acpi_boot_init(void)
1534{ 1532{
1535 /* 1533 /*
1536 * If acpi_disabled, bail out 1534 * If acpi_disabled, bail out
1537 * One exception: acpi=ht continues far enough to enumerate LAPICs
1538 */ 1535 */
1539 if (acpi_disabled && !acpi_ht) 1536 if (acpi_disabled)
1540 return 1; 1537 return 1;
1541 1538
1542 /* 1539 /*
@@ -1554,9 +1551,8 @@ int __init acpi_boot_init(void)
1554 1551
1555 /* 1552 /*
1556 * If acpi_disabled, bail out 1553 * If acpi_disabled, bail out
1557 * One exception: acpi=ht continues far enough to enumerate LAPICs
1558 */ 1554 */
1559 if (acpi_disabled && !acpi_ht) 1555 if (acpi_disabled)
1560 return 1; 1556 return 1;
1561 1557
1562 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1558 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1591,21 +1587,12 @@ static int __init parse_acpi(char *arg)
1591 /* acpi=force to over-ride black-list */ 1587 /* acpi=force to over-ride black-list */
1592 else if (strcmp(arg, "force") == 0) { 1588 else if (strcmp(arg, "force") == 0) {
1593 acpi_force = 1; 1589 acpi_force = 1;
1594 acpi_ht = 1;
1595 acpi_disabled = 0; 1590 acpi_disabled = 0;
1596 } 1591 }
1597 /* acpi=strict disables out-of-spec workarounds */ 1592 /* acpi=strict disables out-of-spec workarounds */
1598 else if (strcmp(arg, "strict") == 0) { 1593 else if (strcmp(arg, "strict") == 0) {
1599 acpi_strict = 1; 1594 acpi_strict = 1;
1600 } 1595 }
1601 /* Limit ACPI just to boot-time to enable HT */
1602 else if (strcmp(arg, "ht") == 0) {
1603 if (!acpi_force) {
1604 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1605 disable_acpi();
1606 }
1607 acpi_ht = 1;
1608 }
1609 /* acpi=rsdt use RSDT instead of XSDT */ 1596 /* acpi=rsdt use RSDT instead of XSDT */
1610 else if (strcmp(arg, "rsdt") == 0) { 1597 else if (strcmp(arg, "rsdt") == 0) {
1611 acpi_rsdt_forced = 1; 1598 acpi_rsdt_forced = 1;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e557..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
167 str = strchr(str, ','); 165 str = strchr(str, ',');
168 if (str != NULL) 166 if (str != NULL)
169 str += strspn(str, ", \t"); 167 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b0593..13ab720573e3 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
1 .section .text.page_aligned 1 .section .text..page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page_types.h> 4#include <asm/page_types.h>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fa5a1474cd18..0d20286d78c6 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1487,6 +1487,7 @@ static int __attach_device(struct device *dev,
1487 struct protection_domain *domain) 1487 struct protection_domain *domain)
1488{ 1488{
1489 struct iommu_dev_data *dev_data, *alias_data; 1489 struct iommu_dev_data *dev_data, *alias_data;
1490 int ret;
1490 1491
1491 dev_data = get_dev_data(dev); 1492 dev_data = get_dev_data(dev);
1492 alias_data = get_dev_data(dev_data->alias); 1493 alias_data = get_dev_data(dev_data->alias);
@@ -1498,13 +1499,14 @@ static int __attach_device(struct device *dev,
1498 spin_lock(&domain->lock); 1499 spin_lock(&domain->lock);
1499 1500
1500 /* Some sanity checks */ 1501 /* Some sanity checks */
1502 ret = -EBUSY;
1501 if (alias_data->domain != NULL && 1503 if (alias_data->domain != NULL &&
1502 alias_data->domain != domain) 1504 alias_data->domain != domain)
1503 return -EBUSY; 1505 goto out_unlock;
1504 1506
1505 if (dev_data->domain != NULL && 1507 if (dev_data->domain != NULL &&
1506 dev_data->domain != domain) 1508 dev_data->domain != domain)
1507 return -EBUSY; 1509 goto out_unlock;
1508 1510
1509 /* Do real assignment */ 1511 /* Do real assignment */
1510 if (dev_data->alias != dev) { 1512 if (dev_data->alias != dev) {
@@ -1520,10 +1522,14 @@ static int __attach_device(struct device *dev,
1520 1522
1521 atomic_inc(&dev_data->bind); 1523 atomic_inc(&dev_data->bind);
1522 1524
1525 ret = 0;
1526
1527out_unlock:
1528
1523 /* ready */ 1529 /* ready */
1524 spin_unlock(&domain->lock); 1530 spin_unlock(&domain->lock);
1525 1531
1526 return 0; 1532 return ret;
1527} 1533}
1528 1534
1529/* 1535/*
@@ -2324,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void)
2324 2330
2325 iommu_detected = 1; 2331 iommu_detected = 1;
2326 swiotlb = 0; 2332 swiotlb = 0;
2327#ifdef CONFIG_GART_IOMMU
2328 gart_iommu_aperture_disabled = 1;
2329 gart_iommu_aperture = 0;
2330#endif
2331 2333
2332 /* Make the driver finally visible to the drivers */ 2334 /* Make the driver finally visible to the drivers */
2333 dma_ops = &amd_iommu_dma_ops; 2335 dma_ops = &amd_iommu_dma_ops;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3bacb4d0844c..3cc63e2b8dd4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -287,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address)
287{ 287{
288 u8 *ret; 288 u8 *ret;
289 289
290 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) 290 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
291 pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
292 address);
293 pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
291 return NULL; 294 return NULL;
295 }
292 296
293 ret = ioremap_nocache(address, MMIO_REGION_LENGTH); 297 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
294 if (ret != NULL) 298 if (ret != NULL)
@@ -1314,7 +1318,7 @@ static int __init amd_iommu_init(void)
1314 ret = amd_iommu_init_dma_ops(); 1318 ret = amd_iommu_init_dma_ops();
1315 1319
1316 if (ret) 1320 if (ret)
1317 goto free; 1321 goto free_disable;
1318 1322
1319 amd_iommu_init_api(); 1323 amd_iommu_init_api();
1320 1324
@@ -1332,9 +1336,10 @@ static int __init amd_iommu_init(void)
1332out: 1336out:
1333 return ret; 1337 return ret;
1334 1338
1335free: 1339free_disable:
1336 disable_iommus(); 1340 disable_iommus();
1337 1341
1342free:
1338 amd_iommu_uninit_devices(); 1343 amd_iommu_uninit_devices();
1339 1344
1340 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1345 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
@@ -1353,6 +1358,15 @@ free:
1353 1358
1354 free_unity_maps(); 1359 free_unity_maps();
1355 1360
1361#ifdef CONFIG_GART_IOMMU
1362 /*
1363 * We failed to initialize the AMD IOMMU - try fallback to GART
1364 * if possible.
1365 */
1366 gart_iommu_init();
1367
1368#endif
1369
1356 goto out; 1370 goto out;
1357} 1371}
1358 1372
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 192cd7ee35cc..0bcc5aeda998 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
51#include <asm/smp.h> 51#include <asm/smp.h>
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h> 53#include <asm/kvm_para.h>
54#include <asm/tsc.h>
54 55
55unsigned int num_processors; 56unsigned int num_processors;
56 57
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
1151 */ 1152 */
1152void __cpuinit setup_local_APIC(void) 1153void __cpuinit setup_local_APIC(void)
1153{ 1154{
1154 unsigned int value; 1155 unsigned int value, queued;
1155 int i, j; 1156 int i, j, acked = 0;
1157 unsigned long long tsc = 0, ntsc;
1158 long long max_loops = cpu_khz;
1159
1160 if (cpu_has_tsc)
1161 rdtscll(tsc);
1156 1162
1157 if (disable_apic) { 1163 if (disable_apic) {
1158 arch_disable_smp_support(); 1164 arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
1204 * the interrupt. Hence a vector might get locked. It was noticed 1210 * the interrupt. Hence a vector might get locked. It was noticed
1205 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. 1211 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
1206 */ 1212 */
1207 for (i = APIC_ISR_NR - 1; i >= 0; i--) { 1213 do {
1208 value = apic_read(APIC_ISR + i*0x10); 1214 queued = 0;
1209 for (j = 31; j >= 0; j--) { 1215 for (i = APIC_ISR_NR - 1; i >= 0; i--)
1210 if (value & (1<<j)) 1216 queued |= apic_read(APIC_IRR + i*0x10);
1211 ack_APIC_irq(); 1217
1218 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
1219 value = apic_read(APIC_ISR + i*0x10);
1220 for (j = 31; j >= 0; j--) {
1221 if (value & (1<<j)) {
1222 ack_APIC_irq();
1223 acked++;
1224 }
1225 }
1212 } 1226 }
1213 } 1227 if (acked > 256) {
1228 printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
1229 acked);
1230 break;
1231 }
1232 if (cpu_has_tsc) {
1233 rdtscll(ntsc);
1234 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1235 } else
1236 max_loops--;
1237 } while (queued && max_loops > 0);
1238 WARN_ON(max_loops <= 0);
1214 1239
1215 /* 1240 /*
1216 * Now that we are all set up, enable the APIC 1241 * Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc83a002786e..68e4a6f2211e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void)
1121 oist = &per_cpu(orig_ist, cpu); 1121 oist = &per_cpu(orig_ist, cpu);
1122 1122
1123#ifdef CONFIG_NUMA 1123#ifdef CONFIG_NUMA
1124 if (cpu != 0 && percpu_read(node_number) == 0 && 1124 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1125 cpu_to_node(cpu) != NUMA_NO_NODE) 1125 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1126 percpu_write(node_number, cpu_to_node(cpu)); 1126 set_numa_node(early_cpu_to_node(cpu));
1127#endif 1127#endif
1128 1128
1129 me = current; 1129 me = current;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6f3dc8fbbfdc..7ec2123838e6 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1497 * simply keep the boost-disable flag in sync with the current global 1497 * simply keep the boost-disable flag in sync with the current global
1498 * state. 1498 * state.
1499 */ 1499 */
1500static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action, 1500static int cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu) 1501 void *hcpu)
1502{ 1502{
1503 unsigned cpu = (long)hcpu; 1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi; 1504 u32 lo, hi;
@@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
1528 return NOTIFY_OK; 1528 return NOTIFY_OK;
1529} 1529}
1530 1530
1531static struct notifier_block __cpuinitdata cpb_nb = { 1531static struct notifier_block cpb_nb = {
1532 .notifier_call = cpb_notify, 1532 .notifier_call = cpb_notify,
1533}; 1533};
1534 1534
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
8 8
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..745b54f9be89
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
1/*
2 * Bridge between MCE and APEI
3 *
4 * On some machine, corrected memory errors are reported via APEI
5 * generic hardware error source (GHES) instead of corrected Machine
6 * Check. These corrected memory errors can be reported to user space
7 * through /dev/mcelog via faking a corrected Machine Check, so that
8 * the error memory page can be offlined by /sbin/mcelog if the error
9 * count for one page is beyond the threshold.
10 *
11 * For fatal MCE, save MCE record into persistent storage via ERST, so
12 * that the MCE record can be logged after reboot via ERST.
13 *
14 * Copyright 2010 Intel Corp.
15 * Author: Huang Ying <ying.huang@intel.com>
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License version
19 * 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */
30
31#include <linux/kernel.h>
32#include <linux/acpi.h>
33#include <linux/cper.h>
34#include <acpi/apei.h>
35#include <asm/mce.h>
36
37#include "mce-internal.h"
38
39void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
40{
41 struct mce m;
42
43 /* Only corrected MC is reported */
44 if (!corrected)
45 return;
46
47 mce_setup(&m);
48 m.bank = 1;
49 /* Fake a memory read corrected error with unknown channel */
50 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
51 m.addr = mem_err->physical_addr;
52 mce_log(&m);
53 mce_notify_irq();
54}
55EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
56
57#define CPER_CREATOR_MCE \
58 UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
59 0x64, 0x90, 0xb8, 0x9d)
60#define CPER_SECTION_TYPE_MCE \
61 UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
62 0x04, 0x4a, 0x38, 0xfc)
63
64/*
65 * CPER specification (in UEFI specification 2.3 appendix N) requires
66 * byte-packed.
67 */
68struct cper_mce_record {
69 struct cper_record_header hdr;
70 struct cper_section_descriptor sec_hdr;
71 struct mce mce;
72} __packed;
73
74int apei_write_mce(struct mce *m)
75{
76 struct cper_mce_record rcd;
77
78 memset(&rcd, 0, sizeof(rcd));
79 memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SER_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd);
87 rcd.hdr.creator_id = CPER_CREATOR_MCE;
88 rcd.hdr.notification_type = CPER_NOTIFY_MCE;
89 rcd.hdr.record_id = cper_next_record_id();
90 rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
91
92 rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
93 rcd.sec_hdr.section_length = sizeof(rcd.mce);
94 rcd.sec_hdr.revision = CPER_SEC_REV;
95 /* fru_id and fru_text is invalid */
96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SER_FATAL;
100
101 memcpy(&rcd.mce, m, sizeof(*m));
102
103 return erst_write(&rcd.hdr);
104}
105
106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{
108 struct cper_mce_record rcd;
109 ssize_t len;
110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd));
112 if (len <= 0)
113 return len;
114 /* Can not skip other records in storage via ERST unless clear them */
115 else if (len != sizeof(rcd) ||
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
117 if (printk_ratelimit())
118 pr_warning(
119 "MCE-APEI: Can not skip the unknown record in ERST");
120 return -EIO;
121 }
122
123 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id;
125
126 return sizeof(*m);
127}
128
129/* Check whether there is record in ERST */
130int apei_check_mce(void)
131{
132 return erst_get_record_count();
133}
134
135int apei_clear_mce(u64 record_id)
136{
137 return erst_clear(record_id);
138}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id);
34int apei_check_mce(void);
35int apei_clear_mce(u64 record_id);
36#else
37static inline int apei_write_mce(struct mce *m)
38{
39 return -EINVAL;
40}
41static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
42{
43 return 0;
44}
45static inline int apei_check_mce(void)
46{
47 return 0;
48}
49static inline int apei_clear_mce(u64 record_id)
50{
51 return -EINVAL;
52}
53#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a355ddcc64b..18cc42562250 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
39 40
40#include <asm/processor.h> 41#include <asm/processor.h>
41#include <asm/hw_irq.h> 42#include <asm/hw_irq.h>
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
169 entry = rcu_dereference_check_mce(mcelog.next); 170 entry = rcu_dereference_check_mce(mcelog.next);
170 for (;;) { 171 for (;;) {
171 /* 172 /*
173 * If edac_mce is enabled, it will check the error type
174 * and will process it, if it is a known error.
175 * Otherwise, the error will be sent through mcelog
176 * interface
177 */
178 if (edac_mce_parse(mce))
179 return;
180
181 /*
172 * When the buffer fills up discard new entries. 182 * When the buffer fills up discard new entries.
173 * Assume that the earlier errors are the more 183 * Assume that the earlier errors are the more
174 * interesting ones: 184 * interesting ones:
@@ -264,7 +274,7 @@ static void wait_for_panic(void)
264 274
265static void mce_panic(char *msg, struct mce *final, char *exp) 275static void mce_panic(char *msg, struct mce *final, char *exp)
266{ 276{
267 int i; 277 int i, apei_err = 0;
268 278
269 if (!fake_panic) { 279 if (!fake_panic) {
270 /* 280 /*
@@ -287,8 +297,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
287 struct mce *m = &mcelog.entry[i]; 297 struct mce *m = &mcelog.entry[i];
288 if (!(m->status & MCI_STATUS_VAL)) 298 if (!(m->status & MCI_STATUS_VAL))
289 continue; 299 continue;
290 if (!(m->status & MCI_STATUS_UC)) 300 if (!(m->status & MCI_STATUS_UC)) {
291 print_mce(m); 301 print_mce(m);
302 if (!apei_err)
303 apei_err = apei_write_mce(m);
304 }
292 } 305 }
293 /* Now print uncorrected but with the final one last */ 306 /* Now print uncorrected but with the final one last */
294 for (i = 0; i < MCE_LOG_LEN; i++) { 307 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +310,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
297 continue; 310 continue;
298 if (!(m->status & MCI_STATUS_UC)) 311 if (!(m->status & MCI_STATUS_UC))
299 continue; 312 continue;
300 if (!final || memcmp(m, final, sizeof(struct mce))) 313 if (!final || memcmp(m, final, sizeof(struct mce))) {
301 print_mce(m); 314 print_mce(m);
315 if (!apei_err)
316 apei_err = apei_write_mce(m);
317 }
302 } 318 }
303 if (final) 319 if (final) {
304 print_mce(final); 320 print_mce(final);
321 if (!apei_err)
322 apei_err = apei_write_mce(final);
323 }
305 if (cpu_missing) 324 if (cpu_missing)
306 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 325 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
307 print_mce_tail(); 326 print_mce_tail();
@@ -1493,6 +1512,43 @@ static void collect_tscs(void *data)
1493 rdtscll(cpu_tsc[smp_processor_id()]); 1512 rdtscll(cpu_tsc[smp_processor_id()]);
1494} 1513}
1495 1514
1515static int mce_apei_read_done;
1516
1517/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1518static int __mce_read_apei(char __user **ubuf, size_t usize)
1519{
1520 int rc;
1521 u64 record_id;
1522 struct mce m;
1523
1524 if (usize < sizeof(struct mce))
1525 return -EINVAL;
1526
1527 rc = apei_read_mce(&m, &record_id);
1528 /* Error or no more MCE record */
1529 if (rc <= 0) {
1530 mce_apei_read_done = 1;
1531 return rc;
1532 }
1533 rc = -EFAULT;
1534 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1535 return rc;
1536 /*
1537 * In fact, we should have cleared the record after that has
1538 * been flushed to the disk or sent to network in
1539 * /sbin/mcelog, but we have no interface to support that now,
1540 * so just clear it to avoid duplication.
1541 */
1542 rc = apei_clear_mce(record_id);
1543 if (rc) {
1544 mce_apei_read_done = 1;
1545 return rc;
1546 }
1547 *ubuf += sizeof(struct mce);
1548
1549 return 0;
1550}
1551
1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1552static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1497 loff_t *off) 1553 loff_t *off)
1498{ 1554{
@@ -1506,15 +1562,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1506 return -ENOMEM; 1562 return -ENOMEM;
1507 1563
1508 mutex_lock(&mce_read_mutex); 1564 mutex_lock(&mce_read_mutex);
1565
1566 if (!mce_apei_read_done) {
1567 err = __mce_read_apei(&buf, usize);
1568 if (err || buf != ubuf)
1569 goto out;
1570 }
1571
1509 next = rcu_dereference_check_mce(mcelog.next); 1572 next = rcu_dereference_check_mce(mcelog.next);
1510 1573
1511 /* Only supports full reads right now */ 1574 /* Only supports full reads right now */
1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1575 err = -EINVAL;
1513 mutex_unlock(&mce_read_mutex); 1576 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1514 kfree(cpu_tsc); 1577 goto out;
1515
1516 return -EINVAL;
1517 }
1518 1578
1519 err = 0; 1579 err = 0;
1520 prev = 0; 1580 prev = 0;
@@ -1562,10 +1622,15 @@ timeout:
1562 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1622 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1563 } 1623 }
1564 } 1624 }
1625
1626 if (err)
1627 err = -EFAULT;
1628
1629out:
1565 mutex_unlock(&mce_read_mutex); 1630 mutex_unlock(&mce_read_mutex);
1566 kfree(cpu_tsc); 1631 kfree(cpu_tsc);
1567 1632
1568 return err ? -EFAULT : buf - ubuf; 1633 return err ? err : buf - ubuf;
1569} 1634}
1570 1635
1571static unsigned int mce_poll(struct file *file, poll_table *wait) 1636static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1638,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1573 poll_wait(file, &mce_wait, wait); 1638 poll_wait(file, &mce_wait, wait);
1574 if (rcu_dereference_check_mce(mcelog.next)) 1639 if (rcu_dereference_check_mce(mcelog.next))
1575 return POLLIN | POLLRDNORM; 1640 return POLLIN | POLLRDNORM;
1641 if (!mce_apei_read_done && apei_check_mce())
1642 return POLLIN | POLLRDNORM;
1576 return 0; 1643 return 0;
1577} 1644}
1578 1645
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..e1a0a3bf9716 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
190 mutex_unlock(&therm_cpu_lock); 190 mutex_unlock(&therm_cpu_lock);
191 break; 191 break;
192 } 192 }
193 return err ? NOTIFY_BAD : NOTIFY_OK; 193 return notifier_from_errno(err);
194} 194}
195 195
196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = 196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fd4db0db3708..5db5b7d65a18 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
106 106
107 int n_events; 107 int n_events;
108 int n_added; 108 int n_added;
109 int n_txn;
109 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 110 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
110 u64 tags[X86_PMC_IDX_MAX]; 111 u64 tags[X86_PMC_IDX_MAX];
111 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 112 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
983out: 984out:
984 cpuc->n_events = n; 985 cpuc->n_events = n;
985 cpuc->n_added += n - n0; 986 cpuc->n_added += n - n0;
987 cpuc->n_txn += n - n0;
986 988
987 return 0; 989 return 0;
988} 990}
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
1089 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1091 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1090 int i; 1092 int i;
1091 1093
1094 /*
1095 * If we're called during a txn, we don't need to do anything.
1096 * The events never got scheduled and ->cancel_txn will truncate
1097 * the event_list.
1098 */
1099 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
1100 return;
1101
1092 x86_pmu_stop(event); 1102 x86_pmu_stop(event);
1093 1103
1094 for (i = 0; i < cpuc->n_events; i++) { 1104 for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1379 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1389 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1380 1390
1381 cpuc->group_flag |= PERF_EVENT_TXN_STARTED; 1391 cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
1392 cpuc->n_txn = 0;
1382} 1393}
1383 1394
1384/* 1395/*
@@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1391 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1402 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1392 1403
1393 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; 1404 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
1405 /*
1406 * Truncate the collected events.
1407 */
1408 cpuc->n_added -= cpuc->n_txn;
1409 cpuc->n_events -= cpuc->n_txn;
1394} 1410}
1395 1411
1396/* 1412/*
@@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1419 */ 1435 */
1420 memcpy(cpuc->assign, assign, n*sizeof(int)); 1436 memcpy(cpuc->assign, assign, n*sizeof(int));
1421 1437
1438 /*
1439 * Clear out the txn count so that ->cancel_txn() which gets
1440 * run after ->commit_txn() doesn't undo things.
1441 */
1442 cpuc->n_txn = 0;
1443
1422 return 0; 1444 return 0;
1423} 1445}
1424 1446
@@ -1717,7 +1739,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
1717 */ 1739 */
1718 regs->bp = rewind_frame_pointer(skip + 1); 1740 regs->bp = rewind_frame_pointer(skip + 1);
1719 regs->cs = __KERNEL_CS; 1741 regs->cs = __KERNEL_CS;
1720 local_save_flags(regs->flags); 1742 /*
1743 * We abuse bit 3 to pass exact information, see perf_misc_flags
1744 * and the comment with PERF_EFLAGS_EXACT.
1745 */
1746 regs->flags = 0;
1721} 1747}
1722 1748
1723unsigned long perf_instruction_pointer(struct pt_regs *regs) 1749unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 424fc8de68e4..ae85d69644d1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -465,15 +465,21 @@ out:
465 return rc; 465 return rc;
466} 466}
467 467
468static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 468static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
469{ 469{
470 unsigned long dummy; 470 int overflow = 0;
471 u32 low, high;
471 472
472 rdmsrl(hwc->config_base + hwc->idx, dummy); 473 rdmsr(hwc->config_base + hwc->idx, low, high);
473 if (dummy & P4_CCCR_OVF) { 474
475 /* we need to check high bit for unflagged overflows */
476 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
477 overflow = 1;
474 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 478 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
475 ((u64)dummy) & ~P4_CCCR_OVF); 479 ((u64)low) & ~P4_CCCR_OVF);
476 } 480 }
481
482 return overflow;
477} 483}
478 484
479static inline void p4_pmu_disable_event(struct perf_event *event) 485static inline void p4_pmu_disable_event(struct perf_event *event)
@@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
584 590
585 WARN_ON_ONCE(hwc->idx != idx); 591 WARN_ON_ONCE(hwc->idx != idx);
586 592
587 /* 593 /* it might be unflagged overflow */
588 * FIXME: Redundant call, actually not needed 594 handled = p4_pmu_clear_cccr_ovf(hwc);
589 * but just to check if we're screwed
590 */
591 p4_pmu_clear_cccr_ovf(hwc);
592 595
593 val = x86_perf_event_update(event); 596 val = x86_perf_event_update(event);
594 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 597 if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
595 continue; 598 continue;
596 599
597 /* 600 /* event overflow for sure */
598 * event overflow 601 data.period = event->hw.last_period;
599 */
600 handled = 1;
601 data.period = event->hw.last_period;
602 602
603 if (!x86_perf_event_set_period(event)) 603 if (!x86_perf_event_set_period(event))
604 continue; 604 continue;
@@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
670 670
671/* 671/*
672 * ESCR address hashing is tricky, ESCRs are not sequential 672 * ESCR address hashing is tricky, ESCRs are not sequential
673 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and 673 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
674 * the metric between any ESCRs is laid in range [0xa0,0xe1] 674 * the metric between any ESCRs is laid in range [0xa0,0xe1]
675 * 675 *
676 * so we make ~70% filled hashtable 676 * so we make ~70% filled hashtable
@@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr)
735{ 735{
736 unsigned int idx = P4_ESCR_MSR_IDX(addr); 736 unsigned int idx = P4_ESCR_MSR_IDX(addr);
737 737
738 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || 738 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
739 !p4_escr_table[idx])) { 739 !p4_escr_table[idx] ||
740 p4_escr_table[idx] != addr)) {
740 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); 741 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
741 return -1; 742 return -1;
742 } 743 }
@@ -762,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
762{ 763{
763 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 764 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
764 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; 765 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
765 int cpu = raw_smp_processor_id(); 766 int cpu = smp_processor_id();
766 struct hw_perf_event *hwc; 767 struct hw_perf_event *hwc;
767 struct p4_event_bind *bind; 768 struct p4_event_bind *bind;
768 unsigned int i, thread, num; 769 unsigned int i, thread, num;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900fe..1b7b31ab7d86 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
170 cpuid_device_destroy(cpu); 170 cpuid_device_destroy(cpu);
171 break; 171 break;
172 } 172 }
173 return err ? NOTIFY_BAD : NOTIFY_OK; 173 return notifier_from_errno(err);
174} 174}
175 175
176static struct notifier_block __refdata cpuid_class_cpu_notifier = 176static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3a54dcb9cd0e..43e9ccf44947 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task);
34/* 34/*
35 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 35 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
36 * no more per-task TSS's. The TSS size is kept cacheline-aligned 36 * no more per-task TSS's. The TSS size is kept cacheline-aligned
37 * so they are allowed to end up in the .data.cacheline_aligned 37 * so they are allowed to end up in the .data..cacheline_aligned
38 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
39 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40 */ 40 */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4f..eb9b76c716c2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
29#define KVM_SCALE 22 29#define KVM_SCALE 22
30 30
31static int kvmclock = 1; 31static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
32 34
33static int parse_no_kvmclock(char *arg) 35static int parse_no_kvmclock(char *arg)
34{ 36{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
54 56
55 low = (int)__pa_symbol(&wall_clock); 57 low = (int)__pa_symbol(&wall_clock);
56 high = ((u64)__pa_symbol(&wall_clock) >> 32); 58 high = ((u64)__pa_symbol(&wall_clock) >> 32);
57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 59
60 native_write_msr(msr_kvm_wall_clock, low, high);
58 61
59 vcpu_time = &get_cpu_var(hv_clock); 62 vcpu_time = &get_cpu_var(hv_clock);
60 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 63 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
132 cpu, high, low, txt); 135 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 136
137 return native_write_msr_safe(msr_kvm_system_time, low, high);
134} 138}
135 139
136#ifdef CONFIG_X86_LOCAL_APIC 140#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
165#ifdef CONFIG_KEXEC 169#ifdef CONFIG_KEXEC
166static void kvm_crash_shutdown(struct pt_regs *regs) 170static void kvm_crash_shutdown(struct pt_regs *regs)
167{ 171{
168 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 172 native_write_msr(msr_kvm_system_time, 0, 0);
169 native_machine_crash_shutdown(regs); 173 native_machine_crash_shutdown(regs);
170} 174}
171#endif 175#endif
172 176
173static void kvm_shutdown(void) 177static void kvm_shutdown(void)
174{ 178{
175 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 179 native_write_msr(msr_kvm_system_time, 0, 0);
176 native_machine_shutdown(); 180 native_machine_shutdown();
177} 181}
178 182
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
181 if (!kvm_para_available()) 185 if (!kvm_para_available())
182 return; 186 return;
183 187
184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 188 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
185 if (kvm_register_clock("boot clock")) 189 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
186 return; 190 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
187 pv_time_ops.sched_clock = kvm_clock_read; 191 } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
188 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 192 return;
189 x86_platform.get_wallclock = kvm_get_wallclock; 193
190 x86_platform.set_wallclock = kvm_set_wallclock; 194 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
195 msr_kvm_system_time, msr_kvm_wall_clock);
196
197 if (kvm_register_clock("boot clock"))
198 return;
199 pv_time_ops.sched_clock = kvm_clock_read;
200 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
201 x86_platform.get_wallclock = kvm_get_wallclock;
202 x86_platform.set_wallclock = kvm_set_wallclock;
191#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
192 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
194#endif 206#endif
195#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
197#endif 209#endif
198 machine_ops.shutdown = kvm_shutdown; 210 machine_ops.shutdown = kvm_shutdown;
199#ifdef CONFIG_KEXEC 211#ifdef CONFIG_KEXEC
200 machine_ops.crash_shutdown = kvm_crash_shutdown; 212 machine_ops.crash_shutdown = kvm_crash_shutdown;
201#endif 213#endif
202 kvm_get_preset_lpj(); 214 kvm_get_preset_lpj();
203 clocksource_register(&kvm_clock); 215 clocksource_register(&kvm_clock);
204 pv_info.paravirt_enabled = 1; 216 pv_info.paravirt_enabled = 1;
205 pv_info.name = "KVM"; 217 pv_info.name = "KVM";
206 } 218
219 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
220 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
207} 221}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2cd8c544e41a..fa6551d36c10 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
260} 260}
261 261
262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); 262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
263MODULE_ALIAS("devname:cpu/microcode");
263#else 264#else
264#define microcode_dev_init() 0 265#define microcode_dev_init() 0
265#define microcode_dev_exit() do { } while (0) 266#define microcode_dev_exit() do { } while (0)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47c..7bf2dc4c8f70 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
230 msr_device_destroy(cpu); 230 msr_device_destroy(cpu);
231 break; 231 break;
232 } 232 }
233 return err ? NOTIFY_BAD : NOTIFY_OK; 233 return notifier_from_errno(err);
234} 234}
235 235
236static struct notifier_block __refdata msr_class_cpu_notifier = { 236static struct notifier_block __refdata msr_class_cpu_notifier = {
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20e..a5bc528d4328 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
31 .free_coherent = swiotlb_free_coherent, 31 .free_coherent = swiotlb_free_coherent,
32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
33 .sync_single_for_device = swiotlb_sync_single_for_device, 33 .sync_single_for_device = swiotlb_sync_single_for_device,
34 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
35 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
36 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 34 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
37 .sync_sg_for_device = swiotlb_sync_sg_for_device, 35 .sync_sg_for_device = swiotlb_sync_sg_for_device,
38 .map_sg = swiotlb_map_sg_attrs, 36 .map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..239427ca02af 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
31 u32 tsc_to_nsec_mul; 31 u32 tsc_to_nsec_mul;
32 int tsc_shift; 32 int tsc_shift;
33 u32 version; 33 u32 version;
34 u8 flags;
34}; 35};
35 36
37static u8 valid_flags __read_mostly = 0;
38
39void pvclock_set_flags(u8 flags)
40{
41 valid_flags = flags;
42}
43
36/* 44/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result. 46 * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
91 dst->system_timestamp = src->system_time; 99 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 100 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift; 101 dst->tsc_shift = src->tsc_shift;
102 dst->flags = src->flags;
94 rmb(); /* test version after fetching data */ 103 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version)); 104 } while ((src->version & 1) || (dst->version != src->version));
96 105
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
109 return pv_tsc_khz; 118 return pv_tsc_khz;
110} 119}
111 120
121static atomic64_t last_value = ATOMIC64_INIT(0);
122
112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
113{ 124{
114 struct pvclock_shadow_time shadow; 125 struct pvclock_shadow_time shadow;
115 unsigned version; 126 unsigned version;
116 cycle_t ret, offset; 127 cycle_t ret, offset;
128 u64 last;
117 129
118 do { 130 do {
119 version = pvclock_get_time_values(&shadow, src); 131 version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
123 barrier(); 135 barrier();
124 } while (version != src->version); 136 } while (version != src->version);
125 137
138 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
139 (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
140 return ret;
141
142 /*
143 * Assumption here is that last_value, a global accumulator, always goes
144 * forward. If we are less than that, we should not be much smaller.
145 * We assume there is an error marging we're inside, and then the correction
146 * does not sacrifice accuracy.
147 *
148 * For reads: global may have changed between test and return,
149 * but this means someone else updated poked the clock at a later time.
150 * We just need to make sure we are not seeing a backwards event.
151 *
152 * For updates: last_value = ret is not enough, since two vcpus could be
153 * updating at the same time, and one of them could be slightly behind,
154 * making the assumption that last_value always go forward fail to hold.
155 */
156 last = atomic64_read(&last_value);
157 do {
158 if (ret < last)
159 return last;
160 last = atomic64_cmpxchg(&last_value, last, ret);
161 } while (unlikely(last != ret));
162
126 return ret; 163 return ret;
127} 164}
128 165
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e8029896309a..b4ae4acbd031 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), 676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 }, 677 },
678 }, 678 },
679 /*
680 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
681 * match on the product name.
682 */
683 {
684 .callback = dmi_low_memory_corruption,
685 .ident = "Phoenix BIOS",
686 .matches = {
687 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
688 },
689 },
679#endif 690#endif
680 {} 691 {}
681}; 692};
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e70..de3b63ae3da2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,12 +21,6 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
26#else
27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
28#endif
29
30DEFINE_PER_CPU(int, cpu_number); 24DEFINE_PER_CPU(int, cpu_number);
31EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
32 26
@@ -247,7 +241,7 @@ void __init setup_per_cpu_areas(void)
247#endif 241#endif
248#endif 242#endif
249 /* 243 /*
250 * Up to this point, the boot CPU has been using .data.init 244 * Up to this point, the boot CPU has been using .init.data
251 * area. Reload any changed state for the boot CPU. 245 * area. Reload any changed state for the boot CPU.
252 */ 246 */
253 if (cpu == boot_cpu_id) 247 if (cpu == boot_cpu_id)
@@ -265,10 +259,10 @@ void __init setup_per_cpu_areas(void)
265 259
266#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 260#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
267 /* 261 /*
268 * make sure boot cpu node_number is right, when boot cpu is on the 262 * make sure boot cpu numa_node is right, when boot cpu is on the
269 * node that doesn't have mem installed 263 * node that doesn't have mem installed
270 */ 264 */
271 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); 265 set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
272#endif 266#endif
273 267
274 /* Setup node to cpumask map */ 268 /* Setup node to cpumask map */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a0..c4f33b2e77d6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
686static void __cpuinit announce_cpu(int cpu, int apicid) 686static void __cpuinit announce_cpu(int cpu, int apicid)
687{ 687{
688 static int current_node = -1; 688 static int current_node = -1;
689 int node = cpu_to_node(cpu); 689 int node = early_cpu_to_node(cpu);
690 690
691 if (system_state == SYSTEM_BOOTING) { 691 if (system_state == SYSTEM_BOOTING) {
692 if (node != current_node) { 692 if (node != current_node) {
@@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void)
1215 if (!num_processors) 1215 if (!num_processors)
1216 num_processors = 1; 1216 num_processors = 1;
1217 1217
1218 if (setup_possible_cpus == -1) 1218 i = setup_max_cpus ?: 1;
1219 possible = num_processors + disabled_cpus; 1219 if (setup_possible_cpus == -1) {
1220 else 1220 possible = num_processors;
1221#ifdef CONFIG_HOTPLUG_CPU
1222 if (setup_max_cpus)
1223 possible += disabled_cpus;
1224#else
1225 if (possible > i)
1226 possible = i;
1227#endif
1228 } else
1221 possible = setup_possible_cpus; 1229 possible = setup_possible_cpus;
1222 1230
1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1231 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void)
1230 possible = nr_cpu_ids; 1238 possible = nr_cpu_ids;
1231 } 1239 }
1232 1240
1241#ifdef CONFIG_HOTPLUG_CPU
1242 if (!setup_max_cpus)
1243#endif
1244 if (possible > i) {
1245 printk(KERN_WARNING
1246 "%d Processors exceeds max_cpus limit of %u\n",
1247 possible, setup_max_cpus);
1248 possible = i;
1249 }
1250
1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1251 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1234 possible, max_t(int, possible - num_processors, 0)); 1252 possible, max_t(int, possible - num_processors, 0));
1235 1253
1236 for (i = 0; i < possible; i++) 1254 for (i = 0; i < possible; i++)
1237 set_cpu_possible(i, true); 1255 set_cpu_possible(i, true);
1256 for (; i < NR_CPUS; i++)
1257 set_cpu_possible(i, false);
1238 1258
1239 nr_cpu_ids = possible; 1259 nr_cpu_ids = possible;
1240} 1260}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index cc2c60474fd0..c2f1b26141e2 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
46 46
47/* Global pointer to shared data; NULL means no measured launch. */ 47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly; 48struct tboot *tboot __read_mostly;
49EXPORT_SYMBOL(tboot);
49 50
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ 51/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1 52#define AP_WAIT_TIMEOUT 1
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2cc249718c46..d0bb52296fa3 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
97 HEAD_TEXT 97 HEAD_TEXT
98#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
99 . = ALIGN(PAGE_SIZE); 99 . = ALIGN(PAGE_SIZE);
100 *(.text.page_aligned) 100 *(.text..page_aligned)
101#endif 101#endif
102 . = ALIGN(8); 102 . = ALIGN(8);
103 _stext = .; 103 _stext = .;
@@ -305,7 +305,7 @@ SECTIONS
305 . = ALIGN(PAGE_SIZE); 305 . = ALIGN(PAGE_SIZE);
306 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 306 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
307 __bss_start = .; 307 __bss_start = .;
308 *(.bss.page_aligned) 308 *(.bss..page_aligned)
309 *(.bss) 309 *(.bss)
310 . = ALIGN(4); 310 . = ALIGN(4);
311 __bss_stop = .; 311 __bss_stop = .;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac0827..5ac0bb465ed6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "x86.h" 35#include "x86.h"
36#include "tss.h"
36 37
37/* 38/*
38 * Opcode effective-address decode tables. 39 * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
50#define DstReg (2<<1) /* Register operand. */ 51#define DstReg (2<<1) /* Register operand. */
51#define DstMem (3<<1) /* Memory operand. */ 52#define DstMem (3<<1) /* Memory operand. */
52#define DstAcc (4<<1) /* Destination Accumulator */ 53#define DstAcc (4<<1) /* Destination Accumulator */
54#define DstDI (5<<1) /* Destination is in ES:(E)DI */
55#define DstMem64 (6<<1) /* 64bit memory operand */
53#define DstMask (7<<1) 56#define DstMask (7<<1)
54/* Source operand type. */ 57/* Source operand type. */
55#define SrcNone (0<<4) /* No source operand. */ 58#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
63#define SrcOne (7<<4) /* Implied '1' */ 66#define SrcOne (7<<4) /* Implied '1' */
64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 68#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
66#define SrcMask (0xf<<4) 70#define SrcMask (0xf<<4)
67/* Generic ModRM decode. */ 71/* Generic ModRM decode. */
68#define ModRM (1<<8) 72#define ModRM (1<<8)
@@ -85,6 +89,9 @@
85#define Src2ImmByte (2<<29) 89#define Src2ImmByte (2<<29)
86#define Src2One (3<<29) 90#define Src2One (3<<29)
87#define Src2Imm16 (4<<29) 91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
88#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
89 96
90enum { 97enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
147 0, 0, 0, 0, 154 0, 0, 0, 0,
148 /* 0x68 - 0x6F */ 155 /* 0x68 - 0x6F */
149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
150 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
151 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
152 /* 0x70 - 0x77 */ 159 /* 0x70 - 0x77 */
153 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
154 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
173 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
175 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
176 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
177 ByteOp | ImplicitOps | String, ImplicitOps | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
178 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
179 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
180 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
181 ByteOp | ImplicitOps | String, ImplicitOps | String, 188 ByteOp | DstDI | String, DstDI | String,
182 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
183 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
184 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
204 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 0xE0 - 0xE7 */ 212 /* 0xE0 - 0xE7 */
206 0, 0, 0, 0, 213 0, 0, 0, 0,
207 ByteOp | SrcImmUByte, SrcImmUByte, 214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
208 ByteOp | SrcImmUByte, SrcImmUByte, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
209 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
210 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
214 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
215 0, 0, 0, 0, 222 0, 0, 0, 0,
216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, 223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
343 [Group5*8] = 350 [Group5*8] =
344 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
345 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
347 [Group7*8] = 355 [Group7*8] =
348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
349 SrcNone | ModRM | DstMem | Mov, 0, 357 SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, 361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, 362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] = 363 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, 364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
357}; 365};
358 366
359static u32 group2_table[] = { 367static u32 group2_table[] = {
360 [Group7*8] = 368 [Group7*8] =
361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, 369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
362 SrcNone | ModRM | DstMem | Mov, 0, 370 SrcNone | ModRM | DstMem | Mov, 0,
363 SrcMem16 | ModRM | Mov, 0, 371 SrcMem16 | ModRM | Mov | Priv, 0,
364 [Group9*8] = 372 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0, 373 0, 0, 0, 0, 0, 0, 0, 0,
366}; 374};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
562#define insn_fetch(_type, _size, _eip) \ 570#define insn_fetch(_type, _size, _eip) \
563({ unsigned long _x; \ 571({ unsigned long _x; \
564 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ 572 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
565 if (rc != 0) \ 573 if (rc != X86EMUL_CONTINUE) \
566 goto done; \ 574 goto done; \
567 (_eip) += (_size); \ 575 (_eip) += (_size); \
568 (_type)_x; \ 576 (_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
638 646
639static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
640 struct x86_emulate_ops *ops, 648 struct x86_emulate_ops *ops,
641 unsigned long linear, u8 *dest) 649 unsigned long eip, u8 *dest)
642{ 650{
643 struct fetch_cache *fc = &ctxt->decode.fetch; 651 struct fetch_cache *fc = &ctxt->decode.fetch;
644 int rc; 652 int rc;
645 int size; 653 int size, cur_size;
646 654
647 if (linear < fc->start || linear >= fc->end) { 655 if (eip == fc->end) {
648 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 656 cur_size = fc->end - fc->start;
649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); 657 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
650 if (rc) 658 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
659 size, ctxt->vcpu, NULL);
660 if (rc != X86EMUL_CONTINUE)
651 return rc; 661 return rc;
652 fc->start = linear; 662 fc->end += size;
653 fc->end = linear + size;
654 } 663 }
655 *dest = fc->data[linear - fc->start]; 664 *dest = fc->data[eip - fc->start];
656 return 0; 665 return X86EMUL_CONTINUE;
657} 666}
658 667
659static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 668static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
660 struct x86_emulate_ops *ops, 669 struct x86_emulate_ops *ops,
661 unsigned long eip, void *dest, unsigned size) 670 unsigned long eip, void *dest, unsigned size)
662{ 671{
663 int rc = 0; 672 int rc;
664 673
665 /* x86 instructions are limited to 15 bytes. */ 674 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15) 675 if (eip + size - ctxt->eip > 15)
667 return X86EMUL_UNHANDLEABLE; 676 return X86EMUL_UNHANDLEABLE;
668 eip += ctxt->cs_base;
669 while (size--) { 677 while (size--) {
670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 678 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
671 if (rc) 679 if (rc != X86EMUL_CONTINUE)
672 return rc; 680 return rc;
673 } 681 }
674 return 0; 682 return X86EMUL_CONTINUE;
675} 683}
676 684
677/* 685/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
702 *address = 0; 710 *address = 0;
703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 711 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
704 ctxt->vcpu, NULL); 712 ctxt->vcpu, NULL);
705 if (rc) 713 if (rc != X86EMUL_CONTINUE)
706 return rc; 714 return rc;
707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 715 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
708 ctxt->vcpu, NULL); 716 ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
782 struct decode_cache *c = &ctxt->decode; 790 struct decode_cache *c = &ctxt->decode;
783 u8 sib; 791 u8 sib;
784 int index_reg = 0, base_reg = 0, scale; 792 int index_reg = 0, base_reg = 0, scale;
785 int rc = 0; 793 int rc = X86EMUL_CONTINUE;
786 794
787 if (c->rex_prefix) { 795 if (c->rex_prefix) {
788 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 796 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
895 struct x86_emulate_ops *ops) 903 struct x86_emulate_ops *ops)
896{ 904{
897 struct decode_cache *c = &ctxt->decode; 905 struct decode_cache *c = &ctxt->decode;
898 int rc = 0; 906 int rc = X86EMUL_CONTINUE;
899 907
900 switch (c->ad_bytes) { 908 switch (c->ad_bytes) {
901 case 2: 909 case 2:
@@ -916,14 +924,18 @@ int
916x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 924x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
917{ 925{
918 struct decode_cache *c = &ctxt->decode; 926 struct decode_cache *c = &ctxt->decode;
919 int rc = 0; 927 int rc = X86EMUL_CONTINUE;
920 int mode = ctxt->mode; 928 int mode = ctxt->mode;
921 int def_op_bytes, def_ad_bytes, group; 929 int def_op_bytes, def_ad_bytes, group;
922 930
923 /* Shadow copy of register state. Committed on successful emulation. */
924 931
932 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart);
934
935 /* Shadow copy of register state. Committed on successful emulation. */
925 memset(c, 0, sizeof(struct decode_cache)); 936 memset(c, 0, sizeof(struct decode_cache));
926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); 937 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip;
927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
929 941
@@ -1015,11 +1027,6 @@ done_prefixes:
1015 } 1027 }
1016 } 1028 }
1017 1029
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
1023 if (c->d & Group) { 1030 if (c->d & Group) {
1024 group = c->d & GroupMask; 1031 group = c->d & GroupMask;
1025 c->modrm = insn_fetch(u8, 1, c->eip); 1032 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
1046 rc = decode_modrm(ctxt, ops); 1053 rc = decode_modrm(ctxt, ops);
1047 else if (c->d & MemAbs) 1054 else if (c->d & MemAbs)
1048 rc = decode_abs(ctxt, ops); 1055 rc = decode_abs(ctxt, ops);
1049 if (rc) 1056 if (rc != X86EMUL_CONTINUE)
1050 goto done; 1057 goto done;
1051 1058
1052 if (!c->has_seg_override) 1059 if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
1057 1064
1058 if (c->ad_bytes != 8) 1065 if (c->ad_bytes != 8)
1059 c->modrm_ea = (u32)c->modrm_ea; 1066 c->modrm_ea = (u32)c->modrm_ea;
1067
1068 if (c->rip_relative)
1069 c->modrm_ea += c->eip;
1070
1060 /* 1071 /*
1061 * Decode and fetch the source operand: register, memory 1072 * Decode and fetch the source operand: register, memory
1062 * or immediate. 1073 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
1091 break; 1102 break;
1092 } 1103 }
1093 c->src.type = OP_MEM; 1104 c->src.type = OP_MEM;
1105 c->src.ptr = (unsigned long *)c->modrm_ea;
1106 c->src.val = 0;
1094 break; 1107 break;
1095 case SrcImm: 1108 case SrcImm:
1096 case SrcImmU: 1109 case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
1139 c->src.bytes = 1; 1152 c->src.bytes = 1;
1140 c->src.val = 1; 1153 c->src.val = 1;
1141 break; 1154 break;
1155 case SrcSI:
1156 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c),
1160 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0;
1162 break;
1142 } 1163 }
1143 1164
1144 /* 1165 /*
@@ -1168,6 +1189,12 @@ done_prefixes:
1168 c->src2.bytes = 1; 1189 c->src2.bytes = 1;
1169 c->src2.val = 1; 1190 c->src2.val = 1;
1170 break; 1191 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1171 } 1198 }
1172 1199
1173 /* Decode and fetch the destination operand: register or memory. */ 1200 /* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
1180 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 1207 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1181 break; 1208 break;
1182 case DstMem: 1209 case DstMem:
1210 case DstMem64:
1183 if ((c->d & ModRM) && c->modrm_mod == 3) { 1211 if ((c->d & ModRM) && c->modrm_mod == 3) {
1184 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1212 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1185 c->dst.type = OP_REG; 1213 c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
1188 break; 1216 break;
1189 } 1217 }
1190 c->dst.type = OP_MEM; 1218 c->dst.type = OP_MEM;
1219 c->dst.ptr = (unsigned long *)c->modrm_ea;
1220 if ((c->d & DstMask) == DstMem64)
1221 c->dst.bytes = 8;
1222 else
1223 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1224 c->dst.val = 0;
1225 if (c->d & BitOp) {
1226 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1227
1228 c->dst.ptr = (void *)c->dst.ptr +
1229 (c->src.val & mask) / 8;
1230 }
1191 break; 1231 break;
1192 case DstAcc: 1232 case DstAcc:
1193 c->dst.type = OP_REG; 1233 c->dst.type = OP_REG;
1194 c->dst.bytes = c->op_bytes; 1234 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1195 c->dst.ptr = &c->regs[VCPU_REGS_RAX]; 1235 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1196 switch (c->op_bytes) { 1236 switch (c->dst.bytes) {
1197 case 1: 1237 case 1:
1198 c->dst.val = *(u8 *)c->dst.ptr; 1238 c->dst.val = *(u8 *)c->dst.ptr;
1199 break; 1239 break;
@@ -1203,18 +1243,248 @@ done_prefixes:
1203 case 4: 1243 case 4:
1204 c->dst.val = *(u32 *)c->dst.ptr; 1244 c->dst.val = *(u32 *)c->dst.ptr;
1205 break; 1245 break;
1246 case 8:
1247 c->dst.val = *(u64 *)c->dst.ptr;
1248 break;
1206 } 1249 }
1207 c->dst.orig_val = c->dst.val; 1250 c->dst.orig_val = c->dst.val;
1208 break; 1251 break;
1252 case DstDI:
1253 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt),
1257 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0;
1259 break;
1209 } 1260 }
1210 1261
1211 if (c->rip_relative)
1212 c->modrm_ea += c->eip;
1213
1214done: 1262done:
1215 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1216} 1264}
1217 1265
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port,
1269 void *dest)
1270{
1271 struct read_cache *rc = &ctxt->decode.io_read;
1272
1273 if (rc->pos == rc->end) { /* refill pio read ahead */
1274 struct decode_cache *c = &ctxt->decode;
1275 unsigned int in_page, n;
1276 unsigned int count = c->rep_prefix ?
1277 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
1278 in_page = (ctxt->eflags & EFLG_DF) ?
1279 offset_in_page(c->regs[VCPU_REGS_RDI]) :
1280 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
1281 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1282 count);
1283 if (n == 0)
1284 n = 1;
1285 rc->pos = rc->end = 0;
1286 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
1287 return 0;
1288 rc->end = n * size;
1289 }
1290
1291 memcpy(dest, rc->data + rc->pos, size);
1292 rc->pos += size;
1293 return 1;
1294}
1295
1296static u32 desc_limit_scaled(struct desc_struct *desc)
1297{
1298 u32 limit = get_desc_limit(desc);
1299
1300 return desc->g ? (limit << 12) | 0xfff : limit;
1301}
1302
1303static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1304 struct x86_emulate_ops *ops,
1305 u16 selector, struct desc_ptr *dt)
1306{
1307 if (selector & 1 << 2) {
1308 struct desc_struct desc;
1309 memset (dt, 0, sizeof *dt);
1310 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
1311 return;
1312
1313 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1314 dt->address = get_desc_base(&desc);
1315 } else
1316 ops->get_gdt(dt, ctxt->vcpu);
1317}
1318
1319/* allowed just for 8 bytes segments */
1320static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1321 struct x86_emulate_ops *ops,
1322 u16 selector, struct desc_struct *desc)
1323{
1324 struct desc_ptr dt;
1325 u16 index = selector >> 3;
1326 int ret;
1327 u32 err;
1328 ulong addr;
1329
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331
1332 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT;
1335 }
1336 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1340
1341 return ret;
1342}
1343
1344/* allowed just for 8 bytes segments */
1345static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1346 struct x86_emulate_ops *ops,
1347 u16 selector, struct desc_struct *desc)
1348{
1349 struct desc_ptr dt;
1350 u16 index = selector >> 3;
1351 u32 err;
1352 ulong addr;
1353 int ret;
1354
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356
1357 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT;
1360 }
1361
1362 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1366
1367 return ret;
1368}
1369
1370static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1371 struct x86_emulate_ops *ops,
1372 u16 selector, int seg)
1373{
1374 struct desc_struct seg_desc;
1375 u8 dpl, rpl, cpl;
1376 unsigned err_vec = GP_VECTOR;
1377 u32 err_code = 0;
1378 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1379 int ret;
1380
1381 memset(&seg_desc, 0, sizeof seg_desc);
1382
1383 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1384 || ctxt->mode == X86EMUL_MODE_REAL) {
1385 /* set real mode segment descriptor */
1386 set_desc_base(&seg_desc, selector << 4);
1387 set_desc_limit(&seg_desc, 0xffff);
1388 seg_desc.type = 3;
1389 seg_desc.p = 1;
1390 seg_desc.s = 1;
1391 goto load;
1392 }
1393
1394 /* NULL selector is not valid for TR, CS and SS */
1395 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
1396 && null_selector)
1397 goto exception;
1398
1399 /* TR should be in GDT only */
1400 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
1401 goto exception;
1402
1403 if (null_selector) /* for NULL selector skip all following checks */
1404 goto load;
1405
1406 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
1407 if (ret != X86EMUL_CONTINUE)
1408 return ret;
1409
1410 err_code = selector & 0xfffc;
1411 err_vec = GP_VECTOR;
1412
1413 /* can't load system descriptor into segment selecor */
1414 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1415 goto exception;
1416
1417 if (!seg_desc.p) {
1418 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
1419 goto exception;
1420 }
1421
1422 rpl = selector & 3;
1423 dpl = seg_desc.dpl;
1424 cpl = ops->cpl(ctxt->vcpu);
1425
1426 switch (seg) {
1427 case VCPU_SREG_SS:
1428 /*
1429 * segment is not a writable data segment or segment
1430 * selector's RPL != CPL or segment selector's RPL != CPL
1431 */
1432 if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
1433 goto exception;
1434 break;
1435 case VCPU_SREG_CS:
1436 if (!(seg_desc.type & 8))
1437 goto exception;
1438
1439 if (seg_desc.type & 4) {
1440 /* conforming */
1441 if (dpl > cpl)
1442 goto exception;
1443 } else {
1444 /* nonconforming */
1445 if (rpl > cpl || dpl != cpl)
1446 goto exception;
1447 }
1448 /* CS(RPL) <- CPL */
1449 selector = (selector & 0xfffc) | cpl;
1450 break;
1451 case VCPU_SREG_TR:
1452 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1453 goto exception;
1454 break;
1455 case VCPU_SREG_LDTR:
1456 if (seg_desc.s || seg_desc.type != 2)
1457 goto exception;
1458 break;
1459 default: /* DS, ES, FS, or GS */
1460 /*
1461 * segment is not a data or readable code segment or
1462 * ((segment is a data or nonconforming code segment)
1463 * and (both RPL and CPL > DPL))
1464 */
1465 if ((seg_desc.type & 0xa) == 0x8 ||
1466 (((seg_desc.type & 0xc) != 0xc) &&
1467 (rpl > dpl && cpl > dpl)))
1468 goto exception;
1469 break;
1470 }
1471
1472 if (seg_desc.s) {
1473 /* mark segment as accessed */
1474 seg_desc.type |= 1;
1475 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
1476 if (ret != X86EMUL_CONTINUE)
1477 return ret;
1478 }
1479load:
1480 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE;
1483exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
1485 return X86EMUL_PROPAGATE_FAULT;
1486}
1487
1218static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1219{ 1489{
1220 struct decode_cache *c = &ctxt->decode; 1490 struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1251 int rc; 1521 int rc;
1252 unsigned long val, change_mask; 1522 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1523 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); 1524 int cpl = ops->cpl(ctxt->vcpu);
1255 1525
1256 rc = emulate_pop(ctxt, ops, &val, len); 1526 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE) 1527 if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1306 int rc; 1576 int rc;
1307 1577
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1578 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0) 1579 if (rc != X86EMUL_CONTINUE)
1310 return rc; 1580 return rc;
1311 1581
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); 1582 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
1313 return rc; 1583 return rc;
1314} 1584}
1315 1585
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops) 1602 struct x86_emulate_ops *ops)
1333{ 1603{
1334 struct decode_cache *c = &ctxt->decode; 1604 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0; 1605 int rc = X86EMUL_CONTINUE;
1336 int reg = VCPU_REGS_RDI; 1606 int reg = VCPU_REGS_RDI;
1337 1607
1338 while (reg >= VCPU_REGS_RAX) { 1608 while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1343 } 1613 }
1344 1614
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1615 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0) 1616 if (rc != X86EMUL_CONTINUE)
1347 break; 1617 break;
1348 --reg; 1618 --reg;
1349 } 1619 }
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops) 1624 struct x86_emulate_ops *ops)
1355{ 1625{
1356 struct decode_cache *c = &ctxt->decode; 1626 struct decode_cache *c = &ctxt->decode;
1357 int rc;
1358 1627
1359 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1628 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1360 if (rc != 0)
1361 return rc;
1362 return 0;
1363} 1629}
1364 1630
1365static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1631static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1395 struct x86_emulate_ops *ops) 1661 struct x86_emulate_ops *ops)
1396{ 1662{
1397 struct decode_cache *c = &ctxt->decode; 1663 struct decode_cache *c = &ctxt->decode;
1398 int rc = 0;
1399 1664
1400 switch (c->modrm_reg) { 1665 switch (c->modrm_reg) {
1401 case 0 ... 1: /* test */ 1666 case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1408 emulate_1op("neg", c->dst, ctxt->eflags); 1673 emulate_1op("neg", c->dst, ctxt->eflags);
1409 break; 1674 break;
1410 default: 1675 default:
1411 DPRINTF("Cannot emulate %02x\n", c->b); 1676 return 0;
1412 rc = X86EMUL_UNHANDLEABLE;
1413 break;
1414 } 1677 }
1415 return rc; 1678 return 1;
1416} 1679}
1417 1680
1418static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1681static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1442 emulate_push(ctxt); 1705 emulate_push(ctxt);
1443 break; 1706 break;
1444 } 1707 }
1445 return 0; 1708 return X86EMUL_CONTINUE;
1446} 1709}
1447 1710
1448static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1711static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1449 struct x86_emulate_ops *ops, 1712 struct x86_emulate_ops *ops)
1450 unsigned long memop)
1451{ 1713{
1452 struct decode_cache *c = &ctxt->decode; 1714 struct decode_cache *c = &ctxt->decode;
1453 u64 old, new; 1715 u64 old = c->dst.orig_val;
1454 int rc;
1455
1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1457 if (rc != X86EMUL_CONTINUE)
1458 return rc;
1459 1716
1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1717 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1461 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1718 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1463 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1720 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1464 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1721 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1465 ctxt->eflags &= ~EFLG_ZF; 1722 ctxt->eflags &= ~EFLG_ZF;
1466
1467 } else { 1723 } else {
1468 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1724 c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1469 (u32) c->regs[VCPU_REGS_RBX]; 1725 (u32) c->regs[VCPU_REGS_RBX];
1470 1726
1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1472 if (rc != X86EMUL_CONTINUE)
1473 return rc;
1474 ctxt->eflags |= EFLG_ZF; 1727 ctxt->eflags |= EFLG_ZF;
1475 } 1728 }
1476 return 0; 1729 return X86EMUL_CONTINUE;
1477} 1730}
1478 1731
1479static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1732static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1484 unsigned long cs; 1737 unsigned long cs;
1485 1738
1486 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1739 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1487 if (rc) 1740 if (rc != X86EMUL_CONTINUE)
1488 return rc; 1741 return rc;
1489 if (c->op_bytes == 4) 1742 if (c->op_bytes == 4)
1490 c->eip = (u32)c->eip; 1743 c->eip = (u32)c->eip;
1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1744 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1492 if (rc) 1745 if (rc != X86EMUL_CONTINUE)
1493 return rc; 1746 return rc;
1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); 1747 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1495 return rc; 1748 return rc;
1496} 1749}
1497 1750
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1544 default: 1797 default:
1545 break; 1798 break;
1546 } 1799 }
1547 return 0; 1800 return X86EMUL_CONTINUE;
1548} 1801}
1549 1802
1550static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1598 u64 msr_data; 1851 u64 msr_data;
1599 1852
1600 /* syscall is not available in real mode */ 1853 /* syscall is not available in real mode */
1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) 1854 if (ctxt->mode == X86EMUL_MODE_REAL ||
1602 return X86EMUL_UNHANDLEABLE; 1855 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1857 return X86EMUL_PROPAGATE_FAULT;
1858 }
1603 1859
1604 setup_syscalls_segments(ctxt, &cs, &ss); 1860 setup_syscalls_segments(ctxt, &cs, &ss);
1605 1861
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1649 /* inject #GP if in real mode */ 1905 /* inject #GP if in real mode */
1650 if (ctxt->mode == X86EMUL_MODE_REAL) { 1906 if (ctxt->mode == X86EMUL_MODE_REAL) {
1651 kvm_inject_gp(ctxt->vcpu, 0); 1907 kvm_inject_gp(ctxt->vcpu, 0);
1652 return X86EMUL_UNHANDLEABLE; 1908 return X86EMUL_PROPAGATE_FAULT;
1653 } 1909 }
1654 1910
1655 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1911 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1656 * Therefore, we inject an #UD. 1912 * Therefore, we inject an #UD.
1657 */ 1913 */
1658 if (ctxt->mode == X86EMUL_MODE_PROT64) 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1659 return X86EMUL_UNHANDLEABLE; 1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1916 return X86EMUL_PROPAGATE_FAULT;
1917 }
1660 1918
1661 setup_syscalls_segments(ctxt, &cs, &ss); 1919 setup_syscalls_segments(ctxt, &cs, &ss);
1662 1920
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1711 if (ctxt->mode == X86EMUL_MODE_REAL || 1969 if (ctxt->mode == X86EMUL_MODE_REAL ||
1712 ctxt->mode == X86EMUL_MODE_VM86) { 1970 ctxt->mode == X86EMUL_MODE_VM86) {
1713 kvm_inject_gp(ctxt->vcpu, 0); 1971 kvm_inject_gp(ctxt->vcpu, 0);
1714 return X86EMUL_UNHANDLEABLE; 1972 return X86EMUL_PROPAGATE_FAULT;
1715 } 1973 }
1716 1974
1717 setup_syscalls_segments(ctxt, &cs, &ss); 1975 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1756 return X86EMUL_CONTINUE; 2014 return X86EMUL_CONTINUE;
1757} 2015}
1758 2016
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) 2017static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2018 struct x86_emulate_ops *ops)
1760{ 2019{
1761 int iopl; 2020 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1764 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true; 2024 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; 2026 return ops->cpl(ctxt->vcpu) > iopl;
1768} 2027}
1769 2028
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops, 2060 struct x86_emulate_ops *ops,
1802 u16 port, u16 len) 2061 u16 port, u16 len)
1803{ 2062{
1804 if (emulator_bad_iopl(ctxt)) 2063 if (emulator_bad_iopl(ctxt, ops))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false; 2065 return false;
1807 return true; 2066 return true;
1808} 2067}
1809 2068
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss)
2083{
2084 struct decode_cache *c = &ctxt->decode;
2085
2086 tss->ip = c->eip;
2087 tss->flag = ctxt->eflags;
2088 tss->ax = c->regs[VCPU_REGS_RAX];
2089 tss->cx = c->regs[VCPU_REGS_RCX];
2090 tss->dx = c->regs[VCPU_REGS_RDX];
2091 tss->bx = c->regs[VCPU_REGS_RBX];
2092 tss->sp = c->regs[VCPU_REGS_RSP];
2093 tss->bp = c->regs[VCPU_REGS_RBP];
2094 tss->si = c->regs[VCPU_REGS_RSI];
2095 tss->di = c->regs[VCPU_REGS_RDI];
2096
2097 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2098 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2099 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2100 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2101 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2102}
2103
2104static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2105 struct x86_emulate_ops *ops,
2106 struct tss_segment_16 *tss)
2107{
2108 struct decode_cache *c = &ctxt->decode;
2109 int ret;
2110
2111 c->eip = tss->ip;
2112 ctxt->eflags = tss->flag | 2;
2113 c->regs[VCPU_REGS_RAX] = tss->ax;
2114 c->regs[VCPU_REGS_RCX] = tss->cx;
2115 c->regs[VCPU_REGS_RDX] = tss->dx;
2116 c->regs[VCPU_REGS_RBX] = tss->bx;
2117 c->regs[VCPU_REGS_RSP] = tss->sp;
2118 c->regs[VCPU_REGS_RBP] = tss->bp;
2119 c->regs[VCPU_REGS_RSI] = tss->si;
2120 c->regs[VCPU_REGS_RDI] = tss->di;
2121
2122 /*
2123 * SDM says that segment selectors are loaded before segment
2124 * descriptors
2125 */
2126 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
2127 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2128 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2129 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2130 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2131
2132 /*
2133 * Now load segment descriptors. If fault happenes at this stage
2134 * it is handled in a context of new task
2135 */
2136 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
2137 if (ret != X86EMUL_CONTINUE)
2138 return ret;
2139 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2140 if (ret != X86EMUL_CONTINUE)
2141 return ret;
2142 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2143 if (ret != X86EMUL_CONTINUE)
2144 return ret;
2145 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2146 if (ret != X86EMUL_CONTINUE)
2147 return ret;
2148 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2149 if (ret != X86EMUL_CONTINUE)
2150 return ret;
2151
2152 return X86EMUL_CONTINUE;
2153}
2154
2155static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2156 struct x86_emulate_ops *ops,
2157 u16 tss_selector, u16 old_tss_sel,
2158 ulong old_tss_base, struct desc_struct *new_desc)
2159{
2160 struct tss_segment_16 tss_seg;
2161 int ret;
2162 u32 err, new_tss_base = get_desc_base(new_desc);
2163
2164 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2165 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2169 return ret;
2170 }
2171
2172 save_state_to_tss16(ctxt, ops, &tss_seg);
2173
2174 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2175 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2179 return ret;
2180 }
2181
2182 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2183 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2187 return ret;
2188 }
2189
2190 if (old_tss_sel != 0xffff) {
2191 tss_seg.prev_task_link = old_tss_sel;
2192
2193 ret = ops->write_std(new_tss_base,
2194 &tss_seg.prev_task_link,
2195 sizeof tss_seg.prev_task_link,
2196 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2200 return ret;
2201 }
2202 }
2203
2204 return load_state_from_tss16(ctxt, ops, &tss_seg);
2205}
2206
2207static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2208 struct x86_emulate_ops *ops,
2209 struct tss_segment_32 *tss)
2210{
2211 struct decode_cache *c = &ctxt->decode;
2212
2213 tss->cr3 = ops->get_cr(3, ctxt->vcpu);
2214 tss->eip = c->eip;
2215 tss->eflags = ctxt->eflags;
2216 tss->eax = c->regs[VCPU_REGS_RAX];
2217 tss->ecx = c->regs[VCPU_REGS_RCX];
2218 tss->edx = c->regs[VCPU_REGS_RDX];
2219 tss->ebx = c->regs[VCPU_REGS_RBX];
2220 tss->esp = c->regs[VCPU_REGS_RSP];
2221 tss->ebp = c->regs[VCPU_REGS_RBP];
2222 tss->esi = c->regs[VCPU_REGS_RSI];
2223 tss->edi = c->regs[VCPU_REGS_RDI];
2224
2225 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2226 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2227 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2228 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2229 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
2230 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
2231 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2232}
2233
2234static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2235 struct x86_emulate_ops *ops,
2236 struct tss_segment_32 *tss)
2237{
2238 struct decode_cache *c = &ctxt->decode;
2239 int ret;
2240
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu);
2242 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax;
2245 c->regs[VCPU_REGS_RCX] = tss->ecx;
2246 c->regs[VCPU_REGS_RDX] = tss->edx;
2247 c->regs[VCPU_REGS_RBX] = tss->ebx;
2248 c->regs[VCPU_REGS_RSP] = tss->esp;
2249 c->regs[VCPU_REGS_RBP] = tss->ebp;
2250 c->regs[VCPU_REGS_RSI] = tss->esi;
2251 c->regs[VCPU_REGS_RDI] = tss->edi;
2252
2253 /*
2254 * SDM says that segment selectors are loaded before segment
2255 * descriptors
2256 */
2257 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
2258 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2259 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2260 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2261 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2262 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
2263 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
2264
2265 /*
2266 * Now load segment descriptors. If fault happenes at this stage
2267 * it is handled in a context of new task
2268 */
2269 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
2270 if (ret != X86EMUL_CONTINUE)
2271 return ret;
2272 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2273 if (ret != X86EMUL_CONTINUE)
2274 return ret;
2275 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2276 if (ret != X86EMUL_CONTINUE)
2277 return ret;
2278 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2279 if (ret != X86EMUL_CONTINUE)
2280 return ret;
2281 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2282 if (ret != X86EMUL_CONTINUE)
2283 return ret;
2284 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
2285 if (ret != X86EMUL_CONTINUE)
2286 return ret;
2287 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
2288 if (ret != X86EMUL_CONTINUE)
2289 return ret;
2290
2291 return X86EMUL_CONTINUE;
2292}
2293
2294static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2295 struct x86_emulate_ops *ops,
2296 u16 tss_selector, u16 old_tss_sel,
2297 ulong old_tss_base, struct desc_struct *new_desc)
2298{
2299 struct tss_segment_32 tss_seg;
2300 int ret;
2301 u32 err, new_tss_base = get_desc_base(new_desc);
2302
2303 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2304 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2308 return ret;
2309 }
2310
2311 save_state_to_tss32(ctxt, ops, &tss_seg);
2312
2313 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2314 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2318 return ret;
2319 }
2320
2321 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2322 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2326 return ret;
2327 }
2328
2329 if (old_tss_sel != 0xffff) {
2330 tss_seg.prev_task_link = old_tss_sel;
2331
2332 ret = ops->write_std(new_tss_base,
2333 &tss_seg.prev_task_link,
2334 sizeof tss_seg.prev_task_link,
2335 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2339 return ret;
2340 }
2341 }
2342
2343 return load_state_from_tss32(ctxt, ops, &tss_seg);
2344}
2345
2346static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2347 struct x86_emulate_ops *ops,
2348 u16 tss_selector, int reason,
2349 bool has_error_code, u32 error_code)
2350{
2351 struct desc_struct curr_tss_desc, next_tss_desc;
2352 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
2356 u32 desc_limit;
2357
2358 /* FIXME: old_tss_base == ~0 ? */
2359
2360 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
2361 if (ret != X86EMUL_CONTINUE)
2362 return ret;
2363 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
2364 if (ret != X86EMUL_CONTINUE)
2365 return ret;
2366
2367 /* FIXME: check that next_tss_desc is tss */
2368
2369 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0);
2373 return X86EMUL_PROPAGATE_FAULT;
2374 }
2375 }
2376
2377 desc_limit = desc_limit_scaled(&next_tss_desc);
2378 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT;
2384 }
2385
2386 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2387 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2388 write_segment_descriptor(ctxt, ops, old_tss_sel,
2389 &curr_tss_desc);
2390 }
2391
2392 if (reason == TASK_SWITCH_IRET)
2393 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2394
2395 /* set back link to prev task only if NT bit is set in eflags
2396 note that old_tss_sel is not used afetr this point */
2397 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2398 old_tss_sel = 0xffff;
2399
2400 if (next_tss_desc.type & 8)
2401 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
2402 old_tss_base, &next_tss_desc);
2403 else
2404 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
2405 old_tss_base, &next_tss_desc);
2406 if (ret != X86EMUL_CONTINUE)
2407 return ret;
2408
2409 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
2410 ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
2411
2412 if (reason != TASK_SWITCH_IRET) {
2413 next_tss_desc.type |= (1 << 1); /* set busy flag */
2414 write_segment_descriptor(ctxt, ops, tss_selector,
2415 &next_tss_desc);
2416 }
2417
2418 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2419 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
2420 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2421
2422 if (has_error_code) {
2423 struct decode_cache *c = &ctxt->decode;
2424
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt);
2429 }
2430
2431 return ret;
2432}
2433
2434int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2435 struct x86_emulate_ops *ops,
2436 u16 tss_selector, int reason,
2437 bool has_error_code, u32 error_code)
2438{
2439 struct decode_cache *c = &ctxt->decode;
2440 int rc;
2441
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE;
2446
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code);
2449
2450 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops);
2454 }
2455
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2457}
2458
2459static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2460 int reg, struct operand *op)
2461{
2462 struct decode_cache *c = &ctxt->decode;
2463 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2464
2465 register_address_increment(c, &c->regs[reg], df * op->bytes);
2466 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
2467}
2468
1810int 2469int
1811x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2470x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1812{ 2471{
1813 unsigned long memop = 0;
1814 u64 msr_data; 2472 u64 msr_data;
1815 unsigned long saved_eip = 0;
1816 struct decode_cache *c = &ctxt->decode; 2473 struct decode_cache *c = &ctxt->decode;
1817 unsigned int port; 2474 int rc = X86EMUL_CONTINUE;
1818 int io_dir_in; 2475 int saved_dst_type = c->dst.type;
1819 int rc = 0;
1820 2476
1821 ctxt->interruptibility = 0; 2477 ctxt->interruptibility = 0;
1822 2478
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1826 */ 2482 */
1827 2483
1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1829 saved_eip = c->eip; 2485
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2488 goto done;
2489 }
1830 2490
1831 /* LOCK prefix is allowed only with some instructions */ 2491 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) { 2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done; 2494 goto done;
1835 } 2495 }
1836 2496
1837 /* Privileged instruction can be executed only in CPL=0 */ 2497 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { 2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0); 2499 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done; 2500 goto done;
1841 } 2501 }
1842 2502
1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1844 memop = c->modrm_ea;
1845
1846 if (c->rep_prefix && (c->d & String)) { 2503 if (c->rep_prefix && (c->d & String)) {
2504 ctxt->restart = true;
1847 /* All REP prefixes have the same first termination condition */ 2505 /* All REP prefixes have the same first termination condition */
1848 if (c->regs[VCPU_REGS_RCX] == 0) { 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done:
2508 ctxt->restart = false;
1849 kvm_rip_write(ctxt->vcpu, c->eip); 2509 kvm_rip_write(ctxt->vcpu, c->eip);
1850 goto done; 2510 goto done;
1851 } 2511 }
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1857 * - if REPNE/REPNZ and ZF = 1 then done 2517 * - if REPNE/REPNZ and ZF = 1 then done
1858 */ 2518 */
1859 if ((c->b == 0xa6) || (c->b == 0xa7) || 2519 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1860 (c->b == 0xae) || (c->b == 0xaf)) { 2520 (c->b == 0xae) || (c->b == 0xaf)) {
1861 if ((c->rep_prefix == REPE_PREFIX) && 2521 if ((c->rep_prefix == REPE_PREFIX) &&
1862 ((ctxt->eflags & EFLG_ZF) == 0)) { 2522 ((ctxt->eflags & EFLG_ZF) == 0))
1863 kvm_rip_write(ctxt->vcpu, c->eip); 2523 goto string_done;
1864 goto done;
1865 }
1866 if ((c->rep_prefix == REPNE_PREFIX) && 2524 if ((c->rep_prefix == REPNE_PREFIX) &&
1867 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 2525 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
1868 kvm_rip_write(ctxt->vcpu, c->eip); 2526 goto string_done;
1869 goto done;
1870 }
1871 } 2527 }
1872 c->regs[VCPU_REGS_RCX]--; 2528 c->eip = ctxt->eip;
1873 c->eip = kvm_rip_read(ctxt->vcpu);
1874 } 2529 }
1875 2530
1876 if (c->src.type == OP_MEM) { 2531 if (c->src.type == OP_MEM) {
1877 c->src.ptr = (unsigned long *)memop;
1878 c->src.val = 0;
1879 rc = ops->read_emulated((unsigned long)c->src.ptr, 2532 rc = ops->read_emulated((unsigned long)c->src.ptr,
1880 &c->src.val, 2533 &c->src.val,
1881 c->src.bytes, 2534 c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1885 c->src.orig_val = c->src.val; 2538 c->src.orig_val = c->src.val;
1886 } 2539 }
1887 2540
2541 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr,
2543 &c->src2.val,
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE)
2547 goto done;
2548 }
2549
1888 if ((c->d & DstMask) == ImplicitOps) 2550 if ((c->d & DstMask) == ImplicitOps)
1889 goto special_insn; 2551 goto special_insn;
1890 2552
1891 2553
1892 if (c->dst.type == OP_MEM) { 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
1893 c->dst.ptr = (unsigned long *)memop; 2555 /* optimisation - avoid slow emulated read if Mov */
1894 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
1895 c->dst.val = 0; 2557 c->dst.bytes, ctxt->vcpu);
1896 if (c->d & BitOp) { 2558 if (rc != X86EMUL_CONTINUE)
1897 unsigned long mask = ~(c->dst.bytes * 8 - 1); 2559 goto done;
1898
1899 c->dst.ptr = (void *)c->dst.ptr +
1900 (c->src.val & mask) / 8;
1901 }
1902 if (!(c->d & Mov)) {
1903 /* optimisation - avoid slow emulated read */
1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1905 &c->dst.val,
1906 c->dst.bytes,
1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1911 } 2560 }
1912 c->dst.orig_val = c->dst.val; 2561 c->dst.orig_val = c->dst.val;
1913 2562
@@ -1926,7 +2575,7 @@ special_insn:
1926 break; 2575 break;
1927 case 0x07: /* pop es */ 2576 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0) 2578 if (rc != X86EMUL_CONTINUE)
1930 goto done; 2579 goto done;
1931 break; 2580 break;
1932 case 0x08 ... 0x0d: 2581 case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
1945 break; 2594 break;
1946 case 0x17: /* pop ss */ 2595 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0) 2597 if (rc != X86EMUL_CONTINUE)
1949 goto done; 2598 goto done;
1950 break; 2599 break;
1951 case 0x18 ... 0x1d: 2600 case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
1957 break; 2606 break;
1958 case 0x1f: /* pop ds */ 2607 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0) 2609 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2610 goto done;
1962 break; 2611 break;
1963 case 0x20 ... 0x25: 2612 case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
1988 case 0x58 ... 0x5f: /* pop reg */ 2637 case 0x58 ... 0x5f: /* pop reg */
1989 pop_instruction: 2638 pop_instruction:
1990 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 2639 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1991 if (rc != 0) 2640 if (rc != X86EMUL_CONTINUE)
1992 goto done; 2641 goto done;
1993 break; 2642 break;
1994 case 0x60: /* pusha */ 2643 case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
1996 break; 2645 break;
1997 case 0x61: /* popa */ 2646 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops); 2647 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0) 2648 if (rc != X86EMUL_CONTINUE)
2000 goto done; 2649 goto done;
2001 break; 2650 break;
2002 case 0x63: /* movsxd */ 2651 case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
2010 break; 2659 break;
2011 case 0x6c: /* insb */ 2660 case 0x6c: /* insb */
2012 case 0x6d: /* insw/insd */ 2661 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u);
2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2664 c->dst.bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0); 2665 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done; 2666 goto done;
2017 } 2667 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu, 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2019 1, 2669 c->regs[VCPU_REGS_RDX], &c->dst.val))
2020 (c->d & ByteOp) ? 1 : c->op_bytes, 2670 goto done; /* IO is needed, skip writeback */
2021 c->rep_prefix ? 2671 break;
2022 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
2023 (ctxt->eflags & EFLG_DF),
2024 register_address(c, es_base(ctxt),
2025 c->regs[VCPU_REGS_RDI]),
2026 c->rep_prefix,
2027 c->regs[VCPU_REGS_RDX]) == 0) {
2028 c->eip = saved_eip;
2029 return -1;
2030 }
2031 return 0;
2032 case 0x6e: /* outsb */ 2672 case 0x6e: /* outsb */
2033 case 0x6f: /* outsw/outsd */ 2673 case 0x6f: /* outsw/outsd */
2674 c->src.bytes = min(c->src.bytes, 4u);
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2676 c->src.bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0); 2677 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done; 2678 goto done;
2038 } 2679 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu, 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2040 0, 2681 &c->src.val, 1, ctxt->vcpu);
2041 (c->d & ByteOp) ? 1 : c->op_bytes, 2682
2042 c->rep_prefix ? 2683 c->dst.type = OP_NONE; /* nothing to writeback */
2043 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 2684 break;
2044 (ctxt->eflags & EFLG_DF),
2045 register_address(c,
2046 seg_override_base(ctxt, c),
2047 c->regs[VCPU_REGS_RSI]),
2048 c->rep_prefix,
2049 c->regs[VCPU_REGS_RDX]) == 0) {
2050 c->eip = saved_eip;
2051 return -1;
2052 }
2053 return 0;
2054 case 0x70 ... 0x7f: /* jcc (short) */ 2685 case 0x70 ... 0x7f: /* jcc (short) */
2055 if (test_cc(c->b, ctxt->eflags)) 2686 if (test_cc(c->b, ctxt->eflags))
2056 jmp_rel(c, c->src.val); 2687 jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
2107 case 0x8c: { /* mov r/m, sreg */ 2738 case 0x8c: { /* mov r/m, sreg */
2108 struct kvm_segment segreg; 2739 struct kvm_segment segreg;
2109 2740
2110 if (c->modrm_reg <= 5) 2741 if (c->modrm_reg <= VCPU_SREG_GS)
2111 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2112 else { 2743 else {
2113 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", 2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2114 c->modrm); 2745 goto done;
2115 goto cannot_emulate;
2116 } 2746 }
2117 c->dst.val = segreg.selector; 2747 c->dst.val = segreg.selector;
2118 break; 2748 break;
@@ -2132,16 +2762,16 @@ special_insn:
2132 } 2762 }
2133 2763
2134 if (c->modrm_reg == VCPU_SREG_SS) 2764 if (c->modrm_reg == VCPU_SREG_SS)
2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); 2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
2136 2766
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2138 2768
2139 c->dst.type = OP_NONE; /* Disable writeback. */ 2769 c->dst.type = OP_NONE; /* Disable writeback. */
2140 break; 2770 break;
2141 } 2771 }
2142 case 0x8f: /* pop (sole member of Grp1a) */ 2772 case 0x8f: /* pop (sole member of Grp1a) */
2143 rc = emulate_grp1a(ctxt, ops); 2773 rc = emulate_grp1a(ctxt, ops);
2144 if (rc != 0) 2774 if (rc != X86EMUL_CONTINUE)
2145 goto done; 2775 goto done;
2146 break; 2776 break;
2147 case 0x90: /* nop / xchg r8,rax */ 2777 case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
2175 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2176 break; 2806 break;
2177 case 0xa4 ... 0xa5: /* movs */ 2807 case 0xa4 ... 0xa5: /* movs */
2178 c->dst.type = OP_MEM; 2808 goto mov;
2179 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2180 c->dst.ptr = (unsigned long *)register_address(c,
2181 es_base(ctxt),
2182 c->regs[VCPU_REGS_RDI]);
2183 rc = ops->read_emulated(register_address(c,
2184 seg_override_base(ctxt, c),
2185 c->regs[VCPU_REGS_RSI]),
2186 &c->dst.val,
2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2189 goto done;
2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2192 : c->dst.bytes);
2193 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2194 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2195 : c->dst.bytes);
2196 break;
2197 case 0xa6 ... 0xa7: /* cmps */ 2809 case 0xa6 ... 0xa7: /* cmps */
2198 c->src.type = OP_NONE; /* Disable writeback. */
2199 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2200 c->src.ptr = (unsigned long *)register_address(c,
2201 seg_override_base(ctxt, c),
2202 c->regs[VCPU_REGS_RSI]);
2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2204 &c->src.val,
2205 c->src.bytes,
2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2208 goto done;
2209
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2810 c->dst.type = OP_NONE; /* Disable writeback. */
2211 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2212 c->dst.ptr = (unsigned long *)register_address(c,
2213 es_base(ctxt),
2214 c->regs[VCPU_REGS_RDI]);
2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2216 &c->dst.val,
2217 c->dst.bytes,
2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2220 goto done;
2221
2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2223 2812 goto cmp;
2224 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2225
2226 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2227 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
2228 : c->src.bytes);
2229 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2230 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2231 : c->dst.bytes);
2232
2233 break;
2234 case 0xaa ... 0xab: /* stos */ 2813 case 0xaa ... 0xab: /* stos */
2235 c->dst.type = OP_MEM;
2236 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2237 c->dst.ptr = (unsigned long *)register_address(c,
2238 es_base(ctxt),
2239 c->regs[VCPU_REGS_RDI]);
2240 c->dst.val = c->regs[VCPU_REGS_RAX]; 2814 c->dst.val = c->regs[VCPU_REGS_RAX];
2241 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2242 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2243 : c->dst.bytes);
2244 break; 2815 break;
2245 case 0xac ... 0xad: /* lods */ 2816 case 0xac ... 0xad: /* lods */
2246 c->dst.type = OP_REG; 2817 goto mov;
2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2249 rc = ops->read_emulated(register_address(c,
2250 seg_override_base(ctxt, c),
2251 c->regs[VCPU_REGS_RSI]),
2252 &c->dst.val,
2253 c->dst.bytes,
2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2256 goto done;
2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2259 : c->dst.bytes);
2260 break;
2261 case 0xae ... 0xaf: /* scas */ 2818 case 0xae ... 0xaf: /* scas */
2262 DPRINTF("Urk! I don't handle SCAS.\n"); 2819 DPRINTF("Urk! I don't handle SCAS.\n");
2263 goto cannot_emulate; 2820 goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
2277 break; 2834 break;
2278 case 0xcb: /* ret far */ 2835 case 0xcb: /* ret far */
2279 rc = emulate_ret_far(ctxt, ops); 2836 rc = emulate_ret_far(ctxt, ops);
2280 if (rc) 2837 if (rc != X86EMUL_CONTINUE)
2281 goto done; 2838 goto done;
2282 break; 2839 break;
2283 case 0xd0 ... 0xd1: /* Grp2 */ 2840 case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
2290 break; 2847 break;
2291 case 0xe4: /* inb */ 2848 case 0xe4: /* inb */
2292 case 0xe5: /* in */ 2849 case 0xe5: /* in */
2293 port = c->src.val; 2850 goto do_io_in;
2294 io_dir_in = 1;
2295 goto do_io;
2296 case 0xe6: /* outb */ 2851 case 0xe6: /* outb */
2297 case 0xe7: /* out */ 2852 case 0xe7: /* out */
2298 port = c->src.val; 2853 goto do_io_out;
2299 io_dir_in = 0;
2300 goto do_io;
2301 case 0xe8: /* call (near) */ { 2854 case 0xe8: /* call (near) */ {
2302 long int rel = c->src.val; 2855 long int rel = c->src.val;
2303 c->src.val = (unsigned long) c->eip; 2856 c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
2308 case 0xe9: /* jmp rel */ 2861 case 0xe9: /* jmp rel */
2309 goto jmp; 2862 goto jmp;
2310 case 0xea: /* jmp far */ 2863 case 0xea: /* jmp far */
2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 2864 jump_far:
2312 VCPU_SREG_CS)) 2865 if (load_segment_descriptor(ctxt, ops, c->src2.val,
2866 VCPU_SREG_CS))
2313 goto done; 2867 goto done;
2314 2868
2315 c->eip = c->src.val; 2869 c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
2321 break; 2875 break;
2322 case 0xec: /* in al,dx */ 2876 case 0xec: /* in al,dx */
2323 case 0xed: /* in (e/r)ax,dx */ 2877 case 0xed: /* in (e/r)ax,dx */
2324 port = c->regs[VCPU_REGS_RDX]; 2878 c->src.val = c->regs[VCPU_REGS_RDX];
2325 io_dir_in = 1; 2879 do_io_in:
2326 goto do_io; 2880 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0);
2883 goto done;
2884 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val))
2887 goto done; /* IO is needed */
2888 break;
2327 case 0xee: /* out al,dx */ 2889 case 0xee: /* out al,dx */
2328 case 0xef: /* out (e/r)ax,dx */ 2890 case 0xef: /* out (e/r)ax,dx */
2329 port = c->regs[VCPU_REGS_RDX]; 2891 c->src.val = c->regs[VCPU_REGS_RDX];
2330 io_dir_in = 0; 2892 do_io_out:
2331 do_io: 2893 c->dst.bytes = min(c->dst.bytes, 4u);
2332 if (!emulator_io_permited(ctxt, ops, port, 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0); 2895 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done; 2896 goto done;
2336 } 2897 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2338 (c->d & ByteOp) ? 1 : c->op_bytes, 2899 ctxt->vcpu);
2339 port) != 0) { 2900 c->dst.type = OP_NONE; /* Disable writeback. */
2340 c->eip = saved_eip;
2341 goto cannot_emulate;
2342 }
2343 break; 2901 break;
2344 case 0xf4: /* hlt */ 2902 case 0xf4: /* hlt */
2345 ctxt->vcpu->arch.halt_request = 1; 2903 ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
2350 c->dst.type = OP_NONE; /* Disable writeback. */ 2908 c->dst.type = OP_NONE; /* Disable writeback. */
2351 break; 2909 break;
2352 case 0xf6 ... 0xf7: /* Grp3 */ 2910 case 0xf6 ... 0xf7: /* Grp3 */
2353 rc = emulate_grp3(ctxt, ops); 2911 if (!emulate_grp3(ctxt, ops))
2354 if (rc != 0) 2912 goto cannot_emulate;
2355 goto done;
2356 break; 2913 break;
2357 case 0xf8: /* clc */ 2914 case 0xf8: /* clc */
2358 ctxt->eflags &= ~EFLG_CF; 2915 ctxt->eflags &= ~EFLG_CF;
2359 c->dst.type = OP_NONE; /* Disable writeback. */ 2916 c->dst.type = OP_NONE; /* Disable writeback. */
2360 break; 2917 break;
2361 case 0xfa: /* cli */ 2918 case 0xfa: /* cli */
2362 if (emulator_bad_iopl(ctxt)) 2919 if (emulator_bad_iopl(ctxt, ops))
2363 kvm_inject_gp(ctxt->vcpu, 0); 2920 kvm_inject_gp(ctxt->vcpu, 0);
2364 else { 2921 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF; 2922 ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
2367 } 2924 }
2368 break; 2925 break;
2369 case 0xfb: /* sti */ 2926 case 0xfb: /* sti */
2370 if (emulator_bad_iopl(ctxt)) 2927 if (emulator_bad_iopl(ctxt, ops))
2371 kvm_inject_gp(ctxt->vcpu, 0); 2928 kvm_inject_gp(ctxt->vcpu, 0);
2372 else { 2929 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF; 2931 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */ 2932 c->dst.type = OP_NONE; /* Disable writeback. */
2376 } 2933 }
@@ -2383,28 +2940,55 @@ special_insn:
2383 ctxt->eflags |= EFLG_DF; 2940 ctxt->eflags |= EFLG_DF;
2384 c->dst.type = OP_NONE; /* Disable writeback. */ 2941 c->dst.type = OP_NONE; /* Disable writeback. */
2385 break; 2942 break;
2386 case 0xfe ... 0xff: /* Grp4/Grp5 */ 2943 case 0xfe: /* Grp4 */
2944 grp45:
2387 rc = emulate_grp45(ctxt, ops); 2945 rc = emulate_grp45(ctxt, ops);
2388 if (rc != 0) 2946 if (rc != X86EMUL_CONTINUE)
2389 goto done; 2947 goto done;
2390 break; 2948 break;
2949 case 0xff: /* Grp5 */
2950 if (c->modrm_reg == 5)
2951 goto jump_far;
2952 goto grp45;
2391 } 2953 }
2392 2954
2393writeback: 2955writeback:
2394 rc = writeback(ctxt, ops); 2956 rc = writeback(ctxt, ops);
2395 if (rc != 0) 2957 if (rc != X86EMUL_CONTINUE)
2396 goto done; 2958 goto done;
2397 2959
2960 /*
2961 * restore dst type in case the decoding will be reused
2962 * (happens for string instruction )
2963 */
2964 c->dst.type = saved_dst_type;
2965
2966 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
2968 &c->src);
2969
2970 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
2972
2973 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read;
2975 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
2976 /*
2977 * Re-enter guest when pio read ahead buffer is empty or,
2978 * if it is not used, after each 1024 iteration.
2979 */
2980 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
2981 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false;
2983 }
2984
2398 /* Commit shadow register state. */ 2985 /* Commit shadow register state. */
2399 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2400 kvm_rip_write(ctxt->vcpu, c->eip); 2987 kvm_rip_write(ctxt->vcpu, c->eip);
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags);
2401 2989
2402done: 2990done:
2403 if (rc == X86EMUL_UNHANDLEABLE) { 2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2404 c->eip = saved_eip;
2405 return -1;
2406 }
2407 return 0;
2408 2992
2409twobyte_insn: 2993twobyte_insn:
2410 switch (c->b) { 2994 switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
2418 goto cannot_emulate; 3002 goto cannot_emulate;
2419 3003
2420 rc = kvm_fix_hypercall(ctxt->vcpu); 3004 rc = kvm_fix_hypercall(ctxt->vcpu);
2421 if (rc) 3005 if (rc != X86EMUL_CONTINUE)
2422 goto done; 3006 goto done;
2423 3007
2424 /* Let the processor re-execute the fixed hypercall */ 3008 /* Let the processor re-execute the fixed hypercall */
2425 c->eip = kvm_rip_read(ctxt->vcpu); 3009 c->eip = ctxt->eip;
2426 /* Disable writeback. */ 3010 /* Disable writeback. */
2427 c->dst.type = OP_NONE; 3011 c->dst.type = OP_NONE;
2428 break; 3012 break;
2429 case 2: /* lgdt */ 3013 case 2: /* lgdt */
2430 rc = read_descriptor(ctxt, ops, c->src.ptr, 3014 rc = read_descriptor(ctxt, ops, c->src.ptr,
2431 &size, &address, c->op_bytes); 3015 &size, &address, c->op_bytes);
2432 if (rc) 3016 if (rc != X86EMUL_CONTINUE)
2433 goto done; 3017 goto done;
2434 realmode_lgdt(ctxt->vcpu, size, address); 3018 realmode_lgdt(ctxt->vcpu, size, address);
2435 /* Disable writeback. */ 3019 /* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
2440 switch (c->modrm_rm) { 3024 switch (c->modrm_rm) {
2441 case 1: 3025 case 1:
2442 rc = kvm_fix_hypercall(ctxt->vcpu); 3026 rc = kvm_fix_hypercall(ctxt->vcpu);
2443 if (rc) 3027 if (rc != X86EMUL_CONTINUE)
2444 goto done; 3028 goto done;
2445 break; 3029 break;
2446 default: 3030 default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
2450 rc = read_descriptor(ctxt, ops, c->src.ptr, 3034 rc = read_descriptor(ctxt, ops, c->src.ptr,
2451 &size, &address, 3035 &size, &address,
2452 c->op_bytes); 3036 c->op_bytes);
2453 if (rc) 3037 if (rc != X86EMUL_CONTINUE)
2454 goto done; 3038 goto done;
2455 realmode_lidt(ctxt->vcpu, size, address); 3039 realmode_lidt(ctxt->vcpu, size, address);
2456 } 3040 }
@@ -2459,15 +3043,18 @@ twobyte_insn:
2459 break; 3043 break;
2460 case 4: /* smsw */ 3044 case 4: /* smsw */
2461 c->dst.bytes = 2; 3045 c->dst.bytes = 2;
2462 c->dst.val = realmode_get_cr(ctxt->vcpu, 0); 3046 c->dst.val = ops->get_cr(0, ctxt->vcpu);
2463 break; 3047 break;
2464 case 6: /* lmsw */ 3048 case 6: /* lmsw */
2465 realmode_lmsw(ctxt->vcpu, (u16)c->src.val, 3049 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
2466 &ctxt->eflags); 3050 (c->src.val & 0x0f), ctxt->vcpu);
2467 c->dst.type = OP_NONE; 3051 c->dst.type = OP_NONE;
2468 break; 3052 break;
3053 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3055 goto done;
2469 case 7: /* invlpg*/ 3056 case 7: /* invlpg*/
2470 emulate_invlpg(ctxt->vcpu, memop); 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
2471 /* Disable writeback. */ 3058 /* Disable writeback. */
2472 c->dst.type = OP_NONE; 3059 c->dst.type = OP_NONE;
2473 break; 3060 break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
2493 c->dst.type = OP_NONE; 3080 c->dst.type = OP_NONE;
2494 break; 3081 break;
2495 case 0x20: /* mov cr, reg */ 3082 case 0x20: /* mov cr, reg */
2496 if (c->modrm_mod != 3) 3083 switch (c->modrm_reg) {
2497 goto cannot_emulate; 3084 case 1:
2498 c->regs[c->modrm_rm] = 3085 case 5 ... 7:
2499 realmode_get_cr(ctxt->vcpu, c->modrm_reg); 3086 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3088 goto done;
3089 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
2500 c->dst.type = OP_NONE; /* no writeback */ 3091 c->dst.type = OP_NONE; /* no writeback */
2501 break; 3092 break;
2502 case 0x21: /* mov from dr to reg */ 3093 case 0x21: /* mov from dr to reg */
2503 if (c->modrm_mod != 3) 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2504 goto cannot_emulate; 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2505 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2506 if (rc) 3097 goto done;
2507 goto cannot_emulate; 3098 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
2508 c->dst.type = OP_NONE; /* no writeback */ 3100 c->dst.type = OP_NONE; /* no writeback */
2509 break; 3101 break;
2510 case 0x22: /* mov reg, cr */ 3102 case 0x22: /* mov reg, cr */
2511 if (c->modrm_mod != 3) 3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
2512 goto cannot_emulate;
2513 realmode_set_cr(ctxt->vcpu,
2514 c->modrm_reg, c->modrm_val, &ctxt->eflags);
2515 c->dst.type = OP_NONE; 3104 c->dst.type = OP_NONE;
2516 break; 3105 break;
2517 case 0x23: /* mov from reg to dr */ 3106 case 0x23: /* mov from reg to dr */
2518 if (c->modrm_mod != 3) 3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2519 goto cannot_emulate; 3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2520 rc = emulator_set_dr(ctxt, c->modrm_reg, 3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2521 c->regs[c->modrm_rm]); 3110 goto done;
2522 if (rc) 3111 }
2523 goto cannot_emulate; 3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
2524 c->dst.type = OP_NONE; /* no writeback */ 3113 c->dst.type = OP_NONE; /* no writeback */
2525 break; 3114 break;
2526 case 0x30: 3115 case 0x30:
2527 /* wrmsr */ 3116 /* wrmsr */
2528 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3117 msr_data = (u32)c->regs[VCPU_REGS_RAX]
2529 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
2530 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
2531 if (rc) {
2532 kvm_inject_gp(ctxt->vcpu, 0); 3120 kvm_inject_gp(ctxt->vcpu, 0);
2533 c->eip = kvm_rip_read(ctxt->vcpu); 3121 goto done;
2534 } 3122 }
2535 rc = X86EMUL_CONTINUE; 3123 rc = X86EMUL_CONTINUE;
2536 c->dst.type = OP_NONE; 3124 c->dst.type = OP_NONE;
2537 break; 3125 break;
2538 case 0x32: 3126 case 0x32:
2539 /* rdmsr */ 3127 /* rdmsr */
2540 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
2541 if (rc) {
2542 kvm_inject_gp(ctxt->vcpu, 0); 3129 kvm_inject_gp(ctxt->vcpu, 0);
2543 c->eip = kvm_rip_read(ctxt->vcpu); 3130 goto done;
2544 } else { 3131 } else {
2545 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
2546 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3133 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
2577 break; 3164 break;
2578 case 0xa1: /* pop fs */ 3165 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0) 3167 if (rc != X86EMUL_CONTINUE)
2581 goto done; 3168 goto done;
2582 break; 3169 break;
2583 case 0xa3: 3170 case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
2596 break; 3183 break;
2597 case 0xa9: /* pop gs */ 3184 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0) 3186 if (rc != X86EMUL_CONTINUE)
2600 goto done; 3187 goto done;
2601 break; 3188 break;
2602 case 0xab: 3189 case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
2668 (u64) c->src.val; 3255 (u64) c->src.val;
2669 break; 3256 break;
2670 case 0xc7: /* Grp9 (cmpxchg8b) */ 3257 case 0xc7: /* Grp9 (cmpxchg8b) */
2671 rc = emulate_grp9(ctxt, ops, memop); 3258 rc = emulate_grp9(ctxt, ops);
2672 if (rc != 0) 3259 if (rc != X86EMUL_CONTINUE)
2673 goto done; 3260 goto done;
2674 c->dst.type = OP_NONE;
2675 break; 3261 break;
2676 } 3262 }
2677 goto writeback; 3263 goto writeback;
2678 3264
2679cannot_emulate: 3265cannot_emulate:
2680 DPRINTF("Cannot emulate %02x\n", c->b); 3266 DPRINTF("Cannot emulate %02x\n", c->b);
2681 c->eip = saved_eip;
2682 return -1; 3267 return -1;
2683} 3268}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9f..93825ff3338f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
33#include <linux/kvm_host.h> 33#include <linux/kvm_host.h>
34#include "trace.h" 34#include "trace.h"
35 35
36static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock)
38{
39 raw_spin_lock(&s->lock);
40}
41
42static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock)
44{
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->wakeup_needed = false;
49
50 raw_spin_unlock(&s->lock);
51
52 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu;
54 if (vcpu)
55 kvm_vcpu_kick(vcpu);
56 }
57}
58
36static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 59static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
37{ 60{
38 s->isr &= ~(1 << irq); 61 s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
45 * Other interrupt may be delivered to PIC while lock is dropped but 68 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage. 69 * it should be safe since PIC state is already updated at this stage.
47 */ 70 */
48 raw_spin_unlock(&s->pics_state->lock); 71 pic_unlock(s->pics_state);
49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 72 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock); 73 pic_lock(s->pics_state);
51} 74}
52 75
53void kvm_pic_clear_isr_ack(struct kvm *kvm) 76void kvm_pic_clear_isr_ack(struct kvm *kvm)
54{ 77{
55 struct kvm_pic *s = pic_irqchip(kvm); 78 struct kvm_pic *s = pic_irqchip(kvm);
56 79
57 raw_spin_lock(&s->lock); 80 pic_lock(s);
58 s->pics[0].isr_ack = 0xff; 81 s->pics[0].isr_ack = 0xff;
59 s->pics[1].isr_ack = 0xff; 82 s->pics[1].isr_ack = 0xff;
60 raw_spin_unlock(&s->lock); 83 pic_unlock(s);
61} 84}
62 85
63/* 86/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
158 181
159void kvm_pic_update_irq(struct kvm_pic *s) 182void kvm_pic_update_irq(struct kvm_pic *s)
160{ 183{
161 raw_spin_lock(&s->lock); 184 pic_lock(s);
162 pic_update_irq(s); 185 pic_update_irq(s);
163 raw_spin_unlock(&s->lock); 186 pic_unlock(s);
164} 187}
165 188
166int kvm_pic_set_irq(void *opaque, int irq, int level) 189int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
168 struct kvm_pic *s = opaque; 191 struct kvm_pic *s = opaque;
169 int ret = -1; 192 int ret = -1;
170 193
171 raw_spin_lock(&s->lock); 194 pic_lock(s);
172 if (irq >= 0 && irq < PIC_NUM_PINS) { 195 if (irq >= 0 && irq < PIC_NUM_PINS) {
173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 196 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
174 pic_update_irq(s); 197 pic_update_irq(s);
175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 198 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
176 s->pics[irq >> 3].imr, ret == 0); 199 s->pics[irq >> 3].imr, ret == 0);
177 } 200 }
178 raw_spin_unlock(&s->lock); 201 pic_unlock(s);
179 202
180 return ret; 203 return ret;
181} 204}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
205 int irq, irq2, intno; 228 int irq, irq2, intno;
206 struct kvm_pic *s = pic_irqchip(kvm); 229 struct kvm_pic *s = pic_irqchip(kvm);
207 230
208 raw_spin_lock(&s->lock); 231 pic_lock(s);
209 irq = pic_get_irq(&s->pics[0]); 232 irq = pic_get_irq(&s->pics[0]);
210 if (irq >= 0) { 233 if (irq >= 0) {
211 pic_intack(&s->pics[0], irq); 234 pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
230 intno = s->pics[0].irq_base + irq; 253 intno = s->pics[0].irq_base + irq;
231 } 254 }
232 pic_update_irq(s); 255 pic_update_irq(s);
233 raw_spin_unlock(&s->lock); 256 pic_unlock(s);
234 257
235 return intno; 258 return intno;
236} 259}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
444 printk(KERN_ERR "PIC: non byte write\n"); 467 printk(KERN_ERR "PIC: non byte write\n");
445 return 0; 468 return 0;
446 } 469 }
447 raw_spin_lock(&s->lock); 470 pic_lock(s);
448 switch (addr) { 471 switch (addr) {
449 case 0x20: 472 case 0x20:
450 case 0x21: 473 case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
457 elcr_ioport_write(&s->pics[addr & 1], addr, data); 480 elcr_ioport_write(&s->pics[addr & 1], addr, data);
458 break; 481 break;
459 } 482 }
460 raw_spin_unlock(&s->lock); 483 pic_unlock(s);
461 return 0; 484 return 0;
462} 485}
463 486
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
474 printk(KERN_ERR "PIC: non byte read\n"); 497 printk(KERN_ERR "PIC: non byte read\n");
475 return 0; 498 return 0;
476 } 499 }
477 raw_spin_lock(&s->lock); 500 pic_lock(s);
478 switch (addr) { 501 switch (addr) {
479 case 0x20: 502 case 0x20:
480 case 0x21: 503 case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
488 break; 511 break;
489 } 512 }
490 *(unsigned char *)val = data; 513 *(unsigned char *)val = data;
491 raw_spin_unlock(&s->lock); 514 pic_unlock(s);
492 return 0; 515 return 0;
493} 516}
494 517
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
505 s->output = level; 528 s->output = level;
506 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 529 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
507 s->pics[0].isr_ack &= ~(1 << irq); 530 s->pics[0].isr_ack &= ~(1 << irq);
508 kvm_vcpu_kick(vcpu); 531 s->wakeup_needed = true;
509 } 532 }
510} 533}
511 534
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754d..cd1f362f413d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 raw_spinlock_t lock; 65 raw_spinlock_t lock;
66 bool wakeup_needed;
66 unsigned pending_acks; 67 unsigned pending_acks;
67 struct kvm *kvm; 68 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda54..64bc6ea78d90 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *); 13 bool (*is_periodic)(struct kvm_timer *);
14}; 14};
15 15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); 16enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa2..a6f695d76928 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223} 217}
224EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
225 219
226static int is_write_protection(struct kvm_vcpu *vcpu) 220static bool is_write_protection(struct kvm_vcpu *vcpu)
227{ 221{
228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
229} 223}
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1202{ 1190{
1203 WARN_ON(!sp->unsync); 1191 WARN_ON(!sp->unsync);
1192 trace_kvm_mmu_sync_page(sp);
1204 sp->unsync = 0; 1193 sp->unsync = 0;
1205 --kvm->stat.mmu_unsync; 1194 --kvm->stat.mmu_unsync;
1206} 1195}
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1198
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1200{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1202 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1203 return 1;
1215 } 1204 }
1216 1205
1217 trace_kvm_mmu_sync_page(sp);
1218 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1206 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1219 kvm_flush_remote_tlbs(vcpu->kvm); 1207 kvm_flush_remote_tlbs(vcpu->kvm);
1220 kvm_unlink_unsync_page(vcpu->kvm, sp); 1208 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1573 r = 0; 1563 r = 0;
1574 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1575 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1576 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1577 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1578 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1579 sp->role.word); 1570 sp->role.word);
1580 r = 1; 1571 r = 1;
1581 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1582 n = bucket->first; 1573 goto restart;
1583 } 1574 }
1584 return r; 1575 return r;
1585} 1576}
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1593 1584
1594 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1595 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1596 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1597 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1598 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1599 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1600 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1601 if (kvm_mmu_zap_page(kvm, sp)) 1593 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first; 1594 goto restart;
1603 } 1595 }
1604 } 1596 }
1605} 1597}
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1626 } 1618 }
1627} 1619}
1628 1620
1629struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1630{
1631 struct page *page;
1632
1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1634
1635 if (gpa == UNMAPPED_GVA)
1636 return NULL;
1637
1638 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1639
1640 return page;
1641}
1642
1643/* 1621/*
1644 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1645 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1752 struct kvm_mmu_page *s; 1730 struct kvm_mmu_page *s;
1753 struct hlist_node *node, *n; 1731 struct hlist_node *node, *n;
1754 1732
1755 trace_kvm_mmu_unsync_page(sp);
1756 index = kvm_page_table_hashfn(sp->gfn); 1733 index = kvm_page_table_hashfn(sp->gfn);
1757 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1758 /* don't unsync if pagetable is shadowed with multiple roles */ 1735 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 if (s->role.word != sp->role.word) 1739 if (s->role.word != sp->role.word)
1763 return 1; 1740 return 1;
1764 } 1741 }
1742 trace_kvm_mmu_unsync_page(sp);
1765 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1766 sp->unsync = 1; 1744 sp->unsync = 1;
1767 1745
1768 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1769 1747
1770 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1771 return 0; 1749 return 0;
@@ -1837,6 +1815,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1837 1815
1838 spte |= PT_WRITABLE_MASK; 1816 spte |= PT_WRITABLE_MASK;
1839 1817
1818 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1819 spte &= ~PT_USER_MASK;
1820
1840 /* 1821 /*
1841 * Optimization: for pte sync, if spte was writable the hash 1822 * Optimization: for pte sync, if spte was writable the hash
1842 * lookup is unnecessary (and expensive). Write protection 1823 * lookup is unnecessary (and expensive). Write protection
@@ -1892,6 +1873,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1892 1873
1893 child = page_header(pte & PT64_BASE_ADDR_MASK); 1874 child = page_header(pte & PT64_BASE_ADDR_MASK);
1894 mmu_page_remove_parent_pte(child, sptep); 1875 mmu_page_remove_parent_pte(child, sptep);
1876 __set_spte(sptep, shadow_trap_nonpresent_pte);
1877 kvm_flush_remote_tlbs(vcpu->kvm);
1895 } else if (pfn != spte_to_pfn(*sptep)) { 1878 } else if (pfn != spte_to_pfn(*sptep)) {
1896 pgprintk("hfn old %lx new %lx\n", 1879 pgprintk("hfn old %lx new %lx\n",
1897 spte_to_pfn(*sptep), pfn); 1880 spte_to_pfn(*sptep), pfn);
@@ -2081,21 +2064,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2081 hpa_t root = vcpu->arch.mmu.root_hpa; 2064 hpa_t root = vcpu->arch.mmu.root_hpa;
2082 2065
2083 ASSERT(!VALID_PAGE(root)); 2066 ASSERT(!VALID_PAGE(root));
2084 if (tdp_enabled)
2085 direct = 1;
2086 if (mmu_check_root(vcpu, root_gfn)) 2067 if (mmu_check_root(vcpu, root_gfn))
2087 return 1; 2068 return 1;
2069 if (tdp_enabled) {
2070 direct = 1;
2071 root_gfn = 0;
2072 }
2073 spin_lock(&vcpu->kvm->mmu_lock);
2088 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2074 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2089 PT64_ROOT_LEVEL, direct, 2075 PT64_ROOT_LEVEL, direct,
2090 ACC_ALL, NULL); 2076 ACC_ALL, NULL);
2091 root = __pa(sp->spt); 2077 root = __pa(sp->spt);
2092 ++sp->root_count; 2078 ++sp->root_count;
2079 spin_unlock(&vcpu->kvm->mmu_lock);
2093 vcpu->arch.mmu.root_hpa = root; 2080 vcpu->arch.mmu.root_hpa = root;
2094 return 0; 2081 return 0;
2095 } 2082 }
2096 direct = !is_paging(vcpu); 2083 direct = !is_paging(vcpu);
2097 if (tdp_enabled)
2098 direct = 1;
2099 for (i = 0; i < 4; ++i) { 2084 for (i = 0; i < 4; ++i) {
2100 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2085 hpa_t root = vcpu->arch.mmu.pae_root[i];
2101 2086
@@ -2111,11 +2096,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2111 root_gfn = 0; 2096 root_gfn = 0;
2112 if (mmu_check_root(vcpu, root_gfn)) 2097 if (mmu_check_root(vcpu, root_gfn))
2113 return 1; 2098 return 1;
2099 if (tdp_enabled) {
2100 direct = 1;
2101 root_gfn = i << 30;
2102 }
2103 spin_lock(&vcpu->kvm->mmu_lock);
2114 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2104 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2115 PT32_ROOT_LEVEL, direct, 2105 PT32_ROOT_LEVEL, direct,
2116 ACC_ALL, NULL); 2106 ACC_ALL, NULL);
2117 root = __pa(sp->spt); 2107 root = __pa(sp->spt);
2118 ++sp->root_count; 2108 ++sp->root_count;
2109 spin_unlock(&vcpu->kvm->mmu_lock);
2110
2119 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2111 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2120 } 2112 }
2121 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2113 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2291,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2299 /* no rsvd bits for 2 level 4K page table entries */ 2291 /* no rsvd bits for 2 level 4K page table entries */
2300 context->rsvd_bits_mask[0][1] = 0; 2292 context->rsvd_bits_mask[0][1] = 0;
2301 context->rsvd_bits_mask[0][0] = 0; 2293 context->rsvd_bits_mask[0][0] = 0;
2294 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2295
2296 if (!is_pse(vcpu)) {
2297 context->rsvd_bits_mask[1][1] = 0;
2298 break;
2299 }
2300
2302 if (is_cpuid_PSE36()) 2301 if (is_cpuid_PSE36())
2303 /* 36bits PSE 4MB page */ 2302 /* 36bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2303 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2305 else 2304 else
2306 /* 32 bits PSE 4MB page */ 2305 /* 32 bits PSE 4MB page */
2307 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2306 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2308 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2309 break; 2307 break;
2310 case PT32E_ROOT_LEVEL: 2308 case PT32E_ROOT_LEVEL:
2311 context->rsvd_bits_mask[0][2] = 2309 context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2316,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2318 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2316 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2319 rsvd_bits(maxphyaddr, 62) | 2317 rsvd_bits(maxphyaddr, 62) |
2320 rsvd_bits(13, 20); /* large page */ 2318 rsvd_bits(13, 20); /* large page */
2321 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2319 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2322 break; 2320 break;
2323 case PT64_ROOT_LEVEL: 2321 case PT64_ROOT_LEVEL:
2324 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2322 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2334,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2336 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2334 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2337 rsvd_bits(maxphyaddr, 51) | 2335 rsvd_bits(maxphyaddr, 51) |
2338 rsvd_bits(13, 20); /* large page */ 2336 rsvd_bits(13, 20); /* large page */
2339 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2337 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2340 break; 2338 break;
2341 } 2339 }
2342} 2340}
@@ -2438,7 +2436,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2438 else 2436 else
2439 r = paging32_init_context(vcpu); 2437 r = paging32_init_context(vcpu);
2440 2438
2441 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2439 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2440 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2442 2441
2443 return r; 2442 return r;
2444} 2443}
@@ -2478,7 +2477,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2478 goto out; 2477 goto out;
2479 spin_lock(&vcpu->kvm->mmu_lock); 2478 spin_lock(&vcpu->kvm->mmu_lock);
2480 kvm_mmu_free_some_pages(vcpu); 2479 kvm_mmu_free_some_pages(vcpu);
2480 spin_unlock(&vcpu->kvm->mmu_lock);
2481 r = mmu_alloc_roots(vcpu); 2481 r = mmu_alloc_roots(vcpu);
2482 spin_lock(&vcpu->kvm->mmu_lock);
2482 mmu_sync_roots(vcpu); 2483 mmu_sync_roots(vcpu);
2483 spin_unlock(&vcpu->kvm->mmu_lock); 2484 spin_unlock(&vcpu->kvm->mmu_lock);
2484 if (r) 2485 if (r)
@@ -2527,7 +2528,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2527 } 2528 }
2528 2529
2529 ++vcpu->kvm->stat.mmu_pte_updated; 2530 ++vcpu->kvm->stat.mmu_pte_updated;
2530 if (sp->role.glevels == PT32_ROOT_LEVEL) 2531 if (!sp->role.cr4_pae)
2531 paging32_update_pte(vcpu, sp, spte, new); 2532 paging32_update_pte(vcpu, sp, spte, new);
2532 else 2533 else
2533 paging64_update_pte(vcpu, sp, spte, new); 2534 paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2563,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2562} 2563}
2563 2564
2564static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2565static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2565 const u8 *new, int bytes) 2566 u64 gpte)
2566{ 2567{
2567 gfn_t gfn; 2568 gfn_t gfn;
2568 int r;
2569 u64 gpte = 0;
2570 pfn_t pfn; 2569 pfn_t pfn;
2571 2570
2572 if (bytes != 4 && bytes != 8)
2573 return;
2574
2575 /*
2576 * Assume that the pte write on a page table of the same type
2577 * as the current vcpu paging mode. This is nearly always true
2578 * (might be false while changing modes). Note it is verified later
2579 * by update_pte().
2580 */
2581 if (is_pae(vcpu)) {
2582 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2583 if ((bytes == 4) && (gpa % 4 == 0)) {
2584 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2585 if (r)
2586 return;
2587 memcpy((void *)&gpte + (gpa % 8), new, 4);
2588 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2589 memcpy((void *)&gpte, new, 8);
2590 }
2591 } else {
2592 if ((bytes == 4) && (gpa % 4 == 0))
2593 memcpy((void *)&gpte, new, 4);
2594 }
2595 if (!is_present_gpte(gpte)) 2571 if (!is_present_gpte(gpte))
2596 return; 2572 return;
2597 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2573 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2616,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2640 int flooded = 0; 2616 int flooded = 0;
2641 int npte; 2617 int npte;
2642 int r; 2618 int r;
2619 int invlpg_counter;
2643 2620
2644 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2621 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2645 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2622
2623 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2624
2625 /*
2626 * Assume that the pte write on a page table of the same type
2627 * as the current vcpu paging mode. This is nearly always true
2628 * (might be false while changing modes). Note it is verified later
2629 * by update_pte().
2630 */
2631 if ((is_pae(vcpu) && bytes == 4) || !new) {
2632 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2633 if (is_pae(vcpu)) {
2634 gpa &= ~(gpa_t)7;
2635 bytes = 8;
2636 }
2637 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2638 if (r)
2639 gentry = 0;
2640 new = (const u8 *)&gentry;
2641 }
2642
2643 switch (bytes) {
2644 case 4:
2645 gentry = *(const u32 *)new;
2646 break;
2647 case 8:
2648 gentry = *(const u64 *)new;
2649 break;
2650 default:
2651 gentry = 0;
2652 break;
2653 }
2654
2655 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2646 spin_lock(&vcpu->kvm->mmu_lock); 2656 spin_lock(&vcpu->kvm->mmu_lock);
2657 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2658 gentry = 0;
2647 kvm_mmu_access_page(vcpu, gfn); 2659 kvm_mmu_access_page(vcpu, gfn);
2648 kvm_mmu_free_some_pages(vcpu); 2660 kvm_mmu_free_some_pages(vcpu);
2649 ++vcpu->kvm->stat.mmu_pte_write; 2661 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2674,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2662 } 2674 }
2663 index = kvm_page_table_hashfn(gfn); 2675 index = kvm_page_table_hashfn(gfn);
2664 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2676 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2677
2678restart:
2665 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2679 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2666 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2680 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2667 continue; 2681 continue;
2668 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2682 pte_size = sp->role.cr4_pae ? 8 : 4;
2669 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2683 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2670 misaligned |= bytes < 4; 2684 misaligned |= bytes < 4;
2671 if (misaligned || flooded) { 2685 if (misaligned || flooded) {
@@ -2682,14 +2696,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2682 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2696 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2683 gpa, bytes, sp->role.word); 2697 gpa, bytes, sp->role.word);
2684 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2698 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2685 n = bucket->first; 2699 goto restart;
2686 ++vcpu->kvm->stat.mmu_flooded; 2700 ++vcpu->kvm->stat.mmu_flooded;
2687 continue; 2701 continue;
2688 } 2702 }
2689 page_offset = offset; 2703 page_offset = offset;
2690 level = sp->role.level; 2704 level = sp->role.level;
2691 npte = 1; 2705 npte = 1;
2692 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2706 if (!sp->role.cr4_pae) {
2693 page_offset <<= 1; /* 32->64 */ 2707 page_offset <<= 1; /* 32->64 */
2694 /* 2708 /*
2695 * A 32-bit pde maps 4MB while the shadow pdes map 2709 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2721,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2707 continue; 2721 continue;
2708 } 2722 }
2709 spte = &sp->spt[page_offset / sizeof(*spte)]; 2723 spte = &sp->spt[page_offset / sizeof(*spte)];
2710 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2711 gentry = 0;
2712 r = kvm_read_guest_atomic(vcpu->kvm,
2713 gpa & ~(u64)(pte_size - 1),
2714 &gentry, pte_size);
2715 new = (const void *)&gentry;
2716 if (r < 0)
2717 new = NULL;
2718 }
2719 while (npte--) { 2724 while (npte--) {
2720 entry = *spte; 2725 entry = *spte;
2721 mmu_pte_write_zap_pte(vcpu, sp, spte); 2726 mmu_pte_write_zap_pte(vcpu, sp, spte);
2722 if (new) 2727 if (gentry)
2723 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2728 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2729 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2725 ++spte; 2730 ++spte;
2726 } 2731 }
@@ -2900,22 +2905,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2900 struct kvm_mmu_page *sp, *node; 2905 struct kvm_mmu_page *sp, *node;
2901 2906
2902 spin_lock(&kvm->mmu_lock); 2907 spin_lock(&kvm->mmu_lock);
2908restart:
2903 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2909 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2904 if (kvm_mmu_zap_page(kvm, sp)) 2910 if (kvm_mmu_zap_page(kvm, sp))
2905 node = container_of(kvm->arch.active_mmu_pages.next, 2911 goto restart;
2906 struct kvm_mmu_page, link); 2912
2907 spin_unlock(&kvm->mmu_lock); 2913 spin_unlock(&kvm->mmu_lock);
2908 2914
2909 kvm_flush_remote_tlbs(kvm); 2915 kvm_flush_remote_tlbs(kvm);
2910} 2916}
2911 2917
2912static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2918static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
2913{ 2919{
2914 struct kvm_mmu_page *page; 2920 struct kvm_mmu_page *page;
2915 2921
2916 page = container_of(kvm->arch.active_mmu_pages.prev, 2922 page = container_of(kvm->arch.active_mmu_pages.prev,
2917 struct kvm_mmu_page, link); 2923 struct kvm_mmu_page, link);
2918 kvm_mmu_zap_page(kvm, page); 2924 return kvm_mmu_zap_page(kvm, page) + 1;
2919} 2925}
2920 2926
2921static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) 2927static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2933,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2927 spin_lock(&kvm_lock); 2933 spin_lock(&kvm_lock);
2928 2934
2929 list_for_each_entry(kvm, &vm_list, vm_list) { 2935 list_for_each_entry(kvm, &vm_list, vm_list) {
2930 int npages, idx; 2936 int npages, idx, freed_pages;
2931 2937
2932 idx = srcu_read_lock(&kvm->srcu); 2938 idx = srcu_read_lock(&kvm->srcu);
2933 spin_lock(&kvm->mmu_lock); 2939 spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2941,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2935 kvm->arch.n_free_mmu_pages; 2941 kvm->arch.n_free_mmu_pages;
2936 cache_count += npages; 2942 cache_count += npages;
2937 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 2943 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2938 kvm_mmu_remove_one_alloc_mmu_page(kvm); 2944 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
2939 cache_count--; 2945 cache_count -= freed_pages;
2940 kvm_freed = kvm; 2946 kvm_freed = kvm;
2941 } 2947 }
2942 nr_to_scan--; 2948 nr_to_scan--;
@@ -3011,7 +3017,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3011 unsigned int nr_pages = 0; 3017 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots; 3018 struct kvm_memslots *slots;
3013 3019
3014 slots = rcu_dereference(kvm->memslots); 3020 slots = kvm_memslots(kvm);
3021
3015 for (i = 0; i < slots->nmemslots; i++) 3022 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages; 3023 nr_pages += slots->memslots[i].npages;
3017 3024
@@ -3174,8 +3181,7 @@ static gva_t canonicalize(gva_t gva)
3174} 3181}
3175 3182
3176 3183
3177typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3184typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3178 u64 *sptep);
3179 3185
3180static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3186static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3181 inspect_spte_fn fn) 3187 inspect_spte_fn fn)
@@ -3191,7 +3197,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3191 child = page_header(ent & PT64_BASE_ADDR_MASK); 3197 child = page_header(ent & PT64_BASE_ADDR_MASK);
3192 __mmu_spte_walk(kvm, child, fn); 3198 __mmu_spte_walk(kvm, child, fn);
3193 } else 3199 } else
3194 fn(kvm, sp, &sp->spt[i]); 3200 fn(kvm, &sp->spt[i]);
3195 } 3201 }
3196 } 3202 }
3197} 3203}
@@ -3282,11 +3288,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3282 3288
3283static int count_rmaps(struct kvm_vcpu *vcpu) 3289static int count_rmaps(struct kvm_vcpu *vcpu)
3284{ 3290{
3291 struct kvm *kvm = vcpu->kvm;
3292 struct kvm_memslots *slots;
3285 int nmaps = 0; 3293 int nmaps = 0;
3286 int i, j, k, idx; 3294 int i, j, k, idx;
3287 3295
3288 idx = srcu_read_lock(&kvm->srcu); 3296 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots); 3297 slots = kvm_memslots(kvm);
3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3298 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3291 struct kvm_memory_slot *m = &slots->memslots[i]; 3299 struct kvm_memory_slot *m = &slots->memslots[i];
3292 struct kvm_rmap_desc *d; 3300 struct kvm_rmap_desc *d;
@@ -3315,7 +3323,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3315 return nmaps; 3323 return nmaps;
3316} 3324}
3317 3325
3318void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3326void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3319{ 3327{
3320 unsigned long *rmapp; 3328 unsigned long *rmapp;
3321 struct kvm_mmu_page *rev_sp; 3329 struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3339,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3331 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3339 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3332 audit_msg, gfn); 3340 audit_msg, gfn);
3333 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3341 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3334 audit_msg, sptep - rev_sp->spt, 3342 audit_msg, (long int)(sptep - rev_sp->spt),
3335 rev_sp->gfn); 3343 rev_sp->gfn);
3336 dump_stack(); 3344 dump_stack();
3337 return; 3345 return;
3338 } 3346 }
3339 3347
3340 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3348 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3341 is_large_pte(*sptep)); 3349 rev_sp->role.level);
3342 if (!*rmapp) { 3350 if (!*rmapp) {
3343 if (!printk_ratelimit()) 3351 if (!printk_ratelimit())
3344 return; 3352 return;
@@ -3373,7 +3381,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3373 continue; 3381 continue;
3374 if (!(ent & PT_WRITABLE_MASK)) 3382 if (!(ent & PT_WRITABLE_MASK))
3375 continue; 3383 continue;
3376 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3384 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3377 } 3385 }
3378 } 3386 }
3379 return; 3387 return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a9..42f07b1bfbc9 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,14 +6,12 @@
6 6
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11 9
12#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \ 11 __field(__u64, gfn) \
14 __field(__u32, role) \ 12 __field(__u32, role) \
15 __field(__u32, root_count) \ 13 __field(__u32, root_count) \
16 __field(__u32, unsync) 14 __field(bool, unsync)
17 15
18#define KVM_MMU_PAGE_ASSIGN(sp) \ 16#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->gfn = sp->gfn; \ 17 __entry->gfn = sp->gfn; \
@@ -30,14 +28,14 @@
30 \ 28 \
31 role.word = __entry->role; \ 29 role.word = __entry->role; \
32 \ 30 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ 31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", \ 32 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \ 33 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \
36 role.quadrant, \ 35 role.quadrant, \
37 role.direct ? " direct" : "", \ 36 role.direct ? " direct" : "", \
38 access_str[role.access], \ 37 access_str[role.access], \
39 role.invalid ? " invalid" : "", \ 38 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \ 39 role.nxe ? "" : "!", \
42 __entry->root_count, \ 40 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 41 __entry->unsync ? "unsync" : "sync", 0); \
@@ -94,15 +92,15 @@ TRACE_EVENT(
94 TP_printk("pte %llx level %u", __entry->pte, __entry->level) 92 TP_printk("pte %llx level %u", __entry->pte, __entry->level)
95); 93);
96 94
97/* We set a pte accessed bit */ 95DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
98TRACE_EVENT( 96
99 kvm_mmu_set_accessed_bit,
100 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 97 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
98
101 TP_ARGS(table_gfn, index, size), 99 TP_ARGS(table_gfn, index, size),
102 100
103 TP_STRUCT__entry( 101 TP_STRUCT__entry(
104 __field(__u64, gpa) 102 __field(__u64, gpa)
105 ), 103 ),
106 104
107 TP_fast_assign( 105 TP_fast_assign(
108 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 106 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -112,22 +110,20 @@ TRACE_EVENT(
112 TP_printk("gpa %llx", __entry->gpa) 110 TP_printk("gpa %llx", __entry->gpa)
113); 111);
114 112
115/* We set a pte dirty bit */ 113/* We set a pte accessed bit */
116TRACE_EVENT( 114DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
117 kvm_mmu_set_dirty_bit, 115
118 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 116 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
119 TP_ARGS(table_gfn, index, size),
120 117
121 TP_STRUCT__entry( 118 TP_ARGS(table_gfn, index, size)
122 __field(__u64, gpa) 119);
123 ),
124 120
125 TP_fast_assign( 121/* We set a pte dirty bit */
126 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 122DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
127 + index * size;
128 ),
129 123
130 TP_printk("gpa %llx", __entry->gpa) 124 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
125
126 TP_ARGS(table_gfn, index, size)
131); 127);
132 128
133TRACE_EVENT( 129TRACE_EVENT(
@@ -166,55 +162,45 @@ TRACE_EVENT(
166 __entry->created ? "new" : "existing") 162 __entry->created ? "new" : "existing")
167); 163);
168 164
169TRACE_EVENT( 165DECLARE_EVENT_CLASS(kvm_mmu_page_class,
170 kvm_mmu_sync_page, 166
171 TP_PROTO(struct kvm_mmu_page *sp), 167 TP_PROTO(struct kvm_mmu_page *sp),
172 TP_ARGS(sp), 168 TP_ARGS(sp),
173 169
174 TP_STRUCT__entry( 170 TP_STRUCT__entry(
175 KVM_MMU_PAGE_FIELDS 171 KVM_MMU_PAGE_FIELDS
176 ), 172 ),
177 173
178 TP_fast_assign( 174 TP_fast_assign(
179 KVM_MMU_PAGE_ASSIGN(sp) 175 KVM_MMU_PAGE_ASSIGN(sp)
180 ), 176 ),
181 177
182 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 178 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
183); 179);
184 180
185TRACE_EVENT( 181DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
186 kvm_mmu_unsync_page,
187 TP_PROTO(struct kvm_mmu_page *sp), 182 TP_PROTO(struct kvm_mmu_page *sp),
188 TP_ARGS(sp),
189
190 TP_STRUCT__entry(
191 KVM_MMU_PAGE_FIELDS
192 ),
193 183
194 TP_fast_assign( 184 TP_ARGS(sp)
195 KVM_MMU_PAGE_ASSIGN(sp)
196 ),
197
198 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
199); 185);
200 186
201TRACE_EVENT( 187DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
202 kvm_mmu_zap_page,
203 TP_PROTO(struct kvm_mmu_page *sp), 188 TP_PROTO(struct kvm_mmu_page *sp),
204 TP_ARGS(sp),
205 189
206 TP_STRUCT__entry( 190 TP_ARGS(sp)
207 KVM_MMU_PAGE_FIELDS 191);
208 ),
209 192
210 TP_fast_assign( 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
211 KVM_MMU_PAGE_ASSIGN(sp) 194 TP_PROTO(struct kvm_mmu_page *sp),
212 ),
213 195
214 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 196 TP_ARGS(sp)
215); 197);
216
217#endif /* _TRACE_KVMMMU_H */ 198#endif /* _TRACE_KVMMMU_H */
218 199
200#undef TRACE_INCLUDE_PATH
201#define TRACE_INCLUDE_PATH .
202#undef TRACE_INCLUDE_FILE
203#define TRACE_INCLUDE_FILE mmutrace
204
219/* This part must be outside protection */ 205/* This part must be outside protection */
220#include <trace/define_trace.h> 206#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6a..89d66ca4d87c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
170 goto access_error; 170 goto access_error;
171 171
172#if PTTYPE == 64 172#if PTTYPE == 64
173 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) 173 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 174 goto access_error;
175#endif 175#endif
176 176
@@ -190,10 +190,10 @@ walk:
190 190
191 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 191 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
192 ((walker->level == PT_DIRECTORY_LEVEL) && 192 ((walker->level == PT_DIRECTORY_LEVEL) &&
193 (pte & PT_PAGE_SIZE_MASK) && 193 is_large_pte(pte) &&
194 (PTTYPE == 64 || is_pse(vcpu))) || 194 (PTTYPE == 64 || is_pse(vcpu))) ||
195 ((walker->level == PT_PDPE_LEVEL) && 195 ((walker->level == PT_PDPE_LEVEL) &&
196 (pte & PT_PAGE_SIZE_MASK) && 196 is_large_pte(pte) &&
197 is_long_mode(vcpu))) { 197 is_long_mode(vcpu))) {
198 int lvl = walker->level; 198 int lvl = walker->level;
199 199
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
258 pt_element_t gpte; 258 pt_element_t gpte;
259 unsigned pte_access; 259 unsigned pte_access;
260 pfn_t pfn; 260 pfn_t pfn;
261 u64 new_spte;
261 262
262 gpte = *(const pt_element_t *)pte; 263 gpte = *(const pt_element_t *)pte;
263 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
264 if (!is_present_gpte(gpte)) 265 if (!is_present_gpte(gpte)) {
265 __set_spte(spte, shadow_notrap_nonpresent_pte); 266 if (page->unsync)
267 new_spte = shadow_trap_nonpresent_pte;
268 else
269 new_spte = shadow_notrap_nonpresent_pte;
270 __set_spte(spte, new_spte);
271 }
266 return; 272 return;
267 } 273 }
268 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
458{ 464{
459 struct kvm_shadow_walk_iterator iterator; 465 struct kvm_shadow_walk_iterator iterator;
466 gpa_t pte_gpa = -1;
460 int level; 467 int level;
461 u64 *sptep; 468 u64 *sptep;
462 int need_flush = 0; 469 int need_flush = 0;
@@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 474 level = iterator.level;
468 sptep = iterator.sptep; 475 sptep = iterator.sptep;
469 476
470 if (level == PT_PAGE_TABLE_LEVEL || 477 if (is_last_spte(*sptep, level)) {
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 479 int offset, shift;
480
481 shift = PAGE_SHIFT -
482 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
483 offset = sp->role.quadrant << shift;
484
485 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
486 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
473 487
474 if (is_shadow_present_pte(*sptep)) { 488 if (is_shadow_present_pte(*sptep)) {
475 rmap_remove(vcpu->kvm, sptep); 489 rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 501
488 if (need_flush) 502 if (need_flush)
489 kvm_flush_remote_tlbs(vcpu->kvm); 503 kvm_flush_remote_tlbs(vcpu->kvm);
504
505 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
506
490 spin_unlock(&vcpu->kvm->mmu_lock); 507 spin_unlock(&vcpu->kvm->mmu_lock);
508
509 if (pte_gpa == -1)
510 return;
511
512 if (mmu_topup_memory_caches(vcpu))
513 return;
514 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
491} 515}
492 516
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 517static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
551{ 575{
552 int i, offset, nr_present; 576 int i, offset, nr_present;
553 bool reset_host_protection; 577 bool reset_host_protection;
578 gpa_t first_pte_gpa;
554 579
555 offset = nr_present = 0; 580 offset = nr_present = 0;
556 581
557 if (PTTYPE == 32) 582 if (PTTYPE == 32)
558 offset = sp->role.quadrant << PT64_LEVEL_BITS; 583 offset = sp->role.quadrant << PT64_LEVEL_BITS;
559 584
585 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
586
560 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 587 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
561 unsigned pte_access; 588 unsigned pte_access;
562 pt_element_t gpte; 589 pt_element_t gpte;
@@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
566 if (!is_shadow_present_pte(sp->spt[i])) 593 if (!is_shadow_present_pte(sp->spt[i]))
567 continue; 594 continue;
568 595
569 pte_gpa = gfn_to_gpa(sp->gfn); 596 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
570 pte_gpa += (i+offset) * sizeof(pt_element_t);
571 597
572 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 598 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
573 sizeof(pt_element_t))) 599 sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 737361fcd503..ce438e0fdd26 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,7 @@
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30 30
31#include <asm/tlbflush.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
32 33
33#include <asm/virtext.h> 34#include <asm/virtext.h>
@@ -44,10 +45,11 @@ MODULE_LICENSE("GPL");
44#define SEG_TYPE_LDT 2 45#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 46#define SEG_TYPE_BUSY_TSS16 3
46 47
47#define SVM_FEATURE_NPT (1 << 0) 48#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1) 49#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_FEATURE_SVML (1 << 2) 50#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 51#define SVM_FEATURE_NRIP (1 << 3)
52#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
51 53
52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 54#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 55#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -55,6 +57,8 @@ MODULE_LICENSE("GPL");
55 57
56#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 58#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
57 59
60static bool erratum_383_found __read_mostly;
61
58static const u32 host_save_user_msrs[] = { 62static const u32 host_save_user_msrs[] = {
59#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
60 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -70,6 +74,7 @@ struct kvm_vcpu;
70struct nested_state { 74struct nested_state {
71 struct vmcb *hsave; 75 struct vmcb *hsave;
72 u64 hsave_msr; 76 u64 hsave_msr;
77 u64 vm_cr_msr;
73 u64 vmcb; 78 u64 vmcb;
74 79
75 /* These are the merged vectors */ 80 /* These are the merged vectors */
@@ -77,6 +82,7 @@ struct nested_state {
77 82
78 /* gpa pointers to the real vectors */ 83 /* gpa pointers to the real vectors */
79 u64 vmcb_msrpm; 84 u64 vmcb_msrpm;
85 u64 vmcb_iopm;
80 86
81 /* A VMEXIT is required but not yet emulated */ 87 /* A VMEXIT is required but not yet emulated */
82 bool exit_required; 88 bool exit_required;
@@ -91,6 +97,9 @@ struct nested_state {
91 97
92}; 98};
93 99
100#define MSRPM_OFFSETS 16
101static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
102
94struct vcpu_svm { 103struct vcpu_svm {
95 struct kvm_vcpu vcpu; 104 struct kvm_vcpu vcpu;
96 struct vmcb *vmcb; 105 struct vmcb *vmcb;
@@ -110,13 +119,39 @@ struct vcpu_svm {
110 struct nested_state nested; 119 struct nested_state nested;
111 120
112 bool nmi_singlestep; 121 bool nmi_singlestep;
122
123 unsigned int3_injected;
124 unsigned long int3_rip;
125};
126
127#define MSR_INVALID 0xffffffffU
128
129static struct svm_direct_access_msrs {
130 u32 index; /* Index of the MSR */
131 bool always; /* True if intercept is always on */
132} direct_access_msrs[] = {
133 { .index = MSR_K6_STAR, .always = true },
134 { .index = MSR_IA32_SYSENTER_CS, .always = true },
135#ifdef CONFIG_X86_64
136 { .index = MSR_GS_BASE, .always = true },
137 { .index = MSR_FS_BASE, .always = true },
138 { .index = MSR_KERNEL_GS_BASE, .always = true },
139 { .index = MSR_LSTAR, .always = true },
140 { .index = MSR_CSTAR, .always = true },
141 { .index = MSR_SYSCALL_MASK, .always = true },
142#endif
143 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
144 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
145 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
146 { .index = MSR_IA32_LASTINTTOIP, .always = false },
147 { .index = MSR_INVALID, .always = false },
113}; 148};
114 149
115/* enable NPT for AMD64 and X86 with PAE */ 150/* enable NPT for AMD64 and X86 with PAE */
116#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 151#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
117static bool npt_enabled = true; 152static bool npt_enabled = true;
118#else 153#else
119static bool npt_enabled = false; 154static bool npt_enabled;
120#endif 155#endif
121static int npt = 1; 156static int npt = 1;
122 157
@@ -129,6 +164,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
129static void svm_complete_interrupts(struct vcpu_svm *svm); 164static void svm_complete_interrupts(struct vcpu_svm *svm);
130 165
131static int nested_svm_exit_handled(struct vcpu_svm *svm); 166static int nested_svm_exit_handled(struct vcpu_svm *svm);
167static int nested_svm_intercept(struct vcpu_svm *svm);
132static int nested_svm_vmexit(struct vcpu_svm *svm); 168static int nested_svm_vmexit(struct vcpu_svm *svm);
133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 169static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
134 bool has_error_code, u32 error_code); 170 bool has_error_code, u32 error_code);
@@ -163,8 +199,8 @@ static unsigned long iopm_base;
163struct kvm_ldttss_desc { 199struct kvm_ldttss_desc {
164 u16 limit0; 200 u16 limit0;
165 u16 base0; 201 u16 base0;
166 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 202 unsigned base1:8, type:5, dpl:2, p:1;
167 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; 203 unsigned limit1:4, zero0:3, g:1, base2:8;
168 u32 base3; 204 u32 base3;
169 u32 zero1; 205 u32 zero1;
170} __attribute__((packed)); 206} __attribute__((packed));
@@ -194,6 +230,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
194#define MSRS_RANGE_SIZE 2048 230#define MSRS_RANGE_SIZE 2048
195#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 231#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
196 232
233static u32 svm_msrpm_offset(u32 msr)
234{
235 u32 offset;
236 int i;
237
238 for (i = 0; i < NUM_MSR_MAPS; i++) {
239 if (msr < msrpm_ranges[i] ||
240 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
241 continue;
242
243 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
244 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
245
246 /* Now we have the u8 offset - but need the u32 offset */
247 return offset / 4;
248 }
249
250 /* MSR not in any range */
251 return MSR_INVALID;
252}
253
197#define MAX_INST_SIZE 15 254#define MAX_INST_SIZE 15
198 255
199static inline u32 svm_has(u32 feat) 256static inline u32 svm_has(u32 feat)
@@ -213,7 +270,7 @@ static inline void stgi(void)
213 270
214static inline void invlpga(unsigned long addr, u32 asid) 271static inline void invlpga(unsigned long addr, u32 asid)
215{ 272{
216 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 273 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
217} 274}
218 275
219static inline void force_new_asid(struct kvm_vcpu *vcpu) 276static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +292,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
235 vcpu->arch.efer = efer; 292 vcpu->arch.efer = efer;
236} 293}
237 294
238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
239 bool has_error_code, u32 error_code)
240{
241 struct vcpu_svm *svm = to_svm(vcpu);
242
243 /* If we are within a nested VM we'd better #VMEXIT and let the
244 guest handle the exception */
245 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
246 return;
247
248 svm->vmcb->control.event_inj = nr
249 | SVM_EVTINJ_VALID
250 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
251 | SVM_EVTINJ_TYPE_EXEPT;
252 svm->vmcb->control.event_inj_err = error_code;
253}
254
255static int is_external_interrupt(u32 info) 295static int is_external_interrupt(u32 info)
256{ 296{
257 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 297 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +304,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
264 u32 ret = 0; 304 u32 ret = 0;
265 305
266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 306 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
267 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; 307 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
268 return ret & mask; 308 return ret & mask;
269} 309}
270 310
@@ -283,6 +323,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
283{ 323{
284 struct vcpu_svm *svm = to_svm(vcpu); 324 struct vcpu_svm *svm = to_svm(vcpu);
285 325
326 if (svm->vmcb->control.next_rip != 0)
327 svm->next_rip = svm->vmcb->control.next_rip;
328
286 if (!svm->next_rip) { 329 if (!svm->next_rip) {
287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 330 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
288 EMULATE_DONE) 331 EMULATE_DONE)
@@ -297,6 +340,68 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
297 svm_set_interrupt_shadow(vcpu, 0); 340 svm_set_interrupt_shadow(vcpu, 0);
298} 341}
299 342
343static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
344 bool has_error_code, u32 error_code,
345 bool reinject)
346{
347 struct vcpu_svm *svm = to_svm(vcpu);
348
349 /*
350 * If we are within a nested VM we'd better #VMEXIT and let the guest
351 * handle the exception
352 */
353 if (!reinject &&
354 nested_svm_check_exception(svm, nr, has_error_code, error_code))
355 return;
356
357 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
358 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
359
360 /*
361 * For guest debugging where we have to reinject #BP if some
362 * INT3 is guest-owned:
363 * Emulate nRIP by moving RIP forward. Will fail if injection
364 * raises a fault that is not intercepted. Still better than
365 * failing in all cases.
366 */
367 skip_emulated_instruction(&svm->vcpu);
368 rip = kvm_rip_read(&svm->vcpu);
369 svm->int3_rip = rip + svm->vmcb->save.cs.base;
370 svm->int3_injected = rip - old_rip;
371 }
372
373 svm->vmcb->control.event_inj = nr
374 | SVM_EVTINJ_VALID
375 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
376 | SVM_EVTINJ_TYPE_EXEPT;
377 svm->vmcb->control.event_inj_err = error_code;
378}
379
380static void svm_init_erratum_383(void)
381{
382 u32 low, high;
383 int err;
384 u64 val;
385
386 /* Only Fam10h is affected */
387 if (boot_cpu_data.x86 != 0x10)
388 return;
389
390 /* Use _safe variants to not break nested virtualization */
391 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
392 if (err)
393 return;
394
395 val |= (1ULL << 47);
396
397 low = lower_32_bits(val);
398 high = upper_32_bits(val);
399
400 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
401
402 erratum_383_found = true;
403}
404
300static int has_svm(void) 405static int has_svm(void)
301{ 406{
302 const char *msg; 407 const char *msg;
@@ -319,7 +424,7 @@ static int svm_hardware_enable(void *garbage)
319 424
320 struct svm_cpu_data *sd; 425 struct svm_cpu_data *sd;
321 uint64_t efer; 426 uint64_t efer;
322 struct descriptor_table gdt_descr; 427 struct desc_ptr gdt_descr;
323 struct desc_struct *gdt; 428 struct desc_struct *gdt;
324 int me = raw_smp_processor_id(); 429 int me = raw_smp_processor_id();
325 430
@@ -344,14 +449,16 @@ static int svm_hardware_enable(void *garbage)
344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 449 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
345 sd->next_asid = sd->max_asid + 1; 450 sd->next_asid = sd->max_asid + 1;
346 451
347 kvm_get_gdt(&gdt_descr); 452 native_store_gdt(&gdt_descr);
348 gdt = (struct desc_struct *)gdt_descr.base; 453 gdt = (struct desc_struct *)gdt_descr.address;
349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 454 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
350 455
351 wrmsrl(MSR_EFER, efer | EFER_SVME); 456 wrmsrl(MSR_EFER, efer | EFER_SVME);
352 457
353 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 458 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
354 459
460 svm_init_erratum_383();
461
355 return 0; 462 return 0;
356} 463}
357 464
@@ -391,42 +498,98 @@ err_1:
391 498
392} 499}
393 500
501static bool valid_msr_intercept(u32 index)
502{
503 int i;
504
505 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
506 if (direct_access_msrs[i].index == index)
507 return true;
508
509 return false;
510}
511
394static void set_msr_interception(u32 *msrpm, unsigned msr, 512static void set_msr_interception(u32 *msrpm, unsigned msr,
395 int read, int write) 513 int read, int write)
396{ 514{
515 u8 bit_read, bit_write;
516 unsigned long tmp;
517 u32 offset;
518
519 /*
520 * If this warning triggers extend the direct_access_msrs list at the
521 * beginning of the file
522 */
523 WARN_ON(!valid_msr_intercept(msr));
524
525 offset = svm_msrpm_offset(msr);
526 bit_read = 2 * (msr & 0x0f);
527 bit_write = 2 * (msr & 0x0f) + 1;
528 tmp = msrpm[offset];
529
530 BUG_ON(offset == MSR_INVALID);
531
532 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
533 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
534
535 msrpm[offset] = tmp;
536}
537
538static void svm_vcpu_init_msrpm(u32 *msrpm)
539{
397 int i; 540 int i;
398 541
399 for (i = 0; i < NUM_MSR_MAPS; i++) { 542 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
400 if (msr >= msrpm_ranges[i] && 543
401 msr < msrpm_ranges[i] + MSRS_IN_RANGE) { 544 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
402 u32 msr_offset = (i * MSRS_IN_RANGE + msr - 545 if (!direct_access_msrs[i].always)
403 msrpm_ranges[i]) * 2; 546 continue;
404 547
405 u32 *base = msrpm + (msr_offset / 32); 548 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
406 u32 msr_shift = msr_offset % 32; 549 }
407 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 550}
408 *base = (*base & ~(0x3 << msr_shift)) | 551
409 (mask << msr_shift); 552static void add_msr_offset(u32 offset)
553{
554 int i;
555
556 for (i = 0; i < MSRPM_OFFSETS; ++i) {
557
558 /* Offset already in list? */
559 if (msrpm_offsets[i] == offset)
410 return; 560 return;
411 } 561
562 /* Slot used by another offset? */
563 if (msrpm_offsets[i] != MSR_INVALID)
564 continue;
565
566 /* Add offset to list */
567 msrpm_offsets[i] = offset;
568
569 return;
412 } 570 }
571
572 /*
573 * If this BUG triggers the msrpm_offsets table has an overflow. Just
574 * increase MSRPM_OFFSETS in this case.
575 */
413 BUG(); 576 BUG();
414} 577}
415 578
416static void svm_vcpu_init_msrpm(u32 *msrpm) 579static void init_msrpm_offsets(void)
417{ 580{
418 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 581 int i;
419 582
420#ifdef CONFIG_X86_64 583 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
421 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); 584
422 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); 585 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
423 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); 586 u32 offset;
424 set_msr_interception(msrpm, MSR_LSTAR, 1, 1); 587
425 set_msr_interception(msrpm, MSR_CSTAR, 1, 1); 588 offset = svm_msrpm_offset(direct_access_msrs[i].index);
426 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); 589 BUG_ON(offset == MSR_INVALID);
427#endif 590
428 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 591 add_msr_offset(offset);
429 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 592 }
430} 593}
431 594
432static void svm_enable_lbrv(struct vcpu_svm *svm) 595static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +630,8 @@ static __init int svm_hardware_setup(void)
467 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 630 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
468 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 631 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
469 632
633 init_msrpm_offsets();
634
470 if (boot_cpu_has(X86_FEATURE_NX)) 635 if (boot_cpu_has(X86_FEATURE_NX))
471 kvm_enable_efer_bits(EFER_NX); 636 kvm_enable_efer_bits(EFER_NX);
472 637
@@ -523,7 +688,7 @@ static void init_seg(struct vmcb_seg *seg)
523{ 688{
524 seg->selector = 0; 689 seg->selector = 0;
525 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 690 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
526 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 691 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
527 seg->limit = 0xffff; 692 seg->limit = 0xffff;
528 seg->base = 0; 693 seg->base = 0;
529} 694}
@@ -543,16 +708,16 @@ static void init_vmcb(struct vcpu_svm *svm)
543 708
544 svm->vcpu.fpu_active = 1; 709 svm->vcpu.fpu_active = 1;
545 710
546 control->intercept_cr_read = INTERCEPT_CR0_MASK | 711 control->intercept_cr_read = INTERCEPT_CR0_MASK |
547 INTERCEPT_CR3_MASK | 712 INTERCEPT_CR3_MASK |
548 INTERCEPT_CR4_MASK; 713 INTERCEPT_CR4_MASK;
549 714
550 control->intercept_cr_write = INTERCEPT_CR0_MASK | 715 control->intercept_cr_write = INTERCEPT_CR0_MASK |
551 INTERCEPT_CR3_MASK | 716 INTERCEPT_CR3_MASK |
552 INTERCEPT_CR4_MASK | 717 INTERCEPT_CR4_MASK |
553 INTERCEPT_CR8_MASK; 718 INTERCEPT_CR8_MASK;
554 719
555 control->intercept_dr_read = INTERCEPT_DR0_MASK | 720 control->intercept_dr_read = INTERCEPT_DR0_MASK |
556 INTERCEPT_DR1_MASK | 721 INTERCEPT_DR1_MASK |
557 INTERCEPT_DR2_MASK | 722 INTERCEPT_DR2_MASK |
558 INTERCEPT_DR3_MASK | 723 INTERCEPT_DR3_MASK |
@@ -561,7 +726,7 @@ static void init_vmcb(struct vcpu_svm *svm)
561 INTERCEPT_DR6_MASK | 726 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 727 INTERCEPT_DR7_MASK;
563 728
564 control->intercept_dr_write = INTERCEPT_DR0_MASK | 729 control->intercept_dr_write = INTERCEPT_DR0_MASK |
565 INTERCEPT_DR1_MASK | 730 INTERCEPT_DR1_MASK |
566 INTERCEPT_DR2_MASK | 731 INTERCEPT_DR2_MASK |
567 INTERCEPT_DR3_MASK | 732 INTERCEPT_DR3_MASK |
@@ -575,7 +740,7 @@ static void init_vmcb(struct vcpu_svm *svm)
575 (1 << MC_VECTOR); 740 (1 << MC_VECTOR);
576 741
577 742
578 control->intercept = (1ULL << INTERCEPT_INTR) | 743 control->intercept = (1ULL << INTERCEPT_INTR) |
579 (1ULL << INTERCEPT_NMI) | 744 (1ULL << INTERCEPT_NMI) |
580 (1ULL << INTERCEPT_SMI) | 745 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) | 746 (1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +801,8 @@ static void init_vmcb(struct vcpu_svm *svm)
636 save->rip = 0x0000fff0; 801 save->rip = 0x0000fff0;
637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 802 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
638 803
639 /* This is the guest-visible cr0 value. 804 /*
805 * This is the guest-visible cr0 value.
640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
641 */ 807 */
642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -729,6 +895,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
729 svm_vcpu_init_msrpm(svm->msrpm); 895 svm_vcpu_init_msrpm(svm->msrpm);
730 896
731 svm->nested.msrpm = page_address(nested_msrpm_pages); 897 svm->nested.msrpm = page_address(nested_msrpm_pages);
898 svm_vcpu_init_msrpm(svm->nested.msrpm);
732 899
733 svm->vmcb = page_address(page); 900 svm->vmcb = page_address(page);
734 clear_page(svm->vmcb); 901 clear_page(svm->vmcb);
@@ -882,7 +1049,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
882 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1049 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
883 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 1050 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
884 1051
885 /* AMD's VMCB does not have an explicit unusable field, so emulate it 1052 /*
1053 * AMD's VMCB does not have an explicit unusable field, so emulate it
886 * for cross vendor migration purposes by "not present" 1054 * for cross vendor migration purposes by "not present"
887 */ 1055 */
888 var->unusable = !var->present || (var->type == 0); 1056 var->unusable = !var->present || (var->type == 0);
@@ -918,7 +1086,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
918 var->type |= 0x1; 1086 var->type |= 0x1;
919 break; 1087 break;
920 case VCPU_SREG_SS: 1088 case VCPU_SREG_SS:
921 /* On AMD CPUs sometimes the DB bit in the segment 1089 /*
1090 * On AMD CPUs sometimes the DB bit in the segment
922 * descriptor is left as 1, although the whole segment has 1091 * descriptor is left as 1, although the whole segment has
923 * been made unusable. Clear it here to pass an Intel VMX 1092 * been made unusable. Clear it here to pass an Intel VMX
924 * entry check when cross vendor migrating. 1093 * entry check when cross vendor migrating.
@@ -936,36 +1105,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
936 return save->cpl; 1105 return save->cpl;
937} 1106}
938 1107
939static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1108static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
940{ 1109{
941 struct vcpu_svm *svm = to_svm(vcpu); 1110 struct vcpu_svm *svm = to_svm(vcpu);
942 1111
943 dt->limit = svm->vmcb->save.idtr.limit; 1112 dt->size = svm->vmcb->save.idtr.limit;
944 dt->base = svm->vmcb->save.idtr.base; 1113 dt->address = svm->vmcb->save.idtr.base;
945} 1114}
946 1115
947static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1116static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
948{ 1117{
949 struct vcpu_svm *svm = to_svm(vcpu); 1118 struct vcpu_svm *svm = to_svm(vcpu);
950 1119
951 svm->vmcb->save.idtr.limit = dt->limit; 1120 svm->vmcb->save.idtr.limit = dt->size;
952 svm->vmcb->save.idtr.base = dt->base ; 1121 svm->vmcb->save.idtr.base = dt->address ;
953} 1122}
954 1123
955static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1124static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
956{ 1125{
957 struct vcpu_svm *svm = to_svm(vcpu); 1126 struct vcpu_svm *svm = to_svm(vcpu);
958 1127
959 dt->limit = svm->vmcb->save.gdtr.limit; 1128 dt->size = svm->vmcb->save.gdtr.limit;
960 dt->base = svm->vmcb->save.gdtr.base; 1129 dt->address = svm->vmcb->save.gdtr.base;
961} 1130}
962 1131
963static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1132static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
964{ 1133{
965 struct vcpu_svm *svm = to_svm(vcpu); 1134 struct vcpu_svm *svm = to_svm(vcpu);
966 1135
967 svm->vmcb->save.gdtr.limit = dt->limit; 1136 svm->vmcb->save.gdtr.limit = dt->size;
968 svm->vmcb->save.gdtr.base = dt->base ; 1137 svm->vmcb->save.gdtr.base = dt->address ;
969} 1138}
970 1139
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1140static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -978,6 +1147,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
978 1147
979static void update_cr0_intercept(struct vcpu_svm *svm) 1148static void update_cr0_intercept(struct vcpu_svm *svm)
980{ 1149{
1150 struct vmcb *vmcb = svm->vmcb;
981 ulong gcr0 = svm->vcpu.arch.cr0; 1151 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0; 1152 u64 *hcr0 = &svm->vmcb->save.cr0;
983 1153
@@ -989,11 +1159,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
989 1159
990 1160
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1161 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1162 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1163 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1164 if (is_nested(svm)) {
1165 struct vmcb *hsave = svm->nested.hsave;
1166
1167 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1168 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1169 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1170 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1171 }
994 } else { 1172 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1173 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1174 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1175 if (is_nested(svm)) {
1176 struct vmcb *hsave = svm->nested.hsave;
1177
1178 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1179 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1180 }
997 } 1181 }
998} 1182}
999 1183
@@ -1001,6 +1185,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1001{ 1185{
1002 struct vcpu_svm *svm = to_svm(vcpu); 1186 struct vcpu_svm *svm = to_svm(vcpu);
1003 1187
1188 if (is_nested(svm)) {
1189 /*
1190 * We are here because we run in nested mode, the host kvm
1191 * intercepts cr0 writes but the l1 hypervisor does not.
1192 * But the L1 hypervisor may intercept selective cr0 writes.
1193 * This needs to be checked here.
1194 */
1195 unsigned long old, new;
1196
1197 /* Remove bits that would trigger a real cr0 write intercept */
1198 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1199 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1200
1201 if (old == new) {
1202 /* cr0 write with ts and mp unchanged */
1203 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1204 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1205 return;
1206 }
1207 }
1208
1004#ifdef CONFIG_X86_64 1209#ifdef CONFIG_X86_64
1005 if (vcpu->arch.efer & EFER_LME) { 1210 if (vcpu->arch.efer & EFER_LME) {
1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1211 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1134,70 +1339,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1134 svm->vmcb->control.asid = sd->next_asid++; 1339 svm->vmcb->control.asid = sd->next_asid++;
1135} 1340}
1136 1341
1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) 1342static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1138{ 1343{
1139 struct vcpu_svm *svm = to_svm(vcpu); 1344 struct vcpu_svm *svm = to_svm(vcpu);
1140 1345
1141 switch (dr) { 1346 svm->vmcb->save.dr7 = value;
1142 case 0 ... 3:
1143 *dest = vcpu->arch.db[dr];
1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1149 case 6:
1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1151 *dest = vcpu->arch.dr6;
1152 else
1153 *dest = svm->vmcb->save.dr6;
1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1159 case 7:
1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1161 *dest = vcpu->arch.dr7;
1162 else
1163 *dest = svm->vmcb->save.dr7;
1164 break;
1165 }
1166
1167 return EMULATE_DONE;
1168}
1169
1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1171{
1172 struct vcpu_svm *svm = to_svm(vcpu);
1173
1174 switch (dr) {
1175 case 0 ... 3:
1176 vcpu->arch.db[dr] = value;
1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1178 vcpu->arch.eff_db[dr] = value;
1179 break;
1180 case 4:
1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1182 return EMULATE_FAIL; /* will re-inject UD */
1183 /* fall through */
1184 case 6:
1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1191 case 7:
1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1196 }
1197 break;
1198 }
1199
1200 return EMULATE_DONE;
1201} 1347}
1202 1348
1203static int pf_interception(struct vcpu_svm *svm) 1349static int pf_interception(struct vcpu_svm *svm)
@@ -1234,7 +1380,7 @@ static int db_interception(struct vcpu_svm *svm)
1234 } 1380 }
1235 1381
1236 if (svm->vcpu.guest_debug & 1382 if (svm->vcpu.guest_debug &
1237 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ 1383 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1238 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1384 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1239 kvm_run->debug.arch.pc = 1385 kvm_run->debug.arch.pc =
1240 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1386 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1268,7 +1414,22 @@ static int ud_interception(struct vcpu_svm *svm)
1268static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1414static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1269{ 1415{
1270 struct vcpu_svm *svm = to_svm(vcpu); 1416 struct vcpu_svm *svm = to_svm(vcpu);
1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1417 u32 excp;
1418
1419 if (is_nested(svm)) {
1420 u32 h_excp, n_excp;
1421
1422 h_excp = svm->nested.hsave->control.intercept_exceptions;
1423 n_excp = svm->nested.intercept_exceptions;
1424 h_excp &= ~(1 << NM_VECTOR);
1425 excp = h_excp | n_excp;
1426 } else {
1427 excp = svm->vmcb->control.intercept_exceptions;
1428 excp &= ~(1 << NM_VECTOR);
1429 }
1430
1431 svm->vmcb->control.intercept_exceptions = excp;
1432
1272 svm->vcpu.fpu_active = 1; 1433 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm); 1434 update_cr0_intercept(svm);
1274} 1435}
@@ -1279,8 +1440,59 @@ static int nm_interception(struct vcpu_svm *svm)
1279 return 1; 1440 return 1;
1280} 1441}
1281 1442
1282static int mc_interception(struct vcpu_svm *svm) 1443static bool is_erratum_383(void)
1283{ 1444{
1445 int err, i;
1446 u64 value;
1447
1448 if (!erratum_383_found)
1449 return false;
1450
1451 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1452 if (err)
1453 return false;
1454
1455 /* Bit 62 may or may not be set for this mce */
1456 value &= ~(1ULL << 62);
1457
1458 if (value != 0xb600000000010015ULL)
1459 return false;
1460
1461 /* Clear MCi_STATUS registers */
1462 for (i = 0; i < 6; ++i)
1463 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1464
1465 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1466 if (!err) {
1467 u32 low, high;
1468
1469 value &= ~(1ULL << 2);
1470 low = lower_32_bits(value);
1471 high = upper_32_bits(value);
1472
1473 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1474 }
1475
1476 /* Flush tlb to evict multi-match entries */
1477 __flush_tlb_all();
1478
1479 return true;
1480}
1481
1482static void svm_handle_mce(struct vcpu_svm *svm)
1483{
1484 if (is_erratum_383()) {
1485 /*
1486 * Erratum 383 triggered. Guest state is corrupt so kill the
1487 * guest.
1488 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
1492
1493 return;
1494 }
1495
1284 /* 1496 /*
1285 * On an #MC intercept the MCE handler is not called automatically in 1497 * On an #MC intercept the MCE handler is not called automatically in
1286 * the host. So do it by hand here. 1498 * the host. So do it by hand here.
@@ -1289,6 +1501,11 @@ static int mc_interception(struct vcpu_svm *svm)
1289 "int $0x12\n"); 1501 "int $0x12\n");
1290 /* not sure if we ever come back to this point */ 1502 /* not sure if we ever come back to this point */
1291 1503
1504 return;
1505}
1506
1507static int mc_interception(struct vcpu_svm *svm)
1508{
1292 return 1; 1509 return 1;
1293} 1510}
1294 1511
@@ -1309,29 +1526,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
1309 1526
1310static int io_interception(struct vcpu_svm *svm) 1527static int io_interception(struct vcpu_svm *svm)
1311{ 1528{
1529 struct kvm_vcpu *vcpu = &svm->vcpu;
1312 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1530 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1313 int size, in, string; 1531 int size, in, string;
1314 unsigned port; 1532 unsigned port;
1315 1533
1316 ++svm->vcpu.stat.io_exits; 1534 ++svm->vcpu.stat.io_exits;
1317
1318 svm->next_rip = svm->vmcb->control.exit_info_2;
1319
1320 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1535 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1321
1322 if (string) {
1323 if (emulate_instruction(&svm->vcpu,
1324 0, 0, 0) == EMULATE_DO_MMIO)
1325 return 0;
1326 return 1;
1327 }
1328
1329 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
1539
1330 port = io_info >> 16; 1540 port = io_info >> 16;
1331 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1332 1542 svm->next_rip = svm->vmcb->control.exit_info_2;
1333 skip_emulated_instruction(&svm->vcpu); 1543 skip_emulated_instruction(&svm->vcpu);
1334 return kvm_emulate_pio(&svm->vcpu, in, size, port); 1544
1545 return kvm_fast_pio_out(vcpu, size, port);
1335} 1546}
1336 1547
1337static int nmi_interception(struct vcpu_svm *svm) 1548static int nmi_interception(struct vcpu_svm *svm)
@@ -1384,6 +1595,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1384static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1595static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1385 bool has_error_code, u32 error_code) 1596 bool has_error_code, u32 error_code)
1386{ 1597{
1598 int vmexit;
1599
1387 if (!is_nested(svm)) 1600 if (!is_nested(svm))
1388 return 0; 1601 return 0;
1389 1602
@@ -1392,21 +1605,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1392 svm->vmcb->control.exit_info_1 = error_code; 1605 svm->vmcb->control.exit_info_1 = error_code;
1393 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 1606 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1394 1607
1395 return nested_svm_exit_handled(svm); 1608 vmexit = nested_svm_intercept(svm);
1609 if (vmexit == NESTED_EXIT_DONE)
1610 svm->nested.exit_required = true;
1611
1612 return vmexit;
1396} 1613}
1397 1614
1398static inline int nested_svm_intr(struct vcpu_svm *svm) 1615/* This function returns true if it is save to enable the irq window */
1616static inline bool nested_svm_intr(struct vcpu_svm *svm)
1399{ 1617{
1400 if (!is_nested(svm)) 1618 if (!is_nested(svm))
1401 return 0; 1619 return true;
1402 1620
1403 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1621 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1404 return 0; 1622 return true;
1405 1623
1406 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1624 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1407 return 0; 1625 return false;
1408 1626
1409 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1627 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1628 svm->vmcb->control.exit_info_1 = 0;
1629 svm->vmcb->control.exit_info_2 = 0;
1410 1630
1411 if (svm->nested.intercept & 1ULL) { 1631 if (svm->nested.intercept & 1ULL) {
1412 /* 1632 /*
@@ -1417,21 +1637,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1417 */ 1637 */
1418 svm->nested.exit_required = true; 1638 svm->nested.exit_required = true;
1419 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1639 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1420 return 1; 1640 return false;
1421 } 1641 }
1422 1642
1423 return 0; 1643 return true;
1424} 1644}
1425 1645
1426static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) 1646/* This function returns true if it is save to enable the nmi window */
1647static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1648{
1649 if (!is_nested(svm))
1650 return true;
1651
1652 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1653 return true;
1654
1655 svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1656 svm->nested.exit_required = true;
1657
1658 return false;
1659}
1660
1661static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1427{ 1662{
1428 struct page *page; 1663 struct page *page;
1429 1664
1665 might_sleep();
1666
1430 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1667 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1431 if (is_error_page(page)) 1668 if (is_error_page(page))
1432 goto error; 1669 goto error;
1433 1670
1434 return kmap_atomic(page, idx); 1671 *_page = page;
1672
1673 return kmap(page);
1435 1674
1436error: 1675error:
1437 kvm_release_page_clean(page); 1676 kvm_release_page_clean(page);
@@ -1440,61 +1679,55 @@ error:
1440 return NULL; 1679 return NULL;
1441} 1680}
1442 1681
1443static void nested_svm_unmap(void *addr, enum km_type idx) 1682static void nested_svm_unmap(struct page *page)
1444{ 1683{
1445 struct page *page; 1684 kunmap(page);
1685 kvm_release_page_dirty(page);
1686}
1446 1687
1447 if (!addr) 1688static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1448 return; 1689{
1690 unsigned port;
1691 u8 val, bit;
1692 u64 gpa;
1449 1693
1450 page = kmap_atomic_to_page(addr); 1694 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1695 return NESTED_EXIT_HOST;
1451 1696
1452 kunmap_atomic(addr, idx); 1697 port = svm->vmcb->control.exit_info_1 >> 16;
1453 kvm_release_page_dirty(page); 1698 gpa = svm->nested.vmcb_iopm + (port / 8);
1699 bit = port % 8;
1700 val = 0;
1701
1702 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1703 val &= (1 << bit);
1704
1705 return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1454} 1706}
1455 1707
1456static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1708static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1457{ 1709{
1458 u32 param = svm->vmcb->control.exit_info_1 & 1; 1710 u32 offset, msr, value;
1459 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1711 int write, mask;
1460 bool ret = false;
1461 u32 t0, t1;
1462 u8 *msrpm;
1463 1712
1464 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 1713 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1465 return false; 1714 return NESTED_EXIT_HOST;
1466 1715
1467 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1716 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1717 offset = svm_msrpm_offset(msr);
1718 write = svm->vmcb->control.exit_info_1 & 1;
1719 mask = 1 << ((2 * (msr & 0xf)) + write);
1468 1720
1469 if (!msrpm) 1721 if (offset == MSR_INVALID)
1470 goto out; 1722 return NESTED_EXIT_DONE;
1471 1723
1472 switch (msr) { 1724 /* Offset is in 32 bit units but need in 8 bit units */
1473 case 0 ... 0x1fff: 1725 offset *= 4;
1474 t0 = (msr * 2) % 8;
1475 t1 = msr / 8;
1476 break;
1477 case 0xc0000000 ... 0xc0001fff:
1478 t0 = (8192 + msr - 0xc0000000) * 2;
1479 t1 = (t0 / 8);
1480 t0 %= 8;
1481 break;
1482 case 0xc0010000 ... 0xc0011fff:
1483 t0 = (16384 + msr - 0xc0010000) * 2;
1484 t1 = (t0 / 8);
1485 t0 %= 8;
1486 break;
1487 default:
1488 ret = true;
1489 goto out;
1490 }
1491
1492 ret = msrpm[t1] & ((1 << param) << t0);
1493 1726
1494out: 1727 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1495 nested_svm_unmap(msrpm, KM_USER0); 1728 return NESTED_EXIT_DONE;
1496 1729
1497 return ret; 1730 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1498} 1731}
1499 1732
1500static int nested_svm_exit_special(struct vcpu_svm *svm) 1733static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1504,17 +1737,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1504 switch (exit_code) { 1737 switch (exit_code) {
1505 case SVM_EXIT_INTR: 1738 case SVM_EXIT_INTR:
1506 case SVM_EXIT_NMI: 1739 case SVM_EXIT_NMI:
1740 case SVM_EXIT_EXCP_BASE + MC_VECTOR:
1507 return NESTED_EXIT_HOST; 1741 return NESTED_EXIT_HOST;
1508 /* For now we are always handling NPFs when using them */
1509 case SVM_EXIT_NPF: 1742 case SVM_EXIT_NPF:
1743 /* For now we are always handling NPFs when using them */
1510 if (npt_enabled) 1744 if (npt_enabled)
1511 return NESTED_EXIT_HOST; 1745 return NESTED_EXIT_HOST;
1512 break; 1746 break;
1513 /* When we're shadowing, trap PFs */
1514 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1747 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1748 /* When we're shadowing, trap PFs */
1515 if (!npt_enabled) 1749 if (!npt_enabled)
1516 return NESTED_EXIT_HOST; 1750 return NESTED_EXIT_HOST;
1517 break; 1751 break;
1752 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1753 nm_interception(svm);
1754 break;
1518 default: 1755 default:
1519 break; 1756 break;
1520 } 1757 }
@@ -1525,7 +1762,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1525/* 1762/*
1526 * If this function returns true, this #vmexit was already handled 1763 * If this function returns true, this #vmexit was already handled
1527 */ 1764 */
1528static int nested_svm_exit_handled(struct vcpu_svm *svm) 1765static int nested_svm_intercept(struct vcpu_svm *svm)
1529{ 1766{
1530 u32 exit_code = svm->vmcb->control.exit_code; 1767 u32 exit_code = svm->vmcb->control.exit_code;
1531 int vmexit = NESTED_EXIT_HOST; 1768 int vmexit = NESTED_EXIT_HOST;
@@ -1534,6 +1771,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1534 case SVM_EXIT_MSR: 1771 case SVM_EXIT_MSR:
1535 vmexit = nested_svm_exit_handled_msr(svm); 1772 vmexit = nested_svm_exit_handled_msr(svm);
1536 break; 1773 break;
1774 case SVM_EXIT_IOIO:
1775 vmexit = nested_svm_intercept_ioio(svm);
1776 break;
1537 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1777 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1538 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1778 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1539 if (svm->nested.intercept_cr_read & cr_bits) 1779 if (svm->nested.intercept_cr_read & cr_bits)
@@ -1564,6 +1804,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1564 vmexit = NESTED_EXIT_DONE; 1804 vmexit = NESTED_EXIT_DONE;
1565 break; 1805 break;
1566 } 1806 }
1807 case SVM_EXIT_ERR: {
1808 vmexit = NESTED_EXIT_DONE;
1809 break;
1810 }
1567 default: { 1811 default: {
1568 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1812 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1569 if (svm->nested.intercept & exit_bits) 1813 if (svm->nested.intercept & exit_bits)
@@ -1571,9 +1815,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1571 } 1815 }
1572 } 1816 }
1573 1817
1574 if (vmexit == NESTED_EXIT_DONE) { 1818 return vmexit;
1819}
1820
1821static int nested_svm_exit_handled(struct vcpu_svm *svm)
1822{
1823 int vmexit;
1824
1825 vmexit = nested_svm_intercept(svm);
1826
1827 if (vmexit == NESTED_EXIT_DONE)
1575 nested_svm_vmexit(svm); 1828 nested_svm_vmexit(svm);
1576 }
1577 1829
1578 return vmexit; 1830 return vmexit;
1579} 1831}
@@ -1615,6 +1867,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1615 struct vmcb *nested_vmcb; 1867 struct vmcb *nested_vmcb;
1616 struct vmcb *hsave = svm->nested.hsave; 1868 struct vmcb *hsave = svm->nested.hsave;
1617 struct vmcb *vmcb = svm->vmcb; 1869 struct vmcb *vmcb = svm->vmcb;
1870 struct page *page;
1618 1871
1619 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 1872 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1620 vmcb->control.exit_info_1, 1873 vmcb->control.exit_info_1,
@@ -1622,10 +1875,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1622 vmcb->control.exit_int_info, 1875 vmcb->control.exit_int_info,
1623 vmcb->control.exit_int_info_err); 1876 vmcb->control.exit_int_info_err);
1624 1877
1625 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1878 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
1626 if (!nested_vmcb) 1879 if (!nested_vmcb)
1627 return 1; 1880 return 1;
1628 1881
1882 /* Exit nested SVM mode */
1883 svm->nested.vmcb = 0;
1884
1629 /* Give the current vmcb to the guest */ 1885 /* Give the current vmcb to the guest */
1630 disable_gif(svm); 1886 disable_gif(svm);
1631 1887
@@ -1635,9 +1891,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1635 nested_vmcb->save.ds = vmcb->save.ds; 1891 nested_vmcb->save.ds = vmcb->save.ds;
1636 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1892 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1637 nested_vmcb->save.idtr = vmcb->save.idtr; 1893 nested_vmcb->save.idtr = vmcb->save.idtr;
1638 if (npt_enabled) 1894 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1639 nested_vmcb->save.cr3 = vmcb->save.cr3; 1895 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1640 nested_vmcb->save.cr2 = vmcb->save.cr2; 1896 nested_vmcb->save.cr2 = vmcb->save.cr2;
1897 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1641 nested_vmcb->save.rflags = vmcb->save.rflags; 1898 nested_vmcb->save.rflags = vmcb->save.rflags;
1642 nested_vmcb->save.rip = vmcb->save.rip; 1899 nested_vmcb->save.rip = vmcb->save.rip;
1643 nested_vmcb->save.rsp = vmcb->save.rsp; 1900 nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1709,10 +1966,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1709 svm->vmcb->save.cpl = 0; 1966 svm->vmcb->save.cpl = 0;
1710 svm->vmcb->control.exit_int_info = 0; 1967 svm->vmcb->control.exit_int_info = 0;
1711 1968
1712 /* Exit nested SVM mode */ 1969 nested_svm_unmap(page);
1713 svm->nested.vmcb = 0;
1714
1715 nested_svm_unmap(nested_vmcb, KM_USER0);
1716 1970
1717 kvm_mmu_reset_context(&svm->vcpu); 1971 kvm_mmu_reset_context(&svm->vcpu);
1718 kvm_mmu_load(&svm->vcpu); 1972 kvm_mmu_load(&svm->vcpu);
@@ -1722,19 +1976,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1722 1976
1723static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 1977static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1724{ 1978{
1725 u32 *nested_msrpm; 1979 /*
1980 * This function merges the msr permission bitmaps of kvm and the
1981 * nested vmcb. It is omptimized in that it only merges the parts where
1982 * the kvm msr permission bitmap may contain zero bits
1983 */
1726 int i; 1984 int i;
1727 1985
1728 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1986 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1729 if (!nested_msrpm) 1987 return true;
1730 return false;
1731 1988
1732 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1989 for (i = 0; i < MSRPM_OFFSETS; i++) {
1733 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1990 u32 value, p;
1991 u64 offset;
1734 1992
1735 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 1993 if (msrpm_offsets[i] == 0xffffffff)
1994 break;
1995
1996 p = msrpm_offsets[i];
1997 offset = svm->nested.vmcb_msrpm + (p * 4);
1736 1998
1737 nested_svm_unmap(nested_msrpm, KM_USER0); 1999 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
2000 return false;
2001
2002 svm->nested.msrpm[p] = svm->msrpm[p] | value;
2003 }
2004
2005 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1738 2006
1739 return true; 2007 return true;
1740} 2008}
@@ -1744,26 +2012,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1744 struct vmcb *nested_vmcb; 2012 struct vmcb *nested_vmcb;
1745 struct vmcb *hsave = svm->nested.hsave; 2013 struct vmcb *hsave = svm->nested.hsave;
1746 struct vmcb *vmcb = svm->vmcb; 2014 struct vmcb *vmcb = svm->vmcb;
2015 struct page *page;
2016 u64 vmcb_gpa;
2017
2018 vmcb_gpa = svm->vmcb->save.rax;
1747 2019
1748 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2020 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1749 if (!nested_vmcb) 2021 if (!nested_vmcb)
1750 return false; 2022 return false;
1751 2023
1752 /* nested_vmcb is our indicator if nested SVM is activated */ 2024 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
1753 svm->nested.vmcb = svm->vmcb->save.rax;
1754
1755 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1756 nested_vmcb->save.rip, 2025 nested_vmcb->save.rip,
1757 nested_vmcb->control.int_ctl, 2026 nested_vmcb->control.int_ctl,
1758 nested_vmcb->control.event_inj, 2027 nested_vmcb->control.event_inj,
1759 nested_vmcb->control.nested_ctl); 2028 nested_vmcb->control.nested_ctl);
1760 2029
2030 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
2031 nested_vmcb->control.intercept_cr_write,
2032 nested_vmcb->control.intercept_exceptions,
2033 nested_vmcb->control.intercept);
2034
1761 /* Clear internal status */ 2035 /* Clear internal status */
1762 kvm_clear_exception_queue(&svm->vcpu); 2036 kvm_clear_exception_queue(&svm->vcpu);
1763 kvm_clear_interrupt_queue(&svm->vcpu); 2037 kvm_clear_interrupt_queue(&svm->vcpu);
1764 2038
1765 /* Save the old vmcb, so we don't need to pick what we save, but 2039 /*
1766 can restore everything when a VMEXIT occurs */ 2040 * Save the old vmcb, so we don't need to pick what we save, but can
2041 * restore everything when a VMEXIT occurs
2042 */
1767 hsave->save.es = vmcb->save.es; 2043 hsave->save.es = vmcb->save.es;
1768 hsave->save.cs = vmcb->save.cs; 2044 hsave->save.cs = vmcb->save.cs;
1769 hsave->save.ss = vmcb->save.ss; 2045 hsave->save.ss = vmcb->save.ss;
@@ -1803,14 +2079,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1803 if (npt_enabled) { 2079 if (npt_enabled) {
1804 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1805 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1806 } else { 2082 } else
1807 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1808 kvm_mmu_reset_context(&svm->vcpu); 2084
1809 } 2085 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu);
2087
1810 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 2088 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1811 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 2089 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1812 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 2090 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1813 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 2091 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2092
1814 /* In case we don't even reach vcpu_run, the fields are not updated */ 2093 /* In case we don't even reach vcpu_run, the fields are not updated */
1815 svm->vmcb->save.rax = nested_vmcb->save.rax; 2094 svm->vmcb->save.rax = nested_vmcb->save.rax;
1816 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 2095 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1819,22 +2098,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1819 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 2098 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1820 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 2099 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1821 2100
1822 /* We don't want a nested guest to be more powerful than the guest, 2101 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
1823 so all intercepts are ORed */ 2102 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
1824 svm->vmcb->control.intercept_cr_read |=
1825 nested_vmcb->control.intercept_cr_read;
1826 svm->vmcb->control.intercept_cr_write |=
1827 nested_vmcb->control.intercept_cr_write;
1828 svm->vmcb->control.intercept_dr_read |=
1829 nested_vmcb->control.intercept_dr_read;
1830 svm->vmcb->control.intercept_dr_write |=
1831 nested_vmcb->control.intercept_dr_write;
1832 svm->vmcb->control.intercept_exceptions |=
1833 nested_vmcb->control.intercept_exceptions;
1834
1835 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1836
1837 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1838 2103
1839 /* cache intercepts */ 2104 /* cache intercepts */
1840 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2105 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1851,13 +2116,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1851 else 2116 else
1852 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 2117 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1853 2118
2119 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2120 /* We only want the cr8 intercept bits of the guest */
2121 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
2122 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2123 }
2124
2125 /* We don't want to see VMMCALLs from a nested guest */
2126 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
2127
2128 /*
2129 * We don't want a nested guest to be more powerful than the guest, so
2130 * all intercepts are ORed
2131 */
2132 svm->vmcb->control.intercept_cr_read |=
2133 nested_vmcb->control.intercept_cr_read;
2134 svm->vmcb->control.intercept_cr_write |=
2135 nested_vmcb->control.intercept_cr_write;
2136 svm->vmcb->control.intercept_dr_read |=
2137 nested_vmcb->control.intercept_dr_read;
2138 svm->vmcb->control.intercept_dr_write |=
2139 nested_vmcb->control.intercept_dr_write;
2140 svm->vmcb->control.intercept_exceptions |=
2141 nested_vmcb->control.intercept_exceptions;
2142
2143 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2144
2145 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
1854 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2146 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1855 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 2147 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1856 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 2148 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1857 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 2149 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1858 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 2150 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1859 2151
1860 nested_svm_unmap(nested_vmcb, KM_USER0); 2152 nested_svm_unmap(page);
2153
2154 /* nested_vmcb is our indicator if nested SVM is activated */
2155 svm->nested.vmcb = vmcb_gpa;
1861 2156
1862 enable_gif(svm); 2157 enable_gif(svm);
1863 2158
@@ -1883,6 +2178,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1883static int vmload_interception(struct vcpu_svm *svm) 2178static int vmload_interception(struct vcpu_svm *svm)
1884{ 2179{
1885 struct vmcb *nested_vmcb; 2180 struct vmcb *nested_vmcb;
2181 struct page *page;
1886 2182
1887 if (nested_svm_check_permissions(svm)) 2183 if (nested_svm_check_permissions(svm))
1888 return 1; 2184 return 1;
@@ -1890,12 +2186,12 @@ static int vmload_interception(struct vcpu_svm *svm)
1890 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2186 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1891 skip_emulated_instruction(&svm->vcpu); 2187 skip_emulated_instruction(&svm->vcpu);
1892 2188
1893 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2189 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1894 if (!nested_vmcb) 2190 if (!nested_vmcb)
1895 return 1; 2191 return 1;
1896 2192
1897 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2193 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1898 nested_svm_unmap(nested_vmcb, KM_USER0); 2194 nested_svm_unmap(page);
1899 2195
1900 return 1; 2196 return 1;
1901} 2197}
@@ -1903,6 +2199,7 @@ static int vmload_interception(struct vcpu_svm *svm)
1903static int vmsave_interception(struct vcpu_svm *svm) 2199static int vmsave_interception(struct vcpu_svm *svm)
1904{ 2200{
1905 struct vmcb *nested_vmcb; 2201 struct vmcb *nested_vmcb;
2202 struct page *page;
1906 2203
1907 if (nested_svm_check_permissions(svm)) 2204 if (nested_svm_check_permissions(svm))
1908 return 1; 2205 return 1;
@@ -1910,12 +2207,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
1910 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2207 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1911 skip_emulated_instruction(&svm->vcpu); 2208 skip_emulated_instruction(&svm->vcpu);
1912 2209
1913 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2210 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1914 if (!nested_vmcb) 2211 if (!nested_vmcb)
1915 return 1; 2212 return 1;
1916 2213
1917 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2214 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1918 nested_svm_unmap(nested_vmcb, KM_USER0); 2215 nested_svm_unmap(page);
1919 2216
1920 return 1; 2217 return 1;
1921} 2218}
@@ -2018,6 +2315,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
2018 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2315 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2019 uint32_t idt_v = 2316 uint32_t idt_v =
2020 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2317 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2318 bool has_error_code = false;
2319 u32 error_code = 0;
2021 2320
2022 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2321 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2023 2322
@@ -2038,6 +2337,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
2038 svm->vcpu.arch.nmi_injected = false; 2337 svm->vcpu.arch.nmi_injected = false;
2039 break; 2338 break;
2040 case SVM_EXITINTINFO_TYPE_EXEPT: 2339 case SVM_EXITINTINFO_TYPE_EXEPT:
2340 if (svm->vmcb->control.exit_info_2 &
2341 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2342 has_error_code = true;
2343 error_code =
2344 (u32)svm->vmcb->control.exit_info_2;
2345 }
2041 kvm_clear_exception_queue(&svm->vcpu); 2346 kvm_clear_exception_queue(&svm->vcpu);
2042 break; 2347 break;
2043 case SVM_EXITINTINFO_TYPE_INTR: 2348 case SVM_EXITINTINFO_TYPE_INTR:
@@ -2054,7 +2359,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
2054 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2359 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2055 skip_emulated_instruction(&svm->vcpu); 2360 skip_emulated_instruction(&svm->vcpu);
2056 2361
2057 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2362 if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2363 has_error_code, error_code) == EMULATE_FAIL) {
2364 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2365 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2366 svm->vcpu.run->internal.ndata = 0;
2367 return 0;
2368 }
2369 return 1;
2058} 2370}
2059 2371
2060static int cpuid_interception(struct vcpu_svm *svm) 2372static int cpuid_interception(struct vcpu_svm *svm)
@@ -2145,9 +2457,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2145 case MSR_IA32_SYSENTER_ESP: 2457 case MSR_IA32_SYSENTER_ESP:
2146 *data = svm->sysenter_esp; 2458 *data = svm->sysenter_esp;
2147 break; 2459 break;
2148 /* Nobody will change the following 5 values in the VMCB so 2460 /*
2149 we can safely return them on rdmsr. They will always be 0 2461 * Nobody will change the following 5 values in the VMCB so we can
2150 until LBRV is implemented. */ 2462 * safely return them on rdmsr. They will always be 0 until LBRV is
2463 * implemented.
2464 */
2151 case MSR_IA32_DEBUGCTLMSR: 2465 case MSR_IA32_DEBUGCTLMSR:
2152 *data = svm->vmcb->save.dbgctl; 2466 *data = svm->vmcb->save.dbgctl;
2153 break; 2467 break;
@@ -2167,7 +2481,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2167 *data = svm->nested.hsave_msr; 2481 *data = svm->nested.hsave_msr;
2168 break; 2482 break;
2169 case MSR_VM_CR: 2483 case MSR_VM_CR:
2170 *data = 0; 2484 *data = svm->nested.vm_cr_msr;
2171 break; 2485 break;
2172 case MSR_IA32_UCODE_REV: 2486 case MSR_IA32_UCODE_REV:
2173 *data = 0x01000065; 2487 *data = 0x01000065;
@@ -2197,6 +2511,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2197 return 1; 2511 return 1;
2198} 2512}
2199 2513
2514static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2515{
2516 struct vcpu_svm *svm = to_svm(vcpu);
2517 int svm_dis, chg_mask;
2518
2519 if (data & ~SVM_VM_CR_VALID_MASK)
2520 return 1;
2521
2522 chg_mask = SVM_VM_CR_VALID_MASK;
2523
2524 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2525 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2526
2527 svm->nested.vm_cr_msr &= ~chg_mask;
2528 svm->nested.vm_cr_msr |= (data & chg_mask);
2529
2530 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2531
2532 /* check for svm_disable while efer.svme is set */
2533 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2534 return 1;
2535
2536 return 0;
2537}
2538
2200static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 2539static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2201{ 2540{
2202 struct vcpu_svm *svm = to_svm(vcpu); 2541 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2263,6 +2602,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2263 svm->nested.hsave_msr = data; 2602 svm->nested.hsave_msr = data;
2264 break; 2603 break;
2265 case MSR_VM_CR: 2604 case MSR_VM_CR:
2605 return svm_set_vm_cr(vcpu, data);
2266 case MSR_VM_IGNNE: 2606 case MSR_VM_IGNNE:
2267 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2607 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2268 break; 2608 break;
@@ -2326,16 +2666,16 @@ static int pause_interception(struct vcpu_svm *svm)
2326} 2666}
2327 2667
2328static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 2668static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2329 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2669 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2330 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2670 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2331 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2671 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2332 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2672 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2673 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2674 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2675 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2676 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2337 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2677 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2338 [SVM_EXIT_READ_DR0] = emulate_on_interception, 2678 [SVM_EXIT_READ_DR0] = emulate_on_interception,
2339 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2679 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2340 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2680 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2341 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2681 [SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2354,15 +2694,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2694 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2695 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2356 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2696 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2357 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2697 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2358 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2698 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
2359 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2699 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2360 [SVM_EXIT_INTR] = intr_interception, 2700 [SVM_EXIT_INTR] = intr_interception,
2361 [SVM_EXIT_NMI] = nmi_interception, 2701 [SVM_EXIT_NMI] = nmi_interception,
2362 [SVM_EXIT_SMI] = nop_on_interception, 2702 [SVM_EXIT_SMI] = nop_on_interception,
2363 [SVM_EXIT_INIT] = nop_on_interception, 2703 [SVM_EXIT_INIT] = nop_on_interception,
2364 [SVM_EXIT_VINTR] = interrupt_window_interception, 2704 [SVM_EXIT_VINTR] = interrupt_window_interception,
2365 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2366 [SVM_EXIT_CPUID] = cpuid_interception, 2705 [SVM_EXIT_CPUID] = cpuid_interception,
2367 [SVM_EXIT_IRET] = iret_interception, 2706 [SVM_EXIT_IRET] = iret_interception,
2368 [SVM_EXIT_INVD] = emulate_on_interception, 2707 [SVM_EXIT_INVD] = emulate_on_interception,
@@ -2370,7 +2709,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2370 [SVM_EXIT_HLT] = halt_interception, 2709 [SVM_EXIT_HLT] = halt_interception,
2371 [SVM_EXIT_INVLPG] = invlpg_interception, 2710 [SVM_EXIT_INVLPG] = invlpg_interception,
2372 [SVM_EXIT_INVLPGA] = invlpga_interception, 2711 [SVM_EXIT_INVLPGA] = invlpga_interception,
2373 [SVM_EXIT_IOIO] = io_interception, 2712 [SVM_EXIT_IOIO] = io_interception,
2374 [SVM_EXIT_MSR] = msr_interception, 2713 [SVM_EXIT_MSR] = msr_interception,
2375 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2714 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2376 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2715 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2393,7 +2732,12 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2393 struct kvm_run *kvm_run = vcpu->run; 2732 struct kvm_run *kvm_run = vcpu->run;
2394 u32 exit_code = svm->vmcb->control.exit_code; 2733 u32 exit_code = svm->vmcb->control.exit_code;
2395 2734
2396 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2735 trace_kvm_exit(exit_code, vcpu);
2736
2737 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2738 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2739 if (npt_enabled)
2740 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2397 2741
2398 if (unlikely(svm->nested.exit_required)) { 2742 if (unlikely(svm->nested.exit_required)) {
2399 nested_svm_vmexit(svm); 2743 nested_svm_vmexit(svm);
@@ -2422,11 +2766,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2422 2766
2423 svm_complete_interrupts(svm); 2767 svm_complete_interrupts(svm);
2424 2768
2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2429
2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2769 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2432 kvm_run->fail_entry.hardware_entry_failure_reason 2771 kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2511,6 +2850,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2511{ 2850{
2512 struct vcpu_svm *svm = to_svm(vcpu); 2851 struct vcpu_svm *svm = to_svm(vcpu);
2513 2852
2853 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2854 return;
2855
2514 if (irr == -1) 2856 if (irr == -1)
2515 return; 2857 return;
2516 2858
@@ -2522,8 +2864,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2522{ 2864{
2523 struct vcpu_svm *svm = to_svm(vcpu); 2865 struct vcpu_svm *svm = to_svm(vcpu);
2524 struct vmcb *vmcb = svm->vmcb; 2866 struct vmcb *vmcb = svm->vmcb;
2525 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2867 int ret;
2526 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2868 ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2869 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2870 ret = ret && gif_set(svm) && nested_svm_nmi(svm);
2871
2872 return ret;
2527} 2873}
2528 2874
2529static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 2875static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2568,13 +2914,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2568{ 2914{
2569 struct vcpu_svm *svm = to_svm(vcpu); 2915 struct vcpu_svm *svm = to_svm(vcpu);
2570 2916
2571 nested_svm_intr(svm); 2917 /*
2572 2918 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
2573 /* In case GIF=0 we can't rely on the CPU to tell us when 2919 * 1, because that's a separate STGI/VMRUN intercept. The next time we
2574 * GIF becomes 1, because that's a separate STGI/VMRUN intercept. 2920 * get that intercept, this function will be called again though and
2575 * The next time we get that intercept, this function will be 2921 * we'll get the vintr intercept.
2576 * called again though and we'll get the vintr intercept. */ 2922 */
2577 if (gif_set(svm)) { 2923 if (gif_set(svm) && nested_svm_intr(svm)) {
2578 svm_set_vintr(svm); 2924 svm_set_vintr(svm);
2579 svm_inject_irq(svm, 0x0); 2925 svm_inject_irq(svm, 0x0);
2580 } 2926 }
@@ -2588,9 +2934,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2588 == HF_NMI_MASK) 2934 == HF_NMI_MASK)
2589 return; /* IRET will cause a vm exit */ 2935 return; /* IRET will cause a vm exit */
2590 2936
2591 /* Something prevents NMI from been injected. Single step over 2937 /*
2592 possible problem (IRET or exception injection or interrupt 2938 * Something prevents NMI from been injected. Single step over possible
2593 shadow) */ 2939 * problem (IRET or exception injection or interrupt shadow)
2940 */
2594 svm->nmi_singlestep = true; 2941 svm->nmi_singlestep = true;
2595 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2942 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2596 update_db_intercept(vcpu); 2943 update_db_intercept(vcpu);
@@ -2614,6 +2961,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2614{ 2961{
2615 struct vcpu_svm *svm = to_svm(vcpu); 2962 struct vcpu_svm *svm = to_svm(vcpu);
2616 2963
2964 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2965 return;
2966
2617 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2967 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2618 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2968 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2619 kvm_set_cr8(vcpu, cr8); 2969 kvm_set_cr8(vcpu, cr8);
@@ -2625,6 +2975,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2625 struct vcpu_svm *svm = to_svm(vcpu); 2975 struct vcpu_svm *svm = to_svm(vcpu);
2626 u64 cr8; 2976 u64 cr8;
2627 2977
2978 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2979 return;
2980
2628 cr8 = kvm_get_cr8(vcpu); 2981 cr8 = kvm_get_cr8(vcpu);
2629 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2982 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2630 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2983 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2635,6 +2988,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2635 u8 vector; 2988 u8 vector;
2636 int type; 2989 int type;
2637 u32 exitintinfo = svm->vmcb->control.exit_int_info; 2990 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2991 unsigned int3_injected = svm->int3_injected;
2992
2993 svm->int3_injected = 0;
2638 2994
2639 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 2995 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2640 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 2996 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2654,18 +3010,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2654 svm->vcpu.arch.nmi_injected = true; 3010 svm->vcpu.arch.nmi_injected = true;
2655 break; 3011 break;
2656 case SVM_EXITINTINFO_TYPE_EXEPT: 3012 case SVM_EXITINTINFO_TYPE_EXEPT:
2657 /* In case of software exception do not reinject an exception 3013 /*
2658 vector, but re-execute and instruction instead */ 3014 * In case of software exceptions, do not reinject the vector,
2659 if (is_nested(svm)) 3015 * but re-execute the instruction instead. Rewind RIP first
2660 break; 3016 * if we emulated INT3 before.
2661 if (kvm_exception_is_soft(vector)) 3017 */
3018 if (kvm_exception_is_soft(vector)) {
3019 if (vector == BP_VECTOR && int3_injected &&
3020 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3021 kvm_rip_write(&svm->vcpu,
3022 kvm_rip_read(&svm->vcpu) -
3023 int3_injected);
2662 break; 3024 break;
3025 }
2663 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 3026 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2664 u32 err = svm->vmcb->control.exit_int_info_err; 3027 u32 err = svm->vmcb->control.exit_int_info_err;
2665 kvm_queue_exception_e(&svm->vcpu, vector, err); 3028 kvm_requeue_exception_e(&svm->vcpu, vector, err);
2666 3029
2667 } else 3030 } else
2668 kvm_queue_exception(&svm->vcpu, vector); 3031 kvm_requeue_exception(&svm->vcpu, vector);
2669 break; 3032 break;
2670 case SVM_EXITINTINFO_TYPE_INTR: 3033 case SVM_EXITINTINFO_TYPE_INTR:
2671 kvm_queue_interrupt(&svm->vcpu, vector, false); 3034 kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -2688,6 +3051,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2688 u16 gs_selector; 3051 u16 gs_selector;
2689 u16 ldt_selector; 3052 u16 ldt_selector;
2690 3053
3054 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3055 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3056 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3057
2691 /* 3058 /*
2692 * A vmexit emulation is required before the vcpu can be executed 3059 * A vmexit emulation is required before the vcpu can be executed
2693 * again. 3060 * again.
@@ -2695,10 +3062,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2695 if (unlikely(svm->nested.exit_required)) 3062 if (unlikely(svm->nested.exit_required))
2696 return; 3063 return;
2697 3064
2698 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2699 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2700 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2701
2702 pre_svm_run(svm); 3065 pre_svm_run(svm);
2703 3066
2704 sync_lapic_to_cr8(vcpu); 3067 sync_lapic_to_cr8(vcpu);
@@ -2811,6 +3174,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2811 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3174 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
2812 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3175 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
2813 } 3176 }
3177
3178 /*
3179 * We need to handle MC intercepts here before the vcpu has a chance to
3180 * change the physical cpu
3181 */
3182 if (unlikely(svm->vmcb->control.exit_code ==
3183 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3184 svm_handle_mce(svm);
2814} 3185}
2815 3186
2816#undef R 3187#undef R
@@ -2879,25 +3250,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{ 3250{
2880} 3251}
2881 3252
3253static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3254{
3255 switch (func) {
3256 case 0x8000000A:
3257 entry->eax = 1; /* SVM revision 1 */
3258 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3259 ASID emulation to nested SVM */
3260 entry->ecx = 0; /* Reserved */
3261 entry->edx = 0; /* Do not support any additional features */
3262
3263 break;
3264 }
3265}
3266
2882static const struct trace_print_flags svm_exit_reasons_str[] = { 3267static const struct trace_print_flags svm_exit_reasons_str[] = {
2883 { SVM_EXIT_READ_CR0, "read_cr0" }, 3268 { SVM_EXIT_READ_CR0, "read_cr0" },
2884 { SVM_EXIT_READ_CR3, "read_cr3" }, 3269 { SVM_EXIT_READ_CR3, "read_cr3" },
2885 { SVM_EXIT_READ_CR4, "read_cr4" }, 3270 { SVM_EXIT_READ_CR4, "read_cr4" },
2886 { SVM_EXIT_READ_CR8, "read_cr8" }, 3271 { SVM_EXIT_READ_CR8, "read_cr8" },
2887 { SVM_EXIT_WRITE_CR0, "write_cr0" }, 3272 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2888 { SVM_EXIT_WRITE_CR3, "write_cr3" }, 3273 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2889 { SVM_EXIT_WRITE_CR4, "write_cr4" }, 3274 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2890 { SVM_EXIT_WRITE_CR8, "write_cr8" }, 3275 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2891 { SVM_EXIT_READ_DR0, "read_dr0" }, 3276 { SVM_EXIT_READ_DR0, "read_dr0" },
2892 { SVM_EXIT_READ_DR1, "read_dr1" }, 3277 { SVM_EXIT_READ_DR1, "read_dr1" },
2893 { SVM_EXIT_READ_DR2, "read_dr2" }, 3278 { SVM_EXIT_READ_DR2, "read_dr2" },
2894 { SVM_EXIT_READ_DR3, "read_dr3" }, 3279 { SVM_EXIT_READ_DR3, "read_dr3" },
2895 { SVM_EXIT_WRITE_DR0, "write_dr0" }, 3280 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2896 { SVM_EXIT_WRITE_DR1, "write_dr1" }, 3281 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2897 { SVM_EXIT_WRITE_DR2, "write_dr2" }, 3282 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2898 { SVM_EXIT_WRITE_DR3, "write_dr3" }, 3283 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2899 { SVM_EXIT_WRITE_DR5, "write_dr5" }, 3284 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2900 { SVM_EXIT_WRITE_DR7, "write_dr7" }, 3285 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2901 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, 3286 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2902 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, 3287 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2903 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, 3288 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2946,8 +3331,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{ 3331{
2947 struct vcpu_svm *svm = to_svm(vcpu); 3332 struct vcpu_svm *svm = to_svm(vcpu);
2948 3333
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3334 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
3335 if (is_nested(svm))
3336 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3337 update_cr0_intercept(svm);
2951} 3338}
2952 3339
2953static struct kvm_x86_ops svm_x86_ops = { 3340static struct kvm_x86_ops svm_x86_ops = {
@@ -2986,8 +3373,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2986 .set_idt = svm_set_idt, 3373 .set_idt = svm_set_idt,
2987 .get_gdt = svm_get_gdt, 3374 .get_gdt = svm_get_gdt,
2988 .set_gdt = svm_set_gdt, 3375 .set_gdt = svm_set_gdt,
2989 .get_dr = svm_get_dr, 3376 .set_dr7 = svm_set_dr7,
2990 .set_dr = svm_set_dr,
2991 .cache_reg = svm_cache_reg, 3377 .cache_reg = svm_cache_reg,
2992 .get_rflags = svm_get_rflags, 3378 .get_rflags = svm_get_rflags,
2993 .set_rflags = svm_set_rflags, 3379 .set_rflags = svm_set_rflags,
@@ -3023,12 +3409,14 @@ static struct kvm_x86_ops svm_x86_ops = {
3023 .cpuid_update = svm_cpuid_update, 3409 .cpuid_update = svm_cpuid_update,
3024 3410
3025 .rdtscp_supported = svm_rdtscp_supported, 3411 .rdtscp_supported = svm_rdtscp_supported,
3412
3413 .set_supported_cpuid = svm_set_supported_cpuid,
3026}; 3414};
3027 3415
3028static int __init svm_init(void) 3416static int __init svm_init(void)
3029{ 3417{
3030 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), 3418 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
3031 THIS_MODULE); 3419 __alignof__(struct vcpu_svm), THIS_MODULE);
3032} 3420}
3033 3421
3034static void __exit svm_exit(void) 3422static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066c..4ddadb1a5ffe 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
12 /* 12 /*
13 * There is a race window between reading and incrementing, but we do 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject 14 * not care about potentially loosing timer events in the !reinject
15 * case anyway. 15 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
16 * in vcpu_enter_guest.
16 */ 17 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending); 19 atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f044..a6544b8e7c0f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
5 5
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10 8
11/* 9/*
12 * Tracepoint for guest mode entry. 10 * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
184 * Tracepoint for kvm guest exit: 182 * Tracepoint for kvm guest exit:
185 */ 183 */
186TRACE_EVENT(kvm_exit, 184TRACE_EVENT(kvm_exit,
187 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), 185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
188 TP_ARGS(exit_reason, guest_rip), 186 TP_ARGS(exit_reason, vcpu),
189 187
190 TP_STRUCT__entry( 188 TP_STRUCT__entry(
191 __field( unsigned int, exit_reason ) 189 __field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
194 192
195 TP_fast_assign( 193 TP_fast_assign(
196 __entry->exit_reason = exit_reason; 194 __entry->exit_reason = exit_reason;
197 __entry->guest_rip = guest_rip; 195 __entry->guest_rip = kvm_rip_read(vcpu);
198 ), 196 ),
199 197
200 TP_printk("reason %s rip 0x%lx", 198 TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
221 TP_printk("irq %u", __entry->irq) 219 TP_printk("irq %u", __entry->irq)
222); 220);
223 221
222#define EXS(x) { x##_VECTOR, "#" #x }
223
224#define kvm_trace_sym_exc \
225 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
226 EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
227 EXS(MF), EXS(MC)
228
229/*
230 * Tracepoint for kvm interrupt injection:
231 */
232TRACE_EVENT(kvm_inj_exception,
233 TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
234 TP_ARGS(exception, has_error, error_code),
235
236 TP_STRUCT__entry(
237 __field( u8, exception )
238 __field( u8, has_error )
239 __field( u32, error_code )
240 ),
241
242 TP_fast_assign(
243 __entry->exception = exception;
244 __entry->has_error = has_error;
245 __entry->error_code = error_code;
246 ),
247
248 TP_printk("%s (0x%x)",
249 __print_symbolic(__entry->exception, kvm_trace_sym_exc),
250 /* FIXME: don't print error_code if not present */
251 __entry->has_error ? __entry->error_code : 0)
252);
253
224/* 254/*
225 * Tracepoint for page fault. 255 * Tracepoint for page fault.
226 */ 256 */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
413 ), 443 ),
414 444
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " 445 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n", 446 "event_inj: 0x%08x npt: %s",
417 __entry->rip, __entry->vmcb, __entry->nested_rip, 447 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj, 448 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off") 449 __entry->npt ? "on" : "off")
420); 450);
421 451
452TRACE_EVENT(kvm_nested_intercepts,
453 TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
454 TP_ARGS(cr_read, cr_write, exceptions, intercept),
455
456 TP_STRUCT__entry(
457 __field( __u16, cr_read )
458 __field( __u16, cr_write )
459 __field( __u32, exceptions )
460 __field( __u64, intercept )
461 ),
462
463 TP_fast_assign(
464 __entry->cr_read = cr_read;
465 __entry->cr_write = cr_write;
466 __entry->exceptions = exceptions;
467 __entry->intercept = intercept;
468 ),
469
470 TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
471 __entry->cr_read, __entry->cr_write, __entry->exceptions,
472 __entry->intercept)
473);
422/* 474/*
423 * Tracepoint for #VMEXIT while nested 475 * Tracepoint for #VMEXIT while nested
424 */ 476 */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
447 __entry->exit_int_info_err = exit_int_info_err; 499 __entry->exit_int_info_err = exit_int_info_err;
448 ), 500 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 501 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 502 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
451 __entry->rip, 503 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code, 504 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str), 505 kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
482 ), 534 ),
483 535
484 TP_printk("reason: %s ext_inf1: 0x%016llx " 536 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 537 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
486 ftrace_print_symbols_seq(p, __entry->exit_code, 538 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str), 539 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2, 540 __entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
504 __entry->rip = rip 556 __entry->rip = rip
505 ), 557 ),
506 558
507 TP_printk("rip: 0x%016llx\n", __entry->rip) 559 TP_printk("rip: 0x%016llx", __entry->rip)
508); 560);
509 561
510/* 562/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
526 __entry->address = address; 578 __entry->address = address;
527 ), 579 ),
528 580
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", 581 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
530 __entry->rip, __entry->asid, __entry->address) 582 __entry->rip, __entry->asid, __entry->address)
531); 583);
532 584
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
547 __entry->slb = slb; 599 __entry->slb = slb;
548 ), 600 ),
549 601
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n", 602 TP_printk("rip: 0x%016llx slb: 0x%08x",
551 __entry->rip, __entry->slb) 603 __entry->rip, __entry->slb)
552); 604);
553 605
606#define __print_insn(insn, ilen) ({ \
607 int i; \
608 const char *ret = p->buffer + p->len; \
609 \
610 for (i = 0; i < ilen; ++i) \
611 trace_seq_printf(p, " %02x", insn[i]); \
612 trace_seq_printf(p, "%c", 0); \
613 ret; \
614 })
615
616#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
617#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
618#define KVM_EMUL_INSN_F_CS_D (1 << 2)
619#define KVM_EMUL_INSN_F_CS_L (1 << 3)
620
621#define kvm_trace_symbol_emul_flags \
622 { 0, "real" }, \
623 { KVM_EMUL_INSN_F_CR0_PE \
624 | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
625 { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
626 { KVM_EMUL_INSN_F_CR0_PE \
627 | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
628 { KVM_EMUL_INSN_F_CR0_PE \
629 | KVM_EMUL_INSN_F_CS_L, "prot64" }
630
631#define kei_decode_mode(mode) ({ \
632 u8 flags = 0xff; \
633 switch (mode) { \
634 case X86EMUL_MODE_REAL: \
635 flags = 0; \
636 break; \
637 case X86EMUL_MODE_VM86: \
638 flags = KVM_EMUL_INSN_F_EFL_VM; \
639 break; \
640 case X86EMUL_MODE_PROT16: \
641 flags = KVM_EMUL_INSN_F_CR0_PE; \
642 break; \
643 case X86EMUL_MODE_PROT32: \
644 flags = KVM_EMUL_INSN_F_CR0_PE \
645 | KVM_EMUL_INSN_F_CS_D; \
646 break; \
647 case X86EMUL_MODE_PROT64: \
648 flags = KVM_EMUL_INSN_F_CR0_PE \
649 | KVM_EMUL_INSN_F_CS_L; \
650 break; \
651 } \
652 flags; \
653 })
654
655TRACE_EVENT(kvm_emulate_insn,
656 TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
657 TP_ARGS(vcpu, failed),
658
659 TP_STRUCT__entry(
660 __field( __u64, rip )
661 __field( __u32, csbase )
662 __field( __u8, len )
663 __array( __u8, insn, 15 )
664 __field( __u8, flags )
665 __field( __u8, failed )
666 ),
667
668 TP_fast_assign(
669 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
670 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
671 __entry->len = vcpu->arch.emulate_ctxt.decode.eip
672 - vcpu->arch.emulate_ctxt.decode.fetch.start;
673 memcpy(__entry->insn,
674 vcpu->arch.emulate_ctxt.decode.fetch.data,
675 15);
676 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
677 __entry->failed = failed;
678 ),
679
680 TP_printk("%x:%llx:%s (%s)%s",
681 __entry->csbase, __entry->rip,
682 __print_insn(__entry->insn, __entry->len),
683 __print_symbolic(__entry->flags,
684 kvm_trace_symbol_emul_flags),
685 __entry->failed ? " failed" : ""
686 )
687 );
688
689#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
690#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
691
554#endif /* _TRACE_KVM_H */ 692#endif /* _TRACE_KVM_H */
555 693
694#undef TRACE_INCLUDE_PATH
695#define TRACE_INCLUDE_PATH arch/x86/kvm
696#undef TRACE_INCLUDE_FILE
697#define TRACE_INCLUDE_FILE trace
698
556/* This part must be outside protection */ 699/* This part must be outside protection */
557#include <trace/define_trace.h> 700#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index edca080407a5..859a01a07dbf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/tboot.h>
30#include "kvm_cache_regs.h" 31#include "kvm_cache_regs.h"
31#include "x86.h" 32#include "x86.h"
32 33
@@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO);
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 99static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO); 100module_param(ple_window, int, S_IRUGO);
100 101
102#define NR_AUTOLOAD_MSRS 1
103
101struct vmcs { 104struct vmcs {
102 u32 revision_id; 105 u32 revision_id;
103 u32 abort; 106 u32 abort;
@@ -125,6 +128,11 @@ struct vcpu_vmx {
125 u64 msr_guest_kernel_gs_base; 128 u64 msr_guest_kernel_gs_base;
126#endif 129#endif
127 struct vmcs *vmcs; 130 struct vmcs *vmcs;
131 struct msr_autoload {
132 unsigned nr;
133 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
134 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
135 } msr_autoload;
128 struct { 136 struct {
129 int loaded; 137 int loaded;
130 u16 fs_sel, gs_sel, ldt_sel; 138 u16 fs_sel, gs_sel, ldt_sel;
@@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = {
234}; 242};
235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
236 244
237static inline int is_page_fault(u32 intr_info) 245static inline bool is_page_fault(u32 intr_info)
238{ 246{
239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 247 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
240 INTR_INFO_VALID_MASK)) == 248 INTR_INFO_VALID_MASK)) ==
241 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 249 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
242} 250}
243 251
244static inline int is_no_device(u32 intr_info) 252static inline bool is_no_device(u32 intr_info)
245{ 253{
246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 254 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
247 INTR_INFO_VALID_MASK)) == 255 INTR_INFO_VALID_MASK)) ==
248 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 256 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
249} 257}
250 258
251static inline int is_invalid_opcode(u32 intr_info) 259static inline bool is_invalid_opcode(u32 intr_info)
252{ 260{
253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 261 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
254 INTR_INFO_VALID_MASK)) == 262 INTR_INFO_VALID_MASK)) ==
255 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 263 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
256} 264}
257 265
258static inline int is_external_interrupt(u32 intr_info) 266static inline bool is_external_interrupt(u32 intr_info)
259{ 267{
260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 268 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
261 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 269 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
262} 270}
263 271
264static inline int is_machine_check(u32 intr_info) 272static inline bool is_machine_check(u32 intr_info)
265{ 273{
266 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 274 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
267 INTR_INFO_VALID_MASK)) == 275 INTR_INFO_VALID_MASK)) ==
268 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 276 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
269} 277}
270 278
271static inline int cpu_has_vmx_msr_bitmap(void) 279static inline bool cpu_has_vmx_msr_bitmap(void)
272{ 280{
273 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 281 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
274} 282}
275 283
276static inline int cpu_has_vmx_tpr_shadow(void) 284static inline bool cpu_has_vmx_tpr_shadow(void)
277{ 285{
278 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 286 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
279} 287}
280 288
281static inline int vm_need_tpr_shadow(struct kvm *kvm) 289static inline bool vm_need_tpr_shadow(struct kvm *kvm)
282{ 290{
283 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 291 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
284} 292}
285 293
286static inline int cpu_has_secondary_exec_ctrls(void) 294static inline bool cpu_has_secondary_exec_ctrls(void)
287{ 295{
288 return vmcs_config.cpu_based_exec_ctrl & 296 return vmcs_config.cpu_based_exec_ctrl &
289 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 297 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
303 311
304static inline bool cpu_has_vmx_ept_execute_only(void) 312static inline bool cpu_has_vmx_ept_execute_only(void)
305{ 313{
306 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); 314 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
307} 315}
308 316
309static inline bool cpu_has_vmx_eptp_uncacheable(void) 317static inline bool cpu_has_vmx_eptp_uncacheable(void)
310{ 318{
311 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); 319 return vmx_capability.ept & VMX_EPTP_UC_BIT;
312} 320}
313 321
314static inline bool cpu_has_vmx_eptp_writeback(void) 322static inline bool cpu_has_vmx_eptp_writeback(void)
315{ 323{
316 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); 324 return vmx_capability.ept & VMX_EPTP_WB_BIT;
317} 325}
318 326
319static inline bool cpu_has_vmx_ept_2m_page(void) 327static inline bool cpu_has_vmx_ept_2m_page(void)
320{ 328{
321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 329 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
322} 330}
323 331
324static inline bool cpu_has_vmx_ept_1g_page(void) 332static inline bool cpu_has_vmx_ept_1g_page(void)
325{ 333{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); 334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
327} 335}
328 336
329static inline int cpu_has_vmx_invept_individual_addr(void) 337static inline bool cpu_has_vmx_invept_individual_addr(void)
330{ 338{
331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
332} 340}
333 341
334static inline int cpu_has_vmx_invept_context(void) 342static inline bool cpu_has_vmx_invept_context(void)
335{ 343{
336 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); 344 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
337} 345}
338 346
339static inline int cpu_has_vmx_invept_global(void) 347static inline bool cpu_has_vmx_invept_global(void)
340{ 348{
341 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); 349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
342} 350}
343 351
344static inline int cpu_has_vmx_ept(void) 352static inline bool cpu_has_vmx_ept(void)
345{ 353{
346 return vmcs_config.cpu_based_2nd_exec_ctrl & 354 return vmcs_config.cpu_based_2nd_exec_ctrl &
347 SECONDARY_EXEC_ENABLE_EPT; 355 SECONDARY_EXEC_ENABLE_EPT;
348} 356}
349 357
350static inline int cpu_has_vmx_unrestricted_guest(void) 358static inline bool cpu_has_vmx_unrestricted_guest(void)
351{ 359{
352 return vmcs_config.cpu_based_2nd_exec_ctrl & 360 return vmcs_config.cpu_based_2nd_exec_ctrl &
353 SECONDARY_EXEC_UNRESTRICTED_GUEST; 361 SECONDARY_EXEC_UNRESTRICTED_GUEST;
354} 362}
355 363
356static inline int cpu_has_vmx_ple(void) 364static inline bool cpu_has_vmx_ple(void)
357{ 365{
358 return vmcs_config.cpu_based_2nd_exec_ctrl & 366 return vmcs_config.cpu_based_2nd_exec_ctrl &
359 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 367 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
360} 368}
361 369
362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 370static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
363{ 371{
364 return flexpriority_enabled && irqchip_in_kernel(kvm); 372 return flexpriority_enabled && irqchip_in_kernel(kvm);
365} 373}
366 374
367static inline int cpu_has_vmx_vpid(void) 375static inline bool cpu_has_vmx_vpid(void)
368{ 376{
369 return vmcs_config.cpu_based_2nd_exec_ctrl & 377 return vmcs_config.cpu_based_2nd_exec_ctrl &
370 SECONDARY_EXEC_ENABLE_VPID; 378 SECONDARY_EXEC_ENABLE_VPID;
371} 379}
372 380
373static inline int cpu_has_vmx_rdtscp(void) 381static inline bool cpu_has_vmx_rdtscp(void)
374{ 382{
375 return vmcs_config.cpu_based_2nd_exec_ctrl & 383 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP; 384 SECONDARY_EXEC_RDTSCP;
377} 385}
378 386
379static inline int cpu_has_virtual_nmis(void) 387static inline bool cpu_has_virtual_nmis(void)
380{ 388{
381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
382} 390}
@@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
595 vmcs_write32(EXCEPTION_BITMAP, eb); 603 vmcs_write32(EXCEPTION_BITMAP, eb);
596} 604}
597 605
606static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
607{
608 unsigned i;
609 struct msr_autoload *m = &vmx->msr_autoload;
610
611 for (i = 0; i < m->nr; ++i)
612 if (m->guest[i].index == msr)
613 break;
614
615 if (i == m->nr)
616 return;
617 --m->nr;
618 m->guest[i] = m->guest[m->nr];
619 m->host[i] = m->host[m->nr];
620 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
621 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
622}
623
624static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
625 u64 guest_val, u64 host_val)
626{
627 unsigned i;
628 struct msr_autoload *m = &vmx->msr_autoload;
629
630 for (i = 0; i < m->nr; ++i)
631 if (m->guest[i].index == msr)
632 break;
633
634 if (i == m->nr) {
635 ++m->nr;
636 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
637 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
638 }
639
640 m->guest[i].index = msr;
641 m->guest[i].value = guest_val;
642 m->host[i].index = msr;
643 m->host[i].value = host_val;
644}
645
598static void reload_tss(void) 646static void reload_tss(void)
599{ 647{
600 /* 648 /*
601 * VT restores TR but not its size. Useless. 649 * VT restores TR but not its size. Useless.
602 */ 650 */
603 struct descriptor_table gdt; 651 struct desc_ptr gdt;
604 struct desc_struct *descs; 652 struct desc_struct *descs;
605 653
606 kvm_get_gdt(&gdt); 654 native_store_gdt(&gdt);
607 descs = (void *)gdt.base; 655 descs = (void *)gdt.address;
608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 656 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
609 load_TR_desc(); 657 load_TR_desc();
610} 658}
@@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
631 guest_efer |= host_efer & ignore_bits; 679 guest_efer |= host_efer & ignore_bits;
632 vmx->guest_msrs[efer_offset].data = guest_efer; 680 vmx->guest_msrs[efer_offset].data = guest_efer;
633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 681 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
682
683 clear_atomic_switch_msr(vmx, MSR_EFER);
684 /* On ept, can't emulate nx, and must switch nx atomically */
685 if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
686 guest_efer = vmx->vcpu.arch.efer;
687 if (!(guest_efer & EFER_LMA))
688 guest_efer &= ~EFER_LME;
689 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
690 return false;
691 }
692
634 return true; 693 return true;
635} 694}
636 695
696static unsigned long segment_base(u16 selector)
697{
698 struct desc_ptr gdt;
699 struct desc_struct *d;
700 unsigned long table_base;
701 unsigned long v;
702
703 if (!(selector & ~3))
704 return 0;
705
706 native_store_gdt(&gdt);
707 table_base = gdt.address;
708
709 if (selector & 4) { /* from ldt */
710 u16 ldt_selector = kvm_read_ldt();
711
712 if (!(ldt_selector & ~3))
713 return 0;
714
715 table_base = segment_base(ldt_selector);
716 }
717 d = (struct desc_struct *)(table_base + (selector & ~7));
718 v = get_desc_base(d);
719#ifdef CONFIG_X86_64
720 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
721 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
722#endif
723 return v;
724}
725
726static inline unsigned long kvm_read_tr_base(void)
727{
728 u16 tr;
729 asm("str %0" : "=g"(tr));
730 return segment_base(tr);
731}
732
637static void vmx_save_host_state(struct kvm_vcpu *vcpu) 733static void vmx_save_host_state(struct kvm_vcpu *vcpu)
638{ 734{
639 struct vcpu_vmx *vmx = to_vmx(vcpu); 735 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 } 854 }
759 855
760 if (vcpu->cpu != cpu) { 856 if (vcpu->cpu != cpu) {
761 struct descriptor_table dt; 857 struct desc_ptr dt;
762 unsigned long sysenter_esp; 858 unsigned long sysenter_esp;
763 859
764 vcpu->cpu = cpu; 860 vcpu->cpu = cpu;
@@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
767 * processors. 863 * processors.
768 */ 864 */
769 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 865 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
770 kvm_get_gdt(&dt); 866 native_store_gdt(&dt);
771 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 867 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
772 868
773 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 869 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
774 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 870 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
846 int ret = 0; 942 int ret = 0;
847 943
848 if (interruptibility & GUEST_INTR_STATE_STI) 944 if (interruptibility & GUEST_INTR_STATE_STI)
849 ret |= X86_SHADOW_INT_STI; 945 ret |= KVM_X86_SHADOW_INT_STI;
850 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 946 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
851 ret |= X86_SHADOW_INT_MOV_SS; 947 ret |= KVM_X86_SHADOW_INT_MOV_SS;
852 948
853 return ret & mask; 949 return ret & mask;
854} 950}
@@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
860 956
861 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 957 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
862 958
863 if (mask & X86_SHADOW_INT_MOV_SS) 959 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
864 interruptibility |= GUEST_INTR_STATE_MOV_SS; 960 interruptibility |= GUEST_INTR_STATE_MOV_SS;
865 if (mask & X86_SHADOW_INT_STI) 961 else if (mask & KVM_X86_SHADOW_INT_STI)
866 interruptibility |= GUEST_INTR_STATE_STI; 962 interruptibility |= GUEST_INTR_STATE_STI;
867 963
868 if ((interruptibility != interruptibility_old)) 964 if ((interruptibility != interruptibility_old))
@@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
882} 978}
883 979
884static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 980static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
885 bool has_error_code, u32 error_code) 981 bool has_error_code, u32 error_code,
982 bool reinject)
886{ 983{
887 struct vcpu_vmx *vmx = to_vmx(vcpu); 984 struct vcpu_vmx *vmx = to_vmx(vcpu);
888 u32 intr_info = nr | INTR_INFO_VALID_MASK; 985 u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
1176 u64 msr; 1273 u64 msr;
1177 1274
1178 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1275 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1179 return (msr & (FEATURE_CONTROL_LOCKED | 1276 if (msr & FEATURE_CONTROL_LOCKED) {
1180 FEATURE_CONTROL_VMXON_ENABLED)) 1277 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1181 == FEATURE_CONTROL_LOCKED; 1278 && tboot_enabled())
1279 return 1;
1280 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1281 && !tboot_enabled())
1282 return 1;
1283 }
1284
1285 return 0;
1182 /* locked but not enabled */ 1286 /* locked but not enabled */
1183} 1287}
1184 1288
@@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage)
1186{ 1290{
1187 int cpu = raw_smp_processor_id(); 1291 int cpu = raw_smp_processor_id();
1188 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1292 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1189 u64 old; 1293 u64 old, test_bits;
1190 1294
1191 if (read_cr4() & X86_CR4_VMXE) 1295 if (read_cr4() & X86_CR4_VMXE)
1192 return -EBUSY; 1296 return -EBUSY;
1193 1297
1194 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1298 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1195 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1299 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1196 if ((old & (FEATURE_CONTROL_LOCKED | 1300
1197 FEATURE_CONTROL_VMXON_ENABLED)) 1301 test_bits = FEATURE_CONTROL_LOCKED;
1198 != (FEATURE_CONTROL_LOCKED | 1302 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1199 FEATURE_CONTROL_VMXON_ENABLED)) 1303 if (tboot_enabled())
1304 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1305
1306 if ((old & test_bits) != test_bits) {
1200 /* enable and lock */ 1307 /* enable and lock */
1201 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1202 FEATURE_CONTROL_LOCKED | 1309 }
1203 FEATURE_CONTROL_VMXON_ENABLED);
1204 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1205 asm volatile (ASM_VMX_VMXON_RAX 1311 asm volatile (ASM_VMX_VMXON_RAX
1206 : : "a"(&phys_addr), "m"(phys_addr) 1312 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1521 struct kvm_memslots *slots; 1627 struct kvm_memslots *slots;
1522 gfn_t base_gfn; 1628 gfn_t base_gfn;
1523 1629
1524 slots = rcu_dereference(kvm->memslots); 1630 slots = kvm_memslots(kvm);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn + 1631 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3; 1632 kvm->memslots->memslots[0].npages - 3;
1527 return base_gfn << PAGE_SHIFT; 1633 return base_gfn << PAGE_SHIFT;
@@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1649 vmcs_write32(VM_ENTRY_CONTROLS, 1755 vmcs_write32(VM_ENTRY_CONTROLS,
1650 vmcs_read32(VM_ENTRY_CONTROLS) 1756 vmcs_read32(VM_ENTRY_CONTROLS)
1651 & ~VM_ENTRY_IA32E_MODE); 1757 & ~VM_ENTRY_IA32E_MODE);
1758 vmx_set_efer(vcpu, vcpu->arch.efer);
1652} 1759}
1653 1760
1654#endif 1761#endif
@@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1934 *l = (ar >> 13) & 1; 2041 *l = (ar >> 13) & 1;
1935} 2042}
1936 2043
1937static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2044static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1938{ 2045{
1939 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 2046 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
1940 dt->base = vmcs_readl(GUEST_IDTR_BASE); 2047 dt->address = vmcs_readl(GUEST_IDTR_BASE);
1941} 2048}
1942 2049
1943static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2050static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1944{ 2051{
1945 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 2052 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
1946 vmcs_writel(GUEST_IDTR_BASE, dt->base); 2053 vmcs_writel(GUEST_IDTR_BASE, dt->address);
1947} 2054}
1948 2055
1949static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2056static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1950{ 2057{
1951 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 2058 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
1952 dt->base = vmcs_readl(GUEST_GDTR_BASE); 2059 dt->address = vmcs_readl(GUEST_GDTR_BASE);
1953} 2060}
1954 2061
1955static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2062static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1956{ 2063{
1957 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 2064 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
1958 vmcs_writel(GUEST_GDTR_BASE, dt->base); 2065 vmcs_writel(GUEST_GDTR_BASE, dt->address);
1959} 2066}
1960 2067
1961static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 2068static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2296 spin_unlock(&vmx_vpid_lock); 2403 spin_unlock(&vmx_vpid_lock);
2297} 2404}
2298 2405
2406static void free_vpid(struct vcpu_vmx *vmx)
2407{
2408 if (!enable_vpid)
2409 return;
2410 spin_lock(&vmx_vpid_lock);
2411 if (vmx->vpid != 0)
2412 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2413 spin_unlock(&vmx_vpid_lock);
2414}
2415
2299static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 2416static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2300{ 2417{
2301 int f = sizeof(unsigned long); 2418 int f = sizeof(unsigned long);
@@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2334 u32 junk; 2451 u32 junk;
2335 u64 host_pat, tsc_this, tsc_base; 2452 u64 host_pat, tsc_this, tsc_base;
2336 unsigned long a; 2453 unsigned long a;
2337 struct descriptor_table dt; 2454 struct desc_ptr dt;
2338 int i; 2455 int i;
2339 unsigned long kvm_vmx_return; 2456 unsigned long kvm_vmx_return;
2340 u32 exec_control; 2457 u32 exec_control;
@@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2415 2532
2416 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2533 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2417 2534
2418 kvm_get_idt(&dt); 2535 native_store_idt(&dt);
2419 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2536 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2420 2537
2421 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2538 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2422 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 2539 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2423 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2540 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2424 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2541 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2542 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2425 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2543 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2544 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2426 2545
2427 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); 2546 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2428 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); 2547 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2947,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
2947 int size, in, string; 3066 int size, in, string;
2948 unsigned port; 3067 unsigned port;
2949 3068
2950 ++vcpu->stat.io_exits;
2951 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3069 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2952 string = (exit_qualification & 16) != 0; 3070 string = (exit_qualification & 16) != 0;
3071 in = (exit_qualification & 8) != 0;
2953 3072
2954 if (string) { 3073 ++vcpu->stat.io_exits;
2955 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2956 return 0;
2957 return 1;
2958 }
2959 3074
2960 size = (exit_qualification & 7) + 1; 3075 if (string || in)
2961 in = (exit_qualification & 8) != 0; 3076 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
2962 port = exit_qualification >> 16;
2963 3077
3078 port = exit_qualification >> 16;
3079 size = (exit_qualification & 7) + 1;
2964 skip_emulated_instruction(vcpu); 3080 skip_emulated_instruction(vcpu);
2965 return kvm_emulate_pio(vcpu, in, size, port); 3081
3082 return kvm_fast_pio_out(vcpu, size, port);
2966} 3083}
2967 3084
2968static void 3085static void
@@ -3053,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3053 return 0; 3170 return 0;
3054} 3171}
3055 3172
3056static int check_dr_alias(struct kvm_vcpu *vcpu)
3057{
3058 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3059 kvm_queue_exception(vcpu, UD_VECTOR);
3060 return -1;
3061 }
3062 return 0;
3063}
3064
3065static int handle_dr(struct kvm_vcpu *vcpu) 3173static int handle_dr(struct kvm_vcpu *vcpu)
3066{ 3174{
3067 unsigned long exit_qualification; 3175 unsigned long exit_qualification;
3068 unsigned long val;
3069 int dr, reg; 3176 int dr, reg;
3070 3177
3071 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 3178 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3100,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3100 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 3207 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3101 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 3208 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3102 if (exit_qualification & TYPE_MOV_FROM_DR) { 3209 if (exit_qualification & TYPE_MOV_FROM_DR) {
3103 switch (dr) { 3210 unsigned long val;
3104 case 0 ... 3: 3211 if (!kvm_get_dr(vcpu, dr, &val))
3105 val = vcpu->arch.db[dr]; 3212 kvm_register_write(vcpu, reg, val);
3106 break; 3213 } else
3107 case 4: 3214 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3108 if (check_dr_alias(vcpu) < 0)
3109 return 1;
3110 /* fall through */
3111 case 6:
3112 val = vcpu->arch.dr6;
3113 break;
3114 case 5:
3115 if (check_dr_alias(vcpu) < 0)
3116 return 1;
3117 /* fall through */
3118 default: /* 7 */
3119 val = vcpu->arch.dr7;
3120 break;
3121 }
3122 kvm_register_write(vcpu, reg, val);
3123 } else {
3124 val = vcpu->arch.regs[reg];
3125 switch (dr) {
3126 case 0 ... 3:
3127 vcpu->arch.db[dr] = val;
3128 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3129 vcpu->arch.eff_db[dr] = val;
3130 break;
3131 case 4:
3132 if (check_dr_alias(vcpu) < 0)
3133 return 1;
3134 /* fall through */
3135 case 6:
3136 if (val & 0xffffffff00000000ULL) {
3137 kvm_inject_gp(vcpu, 0);
3138 return 1;
3139 }
3140 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3141 break;
3142 case 5:
3143 if (check_dr_alias(vcpu) < 0)
3144 return 1;
3145 /* fall through */
3146 default: /* 7 */
3147 if (val & 0xffffffff00000000ULL) {
3148 kvm_inject_gp(vcpu, 0);
3149 return 1;
3150 }
3151 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3152 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
3153 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3154 vcpu->arch.switch_db_regs =
3155 (val & DR7_BP_EN_MASK);
3156 }
3157 break;
3158 }
3159 }
3160 skip_emulated_instruction(vcpu); 3215 skip_emulated_instruction(vcpu);
3161 return 1; 3216 return 1;
3162} 3217}
3163 3218
3219static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3220{
3221 vmcs_writel(GUEST_DR7, val);
3222}
3223
3164static int handle_cpuid(struct kvm_vcpu *vcpu) 3224static int handle_cpuid(struct kvm_vcpu *vcpu)
3165{ 3225{
3166 kvm_emulate_cpuid(vcpu); 3226 kvm_emulate_cpuid(vcpu);
@@ -3292,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3292{ 3352{
3293 struct vcpu_vmx *vmx = to_vmx(vcpu); 3353 struct vcpu_vmx *vmx = to_vmx(vcpu);
3294 unsigned long exit_qualification; 3354 unsigned long exit_qualification;
3355 bool has_error_code = false;
3356 u32 error_code = 0;
3295 u16 tss_selector; 3357 u16 tss_selector;
3296 int reason, type, idt_v; 3358 int reason, type, idt_v;
3297 3359
@@ -3314,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3314 kvm_clear_interrupt_queue(vcpu); 3376 kvm_clear_interrupt_queue(vcpu);
3315 break; 3377 break;
3316 case INTR_TYPE_HARD_EXCEPTION: 3378 case INTR_TYPE_HARD_EXCEPTION:
3379 if (vmx->idt_vectoring_info &
3380 VECTORING_INFO_DELIVER_CODE_MASK) {
3381 has_error_code = true;
3382 error_code =
3383 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3384 }
3385 /* fall through */
3317 case INTR_TYPE_SOFT_EXCEPTION: 3386 case INTR_TYPE_SOFT_EXCEPTION:
3318 kvm_clear_exception_queue(vcpu); 3387 kvm_clear_exception_queue(vcpu);
3319 break; 3388 break;
@@ -3328,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3328 type != INTR_TYPE_NMI_INTR)) 3397 type != INTR_TYPE_NMI_INTR))
3329 skip_emulated_instruction(vcpu); 3398 skip_emulated_instruction(vcpu);
3330 3399
3331 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3400 if (kvm_task_switch(vcpu, tss_selector, reason,
3401 has_error_code, error_code) == EMULATE_FAIL) {
3402 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3403 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3404 vcpu->run->internal.ndata = 0;
3332 return 0; 3405 return 0;
3406 }
3333 3407
3334 /* clear all local breakpoint enable flags */ 3408 /* clear all local breakpoint enable flags */
3335 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 3409 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3574,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3574 u32 exit_reason = vmx->exit_reason; 3648 u32 exit_reason = vmx->exit_reason;
3575 u32 vectoring_info = vmx->idt_vectoring_info; 3649 u32 vectoring_info = vmx->idt_vectoring_info;
3576 3650
3577 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3651 trace_kvm_exit(exit_reason, vcpu);
3578 3652
3579 /* If guest state is invalid, start emulating */ 3653 /* If guest state is invalid, start emulating */
3580 if (vmx->emulation_required && emulate_invalid_guest_state) 3654 if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3923,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3923{ 3997{
3924 struct vcpu_vmx *vmx = to_vmx(vcpu); 3998 struct vcpu_vmx *vmx = to_vmx(vcpu);
3925 3999
3926 spin_lock(&vmx_vpid_lock); 4000 free_vpid(vmx);
3927 if (vmx->vpid != 0)
3928 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3929 spin_unlock(&vmx_vpid_lock);
3930 vmx_free_vmcs(vcpu); 4001 vmx_free_vmcs(vcpu);
3931 kfree(vmx->guest_msrs); 4002 kfree(vmx->guest_msrs);
3932 kvm_vcpu_uninit(vcpu); 4003 kvm_vcpu_uninit(vcpu);
@@ -3988,6 +4059,7 @@ free_msrs:
3988uninit_vcpu: 4059uninit_vcpu:
3989 kvm_vcpu_uninit(&vmx->vcpu); 4060 kvm_vcpu_uninit(&vmx->vcpu);
3990free_vcpu: 4061free_vcpu:
4062 free_vpid(vmx);
3991 kmem_cache_free(kvm_vcpu_cache, vmx); 4063 kmem_cache_free(kvm_vcpu_cache, vmx);
3992 return ERR_PTR(err); 4064 return ERR_PTR(err);
3993} 4065}
@@ -4118,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4118 } 4190 }
4119} 4191}
4120 4192
4193static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4194{
4195}
4196
4121static struct kvm_x86_ops vmx_x86_ops = { 4197static struct kvm_x86_ops vmx_x86_ops = {
4122 .cpu_has_kvm_support = cpu_has_kvm_support, 4198 .cpu_has_kvm_support = cpu_has_kvm_support,
4123 .disabled_by_bios = vmx_disabled_by_bios, 4199 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4154,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4154 .set_idt = vmx_set_idt, 4230 .set_idt = vmx_set_idt,
4155 .get_gdt = vmx_get_gdt, 4231 .get_gdt = vmx_get_gdt,
4156 .set_gdt = vmx_set_gdt, 4232 .set_gdt = vmx_set_gdt,
4233 .set_dr7 = vmx_set_dr7,
4157 .cache_reg = vmx_cache_reg, 4234 .cache_reg = vmx_cache_reg,
4158 .get_rflags = vmx_get_rflags, 4235 .get_rflags = vmx_get_rflags,
4159 .set_rflags = vmx_set_rflags, 4236 .set_rflags = vmx_set_rflags,
@@ -4189,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4189 .cpuid_update = vmx_cpuid_update, 4266 .cpuid_update = vmx_cpuid_update,
4190 4267
4191 .rdtscp_supported = vmx_rdtscp_supported, 4268 .rdtscp_supported = vmx_rdtscp_supported,
4269
4270 .set_supported_cpuid = vmx_set_supported_cpuid,
4192}; 4271};
4193 4272
4194static int __init vmx_init(void) 4273static int __init vmx_init(void)
@@ -4236,7 +4315,8 @@ static int __init vmx_init(void)
4236 4315
4237 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 4316 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
4238 4317
4239 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 4318 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
4319 __alignof__(struct vcpu_vmx), THIS_MODULE);
4240 if (r) 4320 if (r)
4241 goto out3; 4321 goto out3;
4242 4322
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd9bc8fb81ab..05d571f6f196 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h> 43#include <linux/perf_event.h>
44#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
45#undef TRACE_INCLUDE_FILE 45
46#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
47#include "trace.h" 47#include "trace.h"
48 48
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
224 kvm_on_user_return(&smsr->urn); 224 kvm_on_user_return(&smsr->urn);
225} 225}
226 226
227unsigned long segment_base(u16 selector)
228{
229 struct descriptor_table gdt;
230 struct desc_struct *d;
231 unsigned long table_base;
232 unsigned long v;
233
234 if (selector == 0)
235 return 0;
236
237 kvm_get_gdt(&gdt);
238 table_base = gdt.base;
239
240 if (selector & 4) { /* from ldt */
241 u16 ldt_selector = kvm_read_ldt();
242
243 table_base = segment_base(ldt_selector);
244 }
245 d = (struct desc_struct *)(table_base + (selector & ~7));
246 v = get_desc_base(d);
247#ifdef CONFIG_X86_64
248 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
249 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
250#endif
251 return v;
252}
253EXPORT_SYMBOL_GPL(segment_base);
254
255u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
256{ 228{
257 if (irqchip_in_kernel(vcpu->kvm)) 229 if (irqchip_in_kernel(vcpu->kvm))
@@ -293,7 +265,8 @@ static int exception_class(int vector)
293} 265}
294 266
295static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 267static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
296 unsigned nr, bool has_error, u32 error_code) 268 unsigned nr, bool has_error, u32 error_code,
269 bool reinject)
297{ 270{
298 u32 prev_nr; 271 u32 prev_nr;
299 int class1, class2; 272 int class1, class2;
@@ -304,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
304 vcpu->arch.exception.has_error_code = has_error; 277 vcpu->arch.exception.has_error_code = has_error;
305 vcpu->arch.exception.nr = nr; 278 vcpu->arch.exception.nr = nr;
306 vcpu->arch.exception.error_code = error_code; 279 vcpu->arch.exception.error_code = error_code;
280 vcpu->arch.exception.reinject = reinject;
307 return; 281 return;
308 } 282 }
309 283
@@ -332,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
332 306
333void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 307void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
334{ 308{
335 kvm_multiple_exception(vcpu, nr, false, 0); 309 kvm_multiple_exception(vcpu, nr, false, 0, false);
336} 310}
337EXPORT_SYMBOL_GPL(kvm_queue_exception); 311EXPORT_SYMBOL_GPL(kvm_queue_exception);
338 312
313void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
314{
315 kvm_multiple_exception(vcpu, nr, false, 0, true);
316}
317EXPORT_SYMBOL_GPL(kvm_requeue_exception);
318
339void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 319void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
340 u32 error_code) 320 u32 error_code)
341{ 321{
@@ -352,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
352 332
353void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 333void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
354{ 334{
355 kvm_multiple_exception(vcpu, nr, true, error_code); 335 kvm_multiple_exception(vcpu, nr, true, error_code, false);
356} 336}
357EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 337EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
358 338
339void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
340{
341 kvm_multiple_exception(vcpu, nr, true, error_code, true);
342}
343EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
344
359/* 345/*
360 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 346 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
361 * a #GP and return false. 347 * a #GP and return false.
@@ -476,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
476 } 462 }
477 463
478 kvm_x86_ops->set_cr0(vcpu, cr0); 464 kvm_x86_ops->set_cr0(vcpu, cr0);
479 vcpu->arch.cr0 = cr0;
480 465
481 kvm_mmu_reset_context(vcpu); 466 kvm_mmu_reset_context(vcpu);
482 return; 467 return;
@@ -485,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
485 470
486void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
487{ 472{
488 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); 473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
489} 474}
490EXPORT_SYMBOL_GPL(kvm_lmsw); 475EXPORT_SYMBOL_GPL(kvm_lmsw);
491 476
@@ -517,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
517 } 502 }
518 kvm_x86_ops->set_cr4(vcpu, cr4); 503 kvm_x86_ops->set_cr4(vcpu, cr4);
519 vcpu->arch.cr4 = cr4; 504 vcpu->arch.cr4 = cr4;
520 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
521 kvm_mmu_reset_context(vcpu); 505 kvm_mmu_reset_context(vcpu);
522} 506}
523EXPORT_SYMBOL_GPL(kvm_set_cr4); 507EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -592,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
592} 576}
593EXPORT_SYMBOL_GPL(kvm_get_cr8); 577EXPORT_SYMBOL_GPL(kvm_get_cr8);
594 578
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{
581 switch (dr) {
582 case 0 ... 3:
583 vcpu->arch.db[dr] = val;
584 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
585 vcpu->arch.eff_db[dr] = val;
586 break;
587 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
589 kvm_queue_exception(vcpu, UD_VECTOR);
590 return 1;
591 }
592 /* fall through */
593 case 6:
594 if (val & 0xffffffff00000000ULL) {
595 kvm_inject_gp(vcpu, 0);
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break;
600 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
602 kvm_queue_exception(vcpu, UD_VECTOR);
603 return 1;
604 }
605 /* fall through */
606 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) {
608 kvm_inject_gp(vcpu, 0);
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
614 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
615 }
616 break;
617 }
618
619 return 0;
620}
621EXPORT_SYMBOL_GPL(kvm_set_dr);
622
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{
625 switch (dr) {
626 case 0 ... 3:
627 *val = vcpu->arch.db[dr];
628 break;
629 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1;
633 }
634 /* fall through */
635 case 6:
636 *val = vcpu->arch.dr6;
637 break;
638 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1;
642 }
643 /* fall through */
644 default: /* 7 */
645 *val = vcpu->arch.dr7;
646 break;
647 }
648
649 return 0;
650}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652
595static inline u32 bit(int bitno) 653static inline u32 bit(int bitno)
596{ 654{
597 return 1 << (bitno & 31); 655 return 1 << (bitno & 31);
@@ -606,9 +664,10 @@ static inline u32 bit(int bitno)
606 * kvm-specific. Those are put in the beginning of the list. 664 * kvm-specific. Those are put in the beginning of the list.
607 */ 665 */
608 666
609#define KVM_SAVE_MSRS_BEGIN 5 667#define KVM_SAVE_MSRS_BEGIN 7
610static u32 msrs_to_save[] = { 668static u32 msrs_to_save[] = {
611 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 669 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
670 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
612 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
613 HV_X64_MSR_APIC_ASSIST_PAGE, 672 HV_X64_MSR_APIC_ASSIST_PAGE,
614 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -625,48 +684,42 @@ static u32 emulated_msrs[] = {
625 MSR_IA32_MISC_ENABLE, 684 MSR_IA32_MISC_ENABLE,
626}; 685};
627 686
628static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 687static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
629{ 688{
630 if (efer & efer_reserved_bits) { 689 if (efer & efer_reserved_bits)
631 kvm_inject_gp(vcpu, 0); 690 return 1;
632 return;
633 }
634 691
635 if (is_paging(vcpu) 692 if (is_paging(vcpu)
636 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { 693 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
637 kvm_inject_gp(vcpu, 0); 694 return 1;
638 return;
639 }
640 695
641 if (efer & EFER_FFXSR) { 696 if (efer & EFER_FFXSR) {
642 struct kvm_cpuid_entry2 *feat; 697 struct kvm_cpuid_entry2 *feat;
643 698
644 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 699 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
645 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 700 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
646 kvm_inject_gp(vcpu, 0); 701 return 1;
647 return;
648 }
649 } 702 }
650 703
651 if (efer & EFER_SVME) { 704 if (efer & EFER_SVME) {
652 struct kvm_cpuid_entry2 *feat; 705 struct kvm_cpuid_entry2 *feat;
653 706
654 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 707 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
655 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 708 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
656 kvm_inject_gp(vcpu, 0); 709 return 1;
657 return;
658 }
659 } 710 }
660 711
661 kvm_x86_ops->set_efer(vcpu, efer);
662
663 efer &= ~EFER_LMA; 712 efer &= ~EFER_LMA;
664 efer |= vcpu->arch.efer & EFER_LMA; 713 efer |= vcpu->arch.efer & EFER_LMA;
665 714
715 kvm_x86_ops->set_efer(vcpu, efer);
716
666 vcpu->arch.efer = efer; 717 vcpu->arch.efer = efer;
667 718
668 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
669 kvm_mmu_reset_context(vcpu); 720 kvm_mmu_reset_context(vcpu);
721
722 return 0;
670} 723}
671 724
672void kvm_enable_efer_bits(u64 mask) 725void kvm_enable_efer_bits(u64 mask)
@@ -696,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
696 749
697static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 750static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
698{ 751{
699 static int version; 752 int version;
753 int r;
700 struct pvclock_wall_clock wc; 754 struct pvclock_wall_clock wc;
701 struct timespec boot; 755 struct timespec boot;
702 756
703 if (!wall_clock) 757 if (!wall_clock)
704 return; 758 return;
705 759
706 version++; 760 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
761 if (r)
762 return;
763
764 if (version & 1)
765 ++version; /* first time write, random junk */
766
767 ++version;
707 768
708 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 769 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
709 770
@@ -796,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
796 vcpu->hv_clock.system_time = ts.tv_nsec + 857 vcpu->hv_clock.system_time = ts.tv_nsec +
797 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 858 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
798 859
860 vcpu->hv_clock.flags = 0;
861
799 /* 862 /*
800 * The interface expects us to write an even number signaling that the 863 * The interface expects us to write an even number signaling that the
801 * update is finished. Since the guest won't see the intermediate 864 * update is finished. Since the guest won't see the intermediate
@@ -1087,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1087{ 1150{
1088 switch (msr) { 1151 switch (msr) {
1089 case MSR_EFER: 1152 case MSR_EFER:
1090 set_efer(vcpu, data); 1153 return set_efer(vcpu, data);
1091 break;
1092 case MSR_K7_HWCR: 1154 case MSR_K7_HWCR:
1093 data &= ~(u64)0x40; /* ignore flush filter disable */ 1155 data &= ~(u64)0x40; /* ignore flush filter disable */
1156 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1094 if (data != 0) { 1157 if (data != 0) {
1095 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1158 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1096 data); 1159 data);
@@ -1133,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1133 case MSR_IA32_MISC_ENABLE: 1196 case MSR_IA32_MISC_ENABLE:
1134 vcpu->arch.ia32_misc_enable_msr = data; 1197 vcpu->arch.ia32_misc_enable_msr = data;
1135 break; 1198 break;
1199 case MSR_KVM_WALL_CLOCK_NEW:
1136 case MSR_KVM_WALL_CLOCK: 1200 case MSR_KVM_WALL_CLOCK:
1137 vcpu->kvm->arch.wall_clock = data; 1201 vcpu->kvm->arch.wall_clock = data;
1138 kvm_write_wall_clock(vcpu->kvm, data); 1202 kvm_write_wall_clock(vcpu->kvm, data);
1139 break; 1203 break;
1204 case MSR_KVM_SYSTEM_TIME_NEW:
1140 case MSR_KVM_SYSTEM_TIME: { 1205 case MSR_KVM_SYSTEM_TIME: {
1141 if (vcpu->arch.time_page) { 1206 if (vcpu->arch.time_page) {
1142 kvm_release_page_dirty(vcpu->arch.time_page); 1207 kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1408,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1408 data = vcpu->arch.efer; 1473 data = vcpu->arch.efer;
1409 break; 1474 break;
1410 case MSR_KVM_WALL_CLOCK: 1475 case MSR_KVM_WALL_CLOCK:
1476 case MSR_KVM_WALL_CLOCK_NEW:
1411 data = vcpu->kvm->arch.wall_clock; 1477 data = vcpu->kvm->arch.wall_clock;
1412 break; 1478 break;
1413 case MSR_KVM_SYSTEM_TIME: 1479 case MSR_KVM_SYSTEM_TIME:
1480 case MSR_KVM_SYSTEM_TIME_NEW:
1414 data = vcpu->arch.time; 1481 data = vcpu->arch.time;
1415 break; 1482 break;
1416 case MSR_IA32_P5_MC_ADDR: 1483 case MSR_IA32_P5_MC_ADDR:
@@ -1549,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1549 case KVM_CAP_HYPERV_VAPIC: 1616 case KVM_CAP_HYPERV_VAPIC:
1550 case KVM_CAP_HYPERV_SPIN: 1617 case KVM_CAP_HYPERV_SPIN:
1551 case KVM_CAP_PCI_SEGMENT: 1618 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS:
1552 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1620 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1553 r = 1; 1621 r = 1;
1554 break; 1622 break;
@@ -1769,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1769{ 1837{
1770 int r; 1838 int r;
1771 1839
1840 vcpu_load(vcpu);
1772 r = -E2BIG; 1841 r = -E2BIG;
1773 if (cpuid->nent < vcpu->arch.cpuid_nent) 1842 if (cpuid->nent < vcpu->arch.cpuid_nent)
1774 goto out; 1843 goto out;
@@ -1780,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1780 1849
1781out: 1850out:
1782 cpuid->nent = vcpu->arch.cpuid_nent; 1851 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1783 return r; 1853 return r;
1784} 1854}
1785 1855
@@ -1910,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1910 } 1980 }
1911 break; 1981 break;
1912 } 1982 }
1983 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature;
1986 entry->eax = 0;
1987 entry->ebx = sigptr[0];
1988 entry->ecx = sigptr[1];
1989 entry->edx = sigptr[2];
1990 break;
1991 }
1992 case KVM_CPUID_FEATURES:
1993 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
1994 (1 << KVM_FEATURE_NOP_IO_DELAY) |
1995 (1 << KVM_FEATURE_CLOCKSOURCE2) |
1996 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
1997 entry->ebx = 0;
1998 entry->ecx = 0;
1999 entry->edx = 0;
2000 break;
1913 case 0x80000000: 2001 case 0x80000000:
1914 entry->eax = min(entry->eax, 0x8000001a); 2002 entry->eax = min(entry->eax, 0x8000001a);
1915 break; 2003 break;
@@ -1918,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1918 entry->ecx &= kvm_supported_word6_x86_features; 2006 entry->ecx &= kvm_supported_word6_x86_features;
1919 break; 2007 break;
1920 } 2008 }
2009
2010 kvm_x86_ops->set_supported_cpuid(function, entry);
2011
1921 put_cpu(); 2012 put_cpu();
1922} 2013}
1923 2014
@@ -1953,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1953 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 2044 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1954 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2045 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1955 &nent, cpuid->nent); 2046 &nent, cpuid->nent);
2047
2048
2049
2050 r = -E2BIG;
2051 if (nent >= cpuid->nent)
2052 goto out_free;
2053
2054 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2055 cpuid->nent);
2056
2057 r = -E2BIG;
2058 if (nent >= cpuid->nent)
2059 goto out_free;
2060
2061 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2062 cpuid->nent);
2063
1956 r = -E2BIG; 2064 r = -E2BIG;
1957 if (nent >= cpuid->nent) 2065 if (nent >= cpuid->nent)
1958 goto out_free; 2066 goto out_free;
@@ -2032,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2032 int r; 2140 int r;
2033 unsigned bank_num = mcg_cap & 0xff, bank; 2141 unsigned bank_num = mcg_cap & 0xff, bank;
2034 2142
2143 vcpu_load(vcpu);
2035 r = -EINVAL; 2144 r = -EINVAL;
2036 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2037 goto out; 2146 goto out;
@@ -2046,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2046 for (bank = 0; bank < bank_num; bank++) 2155 for (bank = 0; bank < bank_num; bank++)
2047 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2048out: 2157out:
2158 vcpu_put(vcpu);
2049 return r; 2159 return r;
2050} 2160}
2051 2161
@@ -2105,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2105{ 2215{
2106 vcpu_load(vcpu); 2216 vcpu_load(vcpu);
2107 2217
2108 events->exception.injected = vcpu->arch.exception.pending; 2218 events->exception.injected =
2219 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2109 events->exception.nr = vcpu->arch.exception.nr; 2221 events->exception.nr = vcpu->arch.exception.nr;
2110 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2222 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2111 events->exception.error_code = vcpu->arch.exception.error_code; 2223 events->exception.error_code = vcpu->arch.exception.error_code;
2112 2224
2113 events->interrupt.injected = vcpu->arch.interrupt.pending; 2225 events->interrupt.injected =
2226 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2114 events->interrupt.nr = vcpu->arch.interrupt.nr; 2227 events->interrupt.nr = vcpu->arch.interrupt.nr;
2115 events->interrupt.soft = vcpu->arch.interrupt.soft; 2228 events->interrupt.soft = 0;
2229 events->interrupt.shadow =
2230 kvm_x86_ops->get_interrupt_shadow(vcpu,
2231 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2116 2232
2117 events->nmi.injected = vcpu->arch.nmi_injected; 2233 events->nmi.injected = vcpu->arch.nmi_injected;
2118 events->nmi.pending = vcpu->arch.nmi_pending; 2234 events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2121,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2121 events->sipi_vector = vcpu->arch.sipi_vector; 2237 events->sipi_vector = vcpu->arch.sipi_vector;
2122 2238
2123 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2124 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW);
2125 2242
2126 vcpu_put(vcpu); 2243 vcpu_put(vcpu);
2127} 2244}
@@ -2130,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2130 struct kvm_vcpu_events *events) 2247 struct kvm_vcpu_events *events)
2131{ 2248{
2132 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2249 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2133 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 2250 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2251 | KVM_VCPUEVENT_VALID_SHADOW))
2134 return -EINVAL; 2252 return -EINVAL;
2135 2253
2136 vcpu_load(vcpu); 2254 vcpu_load(vcpu);
@@ -2145,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2145 vcpu->arch.interrupt.soft = events->interrupt.soft; 2263 vcpu->arch.interrupt.soft = events->interrupt.soft;
2146 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2264 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2147 kvm_pic_clear_isr_ack(vcpu->kvm); 2265 kvm_pic_clear_isr_ack(vcpu->kvm);
2266 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2267 kvm_x86_ops->set_interrupt_shadow(vcpu,
2268 events->interrupt.shadow);
2148 2269
2149 vcpu->arch.nmi_injected = events->nmi.injected; 2270 vcpu->arch.nmi_injected = events->nmi.injected;
2150 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2271 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2159,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2159 return 0; 2280 return 0;
2160} 2281}
2161 2282
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs)
2285{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294}
2295
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2297 struct kvm_debugregs *dbgregs)
2298{
2299 if (dbgregs->flags)
2300 return -EINVAL;
2301
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7;
2307
2308 vcpu_put(vcpu);
2309
2310 return 0;
2311}
2312
2162long kvm_arch_vcpu_ioctl(struct file *filp, 2313long kvm_arch_vcpu_ioctl(struct file *filp,
2163 unsigned int ioctl, unsigned long arg) 2314 unsigned int ioctl, unsigned long arg)
2164{ 2315{
@@ -2313,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2313 r = -EFAULT; 2464 r = -EFAULT;
2314 if (copy_from_user(&mce, argp, sizeof mce)) 2465 if (copy_from_user(&mce, argp, sizeof mce))
2315 goto out; 2466 goto out;
2467 vcpu_load(vcpu);
2316 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2317 break; 2470 break;
2318 } 2471 }
2319 case KVM_GET_VCPU_EVENTS: { 2472 case KVM_GET_VCPU_EVENTS: {
@@ -2337,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2337 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2490 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2338 break; 2491 break;
2339 } 2492 }
2493 case KVM_GET_DEBUGREGS: {
2494 struct kvm_debugregs dbgregs;
2495
2496 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2497
2498 r = -EFAULT;
2499 if (copy_to_user(argp, &dbgregs,
2500 sizeof(struct kvm_debugregs)))
2501 break;
2502 r = 0;
2503 break;
2504 }
2505 case KVM_SET_DEBUGREGS: {
2506 struct kvm_debugregs dbgregs;
2507
2508 r = -EFAULT;
2509 if (copy_from_user(&dbgregs, argp,
2510 sizeof(struct kvm_debugregs)))
2511 break;
2512
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break;
2515 }
2340 default: 2516 default:
2341 r = -EINVAL; 2517 r = -EINVAL;
2342 } 2518 }
@@ -2390,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2390 struct kvm_mem_alias *alias; 2566 struct kvm_mem_alias *alias;
2391 struct kvm_mem_aliases *aliases; 2567 struct kvm_mem_aliases *aliases;
2392 2568
2393 aliases = rcu_dereference(kvm->arch.aliases); 2569 aliases = kvm_aliases(kvm);
2394 2570
2395 for (i = 0; i < aliases->naliases; ++i) { 2571 for (i = 0; i < aliases->naliases; ++i) {
2396 alias = &aliases->aliases[i]; 2572 alias = &aliases->aliases[i];
@@ -2409,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2409 struct kvm_mem_alias *alias; 2585 struct kvm_mem_alias *alias;
2410 struct kvm_mem_aliases *aliases; 2586 struct kvm_mem_aliases *aliases;
2411 2587
2412 aliases = rcu_dereference(kvm->arch.aliases); 2588 aliases = kvm_aliases(kvm);
2413 2589
2414 for (i = 0; i < aliases->naliases; ++i) { 2590 for (i = 0; i < aliases->naliases; ++i) {
2415 alias = &aliases->aliases[i]; 2591 alias = &aliases->aliases[i];
@@ -2804,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
2804 r = -EFAULT; 2980 r = -EFAULT;
2805 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2981 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2806 goto out; 2982 goto out;
2983 r = -ENXIO;
2807 if (irqchip_in_kernel(kvm)) { 2984 if (irqchip_in_kernel(kvm)) {
2808 __s32 status; 2985 __s32 status;
2809 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2986 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2810 irq_event.irq, irq_event.level); 2987 irq_event.irq, irq_event.level);
2811 if (ioctl == KVM_IRQ_LINE_STATUS) { 2988 if (ioctl == KVM_IRQ_LINE_STATUS) {
2989 r = -EFAULT;
2812 irq_event.status = status; 2990 irq_event.status = status;
2813 if (copy_to_user(argp, &irq_event, 2991 if (copy_to_user(argp, &irq_event,
2814 sizeof irq_event)) 2992 sizeof irq_event))
@@ -3024,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3024 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3202 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3025} 3203}
3026 3204
3205static void kvm_set_segment(struct kvm_vcpu *vcpu,
3206 struct kvm_segment *var, int seg)
3207{
3208 kvm_x86_ops->set_segment(vcpu, var, seg);
3209}
3210
3211void kvm_get_segment(struct kvm_vcpu *vcpu,
3212 struct kvm_segment *var, int seg)
3213{
3214 kvm_x86_ops->get_segment(vcpu, var, seg);
3215}
3216
3027gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3217gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3028{ 3218{
3029 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3219 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3104,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3104 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3294 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3105} 3295}
3106 3296
3107static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3297static int kvm_write_guest_virt_system(gva_t addr, void *val,
3108 struct kvm_vcpu *vcpu, u32 *error) 3298 unsigned int bytes,
3299 struct kvm_vcpu *vcpu,
3300 u32 *error)
3109{ 3301{
3110 void *data = val; 3302 void *data = val;
3111 int r = X86EMUL_CONTINUE; 3303 int r = X86EMUL_CONTINUE;
3112 3304
3113 while (bytes) { 3305 while (bytes) {
3114 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); 3306 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3307 PFERR_WRITE_MASK, error);
3115 unsigned offset = addr & (PAGE_SIZE-1); 3308 unsigned offset = addr & (PAGE_SIZE-1);
3116 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3309 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3117 int ret; 3310 int ret;
@@ -3134,7 +3327,6 @@ out:
3134 return r; 3327 return r;
3135} 3328}
3136 3329
3137
3138static int emulator_read_emulated(unsigned long addr, 3330static int emulator_read_emulated(unsigned long addr,
3139 void *val, 3331 void *val,
3140 unsigned int bytes, 3332 unsigned int bytes,
@@ -3237,9 +3429,9 @@ mmio:
3237} 3429}
3238 3430
3239int emulator_write_emulated(unsigned long addr, 3431int emulator_write_emulated(unsigned long addr,
3240 const void *val, 3432 const void *val,
3241 unsigned int bytes, 3433 unsigned int bytes,
3242 struct kvm_vcpu *vcpu) 3434 struct kvm_vcpu *vcpu)
3243{ 3435{
3244 /* Crossing a page boundary? */ 3436 /* Crossing a page boundary? */
3245 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3437 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3257,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr,
3257} 3449}
3258EXPORT_SYMBOL_GPL(emulator_write_emulated); 3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3259 3451
3452#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3454
3455#ifdef CONFIG_X86_64
3456# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3457#else
3458# define CMPXCHG64(ptr, old, new) \
3459 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3460#endif
3461
3260static int emulator_cmpxchg_emulated(unsigned long addr, 3462static int emulator_cmpxchg_emulated(unsigned long addr,
3261 const void *old, 3463 const void *old,
3262 const void *new, 3464 const void *new,
3263 unsigned int bytes, 3465 unsigned int bytes,
3264 struct kvm_vcpu *vcpu) 3466 struct kvm_vcpu *vcpu)
3265{ 3467{
3266 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3468 gpa_t gpa;
3267#ifndef CONFIG_X86_64 3469 struct page *page;
3268 /* guests cmpxchg8b have to be emulated atomically */ 3470 char *kaddr;
3269 if (bytes == 8) { 3471 bool exchanged;
3270 gpa_t gpa;
3271 struct page *page;
3272 char *kaddr;
3273 u64 val;
3274 3472
3275 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3473 /* guests cmpxchg8b have to be emulated atomically */
3474 if (bytes > 8 || (bytes & (bytes - 1)))
3475 goto emul_write;
3276 3476
3277 if (gpa == UNMAPPED_GVA || 3477 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3278 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3279 goto emul_write;
3280 3478
3281 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3479 if (gpa == UNMAPPED_GVA ||
3282 goto emul_write; 3480 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3481 goto emul_write;
3283 3482
3284 val = *(u64 *)new; 3483 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3484 goto emul_write;
3285 3485
3286 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3287 3487
3288 kaddr = kmap_atomic(page, KM_USER0); 3488 kaddr = kmap_atomic(page, KM_USER0);
3289 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 3489 kaddr += offset_in_page(gpa);
3290 kunmap_atomic(kaddr, KM_USER0); 3490 switch (bytes) {
3291 kvm_release_page_dirty(page); 3491 case 1:
3492 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3493 break;
3494 case 2:
3495 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3496 break;
3497 case 4:
3498 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3499 break;
3500 case 8:
3501 exchanged = CMPXCHG64(kaddr, old, new);
3502 break;
3503 default:
3504 BUG();
3292 } 3505 }
3506 kunmap_atomic(kaddr, KM_USER0);
3507 kvm_release_page_dirty(page);
3508
3509 if (!exchanged)
3510 return X86EMUL_CMPXCHG_FAILED;
3511
3512 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3513
3514 return X86EMUL_CONTINUE;
3515
3293emul_write: 3516emul_write:
3294#endif 3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3295 3518
3296 return emulator_write_emulated(addr, new, bytes, vcpu); 3519 return emulator_write_emulated(addr, new, bytes, vcpu);
3297} 3520}
3298 3521
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3523{
3524 /* TODO: String I/O for in kernel device */
3525 int r;
3526
3527 if (vcpu->arch.pio.in)
3528 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3529 vcpu->arch.pio.size, pd);
3530 else
3531 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3532 vcpu->arch.pio.port, vcpu->arch.pio.size,
3533 pd);
3534 return r;
3535}
3536
3537
3538static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3539 unsigned int count, struct kvm_vcpu *vcpu)
3540{
3541 if (vcpu->arch.pio.count)
3542 goto data_avail;
3543
3544 trace_kvm_pio(1, port, size, 1);
3545
3546 vcpu->arch.pio.port = port;
3547 vcpu->arch.pio.in = 1;
3548 vcpu->arch.pio.count = count;
3549 vcpu->arch.pio.size = size;
3550
3551 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3552 data_avail:
3553 memcpy(val, vcpu->arch.pio_data, size * count);
3554 vcpu->arch.pio.count = 0;
3555 return 1;
3556 }
3557
3558 vcpu->run->exit_reason = KVM_EXIT_IO;
3559 vcpu->run->io.direction = KVM_EXIT_IO_IN;
3560 vcpu->run->io.size = size;
3561 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3562 vcpu->run->io.count = count;
3563 vcpu->run->io.port = port;
3564
3565 return 0;
3566}
3567
3568static int emulator_pio_out_emulated(int size, unsigned short port,
3569 const void *val, unsigned int count,
3570 struct kvm_vcpu *vcpu)
3571{
3572 trace_kvm_pio(0, port, size, 1);
3573
3574 vcpu->arch.pio.port = port;
3575 vcpu->arch.pio.in = 0;
3576 vcpu->arch.pio.count = count;
3577 vcpu->arch.pio.size = size;
3578
3579 memcpy(vcpu->arch.pio_data, val, size * count);
3580
3581 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3582 vcpu->arch.pio.count = 0;
3583 return 1;
3584 }
3585
3586 vcpu->run->exit_reason = KVM_EXIT_IO;
3587 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3588 vcpu->run->io.size = size;
3589 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3590 vcpu->run->io.count = count;
3591 vcpu->run->io.port = port;
3592
3593 return 0;
3594}
3595
3299static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3596static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3300{ 3597{
3301 return kvm_x86_ops->get_segment_base(vcpu, seg); 3598 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3316,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
3316 3613
3317int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3318{ 3615{
3319 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); 3616 return kvm_get_dr(ctxt->vcpu, dr, dest);
3320} 3617}
3321 3618
3322int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3323{ 3620{
3324 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3325 3622
3326 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); 3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3327} 3624}
3328 3625
3329void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3344,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3344} 3641}
3345EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3346 3643
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{
3646 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3647}
3648
3649static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3650{
3651 unsigned long value;
3652
3653 switch (cr) {
3654 case 0:
3655 value = kvm_read_cr0(vcpu);
3656 break;
3657 case 2:
3658 value = vcpu->arch.cr2;
3659 break;
3660 case 3:
3661 value = vcpu->arch.cr3;
3662 break;
3663 case 4:
3664 value = kvm_read_cr4(vcpu);
3665 break;
3666 case 8:
3667 value = kvm_get_cr8(vcpu);
3668 break;
3669 default:
3670 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3671 return 0;
3672 }
3673
3674 return value;
3675}
3676
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{
3679 switch (cr) {
3680 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break;
3683 case 2:
3684 vcpu->arch.cr2 = val;
3685 break;
3686 case 3:
3687 kvm_set_cr3(vcpu, val);
3688 break;
3689 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break;
3692 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL);
3694 break;
3695 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3697 }
3698}
3699
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3701{
3702 return kvm_x86_ops->get_cpl(vcpu);
3703}
3704
3705static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3706{
3707 kvm_x86_ops->get_gdt(vcpu, dt);
3708}
3709
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu)
3712{
3713 struct kvm_segment var;
3714
3715 kvm_get_segment(vcpu, &var, seg);
3716
3717 if (var.unusable)
3718 return false;
3719
3720 if (var.g)
3721 var.limit >>= 12;
3722 set_desc_limit(desc, var.limit);
3723 set_desc_base(desc, (unsigned long)var.base);
3724 desc->type = var.type;
3725 desc->s = var.s;
3726 desc->dpl = var.dpl;
3727 desc->p = var.present;
3728 desc->avl = var.avl;
3729 desc->l = var.l;
3730 desc->d = var.db;
3731 desc->g = var.g;
3732
3733 return true;
3734}
3735
3736static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3737 struct kvm_vcpu *vcpu)
3738{
3739 struct kvm_segment var;
3740
3741 /* needed to preserve selector */
3742 kvm_get_segment(vcpu, &var, seg);
3743
3744 var.base = get_desc_base(desc);
3745 var.limit = get_desc_limit(desc);
3746 if (desc->g)
3747 var.limit = (var.limit << 12) | 0xfff;
3748 var.type = desc->type;
3749 var.present = desc->p;
3750 var.dpl = desc->dpl;
3751 var.db = desc->d;
3752 var.s = desc->s;
3753 var.l = desc->l;
3754 var.g = desc->g;
3755 var.avl = desc->avl;
3756 var.present = desc->p;
3757 var.unusable = !var.present;
3758 var.padding = 0;
3759
3760 kvm_set_segment(vcpu, &var, seg);
3761 return;
3762}
3763
3764static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3765{
3766 struct kvm_segment kvm_seg;
3767
3768 kvm_get_segment(vcpu, &kvm_seg, seg);
3769 return kvm_seg.selector;
3770}
3771
3772static void emulator_set_segment_selector(u16 sel, int seg,
3773 struct kvm_vcpu *vcpu)
3774{
3775 struct kvm_segment kvm_seg;
3776
3777 kvm_get_segment(vcpu, &kvm_seg, seg);
3778 kvm_seg.selector = sel;
3779 kvm_set_segment(vcpu, &kvm_seg, seg);
3780}
3781
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3347static struct x86_emulate_ops emulate_ops = { 3787static struct x86_emulate_ops emulate_ops = {
3348 .read_std = kvm_read_guest_virt_system, 3788 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system,
3349 .fetch = kvm_fetch_guest_virt, 3790 .fetch = kvm_fetch_guest_virt,
3350 .read_emulated = emulator_read_emulated, 3791 .read_emulated = emulator_read_emulated,
3351 .write_emulated = emulator_write_emulated, 3792 .write_emulated = emulator_write_emulated,
3352 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3793 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3794 .pio_in_emulated = emulator_pio_in_emulated,
3795 .pio_out_emulated = emulator_pio_out_emulated,
3796 .get_cached_descriptor = emulator_get_cached_descriptor,
3797 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector,
3800 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags,
3353}; 3805};
3354 3806
3355static void cache_all_regs(struct kvm_vcpu *vcpu) 3807static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3380,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3380 cache_all_regs(vcpu); 3832 cache_all_regs(vcpu);
3381 3833
3382 vcpu->mmio_is_write = 0; 3834 vcpu->mmio_is_write = 0;
3383 vcpu->arch.pio.string = 0;
3384 3835
3385 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3386 int cs_db, cs_l; 3837 int cs_db, cs_l;
3387 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3388 3839
3389 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3840 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3390 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3841 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3842 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3391 vcpu->arch.emulate_ctxt.mode = 3843 vcpu->arch.emulate_ctxt.mode =
3392 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3844 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3393 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3845 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3396,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3396 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3397 3849
3398 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu);
3399 3852
3400 /* Only allow emulation of specific instructions on #UD 3853 /* Only allow emulation of specific instructions on #UD
3401 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3428,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3428 ++vcpu->stat.insn_emulation; 3881 ++vcpu->stat.insn_emulation;
3429 if (r) { 3882 if (r) {
3430 ++vcpu->stat.insn_emulation_fail; 3883 ++vcpu->stat.insn_emulation_fail;
3884 trace_kvm_emulate_insn_failed(vcpu);
3431 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3432 return EMULATE_DONE; 3886 return EMULATE_DONE;
3433 return EMULATE_FAIL; 3887 return EMULATE_FAIL;
@@ -3439,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3439 return EMULATE_DONE; 3893 return EMULATE_DONE;
3440 } 3894 }
3441 3895
3896restart:
3442 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3443 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3444 3899
3445 if (r == 0) 3900 if (r == 0)
3446 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3447 3902
3448 if (vcpu->arch.pio.string) 3903 if (vcpu->arch.pio.count) {
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3449 return EMULATE_DO_MMIO; 3906 return EMULATE_DO_MMIO;
3907 }
3450 3908
3451 if ((r || vcpu->mmio_is_write) && run) { 3909 if (r || vcpu->mmio_is_write) {
3452 run->exit_reason = KVM_EXIT_MMIO; 3910 run->exit_reason = KVM_EXIT_MMIO;
3453 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3911 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3454 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3912 memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3458,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3458 3916
3459 if (r) { 3917 if (r) {
3460 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3461 return EMULATE_DONE; 3919 goto done;
3462 if (!vcpu->mmio_needed) { 3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3463 kvm_report_emulation_failure(vcpu, "mmio"); 3923 kvm_report_emulation_failure(vcpu, "mmio");
3464 return EMULATE_FAIL; 3924 return EMULATE_FAIL;
3465 } 3925 }
3466 return EMULATE_DO_MMIO; 3926 return EMULATE_DO_MMIO;
3467 } 3927 }
3468 3928
3469 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3470
3471 if (vcpu->mmio_is_write) { 3929 if (vcpu->mmio_is_write) {
3472 vcpu->mmio_needed = 0; 3930 vcpu->mmio_needed = 0;
3473 return EMULATE_DO_MMIO; 3931 return EMULATE_DO_MMIO;
3474 } 3932 }
3475 3933
3476 return EMULATE_DONE; 3934done:
3477} 3935 if (vcpu->arch.exception.pending)
3478EXPORT_SYMBOL_GPL(emulate_instruction); 3936 vcpu->arch.emulate_ctxt.restart = false;
3479
3480static int pio_copy_data(struct kvm_vcpu *vcpu)
3481{
3482 void *p = vcpu->arch.pio_data;
3483 gva_t q = vcpu->arch.pio.guest_gva;
3484 unsigned bytes;
3485 int ret;
3486 u32 error_code;
3487
3488 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3489 if (vcpu->arch.pio.in)
3490 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3491 else
3492 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3493
3494 if (ret == X86EMUL_PROPAGATE_FAULT)
3495 kvm_inject_page_fault(vcpu, q, error_code);
3496
3497 return ret;
3498}
3499
3500int complete_pio(struct kvm_vcpu *vcpu)
3501{
3502 struct kvm_pio_request *io = &vcpu->arch.pio;
3503 long delta;
3504 int r;
3505 unsigned long val;
3506
3507 if (!io->string) {
3508 if (io->in) {
3509 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3510 memcpy(&val, vcpu->arch.pio_data, io->size);
3511 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3512 }
3513 } else {
3514 if (io->in) {
3515 r = pio_copy_data(vcpu);
3516 if (r)
3517 goto out;
3518 }
3519
3520 delta = 1;
3521 if (io->rep) {
3522 delta *= io->cur_count;
3523 /*
3524 * The size of the register should really depend on
3525 * current address size.
3526 */
3527 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3528 val -= delta;
3529 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3530 }
3531 if (io->down)
3532 delta = -delta;
3533 delta *= io->size;
3534 if (io->in) {
3535 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3536 val += delta;
3537 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3538 } else {
3539 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3540 val += delta;
3541 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3542 }
3543 }
3544out:
3545 io->count -= io->cur_count;
3546 io->cur_count = 0;
3547
3548 return 0;
3549}
3550
3551static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3552{
3553 /* TODO: String I/O for in kernel device */
3554 int r;
3555
3556 if (vcpu->arch.pio.in)
3557 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3558 vcpu->arch.pio.size, pd);
3559 else
3560 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3561 vcpu->arch.pio.port, vcpu->arch.pio.size,
3562 pd);
3563 return r;
3564}
3565 3937
3566static int pio_string_write(struct kvm_vcpu *vcpu) 3938 if (vcpu->arch.emulate_ctxt.restart)
3567{ 3939 goto restart;
3568 struct kvm_pio_request *io = &vcpu->arch.pio;
3569 void *pd = vcpu->arch.pio_data;
3570 int i, r = 0;
3571 3940
3572 for (i = 0; i < io->cur_count; i++) { 3941 return EMULATE_DONE;
3573 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3574 io->port, io->size, pd)) {
3575 r = -EOPNOTSUPP;
3576 break;
3577 }
3578 pd += io->size;
3579 }
3580 return r;
3581}
3582
3583int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3584{
3585 unsigned long val;
3586
3587 trace_kvm_pio(!in, port, size, 1);
3588
3589 vcpu->run->exit_reason = KVM_EXIT_IO;
3590 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3591 vcpu->run->io.size = vcpu->arch.pio.size = size;
3592 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3593 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3594 vcpu->run->io.port = vcpu->arch.pio.port = port;
3595 vcpu->arch.pio.in = in;
3596 vcpu->arch.pio.string = 0;
3597 vcpu->arch.pio.down = 0;
3598 vcpu->arch.pio.rep = 0;
3599
3600 if (!vcpu->arch.pio.in) {
3601 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3602 memcpy(vcpu->arch.pio_data, &val, 4);
3603 }
3604
3605 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3606 complete_pio(vcpu);
3607 return 1;
3608 }
3609 return 0;
3610} 3942}
3611EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3943EXPORT_SYMBOL_GPL(emulate_instruction);
3612 3944
3613int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3945int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3614 int size, unsigned long count, int down,
3615 gva_t address, int rep, unsigned port)
3616{ 3946{
3617 unsigned now, in_page; 3947 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3618 int ret = 0; 3948 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3619 3949 /* do not return to emulator after return from userspace */
3620 trace_kvm_pio(!in, port, size, count); 3950 vcpu->arch.pio.count = 0;
3621
3622 vcpu->run->exit_reason = KVM_EXIT_IO;
3623 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3624 vcpu->run->io.size = vcpu->arch.pio.size = size;
3625 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3626 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3627 vcpu->run->io.port = vcpu->arch.pio.port = port;
3628 vcpu->arch.pio.in = in;
3629 vcpu->arch.pio.string = 1;
3630 vcpu->arch.pio.down = down;
3631 vcpu->arch.pio.rep = rep;
3632
3633 if (!count) {
3634 kvm_x86_ops->skip_emulated_instruction(vcpu);
3635 return 1;
3636 }
3637
3638 if (!down)
3639 in_page = PAGE_SIZE - offset_in_page(address);
3640 else
3641 in_page = offset_in_page(address) + size;
3642 now = min(count, (unsigned long)in_page / size);
3643 if (!now)
3644 now = 1;
3645 if (down) {
3646 /*
3647 * String I/O in reverse. Yuck. Kill the guest, fix later.
3648 */
3649 pr_unimpl(vcpu, "guest string pio down\n");
3650 kvm_inject_gp(vcpu, 0);
3651 return 1;
3652 }
3653 vcpu->run->io.count = now;
3654 vcpu->arch.pio.cur_count = now;
3655
3656 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3657 kvm_x86_ops->skip_emulated_instruction(vcpu);
3658
3659 vcpu->arch.pio.guest_gva = address;
3660
3661 if (!vcpu->arch.pio.in) {
3662 /* string PIO write */
3663 ret = pio_copy_data(vcpu);
3664 if (ret == X86EMUL_PROPAGATE_FAULT)
3665 return 1;
3666 if (ret == 0 && !pio_string_write(vcpu)) {
3667 complete_pio(vcpu);
3668 if (vcpu->arch.pio.count == 0)
3669 ret = 1;
3670 }
3671 }
3672 /* no string PIO read support yet */
3673
3674 return ret; 3951 return ret;
3675} 3952}
3676EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3953EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3677 3954
3678static void bounce_off(void *info) 3955static void bounce_off(void *info)
3679{ 3956{
@@ -3996,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3996 return emulator_write_emulated(rip, instruction, 3, vcpu); 4273 return emulator_write_emulated(rip, instruction, 3, vcpu);
3997} 4274}
3998 4275
3999static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4000{
4001 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4002}
4003
4004void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4005{ 4277{
4006 struct descriptor_table dt = { limit, base }; 4278 struct desc_ptr dt = { limit, base };
4007 4279
4008 kvm_x86_ops->set_gdt(vcpu, &dt); 4280 kvm_x86_ops->set_gdt(vcpu, &dt);
4009} 4281}
4010 4282
4011void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4283void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4012{ 4284{
4013 struct descriptor_table dt = { limit, base }; 4285 struct desc_ptr dt = { limit, base };
4014 4286
4015 kvm_x86_ops->set_idt(vcpu, &dt); 4287 kvm_x86_ops->set_idt(vcpu, &dt);
4016} 4288}
4017 4289
4018void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
4019 unsigned long *rflags)
4020{
4021 kvm_lmsw(vcpu, msw);
4022 *rflags = kvm_get_rflags(vcpu);
4023}
4024
4025unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
4026{
4027 unsigned long value;
4028
4029 switch (cr) {
4030 case 0:
4031 value = kvm_read_cr0(vcpu);
4032 break;
4033 case 2:
4034 value = vcpu->arch.cr2;
4035 break;
4036 case 3:
4037 value = vcpu->arch.cr3;
4038 break;
4039 case 4:
4040 value = kvm_read_cr4(vcpu);
4041 break;
4042 case 8:
4043 value = kvm_get_cr8(vcpu);
4044 break;
4045 default:
4046 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4047 return 0;
4048 }
4049
4050 return value;
4051}
4052
4053void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
4054 unsigned long *rflags)
4055{
4056 switch (cr) {
4057 case 0:
4058 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4059 *rflags = kvm_get_rflags(vcpu);
4060 break;
4061 case 2:
4062 vcpu->arch.cr2 = val;
4063 break;
4064 case 3:
4065 kvm_set_cr3(vcpu, val);
4066 break;
4067 case 4:
4068 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4069 break;
4070 case 8:
4071 kvm_set_cr8(vcpu, val & 0xfUL);
4072 break;
4073 default:
4074 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4075 }
4076}
4077
4078static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4290static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4079{ 4291{
4080 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4292 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4138,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4138{ 4350{
4139 struct kvm_cpuid_entry2 *best; 4351 struct kvm_cpuid_entry2 *best;
4140 4352
4353 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4354 if (!best || best->eax < 0x80000008)
4355 goto not_found;
4141 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4356 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4142 if (best) 4357 if (best)
4143 return best->eax & 0xff; 4358 return best->eax & 0xff;
4359not_found:
4144 return 36; 4360 return 36;
4145} 4361}
4146 4362
@@ -4254,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4254{ 4470{
4255 /* try to reinject previous events if any */ 4471 /* try to reinject previous events if any */
4256 if (vcpu->arch.exception.pending) { 4472 if (vcpu->arch.exception.pending) {
4473 trace_kvm_inj_exception(vcpu->arch.exception.nr,
4474 vcpu->arch.exception.has_error_code,
4475 vcpu->arch.exception.error_code);
4257 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4476 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4258 vcpu->arch.exception.has_error_code, 4477 vcpu->arch.exception.has_error_code,
4259 vcpu->arch.exception.error_code); 4478 vcpu->arch.exception.error_code,
4479 vcpu->arch.exception.reinject);
4260 return; 4480 return;
4261 } 4481 }
4262 4482
@@ -4486,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4486 } 4706 }
4487 4707
4488 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4708 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4489 post_kvm_run_save(vcpu);
4490 4709
4491 vapic_exit(vcpu); 4710 vapic_exit(vcpu);
4492 4711
@@ -4514,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4514 if (!irqchip_in_kernel(vcpu->kvm)) 4733 if (!irqchip_in_kernel(vcpu->kvm))
4515 kvm_set_cr8(vcpu, kvm_run->cr8); 4734 kvm_set_cr8(vcpu, kvm_run->cr8);
4516 4735
4517 if (vcpu->arch.pio.cur_count) { 4736 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4518 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4737 vcpu->arch.emulate_ctxt.restart) {
4519 r = complete_pio(vcpu); 4738 if (vcpu->mmio_needed) {
4520 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4739 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4521 if (r) 4740 vcpu->mmio_read_completed = 1;
4522 goto out; 4741 vcpu->mmio_needed = 0;
4523 } 4742 }
4524 if (vcpu->mmio_needed) {
4525 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4526 vcpu->mmio_read_completed = 1;
4527 vcpu->mmio_needed = 0;
4528
4529 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4530 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4531 EMULTYPE_NO_DECODE);
4532 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4533 if (r == EMULATE_DO_MMIO) { 4746 if (r == EMULATE_DO_MMIO) {
4534 /*
4535 * Read-modify-write. Back to userspace.
4536 */
4537 r = 0; 4747 r = 0;
4538 goto out; 4748 goto out;
4539 } 4749 }
@@ -4545,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4545 r = __vcpu_run(vcpu); 4755 r = __vcpu_run(vcpu);
4546 4756
4547out: 4757out:
4758 post_kvm_run_save(vcpu);
4548 if (vcpu->sigset_active) 4759 if (vcpu->sigset_active)
4549 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4550 4761
@@ -4616,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4616 return 0; 4827 return 0;
4617} 4828}
4618 4829
4619void kvm_get_segment(struct kvm_vcpu *vcpu,
4620 struct kvm_segment *var, int seg)
4621{
4622 kvm_x86_ops->get_segment(vcpu, var, seg);
4623}
4624
4625void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4830void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4626{ 4831{
4627 struct kvm_segment cs; 4832 struct kvm_segment cs;
@@ -4635,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4635int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4840int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4636 struct kvm_sregs *sregs) 4841 struct kvm_sregs *sregs)
4637{ 4842{
4638 struct descriptor_table dt; 4843 struct desc_ptr dt;
4639 4844
4640 vcpu_load(vcpu); 4845 vcpu_load(vcpu);
4641 4846
@@ -4650,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4650 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4855 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4651 4856
4652 kvm_x86_ops->get_idt(vcpu, &dt); 4857 kvm_x86_ops->get_idt(vcpu, &dt);
4653 sregs->idt.limit = dt.limit; 4858 sregs->idt.limit = dt.size;
4654 sregs->idt.base = dt.base; 4859 sregs->idt.base = dt.address;
4655 kvm_x86_ops->get_gdt(vcpu, &dt); 4860 kvm_x86_ops->get_gdt(vcpu, &dt);
4656 sregs->gdt.limit = dt.limit; 4861 sregs->gdt.limit = dt.size;
4657 sregs->gdt.base = dt.base; 4862 sregs->gdt.base = dt.address;
4658 4863
4659 sregs->cr0 = kvm_read_cr0(vcpu); 4864 sregs->cr0 = kvm_read_cr0(vcpu);
4660 sregs->cr2 = vcpu->arch.cr2; 4865 sregs->cr2 = vcpu->arch.cr2;
@@ -4693,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4693 return 0; 4898 return 0;
4694} 4899}
4695 4900
4696static void kvm_set_segment(struct kvm_vcpu *vcpu, 4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4697 struct kvm_segment *var, int seg) 4902 bool has_error_code, u32 error_code)
4698{
4699 kvm_x86_ops->set_segment(vcpu, var, seg);
4700}
4701
4702static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4703 struct kvm_segment *kvm_desct)
4704{
4705 kvm_desct->base = get_desc_base(seg_desc);
4706 kvm_desct->limit = get_desc_limit(seg_desc);
4707 if (seg_desc->g) {
4708 kvm_desct->limit <<= 12;
4709 kvm_desct->limit |= 0xfff;
4710 }
4711 kvm_desct->selector = selector;
4712 kvm_desct->type = seg_desc->type;
4713 kvm_desct->present = seg_desc->p;
4714 kvm_desct->dpl = seg_desc->dpl;
4715 kvm_desct->db = seg_desc->d;
4716 kvm_desct->s = seg_desc->s;
4717 kvm_desct->l = seg_desc->l;
4718 kvm_desct->g = seg_desc->g;
4719 kvm_desct->avl = seg_desc->avl;
4720 if (!selector)
4721 kvm_desct->unusable = 1;
4722 else
4723 kvm_desct->unusable = 0;
4724 kvm_desct->padding = 0;
4725}
4726
4727static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4728 u16 selector,
4729 struct descriptor_table *dtable)
4730{
4731 if (selector & 1 << 2) {
4732 struct kvm_segment kvm_seg;
4733
4734 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4735
4736 if (kvm_seg.unusable)
4737 dtable->limit = 0;
4738 else
4739 dtable->limit = kvm_seg.limit;
4740 dtable->base = kvm_seg.base;
4741 }
4742 else
4743 kvm_x86_ops->get_gdt(vcpu, dtable);
4744}
4745
4746/* allowed just for 8 bytes segments */
4747static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4748 struct desc_struct *seg_desc)
4749{
4750 struct descriptor_table dtable;
4751 u16 index = selector >> 3;
4752 int ret;
4753 u32 err;
4754 gva_t addr;
4755
4756 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4757
4758 if (dtable.limit < index * 8 + 7) {
4759 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4760 return X86EMUL_PROPAGATE_FAULT;
4761 }
4762 addr = dtable.base + index * 8;
4763 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4764 vcpu, &err);
4765 if (ret == X86EMUL_PROPAGATE_FAULT)
4766 kvm_inject_page_fault(vcpu, addr, err);
4767
4768 return ret;
4769}
4770
4771/* allowed just for 8 bytes segments */
4772static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4773 struct desc_struct *seg_desc)
4774{
4775 struct descriptor_table dtable;
4776 u16 index = selector >> 3;
4777
4778 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4779
4780 if (dtable.limit < index * 8 + 7)
4781 return 1;
4782 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4783}
4784
4785static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4786 struct desc_struct *seg_desc)
4787{
4788 u32 base_addr = get_desc_base(seg_desc);
4789
4790 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4791}
4792
4793static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4794 struct desc_struct *seg_desc)
4795{
4796 u32 base_addr = get_desc_base(seg_desc);
4797
4798 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4799}
4800
4801static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4802{
4803 struct kvm_segment kvm_seg;
4804
4805 kvm_get_segment(vcpu, &kvm_seg, seg);
4806 return kvm_seg.selector;
4807}
4808
4809static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4810{
4811 struct kvm_segment segvar = {
4812 .base = selector << 4,
4813 .limit = 0xffff,
4814 .selector = selector,
4815 .type = 3,
4816 .present = 1,
4817 .dpl = 3,
4818 .db = 0,
4819 .s = 1,
4820 .l = 0,
4821 .g = 0,
4822 .avl = 0,
4823 .unusable = 0,
4824 };
4825 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4826 return X86EMUL_CONTINUE;
4827}
4828
4829static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4830{ 4903{
4831 return (seg != VCPU_SREG_LDTR) && 4904 int cs_db, cs_l, ret;
4832 (seg != VCPU_SREG_TR) && 4905 cache_all_regs(vcpu);
4833 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4834}
4835
4836int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4837{
4838 struct kvm_segment kvm_seg;
4839 struct desc_struct seg_desc;
4840 u8 dpl, rpl, cpl;
4841 unsigned err_vec = GP_VECTOR;
4842 u32 err_code = 0;
4843 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4844 int ret;
4845 4906
4846 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) 4907 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4847 return kvm_load_realmode_segment(vcpu, selector, seg);
4848 4908
4849 /* NULL selector is not valid for TR, CS and SS */ 4909 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4850 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 4910 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4851 && null_selector) 4911 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4852 goto exception; 4912 vcpu->arch.emulate_ctxt.mode =
4913 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4914 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4915 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4853 4918
4854 /* TR should be in GDT only */ 4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4855 if (seg == VCPU_SREG_TR && (selector & (1 << 2))) 4920 tss_selector, reason, has_error_code,
4856 goto exception; 4921 error_code);
4857 4922
4858 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4859 if (ret) 4923 if (ret)
4860 return ret; 4924 return EMULATE_FAIL;
4861
4862 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4863
4864 if (null_selector) { /* for NULL selector skip all following checks */
4865 kvm_seg.unusable = 1;
4866 goto load;
4867 }
4868
4869 err_code = selector & 0xfffc;
4870 err_vec = GP_VECTOR;
4871
4872 /* can't load system descriptor into segment selecor */
4873 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4874 goto exception;
4875
4876 if (!kvm_seg.present) {
4877 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4878 goto exception;
4879 }
4880
4881 rpl = selector & 3;
4882 dpl = kvm_seg.dpl;
4883 cpl = kvm_x86_ops->get_cpl(vcpu);
4884
4885 switch (seg) {
4886 case VCPU_SREG_SS:
4887 /*
4888 * segment is not a writable data segment or segment
4889 * selector's RPL != CPL or segment selector's RPL != CPL
4890 */
4891 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4892 goto exception;
4893 break;
4894 case VCPU_SREG_CS:
4895 if (!(kvm_seg.type & 8))
4896 goto exception;
4897
4898 if (kvm_seg.type & 4) {
4899 /* conforming */
4900 if (dpl > cpl)
4901 goto exception;
4902 } else {
4903 /* nonconforming */
4904 if (rpl > cpl || dpl != cpl)
4905 goto exception;
4906 }
4907 /* CS(RPL) <- CPL */
4908 selector = (selector & 0xfffc) | cpl;
4909 break;
4910 case VCPU_SREG_TR:
4911 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4912 goto exception;
4913 break;
4914 case VCPU_SREG_LDTR:
4915 if (kvm_seg.s || kvm_seg.type != 2)
4916 goto exception;
4917 break;
4918 default: /* DS, ES, FS, or GS */
4919 /*
4920 * segment is not a data or readable code segment or
4921 * ((segment is a data or nonconforming code segment)
4922 * and (both RPL and CPL > DPL))
4923 */
4924 if ((kvm_seg.type & 0xa) == 0x8 ||
4925 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4926 goto exception;
4927 break;
4928 }
4929
4930 if (!kvm_seg.unusable && kvm_seg.s) {
4931 /* mark segment as accessed */
4932 kvm_seg.type |= 1;
4933 seg_desc.type |= 1;
4934 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4935 }
4936load:
4937 kvm_set_segment(vcpu, &kvm_seg, seg);
4938 return X86EMUL_CONTINUE;
4939exception:
4940 kvm_queue_exception_e(vcpu, err_vec, err_code);
4941 return X86EMUL_PROPAGATE_FAULT;
4942}
4943
4944static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4945 struct tss_segment_32 *tss)
4946{
4947 tss->cr3 = vcpu->arch.cr3;
4948 tss->eip = kvm_rip_read(vcpu);
4949 tss->eflags = kvm_get_rflags(vcpu);
4950 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4951 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4952 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4953 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4954 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4955 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4956 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4957 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4958 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4959 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4960 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4961 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4962 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4963 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4964 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4965}
4966
4967static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4968{
4969 struct kvm_segment kvm_seg;
4970 kvm_get_segment(vcpu, &kvm_seg, seg);
4971 kvm_seg.selector = sel;
4972 kvm_set_segment(vcpu, &kvm_seg, seg);
4973}
4974
4975static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4976 struct tss_segment_32 *tss)
4977{
4978 kvm_set_cr3(vcpu, tss->cr3);
4979
4980 kvm_rip_write(vcpu, tss->eip);
4981 kvm_set_rflags(vcpu, tss->eflags | 2);
4982
4983 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4984 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4985 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4986 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4987 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4988 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4989 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4990 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4991
4992 /*
4993 * SDM says that segment selectors are loaded before segment
4994 * descriptors
4995 */
4996 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4997 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4998 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4999 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5000 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5001 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
5002 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
5003
5004 /*
5005 * Now load segment descriptors. If fault happenes at this stage
5006 * it is handled in a context of new task
5007 */
5008 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
5009 return 1;
5010
5011 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5012 return 1;
5013 4925
5014 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) 4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5015 return 1; 4927 return EMULATE_DONE;
5016
5017 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5018 return 1;
5019
5020 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5021 return 1;
5022
5023 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
5024 return 1;
5025
5026 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
5027 return 1;
5028 return 0;
5029}
5030
5031static void save_state_to_tss16(struct kvm_vcpu *vcpu,
5032 struct tss_segment_16 *tss)
5033{
5034 tss->ip = kvm_rip_read(vcpu);
5035 tss->flag = kvm_get_rflags(vcpu);
5036 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5037 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5038 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
5039 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5040 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
5041 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
5042 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
5043 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
5044
5045 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
5046 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
5047 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
5048 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
5049 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
5050}
5051
5052static int load_state_from_tss16(struct kvm_vcpu *vcpu,
5053 struct tss_segment_16 *tss)
5054{
5055 kvm_rip_write(vcpu, tss->ip);
5056 kvm_set_rflags(vcpu, tss->flag | 2);
5057 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
5058 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
5059 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
5060 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
5061 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
5062 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
5063 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
5064 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
5065
5066 /*
5067 * SDM says that segment selectors are loaded before segment
5068 * descriptors
5069 */
5070 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5071 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5072 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5073 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5074 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5075
5076 /*
5077 * Now load segment descriptors. If fault happenes at this stage
5078 * it is handled in a context of new task
5079 */
5080 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
5081 return 1;
5082
5083 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5084 return 1;
5085
5086 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5087 return 1;
5088
5089 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5090 return 1;
5091
5092 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5093 return 1;
5094 return 0;
5095}
5096
5097static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
5098 u16 old_tss_sel, u32 old_tss_base,
5099 struct desc_struct *nseg_desc)
5100{
5101 struct tss_segment_16 tss_segment_16;
5102 int ret = 0;
5103
5104 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5105 sizeof tss_segment_16))
5106 goto out;
5107
5108 save_state_to_tss16(vcpu, &tss_segment_16);
5109
5110 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5111 sizeof tss_segment_16))
5112 goto out;
5113
5114 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5115 &tss_segment_16, sizeof tss_segment_16))
5116 goto out;
5117
5118 if (old_tss_sel != 0xffff) {
5119 tss_segment_16.prev_task_link = old_tss_sel;
5120
5121 if (kvm_write_guest(vcpu->kvm,
5122 get_tss_base_addr_write(vcpu, nseg_desc),
5123 &tss_segment_16.prev_task_link,
5124 sizeof tss_segment_16.prev_task_link))
5125 goto out;
5126 }
5127
5128 if (load_state_from_tss16(vcpu, &tss_segment_16))
5129 goto out;
5130
5131 ret = 1;
5132out:
5133 return ret;
5134}
5135
5136static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
5137 u16 old_tss_sel, u32 old_tss_base,
5138 struct desc_struct *nseg_desc)
5139{
5140 struct tss_segment_32 tss_segment_32;
5141 int ret = 0;
5142
5143 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5144 sizeof tss_segment_32))
5145 goto out;
5146
5147 save_state_to_tss32(vcpu, &tss_segment_32);
5148
5149 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5150 sizeof tss_segment_32))
5151 goto out;
5152
5153 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5154 &tss_segment_32, sizeof tss_segment_32))
5155 goto out;
5156
5157 if (old_tss_sel != 0xffff) {
5158 tss_segment_32.prev_task_link = old_tss_sel;
5159
5160 if (kvm_write_guest(vcpu->kvm,
5161 get_tss_base_addr_write(vcpu, nseg_desc),
5162 &tss_segment_32.prev_task_link,
5163 sizeof tss_segment_32.prev_task_link))
5164 goto out;
5165 }
5166
5167 if (load_state_from_tss32(vcpu, &tss_segment_32))
5168 goto out;
5169
5170 ret = 1;
5171out:
5172 return ret;
5173}
5174
5175int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
5176{
5177 struct kvm_segment tr_seg;
5178 struct desc_struct cseg_desc;
5179 struct desc_struct nseg_desc;
5180 int ret = 0;
5181 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
5182 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5183 u32 desc_limit;
5184
5185 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
5186
5187 /* FIXME: Handle errors. Failure to read either TSS or their
5188 * descriptors should generate a pagefault.
5189 */
5190 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
5191 goto out;
5192
5193 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
5194 goto out;
5195
5196 if (reason != TASK_SWITCH_IRET) {
5197 int cpl;
5198
5199 cpl = kvm_x86_ops->get_cpl(vcpu);
5200 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
5201 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5202 return 1;
5203 }
5204 }
5205
5206 desc_limit = get_desc_limit(&nseg_desc);
5207 if (!nseg_desc.p ||
5208 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5209 desc_limit < 0x2b)) {
5210 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
5211 return 1;
5212 }
5213
5214 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
5215 cseg_desc.type &= ~(1 << 1); //clear the B flag
5216 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
5217 }
5218
5219 if (reason == TASK_SWITCH_IRET) {
5220 u32 eflags = kvm_get_rflags(vcpu);
5221 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
5222 }
5223
5224 /* set back link to prev task only if NT bit is set in eflags
5225 note that old_tss_sel is not used afetr this point */
5226 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
5227 old_tss_sel = 0xffff;
5228
5229 if (nseg_desc.type & 8)
5230 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
5231 old_tss_base, &nseg_desc);
5232 else
5233 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
5234 old_tss_base, &nseg_desc);
5235
5236 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
5237 u32 eflags = kvm_get_rflags(vcpu);
5238 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
5239 }
5240
5241 if (reason != TASK_SWITCH_IRET) {
5242 nseg_desc.type |= (1 << 1);
5243 save_guest_segment_descriptor(vcpu, tss_selector,
5244 &nseg_desc);
5245 }
5246
5247 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
5248 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
5249 tr_seg.type = 11;
5250 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
5251out:
5252 return ret;
5253} 4928}
5254EXPORT_SYMBOL_GPL(kvm_task_switch); 4929EXPORT_SYMBOL_GPL(kvm_task_switch);
5255 4930
@@ -5258,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5258{ 4933{
5259 int mmu_reset_needed = 0; 4934 int mmu_reset_needed = 0;
5260 int pending_vec, max_bits; 4935 int pending_vec, max_bits;
5261 struct descriptor_table dt; 4936 struct desc_ptr dt;
5262 4937
5263 vcpu_load(vcpu); 4938 vcpu_load(vcpu);
5264 4939
5265 dt.limit = sregs->idt.limit; 4940 dt.size = sregs->idt.limit;
5266 dt.base = sregs->idt.base; 4941 dt.address = sregs->idt.base;
5267 kvm_x86_ops->set_idt(vcpu, &dt); 4942 kvm_x86_ops->set_idt(vcpu, &dt);
5268 dt.limit = sregs->gdt.limit; 4943 dt.size = sregs->gdt.limit;
5269 dt.base = sregs->gdt.base; 4944 dt.address = sregs->gdt.base;
5270 kvm_x86_ops->set_gdt(vcpu, &dt); 4945 kvm_x86_ops->set_gdt(vcpu, &dt);
5271 4946
5272 vcpu->arch.cr2 = sregs->cr2; 4947 vcpu->arch.cr2 = sregs->cr2;
@@ -5365,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5365 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5040 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5366 } 5041 }
5367 5042
5368 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 5043 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5369 vcpu->arch.singlestep_cs = 5044 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5370 get_segment_selector(vcpu, VCPU_SREG_CS); 5045 get_segment_base(vcpu, VCPU_SREG_CS);
5371 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5372 }
5373 5046
5374 /* 5047 /*
5375 * Trigger an rflags update that will inject or remove the trace 5048 * Trigger an rflags update that will inject or remove the trace
@@ -5860,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5860 return kvm_x86_ops->interrupt_allowed(vcpu); 5533 return kvm_x86_ops->interrupt_allowed(vcpu);
5861} 5534}
5862 5535
5536bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
5537{
5538 unsigned long current_rip = kvm_rip_read(vcpu) +
5539 get_segment_base(vcpu, VCPU_SREG_CS);
5540
5541 return current_rip == linear_rip;
5542}
5543EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
5544
5863unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5545unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5864{ 5546{
5865 unsigned long rflags; 5547 unsigned long rflags;
5866 5548
5867 rflags = kvm_x86_ops->get_rflags(vcpu); 5549 rflags = kvm_x86_ops->get_rflags(vcpu);
5868 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5550 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5869 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5551 rflags &= ~X86_EFLAGS_TF;
5870 return rflags; 5552 return rflags;
5871} 5553}
5872EXPORT_SYMBOL_GPL(kvm_get_rflags); 5554EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5874,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
5874void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5556void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5875{ 5557{
5876 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5558 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5877 vcpu->arch.singlestep_cs == 5559 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5878 get_segment_selector(vcpu, VCPU_SREG_CS) && 5560 rflags |= X86_EFLAGS_TF;
5879 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5880 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5881 kvm_x86_ops->set_rflags(vcpu, rflags); 5561 kvm_x86_ops->set_rflags(vcpu, rflags);
5882} 5562}
5883EXPORT_SYMBOL_GPL(kvm_set_rflags); 5563EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5893,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5893EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5573EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5894EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5574EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5895EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5575EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
5576EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..f4b54458285b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
70 77
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2bdf628066bd..9257510b4836 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1390,7 +1390,6 @@ __init void lguest_init(void)
1390#endif 1390#endif
1391#ifdef CONFIG_ACPI 1391#ifdef CONFIG_ACPI
1392 acpi_disabled = 1; 1392 acpi_disabled = 1;
1393 acpi_ht = 0;
1394#endif 1393#endif
1395 1394
1396 /* 1395 /*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 550df481accd..787c52ca49c3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -3,12 +3,6 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5 5
6#ifdef CONFIG_DEBUG_PER_CPU_MAPS
7# define DBG(x...) printk(KERN_DEBUG x)
8#else
9# define DBG(x...)
10#endif
11
12/* 6/*
13 * Which logical CPUs are on which nodes 7 * Which logical CPUs are on which nodes
14 */ 8 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8948f47fde05..a7bcc23ef96c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,9 +33,6 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36DEFINE_PER_CPU(int, node_number) = 0;
37EXPORT_PER_CPU_SYMBOL(node_number);
38
39/* 36/*
40 * Map cpu index to node index 37 * Map cpu index to node index
41 */ 38 */
@@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node)
809 per_cpu(x86_cpu_to_node_map, cpu) = node; 806 per_cpu(x86_cpu_to_node_map, cpu) = node;
810 807
811 if (node != NUMA_NO_NODE) 808 if (node != NUMA_NO_NODE)
812 per_cpu(node_number, cpu) = node; 809 set_cpu_numa_node(cpu, node);
813} 810}
814 811
815void __cpuinit numa_clear_node(int cpu) 812void __cpuinit numa_clear_node(int cpu)
@@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu)
867 numa_set_cpumask(cpu, 0); 864 numa_set_cpumask(cpu, 0);
868} 865}
869 866
870int cpu_to_node(int cpu) 867int __cpu_to_node(int cpu)
871{ 868{
872 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 869 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
873 printk(KERN_WARNING 870 printk(KERN_WARNING
@@ -877,7 +874,7 @@ int cpu_to_node(int cpu)
877 } 874 }
878 return per_cpu(x86_cpu_to_node_map, cpu); 875 return per_cpu(x86_cpu_to_node_map, cpu);
879} 876}
880EXPORT_SYMBOL(cpu_to_node); 877EXPORT_SYMBOL(__cpu_to_node);
881 878
882/* 879/*
883 * Same function as cpu_to_node() but used if called before the 880 * Same function as cpu_to_node() but used if called before the
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index bbe5502ee1cb..acc15b23b743 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -336,6 +336,7 @@ int free_memtype(u64 start, u64 end)
336{ 336{
337 int err = -EINVAL; 337 int err = -EINVAL;
338 int is_range_ram; 338 int is_range_ram;
339 struct memtype *entry;
339 340
340 if (!pat_enabled) 341 if (!pat_enabled)
341 return 0; 342 return 0;
@@ -355,17 +356,20 @@ int free_memtype(u64 start, u64 end)
355 } 356 }
356 357
357 spin_lock(&memtype_lock); 358 spin_lock(&memtype_lock);
358 err = rbt_memtype_erase(start, end); 359 entry = rbt_memtype_erase(start, end);
359 spin_unlock(&memtype_lock); 360 spin_unlock(&memtype_lock);
360 361
361 if (err) { 362 if (!entry) {
362 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", 363 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
363 current->comm, current->pid, start, end); 364 current->comm, current->pid, start, end);
365 return -EINVAL;
364 } 366 }
365 367
368 kfree(entry);
369
366 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 370 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
367 371
368 return err; 372 return 0;
369} 373}
370 374
371 375
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 4f39eefa3e61..77e5ba153fac 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -28,15 +28,15 @@ static inline char *cattr_name(unsigned long flags)
28#ifdef CONFIG_X86_PAT 28#ifdef CONFIG_X86_PAT
29extern int rbt_memtype_check_insert(struct memtype *new, 29extern int rbt_memtype_check_insert(struct memtype *new,
30 unsigned long *new_type); 30 unsigned long *new_type);
31extern int rbt_memtype_erase(u64 start, u64 end); 31extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
32extern struct memtype *rbt_memtype_lookup(u64 addr); 32extern struct memtype *rbt_memtype_lookup(u64 addr);
33extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); 33extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
34#else 34#else
35static inline int rbt_memtype_check_insert(struct memtype *new, 35static inline int rbt_memtype_check_insert(struct memtype *new,
36 unsigned long *new_type) 36 unsigned long *new_type)
37{ return 0; } 37{ return 0; }
38static inline int rbt_memtype_erase(u64 start, u64 end) 38static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
39{ return 0; } 39{ return NULL; }
40static inline struct memtype *rbt_memtype_lookup(u64 addr) 40static inline struct memtype *rbt_memtype_lookup(u64 addr)
41{ return NULL; } 41{ return NULL; }
42static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) 42static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 07de4cb8cc30..f537087bb740 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -231,16 +231,17 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
231 return err; 231 return err;
232} 232}
233 233
234int rbt_memtype_erase(u64 start, u64 end) 234struct memtype *rbt_memtype_erase(u64 start, u64 end)
235{ 235{
236 struct memtype *data; 236 struct memtype *data;
237 237
238 data = memtype_rb_exact_match(&memtype_rbroot, start, end); 238 data = memtype_rb_exact_match(&memtype_rbroot, start, end);
239 if (!data) 239 if (!data)
240 return -EINVAL; 240 goto out;
241 241
242 rb_erase(&data->rb, &memtype_rbroot); 242 rb_erase(&data->rb, &memtype_rbroot);
243 return 0; 243out:
244 return data;
244} 245}
245 246
246struct memtype *rbt_memtype_lookup(u64 addr) 247struct memtype *rbt_memtype_lookup(u64 addr)
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index df3d5c861cda..308e32570d84 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -34,7 +34,7 @@
34/* IA32 Manual 3, 2-1 */ 34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = { 35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, 36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67 37 0x65, 0x66, 0x67
38}; 38};
39/* IA32 Manual 3, 3-432*/ 39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = { 40static unsigned int reg_rop[] = {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 792854003ed3..cac718499256 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,7 +9,6 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/quicklist.h>
13 12
14#include <asm/system.h> 13#include <asm/system.h>
15#include <asm/pgtable.h> 14#include <asm/pgtable.h>
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index b110d97fb925..a0207a7fdf39 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST) += mrst.o
18obj-y += common.o early.o 18obj-y += common.o early.o
19obj-y += amd_bus.o bus_numa.o 19obj-y += amd_bus.o bus_numa.o
20 20
21obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
22
21ifeq ($(CONFIG_PCI_DEBUG),y) 23ifeq ($(CONFIG_PCI_DEBUG),y)
22EXTRA_CFLAGS += -DDEBUG 24EXTRA_CFLAGS += -DDEBUG
23endif 25endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 31930fd30ea9..2ec04c424a62 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum,
207 if (!info.res) 207 if (!info.res)
208 goto res_alloc_fail; 208 goto res_alloc_fail;
209 209
210 info.name = kmalloc(16, GFP_KERNEL); 210 info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
211 if (!info.name) 211 if (!info.name)
212 goto name_alloc_fail; 212 goto name_alloc_fail;
213 sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
214 213
215 info.res_num = 0; 214 info.res_num = 0;
216 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 215 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
@@ -224,8 +223,11 @@ res_alloc_fail:
224 return; 223 return;
225} 224}
226 225
227struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) 226struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
228{ 227{
228 struct acpi_device *device = root->device;
229 int domain = root->segment;
230 int busnum = root->secondary.start;
229 struct pci_bus *bus; 231 struct pci_bus *bus;
230 struct pci_sysdata *sd; 232 struct pci_sysdata *sd;
231 int node; 233 int node;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 000000000000..0846a5bbbfbd
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,101 @@
1/*
2 * Read address ranges from a Broadcom CNB20LE Host Bridge
3 *
4 * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2 of the License, or (at your
9 * option) any later version.
10 */
11
12#include <linux/delay.h>
13#include <linux/dmi.h>
14#include <linux/pci.h>
15#include <linux/init.h>
16#include <asm/pci_x86.h>
17
18#include "bus_numa.h"
19
20static void __devinit cnb20le_res(struct pci_dev *dev)
21{
22 struct pci_root_info *info;
23 struct resource res;
24 u16 word1, word2;
25 u8 fbus, lbus;
26 int i;
27
28 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use
30 * this information if ACPI _CRS was used. Therefore, we don't bother
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */
34
35 info = &pci_root_info[pci_root_num];
36 pci_root_num++;
37
38 /* read the PCI bus numbers */
39 pci_read_config_byte(dev, 0x44, &fbus);
40 pci_read_config_byte(dev, 0x45, &lbus);
41 info->bus_min = fbus;
42 info->bus_max = lbus;
43
44 /*
45 * Add the legacy IDE ports on bus 0
46 *
47 * These do not exist anywhere in the bridge registers, AFAICT. I do
48 * not have the datasheet, so this is the best I can do.
49 */
50 if (fbus == 0) {
51 update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
52 update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
53 update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
54 update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
55 update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
56 }
57
58 /* read the non-prefetchable memory window */
59 pci_read_config_word(dev, 0xc0, &word1);
60 pci_read_config_word(dev, 0xc2, &word2);
61 if (word1 != word2) {
62 res.start = (word1 << 16) | 0x0000;
63 res.end = (word2 << 16) | 0xffff;
64 res.flags = IORESOURCE_MEM;
65 update_res(info, res.start, res.end, res.flags, 0);
66 }
67
68 /* read the prefetchable memory window */
69 pci_read_config_word(dev, 0xc4, &word1);
70 pci_read_config_word(dev, 0xc6, &word2);
71 if (word1 != word2) {
72 res.start = (word1 << 16) | 0x0000;
73 res.end = (word2 << 16) | 0xffff;
74 res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
75 update_res(info, res.start, res.end, res.flags, 0);
76 }
77
78 /* read the IO port window */
79 pci_read_config_word(dev, 0xd0, &word1);
80 pci_read_config_word(dev, 0xd2, &word2);
81 if (word1 != word2) {
82 res.start = word1;
83 res.end = word2;
84 res.flags = IORESOURCE_IO;
85 update_res(info, res.start, res.end, res.flags, 0);
86 }
87
88 /* print information about this host bridge */
89 res.start = fbus;
90 res.end = lbus;
91 res.flags = IORESOURCE_BUS;
92 dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
93 pci_domain_nr(dev->bus), &res);
94
95 for (i = 0; i < info->res_num; i++)
96 dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
97}
98
99DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
100 cnb20le_res);
101
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cf2e93869c48..215a27ae050d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = {
76 * This interrupt-safe spinlock protects all accesses to PCI 76 * This interrupt-safe spinlock protects all accesses to PCI
77 * configuration space. 77 * configuration space.
78 */ 78 */
79DEFINE_SPINLOCK(pci_config_lock); 79DEFINE_RAW_SPINLOCK(pci_config_lock);
80 80
81static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) 81static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
82{ 82{
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 347d882b3bb3..bd33620b0071 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
27 return -EINVAL; 27 return -EINVAL;
28 } 28 }
29 29
30 spin_lock_irqsave(&pci_config_lock, flags); 30 raw_spin_lock_irqsave(&pci_config_lock, flags);
31 31
32 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); 32 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
33 33
@@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
43 break; 43 break;
44 } 44 }
45 45
46 spin_unlock_irqrestore(&pci_config_lock, flags); 46 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
47 47
48 return 0; 48 return 0;
49} 49}
@@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
56 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 56 if ((bus > 255) || (devfn > 255) || (reg > 4095))
57 return -EINVAL; 57 return -EINVAL;
58 58
59 spin_lock_irqsave(&pci_config_lock, flags); 59 raw_spin_lock_irqsave(&pci_config_lock, flags);
60 60
61 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); 61 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
62 62
@@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
72 break; 72 break;
73 } 73 }
74 74
75 spin_unlock_irqrestore(&pci_config_lock, flags); 75 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
76 76
77 return 0; 77 return 0;
78} 78}
@@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
108 if (dev & 0x10) 108 if (dev & 0x10)
109 return PCIBIOS_DEVICE_NOT_FOUND; 109 return PCIBIOS_DEVICE_NOT_FOUND;
110 110
111 spin_lock_irqsave(&pci_config_lock, flags); 111 raw_spin_lock_irqsave(&pci_config_lock, flags);
112 112
113 outb((u8)(0xF0 | (fn << 1)), 0xCF8); 113 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
114 outb((u8)bus, 0xCFA); 114 outb((u8)bus, 0xCFA);
@@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
127 127
128 outb(0, 0xCF8); 128 outb(0, 0xCF8);
129 129
130 spin_unlock_irqrestore(&pci_config_lock, flags); 130 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
131 131
132 return 0; 132 return 0;
133} 133}
@@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
147 if (dev & 0x10) 147 if (dev & 0x10)
148 return PCIBIOS_DEVICE_NOT_FOUND; 148 return PCIBIOS_DEVICE_NOT_FOUND;
149 149
150 spin_lock_irqsave(&pci_config_lock, flags); 150 raw_spin_lock_irqsave(&pci_config_lock, flags);
151 151
152 outb((u8)(0xF0 | (fn << 1)), 0xCF8); 152 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
153 outb((u8)bus, 0xCFA); 153 outb((u8)bus, 0xCFA);
@@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
166 166
167 outb(0, 0xCF8); 167 outb(0, 0xCF8);
168 168
169 spin_unlock_irqrestore(&pci_config_lock, flags); 169 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
170 170
171 return 0; 171 return 0;
172} 172}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 97da2ba9344b..6fdb3ec30c31 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -96,6 +96,7 @@ EXPORT_SYMBOL(pcibios_align_resource);
96 * the fact the PCI specs explicitly allow address decoders to be 96 * the fact the PCI specs explicitly allow address decoders to be
97 * shared between expansion ROMs and other resource regions, it's 97 * shared between expansion ROMs and other resource regions, it's
98 * at least dangerous) 98 * at least dangerous)
99 * - bad resource sizes or overlaps with other regions
99 * 100 *
100 * Our solution: 101 * Our solution:
101 * (1) Allocate resources for all buses behind PCI-to-PCI bridges. 102 * (1) Allocate resources for all buses behind PCI-to-PCI bridges.
@@ -136,6 +137,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
136 * child resource allocations in this 137 * child resource allocations in this
137 * range. 138 * range.
138 */ 139 */
140 r->start = r->end = 0;
139 r->flags = 0; 141 r->flags = 0;
140 } 142 }
141 } 143 }
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5d362b5ba06f..9810a0f76c91 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
594 r->name = "PIIX/ICH"; 592 r->name = "PIIX/ICH";
595 r->get = pirq_piix_get; 593 r->get = pirq_piix_get;
596 r->set = pirq_piix_set; 594 r->set = pirq_piix_set;
@@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
605 return 1; 603 return 1;
606 } 604 }
607 605
606 if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) &&
607 (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
608 r->name = "PIIX/ICH";
609 r->get = pirq_piix_get;
610 r->set = pirq_piix_set;
611 return 1;
612 }
608 return 0; 613 return 0;
609} 614}
610 615
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 0db5eaf54560..8d460eaf524f 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -11,28 +11,14 @@
11 */ 11 */
12static void __devinit pcibios_fixup_peer_bridges(void) 12static void __devinit pcibios_fixup_peer_bridges(void)
13{ 13{
14 int n, devfn; 14 int n;
15 long node;
16 15
17 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff) 16 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
18 return; 17 return;
19 DBG("PCI: Peer bridge fixup\n"); 18 DBG("PCI: Peer bridge fixup\n");
20 19
21 for (n=0; n <= pcibios_last_bus; n++) { 20 for (n=0; n <= pcibios_last_bus; n++)
22 u32 l; 21 pcibios_scan_specific_bus(n);
23 if (pci_find_bus(0, n))
24 continue;
25 node = get_mp_bus_to_node(n);
26 for (devfn = 0; devfn < 256; devfn += 8) {
27 if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) &&
28 l != 0x0000 && l != 0xffff) {
29 DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l);
30 printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n);
31 pci_scan_bus_on_node(n, &pci_root_ops, node);
32 break;
33 }
34 }
35 }
36} 22}
37 23
38int __init pci_legacy_init(void) 24int __init pci_legacy_init(void)
@@ -50,6 +36,28 @@ int __init pci_legacy_init(void)
50 return 0; 36 return 0;
51} 37}
52 38
39void pcibios_scan_specific_bus(int busn)
40{
41 int devfn;
42 long node;
43 u32 l;
44
45 if (pci_find_bus(0, busn))
46 return;
47
48 node = get_mp_bus_to_node(busn);
49 for (devfn = 0; devfn < 256; devfn += 8) {
50 if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
51 l != 0x0000 && l != 0xffff) {
52 DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
53 printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn);
54 pci_scan_bus_on_node(busn, &pci_root_ops, node);
55 return;
56 }
57 }
58}
59EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
60
53int __init pci_subsys_init(void) 61int __init pci_subsys_init(void)
54{ 62{
55 /* 63 /*
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 39b9ebe8f886..a918553ebc75 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early)
483 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 483 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
484 int valid = 0; 484 int valid = 0;
485 485
486 if (!early && !acpi_disabled) 486 if (!early && !acpi_disabled) {
487 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); 487 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
488 488
489 if (valid) 489 if (valid)
490 continue; 490 continue;
491 491 else
492 if (!early) 492 printk(KERN_ERR FW_BUG PREFIX
493 printk(KERN_ERR FW_BUG PREFIX 493 "MMCONFIG at %pR not reserved in "
494 "MMCONFIG at %pR not reserved in " 494 "ACPI motherboard resources\n",
495 "ACPI motherboard resources\n", &cfg->res); 495 &cfg->res);
496 }
496 497
497 /* Don't try to do this check unless configuration 498 /* Don't try to do this check unless configuration
498 type 1 is available. how about type 2 ?*/ 499 type 1 is available. how about type 2 ?*/
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 90d5fd476ed4..a3d9c54792ae 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -64,7 +64,7 @@ err: *value = -1;
64 if (!base) 64 if (!base)
65 goto err; 65 goto err;
66 66
67 spin_lock_irqsave(&pci_config_lock, flags); 67 raw_spin_lock_irqsave(&pci_config_lock, flags);
68 68
69 pci_exp_set_dev_base(base, bus, devfn); 69 pci_exp_set_dev_base(base, bus, devfn);
70 70
@@ -79,7 +79,7 @@ err: *value = -1;
79 *value = mmio_config_readl(mmcfg_virt_addr + reg); 79 *value = mmio_config_readl(mmcfg_virt_addr + reg);
80 break; 80 break;
81 } 81 }
82 spin_unlock_irqrestore(&pci_config_lock, flags); 82 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
83 83
84 return 0; 84 return 0;
85} 85}
@@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
97 if (!base) 97 if (!base)
98 return -EINVAL; 98 return -EINVAL;
99 99
100 spin_lock_irqsave(&pci_config_lock, flags); 100 raw_spin_lock_irqsave(&pci_config_lock, flags);
101 101
102 pci_exp_set_dev_base(base, bus, devfn); 102 pci_exp_set_dev_base(base, bus, devfn);
103 103
@@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
112 mmio_config_writel(mmcfg_virt_addr + reg, value); 112 mmio_config_writel(mmcfg_virt_addr + reg, value);
113 break; 113 break;
114 } 114 }
115 spin_unlock_irqrestore(&pci_config_lock, flags); 115 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
116 116
117 return 0; 117 return 0;
118} 118}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8223738ad806..5c9e2458df4e 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
37 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 37 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
38 return -EINVAL; 38 return -EINVAL;
39 39
40 spin_lock_irqsave(&pci_config_lock, flags); 40 raw_spin_lock_irqsave(&pci_config_lock, flags);
41 41
42 write_cf8(bus, devfn, reg); 42 write_cf8(bus, devfn, reg);
43 43
@@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
62 break; 62 break;
63 } 63 }
64 64
65 spin_unlock_irqrestore(&pci_config_lock, flags); 65 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
66 66
67 return 0; 67 return 0;
68} 68}
@@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
76 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 76 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
77 return -EINVAL; 77 return -EINVAL;
78 78
79 spin_lock_irqsave(&pci_config_lock, flags); 79 raw_spin_lock_irqsave(&pci_config_lock, flags);
80 80
81 write_cf8(bus, devfn, reg); 81 write_cf8(bus, devfn, reg);
82 82
@@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
101 break; 101 break;
102 } 102 }
103 103
104 spin_unlock_irqrestore(&pci_config_lock, flags); 104 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
105 105
106 return 0; 106 return 0;
107} 107}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 59a225c17b84..2492d165096a 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
162 if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) 162 if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
163 return -EINVAL; 163 return -EINVAL;
164 164
165 spin_lock_irqsave(&pci_config_lock, flags); 165 raw_spin_lock_irqsave(&pci_config_lock, flags);
166 166
167 switch (len) { 167 switch (len) {
168 case 1: 168 case 1:
@@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
213 break; 213 break;
214 } 214 }
215 215
216 spin_unlock_irqrestore(&pci_config_lock, flags); 216 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
217 217
218 return (int)((result & 0xff00) >> 8); 218 return (int)((result & 0xff00) >> 8);
219} 219}
@@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
228 if ((bus > 255) || (devfn > 255) || (reg > 255)) 228 if ((bus > 255) || (devfn > 255) || (reg > 255))
229 return -EINVAL; 229 return -EINVAL;
230 230
231 spin_lock_irqsave(&pci_config_lock, flags); 231 raw_spin_lock_irqsave(&pci_config_lock, flags);
232 232
233 switch (len) { 233 switch (len) {
234 case 1: 234 case 1:
@@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
269 break; 269 break;
270 } 270 }
271 271
272 spin_unlock_irqrestore(&pci_config_lock, flags); 272 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
273 273
274 return (int)((result & 0xff00) >> 8); 274 return (int)((result & 0xff00) >> 8);
275} 275}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0a979f3e5b8a..1290ba54b350 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,6 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
105 ctxt->cr4 = read_cr4(); 105 ctxt->cr4 = read_cr4();
106 ctxt->cr8 = read_cr8(); 106 ctxt->cr8 = read_cr8();
107#endif 107#endif
108 ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
109 &ctxt->misc_enable);
108} 110}
109 111
110/* Needed by apm.c */ 112/* Needed by apm.c */
@@ -152,6 +154,8 @@ static void fix_processor_context(void)
152 */ 154 */
153static void __restore_processor_state(struct saved_context *ctxt) 155static void __restore_processor_state(struct saved_context *ctxt)
154{ 156{
157 if (ctxt->misc_enable_saved)
158 wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
155 /* 159 /*
156 * control registers 160 * control registers
157 */ 161 */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 987267f79bf5..a9c661108034 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -60,6 +60,6 @@ static void xen_vcpu_notify_restore(void *data)
60 60
61void xen_arch_resume(void) 61void xen_arch_resume(void)
62{ 62{
63 smp_call_function(xen_vcpu_notify_restore, 63 on_each_cpu(xen_vcpu_notify_restore,
64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1); 64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
65} 65}