aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig46
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c267
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/include/asm/atomic_32.h2
-rw-r--r--arch/x86/include/asm/atomic_64.h2
-rw-r--r--arch/x86/include/asm/bitsperlong.h13
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/entry_arch.h11
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h17
-rw-r--r--arch/x86/include/asm/kmap_types.h23
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/lguest.h7
-rw-r--r--arch/x86/include/asm/lguest_hcall.h15
-rw-r--r--arch/x86/include/asm/mce.h88
-rw-r--r--arch/x86/include/asm/mman.h2
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/msr.h23
-rw-r--r--arch/x86/include/asm/page.h2
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/types.h6
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c65
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c7
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/entry_64.S11
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c19
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/microcode_core.c1
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/setup.c15
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c31
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/traps.c11
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c109
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c194
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm.c415
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c723
-rw-r--r--arch/x86/kvm/x86.c409
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c158
-rw-r--r--arch/x86/lguest/i386_head.S60
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c183
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/memtest.c9
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c (renamed from arch/x86/power/cpu_64.c)165
-rw-r--r--arch/x86/power/cpu_32.c148
136 files changed, 6820 insertions, 3339 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68f5578fe38e..cf42fc305419 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
46 select HAVE_KERNEL_GZIP 46 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 select HAVE_ARCH_KMEMCHECK
49 50
50config OUTPUT_FORMAT 51config OUTPUT_FORMAT
51 string 52 string
@@ -789,10 +790,26 @@ config X86_MCE
789 to disable it. MCE support simply ignores non-MCE processors like 790 to disable it. MCE support simply ignores non-MCE processors like
790 the 386 and 486, so nearly everyone can say Y here. 791 the 386 and 486, so nearly everyone can say Y here.
791 792
793config X86_OLD_MCE
794 depends on X86_32 && X86_MCE
795 bool "Use legacy machine check code (will go away)"
796 default n
797 select X86_ANCIENT_MCE
798 ---help---
799 Use the old i386 machine check code. This is merely intended for
800 testing in a transition period. Try this if you run into any machine
801 check related software problems, but report the problem to
802 linux-kernel. When in doubt say no.
803
804config X86_NEW_MCE
805 depends on X86_MCE
806 bool
807 default y if (!X86_OLD_MCE && X86_32) || X86_64
808
792config X86_MCE_INTEL 809config X86_MCE_INTEL
793 def_bool y 810 def_bool y
794 prompt "Intel MCE features" 811 prompt "Intel MCE features"
795 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 812 depends on X86_NEW_MCE && X86_LOCAL_APIC
796 ---help--- 813 ---help---
797 Additional support for intel specific MCE features such as 814 Additional support for intel specific MCE features such as
798 the thermal monitor. 815 the thermal monitor.
@@ -800,19 +817,36 @@ config X86_MCE_INTEL
800config X86_MCE_AMD 817config X86_MCE_AMD
801 def_bool y 818 def_bool y
802 prompt "AMD MCE features" 819 prompt "AMD MCE features"
803 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 820 depends on X86_NEW_MCE && X86_LOCAL_APIC
804 ---help--- 821 ---help---
805 Additional support for AMD specific MCE features such as 822 Additional support for AMD specific MCE features such as
806 the DRAM Error Threshold. 823 the DRAM Error Threshold.
807 824
825config X86_ANCIENT_MCE
826 def_bool n
827 depends on X86_32
828 prompt "Support for old Pentium 5 / WinChip machine checks"
829 ---help---
830 Include support for machine check handling on old Pentium 5 or WinChip
831 systems. These typically need to be enabled explicitely on the command
832 line.
833
808config X86_MCE_THRESHOLD 834config X86_MCE_THRESHOLD
809 depends on X86_MCE_AMD || X86_MCE_INTEL 835 depends on X86_MCE_AMD || X86_MCE_INTEL
810 bool 836 bool
811 default y 837 default y
812 838
839config X86_MCE_INJECT
840 depends on X86_NEW_MCE
841 tristate "Machine check injector support"
842 ---help---
843 Provide support for injecting machine checks for testing purposes.
844 If you don't know what a machine check is and you don't do kernel
845 QA it is safe to say n.
846
813config X86_MCE_NONFATAL 847config X86_MCE_NONFATAL
814 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 848 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
815 depends on X86_32 && X86_MCE 849 depends on X86_OLD_MCE
816 ---help--- 850 ---help---
817 Enabling this feature starts a timer that triggers every 5 seconds which 851 Enabling this feature starts a timer that triggers every 5 seconds which
818 will look at the machine check registers to see if anything happened. 852 will look at the machine check registers to see if anything happened.
@@ -825,11 +859,15 @@ config X86_MCE_NONFATAL
825 859
826config X86_MCE_P4THERMAL 860config X86_MCE_P4THERMAL
827 bool "check for P4 thermal throttling interrupt." 861 bool "check for P4 thermal throttling interrupt."
828 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) 862 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
829 ---help--- 863 ---help---
830 Enabling this feature will cause a message to be printed when the P4 864 Enabling this feature will cause a message to be printed when the P4
831 enters thermal throttling. 865 enters thermal throttling.
832 866
867config X86_THERMAL_VECTOR
868 def_bool y
869 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
870
833config VM86 871config VM86
834 bool "Enable VM86 support" if EMBEDDED 872 bool "Enable VM86 support" if EMBEDDED
835 default y 873 default y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca62067..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
81 endif 81 endif
82endif 82endif
83 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
84# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
85sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
86sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b4..cfb0010fa940 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af65497..4e663398f77f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
21#include <asm/i387.h> 21#include <asm/i387.h>
22#include <asm/aes.h> 22#include <asm/aes.h>
23 23
24#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
25#define HAS_CTR
26#endif
27
28#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
29#define HAS_LRW
30#endif
31
32#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
33#define HAS_PCBC
34#endif
35
36#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
37#define HAS_XTS
38#endif
39
24struct async_aes_ctx { 40struct async_aes_ctx {
25 struct cryptd_ablkcipher *cryptd_tfm; 41 struct cryptd_ablkcipher *cryptd_tfm;
26}; 42};
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
137 } 153 }
138}; 154};
139 155
156static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
157{
158 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
159
160 aesni_enc(ctx, dst, src);
161}
162
163static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
164{
165 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
166
167 aesni_dec(ctx, dst, src);
168}
169
170static struct crypto_alg __aesni_alg = {
171 .cra_name = "__aes-aesni",
172 .cra_driver_name = "__driver-aes-aesni",
173 .cra_priority = 0,
174 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
175 .cra_blocksize = AES_BLOCK_SIZE,
176 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
177 .cra_alignmask = 0,
178 .cra_module = THIS_MODULE,
179 .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
180 .cra_u = {
181 .cipher = {
182 .cia_min_keysize = AES_MIN_KEY_SIZE,
183 .cia_max_keysize = AES_MAX_KEY_SIZE,
184 .cia_setkey = aes_set_key,
185 .cia_encrypt = __aes_encrypt,
186 .cia_decrypt = __aes_decrypt
187 }
188 }
189};
190
140static int ecb_encrypt(struct blkcipher_desc *desc, 191static int ecb_encrypt(struct blkcipher_desc *desc,
141 struct scatterlist *dst, struct scatterlist *src, 192 struct scatterlist *dst, struct scatterlist *src,
142 unsigned int nbytes) 193 unsigned int nbytes)
@@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
277 unsigned int key_len) 328 unsigned int key_len)
278{ 329{
279 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 330 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
331 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
332 int err;
280 333
281 return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); 334 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
335 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
336 & CRYPTO_TFM_REQ_MASK);
337 err = crypto_ablkcipher_setkey(child, key, key_len);
338 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
339 & CRYPTO_TFM_RES_MASK);
340 return err;
282} 341}
283 342
284static int ablk_encrypt(struct ablkcipher_request *req) 343static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = {
411 }, 470 },
412}; 471};
413 472
473#ifdef HAS_CTR
474static int ablk_ctr_init(struct crypto_tfm *tfm)
475{
476 struct cryptd_ablkcipher *cryptd_tfm;
477
478 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
479 0, 0);
480 if (IS_ERR(cryptd_tfm))
481 return PTR_ERR(cryptd_tfm);
482 ablk_init_common(tfm, cryptd_tfm);
483 return 0;
484}
485
486static struct crypto_alg ablk_ctr_alg = {
487 .cra_name = "ctr(aes)",
488 .cra_driver_name = "ctr-aes-aesni",
489 .cra_priority = 400,
490 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
491 .cra_blocksize = 1,
492 .cra_ctxsize = sizeof(struct async_aes_ctx),
493 .cra_alignmask = 0,
494 .cra_type = &crypto_ablkcipher_type,
495 .cra_module = THIS_MODULE,
496 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
497 .cra_init = ablk_ctr_init,
498 .cra_exit = ablk_exit,
499 .cra_u = {
500 .ablkcipher = {
501 .min_keysize = AES_MIN_KEY_SIZE,
502 .max_keysize = AES_MAX_KEY_SIZE,
503 .ivsize = AES_BLOCK_SIZE,
504 .setkey = ablk_set_key,
505 .encrypt = ablk_encrypt,
506 .decrypt = ablk_decrypt,
507 .geniv = "chainiv",
508 },
509 },
510};
511#endif
512
513#ifdef HAS_LRW
514static int ablk_lrw_init(struct crypto_tfm *tfm)
515{
516 struct cryptd_ablkcipher *cryptd_tfm;
517
518 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
519 0, 0);
520 if (IS_ERR(cryptd_tfm))
521 return PTR_ERR(cryptd_tfm);
522 ablk_init_common(tfm, cryptd_tfm);
523 return 0;
524}
525
526static struct crypto_alg ablk_lrw_alg = {
527 .cra_name = "lrw(aes)",
528 .cra_driver_name = "lrw-aes-aesni",
529 .cra_priority = 400,
530 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
531 .cra_blocksize = AES_BLOCK_SIZE,
532 .cra_ctxsize = sizeof(struct async_aes_ctx),
533 .cra_alignmask = 0,
534 .cra_type = &crypto_ablkcipher_type,
535 .cra_module = THIS_MODULE,
536 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
537 .cra_init = ablk_lrw_init,
538 .cra_exit = ablk_exit,
539 .cra_u = {
540 .ablkcipher = {
541 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
542 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
543 .ivsize = AES_BLOCK_SIZE,
544 .setkey = ablk_set_key,
545 .encrypt = ablk_encrypt,
546 .decrypt = ablk_decrypt,
547 },
548 },
549};
550#endif
551
552#ifdef HAS_PCBC
553static int ablk_pcbc_init(struct crypto_tfm *tfm)
554{
555 struct cryptd_ablkcipher *cryptd_tfm;
556
557 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
558 0, 0);
559 if (IS_ERR(cryptd_tfm))
560 return PTR_ERR(cryptd_tfm);
561 ablk_init_common(tfm, cryptd_tfm);
562 return 0;
563}
564
565static struct crypto_alg ablk_pcbc_alg = {
566 .cra_name = "pcbc(aes)",
567 .cra_driver_name = "pcbc-aes-aesni",
568 .cra_priority = 400,
569 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
570 .cra_blocksize = AES_BLOCK_SIZE,
571 .cra_ctxsize = sizeof(struct async_aes_ctx),
572 .cra_alignmask = 0,
573 .cra_type = &crypto_ablkcipher_type,
574 .cra_module = THIS_MODULE,
575 .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
576 .cra_init = ablk_pcbc_init,
577 .cra_exit = ablk_exit,
578 .cra_u = {
579 .ablkcipher = {
580 .min_keysize = AES_MIN_KEY_SIZE,
581 .max_keysize = AES_MAX_KEY_SIZE,
582 .ivsize = AES_BLOCK_SIZE,
583 .setkey = ablk_set_key,
584 .encrypt = ablk_encrypt,
585 .decrypt = ablk_decrypt,
586 },
587 },
588};
589#endif
590
591#ifdef HAS_XTS
592static int ablk_xts_init(struct crypto_tfm *tfm)
593{
594 struct cryptd_ablkcipher *cryptd_tfm;
595
596 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
597 0, 0);
598 if (IS_ERR(cryptd_tfm))
599 return PTR_ERR(cryptd_tfm);
600 ablk_init_common(tfm, cryptd_tfm);
601 return 0;
602}
603
604static struct crypto_alg ablk_xts_alg = {
605 .cra_name = "xts(aes)",
606 .cra_driver_name = "xts-aes-aesni",
607 .cra_priority = 400,
608 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
609 .cra_blocksize = AES_BLOCK_SIZE,
610 .cra_ctxsize = sizeof(struct async_aes_ctx),
611 .cra_alignmask = 0,
612 .cra_type = &crypto_ablkcipher_type,
613 .cra_module = THIS_MODULE,
614 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
615 .cra_init = ablk_xts_init,
616 .cra_exit = ablk_exit,
617 .cra_u = {
618 .ablkcipher = {
619 .min_keysize = 2 * AES_MIN_KEY_SIZE,
620 .max_keysize = 2 * AES_MAX_KEY_SIZE,
621 .ivsize = AES_BLOCK_SIZE,
622 .setkey = ablk_set_key,
623 .encrypt = ablk_encrypt,
624 .decrypt = ablk_decrypt,
625 },
626 },
627};
628#endif
629
414static int __init aesni_init(void) 630static int __init aesni_init(void)
415{ 631{
416 int err; 632 int err;
@@ -421,6 +637,8 @@ static int __init aesni_init(void)
421 } 637 }
422 if ((err = crypto_register_alg(&aesni_alg))) 638 if ((err = crypto_register_alg(&aesni_alg)))
423 goto aes_err; 639 goto aes_err;
640 if ((err = crypto_register_alg(&__aesni_alg)))
641 goto __aes_err;
424 if ((err = crypto_register_alg(&blk_ecb_alg))) 642 if ((err = crypto_register_alg(&blk_ecb_alg)))
425 goto blk_ecb_err; 643 goto blk_ecb_err;
426 if ((err = crypto_register_alg(&blk_cbc_alg))) 644 if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +647,41 @@ static int __init aesni_init(void)
429 goto ablk_ecb_err; 647 goto ablk_ecb_err;
430 if ((err = crypto_register_alg(&ablk_cbc_alg))) 648 if ((err = crypto_register_alg(&ablk_cbc_alg)))
431 goto ablk_cbc_err; 649 goto ablk_cbc_err;
650#ifdef HAS_CTR
651 if ((err = crypto_register_alg(&ablk_ctr_alg)))
652 goto ablk_ctr_err;
653#endif
654#ifdef HAS_LRW
655 if ((err = crypto_register_alg(&ablk_lrw_alg)))
656 goto ablk_lrw_err;
657#endif
658#ifdef HAS_PCBC
659 if ((err = crypto_register_alg(&ablk_pcbc_alg)))
660 goto ablk_pcbc_err;
661#endif
662#ifdef HAS_XTS
663 if ((err = crypto_register_alg(&ablk_xts_alg)))
664 goto ablk_xts_err;
665#endif
432 666
433 return err; 667 return err;
434 668
669#ifdef HAS_XTS
670ablk_xts_err:
671#endif
672#ifdef HAS_PCBC
673 crypto_unregister_alg(&ablk_pcbc_alg);
674ablk_pcbc_err:
675#endif
676#ifdef HAS_LRW
677 crypto_unregister_alg(&ablk_lrw_alg);
678ablk_lrw_err:
679#endif
680#ifdef HAS_CTR
681 crypto_unregister_alg(&ablk_ctr_alg);
682ablk_ctr_err:
683#endif
684 crypto_unregister_alg(&ablk_cbc_alg);
435ablk_cbc_err: 685ablk_cbc_err:
436 crypto_unregister_alg(&ablk_ecb_alg); 686 crypto_unregister_alg(&ablk_ecb_alg);
437ablk_ecb_err: 687ablk_ecb_err:
@@ -439,6 +689,8 @@ ablk_ecb_err:
439blk_cbc_err: 689blk_cbc_err:
440 crypto_unregister_alg(&blk_ecb_alg); 690 crypto_unregister_alg(&blk_ecb_alg);
441blk_ecb_err: 691blk_ecb_err:
692 crypto_unregister_alg(&__aesni_alg);
693__aes_err:
442 crypto_unregister_alg(&aesni_alg); 694 crypto_unregister_alg(&aesni_alg);
443aes_err: 695aes_err:
444 return err; 696 return err;
@@ -446,10 +698,23 @@ aes_err:
446 698
447static void __exit aesni_exit(void) 699static void __exit aesni_exit(void)
448{ 700{
701#ifdef HAS_XTS
702 crypto_unregister_alg(&ablk_xts_alg);
703#endif
704#ifdef HAS_PCBC
705 crypto_unregister_alg(&ablk_pcbc_alg);
706#endif
707#ifdef HAS_LRW
708 crypto_unregister_alg(&ablk_lrw_alg);
709#endif
710#ifdef HAS_CTR
711 crypto_unregister_alg(&ablk_ctr_alg);
712#endif
449 crypto_unregister_alg(&ablk_cbc_alg); 713 crypto_unregister_alg(&ablk_cbc_alg);
450 crypto_unregister_alg(&ablk_ecb_alg); 714 crypto_unregister_alg(&ablk_ecb_alg);
451 crypto_unregister_alg(&blk_cbc_alg); 715 crypto_unregister_alg(&blk_cbc_alg);
452 crypto_unregister_alg(&blk_ecb_alg); 716 crypto_unregister_alg(&blk_ecb_alg);
717 crypto_unregister_alg(&__aesni_alg);
453 crypto_unregister_alg(&aesni_alg); 718 crypto_unregister_alg(&aesni_alg);
454} 719}
455 720
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 000000000000..5f9781a3815f
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
1/*
2 * FPU: Wrapper for blkcipher touching fpu
3 *
4 * Copyright (c) Intel Corp.
5 * Author: Huang Ying <ying.huang@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 *
12 */
13
14#include <crypto/algapi.h>
15#include <linux/err.h>
16#include <linux/init.h>
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <asm/i387.h>
20
21struct crypto_fpu_ctx {
22 struct crypto_blkcipher *child;
23};
24
25static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
26 unsigned int keylen)
27{
28 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
29 struct crypto_blkcipher *child = ctx->child;
30 int err;
31
32 crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
33 crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
34 CRYPTO_TFM_REQ_MASK);
35 err = crypto_blkcipher_setkey(child, key, keylen);
36 crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
37 CRYPTO_TFM_RES_MASK);
38 return err;
39}
40
41static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
42 struct scatterlist *dst, struct scatterlist *src,
43 unsigned int nbytes)
44{
45 int err;
46 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
47 struct crypto_blkcipher *child = ctx->child;
48 struct blkcipher_desc desc = {
49 .tfm = child,
50 .info = desc_in->info,
51 .flags = desc_in->flags,
52 };
53
54 kernel_fpu_begin();
55 err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
56 kernel_fpu_end();
57 return err;
58}
59
60static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
61 struct scatterlist *dst, struct scatterlist *src,
62 unsigned int nbytes)
63{
64 int err;
65 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
66 struct crypto_blkcipher *child = ctx->child;
67 struct blkcipher_desc desc = {
68 .tfm = child,
69 .info = desc_in->info,
70 .flags = desc_in->flags,
71 };
72
73 kernel_fpu_begin();
74 err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
75 kernel_fpu_end();
76 return err;
77}
78
79static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
80{
81 struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
82 struct crypto_spawn *spawn = crypto_instance_ctx(inst);
83 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
84 struct crypto_blkcipher *cipher;
85
86 cipher = crypto_spawn_blkcipher(spawn);
87 if (IS_ERR(cipher))
88 return PTR_ERR(cipher);
89
90 ctx->child = cipher;
91 return 0;
92}
93
94static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
95{
96 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
97 crypto_free_blkcipher(ctx->child);
98}
99
100static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
101{
102 struct crypto_instance *inst;
103 struct crypto_alg *alg;
104 int err;
105
106 err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
107 if (err)
108 return ERR_PTR(err);
109
110 alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
111 CRYPTO_ALG_TYPE_MASK);
112 if (IS_ERR(alg))
113 return ERR_CAST(alg);
114
115 inst = crypto_alloc_instance("fpu", alg);
116 if (IS_ERR(inst))
117 goto out_put_alg;
118
119 inst->alg.cra_flags = alg->cra_flags;
120 inst->alg.cra_priority = alg->cra_priority;
121 inst->alg.cra_blocksize = alg->cra_blocksize;
122 inst->alg.cra_alignmask = alg->cra_alignmask;
123 inst->alg.cra_type = alg->cra_type;
124 inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
125 inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
126 inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
127 inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
128 inst->alg.cra_init = crypto_fpu_init_tfm;
129 inst->alg.cra_exit = crypto_fpu_exit_tfm;
130 inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
131 inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
132 inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
133
134out_put_alg:
135 crypto_mod_put(alg);
136 return inst;
137}
138
139static void crypto_fpu_free(struct crypto_instance *inst)
140{
141 crypto_drop_spawn(crypto_instance_ctx(inst));
142 kfree(inst);
143}
144
145static struct crypto_template crypto_fpu_tmpl = {
146 .name = "fpu",
147 .alloc = crypto_fpu_alloc,
148 .free = crypto_fpu_free,
149 .module = THIS_MODULE,
150};
151
152static int __init crypto_fpu_module_init(void)
153{
154 return crypto_register_template(&crypto_fpu_tmpl);
155}
156
157static void __exit crypto_fpu_module_exit(void)
158{
159 crypto_unregister_template(&crypto_fpu_tmpl);
160}
161
162module_init(crypto_fpu_module_init);
163module_exit(crypto_fpu_module_exit);
164
165MODULE_LICENSE("GPL");
166MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index aff9f1fcdcd7..8cb9c814e120 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -483,5 +483,5 @@ atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
483 return old_val < 0; 483 return old_val < 0;
484} 484}
485 485
486#include <asm-generic/atomic.h> 486#include <asm-generic/atomic-long.h>
487#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 8c21731984da..0d6360220007 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -455,5 +455,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
455#define smp_mb__before_atomic_inc() barrier() 455#define smp_mb__before_atomic_inc() barrier()
456#define smp_mb__after_atomic_inc() barrier() 456#define smp_mb__after_atomic_inc() barrier()
457 457
458#include <asm-generic/atomic.h> 458#include <asm-generic/atomic-long.h>
459#endif /* _ASM_X86_ATOMIC_64_H */ 459#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..b0ae1c4dc791
--- /dev/null
+++ b/arch/x86/include/asm/bitsperlong.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_X86_BITSPERLONG_H
2#define __ASM_X86_BITSPERLONG_H
3
4#ifdef __x86_64__
5# define __BITS_PER_LONG 64
6#else
7# define __BITS_PER_LONG 32
8#endif
9
10#include <asm-generic/bitsperlong.h>
11
12#endif /* __ASM_X86_BITSPERLONG_H */
13
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 19af42138f78..4a28d22d4793 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -116,6 +116,8 @@
116#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ 116#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
117#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ 117#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
118#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 118#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
119#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
120#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
119#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 121#define X86_FEATURE_AES (4*32+25) /* AES instructions */
120#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 122#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
121#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 123#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..b93405b228b4 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
60 dma_addr_t addr; 61 dma_addr_t addr;
61 62
62 BUG_ON(!valid_dma_direction(dir)); 63 BUG_ON(!valid_dma_direction(dir));
64 kmemcheck_mark_initialized(ptr, size);
63 addr = ops->map_page(hwdev, virt_to_page(ptr), 65 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size, 66 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL); 67 dir, NULL);
@@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg,
87{ 89{
88 struct dma_map_ops *ops = get_dma_ops(hwdev); 90 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents; 91 int ents;
92 struct scatterlist *s;
93 int i;
90 94
91 BUG_ON(!valid_dma_direction(dir)); 95 BUG_ON(!valid_dma_direction(dir));
96 for_each_sg(sg, s, nents, i)
97 kmemcheck_mark_initialized(sg_virt(s), s->length);
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL); 98 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir); 99 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94 100
@@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
200 dma_addr_t addr; 206 dma_addr_t addr;
201 207
202 BUG_ON(!valid_dma_direction(dir)); 208 BUG_ON(!valid_dma_direction(dir));
209 kmemcheck_mark_initialized(page_address(page) + offset, size);
203 addr = ops->map_page(dev, page, offset, size, dir, NULL); 210 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 211 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205 212
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index d750a10ccad6..ff8cbfa07851 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
17 18
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 19BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt) 20 smp_invalidate_interrupt)
@@ -52,8 +53,16 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
52BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_THERMAL_VECTOR
56BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 57BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
57#endif 58#endif
58 59
60#ifdef CONFIG_X86_MCE_THRESHOLD
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif
63
64#ifdef CONFIG_X86_NEW_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif
67
59#endif 68#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 9ebc5c255032..82e3e8f01043 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,7 +22,7 @@ typedef struct {
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_MCE
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_64 25# ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 26 unsigned int irq_threshold_count;
27# endif 27# endif
28#endif 28#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6df45f639666..ba180d93b08c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -34,6 +34,7 @@ extern void perf_pending_interrupt(void);
34extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
35extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
36extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
37extern void mce_self_interrupt(void);
37 38
38extern void invalidate_interrupt(void); 39extern void invalidate_interrupt(void);
39extern void invalidate_interrupt0(void); 40extern void invalidate_interrupt0(void);
@@ -46,6 +47,7 @@ extern void invalidate_interrupt6(void);
46extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
47 48
48extern void irq_move_cleanup_interrupt(void); 49extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void);
49extern void threshold_interrupt(void); 51extern void threshold_interrupt(void);
50 52
51extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e997be98c9b9..5b21f0ec3df2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#define NMI_VECTOR 0x02 27#define NMI_VECTOR 0x02
28#define MCE_VECTOR 0x12
28 29
29/* 30/*
30 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start
@@ -87,13 +88,8 @@
87#define CALL_FUNCTION_VECTOR 0xfc 88#define CALL_FUNCTION_VECTOR 0xfc
88#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 89#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
89#define THERMAL_APIC_VECTOR 0xfa 90#define THERMAL_APIC_VECTOR 0xfa
90 91#define THRESHOLD_APIC_VECTOR 0xf9
91#ifdef CONFIG_X86_32 92#define REBOOT_VECTOR 0xf8
92/* 0xf8 - 0xf9 : free */
93#else
94# define THRESHOLD_APIC_VECTOR 0xf9
95# define UV_BAU_MESSAGE 0xf8
96#endif
97 93
98/* f0-f7 used for spreading out TLB flushes: */ 94/* f0-f7 used for spreading out TLB flushes: */
99#define INVALIDATE_TLB_VECTOR_END 0xf7 95#define INVALIDATE_TLB_VECTOR_END 0xf7
@@ -117,6 +113,13 @@
117 */ 113 */
118#define LOCAL_PENDING_VECTOR 0xec 114#define LOCAL_PENDING_VECTOR 0xec
119 115
116#define UV_BAU_MESSAGE 0xec
117
118/*
119 * Self IPI vector for machine checks
120 */
121#define MCE_SELF_VECTOR 0xeb
122
120/* 123/*
121 * First APIC vector available to drivers: (vectors 0x30-0xee) we 124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
122 * start at 0x31(0x41) to spread out vectors evenly between priority 125 * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5cf..9e00a731a7fb 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
2#define _ASM_X86_KMAP_TYPES_H 2#define _ASM_X86_KMAP_TYPES_H
3 3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) 4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n , 5#define __WITH_KM_FENCE
6#else
7# define D(n)
8#endif 6#endif
9 7
10enum km_type { 8#include <asm-generic/kmap_types.h>
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26 9
27#undef D 10#undef __WITH_KM_FENCE
28 11
29#endif /* _ASM_X86_KMAP_TYPES_H */ 12#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf11704..125be8b19568 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
16#define __KVM_HAVE_MSI 16#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX
19 20
20/* Architectural interrupt line count. */ 21/* Architectural interrupt line count. */
21#define KVM_NR_INTERRUPTS 256 22#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044ff..eabdc1cfab5c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
185 unsigned access:3; 185 unsigned access:3;
186 unsigned invalid:1; 186 unsigned invalid:1;
187 unsigned cr4_pge:1; 187 unsigned cr4_pge:1;
188 unsigned nxe:1;
188 }; 189 };
189}; 190};
190 191
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
212 int multimapped; /* More than one parent_pte? */ 213 int multimapped; /* More than one parent_pte? */
213 int root_count; /* Currently serving as active root */ 214 int root_count; /* Currently serving as active root */
214 bool unsync; 215 bool unsync;
215 bool global;
216 unsigned int unsync_children; 216 unsigned int unsync_children;
217 union { 217 union {
218 u64 *parent_pte; /* !multimapped */ 218 u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
261 union kvm_mmu_page_role base_role; 261 union kvm_mmu_page_role base_role;
262 262
263 u64 *pae_root; 263 u64 *pae_root;
264 u64 rsvd_bits_mask[2][4];
264}; 265};
265 266
266struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
267 u64 host_tsc; 268 u64 host_tsc;
268 int interrupt_window_open;
269 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
270 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
271 /* 269 /*
272 * rip and regs accesses must go through 270 * rip and regs accesses must go through
273 * kvm_{register,rip}_{read,write} functions. 271 * kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
286 u64 shadow_efer; 284 u64 shadow_efer;
287 u64 apic_base; 285 u64 apic_base;
288 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
287 int32_t apic_arb_prio;
289 int mp_state; 288 int mp_state;
290 int sipi_vector; 289 int sipi_vector;
291 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
320 struct kvm_pio_request pio; 319 struct kvm_pio_request pio;
321 void *pio_data; 320 void *pio_data;
322 321
322 u8 event_exit_inst_len;
323
323 struct kvm_queued_exception { 324 struct kvm_queued_exception {
324 bool pending; 325 bool pending;
325 bool has_error_code; 326 bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
329 330
330 struct kvm_queued_interrupt { 331 struct kvm_queued_interrupt {
331 bool pending; 332 bool pending;
333 bool soft;
332 u8 nr; 334 u8 nr;
333 } interrupt; 335 } interrupt;
334 336
335 struct { 337 struct {
336 int active; 338 int vm86_active;
337 u8 save_iopl; 339 u8 save_iopl;
338 struct kvm_save_segment { 340 struct kvm_save_segment {
339 u16 selector; 341 u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
356 unsigned int time_offset; 358 unsigned int time_offset;
357 struct page *time_page; 359 struct page *time_page;
358 360
361 bool singlestep; /* guest is single stepped by KVM */
359 bool nmi_pending; 362 bool nmi_pending;
360 bool nmi_injected; 363 bool nmi_injected;
361 bool nmi_window_open;
362 364
363 struct mtrr_state_type mtrr_state; 365 struct mtrr_state_type mtrr_state;
364 u32 pat; 366 u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
392 */ 394 */
393 struct list_head active_mmu_pages; 395 struct list_head active_mmu_pages;
394 struct list_head assigned_dev_head; 396 struct list_head assigned_dev_head;
395 struct list_head oos_global_pages;
396 struct iommu_domain *iommu_domain; 397 struct iommu_domain *iommu_domain;
398 int iommu_flags;
397 struct kvm_pic *vpic; 399 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 400 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 401 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list; 402 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 int round_robin_prev_vcpu;
404 unsigned int tss_addr; 405 unsigned int tss_addr;
405 struct page *apic_access_page; 406 struct page *apic_access_page;
406 407
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
423 u32 mmu_recycled; 424 u32 mmu_recycled;
424 u32 mmu_cache_miss; 425 u32 mmu_cache_miss;
425 u32 mmu_unsync; 426 u32 mmu_unsync;
426 u32 mmu_unsync_global;
427 u32 remote_tlb_flush; 427 u32 remote_tlb_flush;
428 u32 lpages; 428 u32 lpages;
429}; 429};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
443 u32 halt_exits; 443 u32 halt_exits;
444 u32 halt_wakeup; 444 u32 halt_wakeup;
445 u32 request_irq_exits; 445 u32 request_irq_exits;
446 u32 request_nmi_exits;
447 u32 irq_exits; 446 u32 irq_exits;
448 u32 host_state_reload; 447 u32 host_state_reload;
449 u32 efer_reload; 448 u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
511 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 510 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
512 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 511 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
513 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 512 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
513 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 void (*patch_hypercall)(struct kvm_vcpu *vcpu, 515 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
515 unsigned char *hypercall_addr); 516 unsigned char *hypercall_addr);
516 int (*get_irq)(struct kvm_vcpu *vcpu); 517 void (*set_irq)(struct kvm_vcpu *vcpu);
517 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 518 void (*set_nmi)(struct kvm_vcpu *vcpu);
518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 519 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
519 bool has_error_code, u32 error_code); 520 bool has_error_code, u32 error_code);
520 bool (*exception_injected)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
522 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 523 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 struct kvm_run *run); 524 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 525 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
526 int (*get_tdp_level)(void); 527 int (*get_tdp_level)(void);
527 int (*get_mt_mask_shift)(void); 528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528}; 529};
529 530
530extern struct kvm_x86_ops *kvm_x86_ops; 531extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
538void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 539void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
539void kvm_mmu_set_base_ptes(u64 base_pte); 540void kvm_mmu_set_base_ptes(u64 base_pte);
540void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 541void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
541 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); 542 u64 dirty_mask, u64 nx_mask, u64 x_mask);
542 543
543int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 544int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
544void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 545void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
552 const void *val, int bytes); 553 const void *val, int bytes);
553int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 554int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
554 gpa_t addr, unsigned long *ret); 555 gpa_t addr, unsigned long *ret);
556u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
555 557
556extern bool tdp_enabled; 558extern bool tdp_enabled;
557 559
@@ -563,6 +565,7 @@ enum emulation_result {
563 565
564#define EMULTYPE_NO_DECODE (1 << 0) 566#define EMULTYPE_NO_DECODE (1 << 0)
565#define EMULTYPE_TRAP_UD (1 << 1) 567#define EMULTYPE_TRAP_UD (1 << 1)
568#define EMULTYPE_SKIP (1 << 2)
566int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 569int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
567 unsigned long cr2, u16 error_code, int emulation_type); 570 unsigned long cr2, u16 error_code, int emulation_type);
568void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 571void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
638int kvm_mmu_load(struct kvm_vcpu *vcpu); 641int kvm_mmu_load(struct kvm_vcpu *vcpu);
639void kvm_mmu_unload(struct kvm_vcpu *vcpu); 642void kvm_mmu_unload(struct kvm_vcpu *vcpu);
640void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 643void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
641void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
642 644
643int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 645int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
644 646
@@ -769,6 +771,8 @@ enum {
769#define HF_GIF_MASK (1 << 0) 771#define HF_GIF_MASK (1 << 0)
770#define HF_HIF_MASK (1 << 1) 772#define HF_HIF_MASK (1 << 1)
771#define HF_VINTR_MASK (1 << 2) 773#define HF_VINTR_MASK (1 << 2)
774#define HF_NMI_MASK (1 << 3)
775#define HF_IRET_MASK (1 << 4)
772 776
773/* 777/*
774 * Hardware virtualization extension instructions may fault if a 778 * Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
791#define KVM_ARCH_WANT_MMU_NOTIFIER 795#define KVM_ARCH_WANT_MMU_NOTIFIER
792int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
793int kvm_age_hva(struct kvm *kvm, unsigned long hva); 797int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
794 799
795#endif /* _ASM_X86_KVM_HOST_H */ 800#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881a..b7ed2c423116 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
143 struct fetch_cache fetch; 143 struct fetch_cache fetch;
144}; 144};
145 145
146#define X86_SHADOW_INT_MOV_SS 1
147#define X86_SHADOW_INT_STI 2
148
146struct x86_emulate_ctxt { 149struct x86_emulate_ctxt {
147 /* Register state before/after emulation. */ 150 /* Register state before/after emulation. */
148 struct kvm_vcpu *vcpu; 151 struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
152 int mode; 155 int mode;
153 u32 cs_base; 156 u32 cs_base;
154 157
158 /* interruptibility state, as a result of execution of STI or MOV SS */
159 int interruptibility;
160
155 /* decode cache */ 161 /* decode cache */
156 struct decode_cache decode; 162 struct decode_cache decode;
157}; 163};
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9c..313389cd50d2 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */ 20/* We map at -4M (-2M when PAE is activated) for ease of mapping
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000
24#else
21#define SWITCHER_ADDR 0xFFC00000 25#define SWITCHER_ADDR 0xFFC00000
26#endif
22 27
23/* Found in switcher.S */ 28/* Found in switcher.S */
24extern unsigned long default_idt_entries[]; 29extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487b..d31c4a684078 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
12#define LHCALL_TS 8 12#define LHCALL_TS 8
13#define LHCALL_SET_CLOCKEVENT 9 13#define LHCALL_SET_CLOCKEVENT 9
14#define LHCALL_HALT 10 14#define LHCALL_HALT 10
15#define LHCALL_SET_PMD 13
15#define LHCALL_SET_PTE 14 16#define LHCALL_SET_PTE 14
16#define LHCALL_SET_PMD 15 17#define LHCALL_SET_PGD 15
17#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
18#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
19#define LHCALL_LOAD_GDT_ENTRY 18 20#define LHCALL_LOAD_GDT_ENTRY 18
21#define LHCALL_SEND_INTERRUPTS 19
20 22
21#define LGUEST_TRAP_ENTRY 0x1F 23#define LGUEST_TRAP_ENTRY 0x1F
22 24
@@ -32,10 +34,10 @@
32 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
33 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
34 * 36 *
35 * We use the KVM hypercall mechanism. Eighteen hypercalls are 37 * We use the KVM hypercall mechanism. Seventeen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the 38 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %ebx, %ecx and %edx. If a return 39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
38 * value makes sense, it's returned in %eax. 40 * If a return value makes sense, it's returned in %eax.
39 * 41 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's 43 * Host, rather than returning failure. This reflects Winston Churchill's
@@ -47,8 +49,9 @@
47 49
48#define LHCALL_RING_SIZE 64 50#define LHCALL_RING_SIZE 64
49struct hcall_args { 51struct hcall_args {
50 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 52 /* These map directly onto eax, ebx, ecx, edx and esi
51 unsigned long arg0, arg1, arg2, arg3; 53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4;
52}; 55};
53 56
54#endif /* !__ASSEMBLY__ */ 57#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e7..540a466e50f5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_MCE_H 1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H 2#define _ASM_X86_MCE_H
3 3
4#ifdef __x86_64__
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <asm/ioctls.h> 5#include <asm/ioctls.h>
8 6
@@ -10,21 +8,35 @@
10 * Machine Check support for x86 8 * Machine Check support for x86
11 */ 9 */
12 10
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
16 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 16#define MCG_EXT_CNT_SHIFT 16
19#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ 17#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
20 18#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
21#define MCI_STATUS_VAL (1UL<<63) /* valid error */ 19
22#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ 20#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
23#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ 21#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
24#define MCI_STATUS_EN (1UL<<60) /* error enabled */ 22#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
25#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ 23
26#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ 24#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
27#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ 25#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
26#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
27#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
28#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
29#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
30#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
31#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
32#define MCI_STATUS_AR (1ULL<<55) /* Action required */
33
34/* MISC register defines */
35#define MCM_ADDR_SEGOFF 0 /* segment offset */
36#define MCM_ADDR_LINEAR 1 /* linear address */
37#define MCM_ADDR_PHYS 2 /* physical address */
38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */
28 40
29/* Fields are zero when not available */ 41/* Fields are zero when not available */
30struct mce { 42struct mce {
@@ -34,13 +46,19 @@ struct mce {
34 __u64 mcgstatus; 46 __u64 mcgstatus;
35 __u64 ip; 47 __u64 ip;
36 __u64 tsc; /* cpu time stamp counter */ 48 __u64 tsc; /* cpu time stamp counter */
37 __u64 res1; /* for future extension */ 49 __u64 time; /* wall time_t when error was detected */
38 __u64 res2; /* dito. */ 50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1;
52 __u16 pad2;
53 __u32 cpuid; /* CPUID 1 EAX */
39 __u8 cs; /* code segment */ 54 __u8 cs; /* code segment */
40 __u8 bank; /* machine check bank */ 55 __u8 bank; /* machine check bank */
41 __u8 cpu; /* cpu that raised the error */ 56 __u8 cpu; /* cpu number; obsolete; use extcpu now */
42 __u8 finished; /* entry is valid */ 57 __u8 finished; /* entry is valid */
43 __u32 pad; 58 __u32 extcpu; /* linux cpu number that detected the error */
59 __u32 socketid; /* CPU socket ID */
60 __u32 apicid; /* CPU initial apic ID */
61 __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
44}; 62};
45 63
46/* 64/*
@@ -57,7 +75,7 @@ struct mce_log {
57 unsigned len; /* = MCE_LOG_LEN */ 75 unsigned len; /* = MCE_LOG_LEN */
58 unsigned next; 76 unsigned next;
59 unsigned flags; 77 unsigned flags;
60 unsigned pad0; 78 unsigned recordlen; /* length of struct mce */
61 struct mce entry[MCE_LOG_LEN]; 79 struct mce entry[MCE_LOG_LEN];
62}; 80};
63 81
@@ -82,19 +100,16 @@ struct mce_log {
82#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 100#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
83#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 101#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
84 102
85#endif /* __x86_64__ */
86
87#ifdef __KERNEL__ 103#ifdef __KERNEL__
88 104
89#ifdef CONFIG_X86_32
90extern int mce_disabled; 105extern int mce_disabled;
91#else /* CONFIG_X86_32 */
92 106
93#include <asm/atomic.h> 107#include <asm/atomic.h>
108#include <linux/percpu.h>
94 109
95void mce_setup(struct mce *m); 110void mce_setup(struct mce *m);
96void mce_log(struct mce *m); 111void mce_log(struct mce *m);
97DECLARE_PER_CPU(struct sys_device, device_mce); 112DECLARE_PER_CPU(struct sys_device, mce_dev);
98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 113extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
99 114
100/* 115/*
@@ -104,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 119#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105 120
106#ifdef CONFIG_X86_MCE_INTEL 121#ifdef CONFIG_X86_MCE_INTEL
122extern int mce_cmci_disabled;
123extern int mce_ignore_ce;
107void mce_intel_feature_init(struct cpuinfo_x86 *c); 124void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void); 125void cmci_clear(void);
109void cmci_reenable(void); 126void cmci_reenable(void);
@@ -123,13 +140,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 140static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
124#endif 141#endif
125 142
126extern int mce_available(struct cpuinfo_x86 *c); 143int mce_available(struct cpuinfo_x86 *c);
144
145DECLARE_PER_CPU(unsigned, mce_exception_count);
146DECLARE_PER_CPU(unsigned, mce_poll_count);
127 147
128void mce_log_therm_throt_event(__u64 status); 148void mce_log_therm_throt_event(__u64 status);
129 149
130extern atomic_t mce_entry; 150extern atomic_t mce_entry;
131 151
132extern void do_machine_check(struct pt_regs *, long); 152void do_machine_check(struct pt_regs *, long);
133 153
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 154typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 155DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
@@ -139,14 +159,16 @@ enum mcp_flags {
139 MCP_UC = (1 << 1), /* log uncorrected errors */ 159 MCP_UC = (1 << 1), /* log uncorrected errors */
140 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 160 MCP_DONTLOG = (1 << 2), /* only clear, don't log */
141}; 161};
142extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 162void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
143 163
144extern int mce_notify_user(void); 164int mce_notify_irq(void);
165void mce_notify_process(void);
145 166
146#endif /* !CONFIG_X86_32 */ 167DECLARE_PER_CPU(struct mce, injectm);
168extern struct file_operations mce_chrdev_ops;
147 169
148#ifdef CONFIG_X86_MCE 170#ifdef CONFIG_X86_MCE
149extern void mcheck_init(struct cpuinfo_x86 *c); 171void mcheck_init(struct cpuinfo_x86 *c);
150#else 172#else
151#define mcheck_init(c) do { } while (0) 173#define mcheck_init(c) do { } while (0)
152#endif 174#endif
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 90bc4108a4fd..751af2550ed9 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_MMAN_H 1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H 2#define _ASM_X86_MMAN_H
3 3
4#include <asm-generic/mman.h> 4#include <asm-generic/mman-common.h>
5 5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 6#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7 7
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4d58d04fca83..1692fb5050e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -207,7 +207,14 @@
207 207
208#define MSR_IA32_THERM_CONTROL 0x0000019a 208#define MSR_IA32_THERM_CONTROL 0x0000019a
209#define MSR_IA32_THERM_INTERRUPT 0x0000019b 209#define MSR_IA32_THERM_INTERRUPT 0x0000019b
210
211#define THERM_INT_LOW_ENABLE (1 << 0)
212#define THERM_INT_HIGH_ENABLE (1 << 1)
213
210#define MSR_IA32_THERM_STATUS 0x0000019c 214#define MSR_IA32_THERM_STATUS 0x0000019c
215
216#define THERM_STATUS_PROCHOT (1 << 0)
217
211#define MSR_IA32_MISC_ENABLE 0x000001a0 218#define MSR_IA32_MISC_ENABLE 0x000001a0
212 219
213/* MISC_ENABLE bits: architectural */ 220/* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf6241807..22603764e7db 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
12 12
13#include <asm/asm.h> 13#include <asm/asm.h>
14#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/cpumask.h>
16
17struct msr {
18 union {
19 struct {
20 u32 l;
21 u32 h;
22 };
23 u64 q;
24 };
25};
15 26
16static inline unsigned long long native_read_tscp(unsigned int *aux) 27static inline unsigned long long native_read_tscp(unsigned int *aux)
17{ 28{
@@ -216,6 +227,8 @@ do { \
216#ifdef CONFIG_SMP 227#ifdef CONFIG_SMP
217int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 228int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
218int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 229int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
230void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
231void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
219int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 232int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
220int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 233int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
221#else /* CONFIG_SMP */ 234#else /* CONFIG_SMP */
@@ -229,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229 wrmsr(msr_no, l, h); 242 wrmsr(msr_no, l, h);
230 return 0; 243 return 0;
231} 244}
245static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
246 struct msr *msrs)
247{
248 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
249}
250static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
251 struct msr *msrs)
252{
253 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
254}
232static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, 255static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
233 u32 *l, u32 *h) 256 u32 *l, u32 *h)
234{ 257{
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 89ed9d70b0aa..625c3f0e741a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
56#endif /* __ASSEMBLY__ */ 56#endif /* __ASSEMBLY__ */
57 57
58#include <asm-generic/memory_model.h> 58#include <asm-generic/memory_model.h>
59#include <asm-generic/page.h> 59#include <asm-generic/getorder.h>
60 60
61#define __HAVE_ARCH_GATE_AREA 1 61#define __HAVE_ARCH_GATE_AREA 1
62 62
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf2631..3cc06e3fceb8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -317,6 +317,11 @@ static inline int pte_present(pte_t a)
317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
318} 318}
319 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
320static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
321{ 326{
322 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 2733fad45f98..5e67c1532314 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) 46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
47#endif 47#endif
48 48
49#define MODULES_VADDR VMALLOC_START
50#define MODULES_END VMALLOC_END
51#define MODULES_LEN (MODULES_VADDR - MODULES_END)
52
49#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) 53#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
50 54
51#endif /* _ASM_X86_PGTABLE_32_DEFS_H */ 55#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 7761a5d554bb..598457cbd0f8 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -117,7 +117,7 @@ typedef unsigned long sigset_t;
117#define MINSIGSTKSZ 2048 117#define MINSIGSTKSZ 2048
118#define SIGSTKSZ 8192 118#define SIGSTKSZ 8192
119 119
120#include <asm-generic/signal.h> 120#include <asm-generic/signal-defs.h>
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3ebf..85574b7c1bc1 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
225#define SVM_EVTINJ_VALID_ERR (1 << 11) 225#define SVM_EVTINJ_VALID_ERR (1 << 11)
226 226
227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK 227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
228#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
228 229
229#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR 230#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
230#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI 231#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c49..c4ee8056baca 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); 67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); 68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); 69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 get_user(termios->c_line, &termio->c_line);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC); 71 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71} 72}
72 73
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98c..b0783520988b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -154,9 +154,9 @@ struct thread_info {
154 154
155/* thread information allocation */ 155/* thread information allocation */
156#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
157#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
158#else 158#else
159#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
160#endif 160#endif
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981f..1375cfc93960 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
4#include <asm/processor.h> 4#include <asm/processor.h>
5#include <asm/tsc.h> 5#include <asm/tsc.h>
6 6
7/* The PIT ticks at this frequency (in HZ): */ 7/* Assume we use the PIT time source for the clock tick */
8#define PIT_TICK_RATE 1193182
9
10#define CLOCK_TICK_RATE PIT_TICK_RATE 8#define CLOCK_TICK_RATE PIT_TICK_RATE
11 9
12#define ARCH_HAS_READ_CURRENT_TIMER 10#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index a5ecc9c33e92..7f3eba08e7de 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
172 flush_tlb_all(); 172 flush_tlb_all();
173} 173}
174 174
175extern void zap_low_mappings(void); 175extern void zap_low_mappings(bool early);
176 176
177#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index e6f736320077..09b97745772f 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -14,12 +14,6 @@ typedef unsigned short umode_t;
14 */ 14 */
15#ifdef __KERNEL__ 15#ifdef __KERNEL__
16 16
17#ifdef CONFIG_X86_32
18# define BITS_PER_LONG 32
19#else
20# define BITS_PER_LONG 64
21#endif
22
23#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
24 18
25typedef u64 dma64_addr_t; 19typedef u64 dma64_addr_t;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b9..11be5ad2e0e9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
247#define EXIT_REASON_MSR_READ 31 247#define EXIT_REASON_MSR_READ 31
248#define EXIT_REASON_MSR_WRITE 32 248#define EXIT_REASON_MSR_WRITE 32
249#define EXIT_REASON_MWAIT_INSTRUCTION 36 249#define EXIT_REASON_MWAIT_INSTRUCTION 36
250#define EXIT_REASON_MCE_DURING_VMENTRY 41
250#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 251#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
251#define EXIT_REASON_APIC_ACCESS 44 252#define EXIT_REASON_APIC_ACCESS 44
252#define EXIT_REASON_EPT_VIOLATION 48 253#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4f78bd682125..f3477bb84566 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
75obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_MODULES) += module_$(BITS).o 76obj-$(CONFIG_MODULES) += module.o
77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
79obj-$(CONFIG_KGDB) += kgdb.o 79obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c5115..ca93638ba430 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
104 initial_gs = per_cpu_offset(smp_processor_id()); 104 initial_gs = per_cpu_offset(smp_processor_id());
105#endif 105#endif
106 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
107 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 108#endif /* CONFIG_64BIT */
109 109
110 return 0; 110 return 0;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 076d3881f3da..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -899,7 +899,7 @@ void clear_local_APIC(void)
899 } 899 }
900 900
901 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
902#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
903 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
904 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -2017,7 +2017,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
2020#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
2021 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
2023#endif 2023#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac42ab3..ef8d9290c7ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
177 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
178 struct irq_desc *desc; 178 struct irq_desc *desc;
179 int count; 179 int count;
180 int node;
180 int i; 181 int i;
181 182
182 cfg = irq_cfgx; 183 cfg = irq_cfgx;
183 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
184 186
185 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
186 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
187 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
188 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
189 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
190 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
191 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
192 } 194 }
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a691302dc3ff..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ef0ae207a7c8..096d19aea2f7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 463 uv_set_scir_bits(bits);
464 464
465 /* enable next timer period */ 465 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 466 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 467}
468 468
469static void __cpuinit uv_heartbeat_enable(int cpu) 469static void __cpuinit uv_heartbeat_enable(int cpu)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac42..79302e9a33a4 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
1233 int err; 1233 int err;
1234 struct apm_user *as; 1234 struct apm_user *as;
1235 1235
1236 device_suspend(PMSG_SUSPEND); 1236 dpm_suspend_start(PMSG_SUSPEND);
1237 1237
1238 device_power_down(PMSG_SUSPEND); 1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1239
1240 local_irq_disable(); 1240 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
1259 sysdev_resume(); 1259 sysdev_resume();
1260 local_irq_enable(); 1260 local_irq_enable();
1261 1261
1262 device_power_up(PMSG_RESUME); 1262 dpm_resume_noirq(PMSG_RESUME);
1263 1263
1264 device_resume(PMSG_RESUME); 1264 dpm_resume_end(PMSG_RESUME);
1265 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1267 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
1277{ 1277{
1278 int err; 1278 int err;
1279 1279
1280 device_power_down(PMSG_SUSPEND); 1280 dpm_suspend_noirq(PMSG_SUSPEND);
1281 1281
1282 local_irq_disable(); 1282 local_irq_disable();
1283 sysdev_suspend(PMSG_SUSPEND); 1283 sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
1291 sysdev_resume(); 1291 sysdev_resume();
1292 local_irq_enable(); 1292 local_irq_enable();
1293 1293
1294 device_power_up(PMSG_RESUME); 1294 dpm_resume_noirq(PMSG_RESUME);
1295} 1295}
1296 1296
1297static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
1376 ignore_bounce = 1; 1376 ignore_bounce = 1;
1377 if ((event != APM_NORMAL_RESUME) 1377 if ((event != APM_NORMAL_RESUME)
1378 || (ignore_normal_resume == 0)) { 1378 || (ignore_normal_resume == 0)) {
1379 device_resume(PMSG_RESUME); 1379 dpm_resume_end(PMSG_RESUME);
1380 queue_event(event, NULL); 1380 queue_event(event, NULL);
1381 } 1381 }
1382 ignore_normal_resume = 0; 1382 ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a830cbd7015..dfdbf6403895 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 127 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
129 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
129 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 130 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
130 131
131 BLANK(); 132 BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3ffdcfa9abdf..9fa33886c0d7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,6 @@ out:
487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
488{ 488{
489 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
490 static int printed;
491 int i; 490 int i;
492 491
493 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
504 } 503 }
505 } 504 }
506 505
507 if (!printed) { 506 printk_once(KERN_ERR
508 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
509 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
510 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
511
512 printk(KERN_ERR "CPU: Your system may be unstable.\n");
513 }
514 509
515 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
516 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index daed39ba2614..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..45004faf67ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o therm_throt.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..89e510424152 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,12 +14,12 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine Check Handler For AMD Athlon/Duron */ 17/* Machine Check Handler For AMD Athlon/Duron: */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 18static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 31
33 for (i = 1; i < nr_mce_banks; i++) { 32 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 58
59 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 60 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 61 /* Serialize: */
57 wmb(); 62 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 63 add_taint(TAINT_MACHINE_CHECK);
59 } 64 }
60 } 65 }
61 66
62 if (recover&2) 67 if (recover & 2)
63 panic("CPU context corrupt"); 68 panic("CPU context corrupt");
64 if (recover&1) 69 if (recover & 1)
65 panic("Unable to continue"); 70 panic("Unable to continue");
71
66 printk(KERN_EMERG "Attempting to continue.\n"); 72 printk(KERN_EMERG "Attempting to continue.\n");
73
67 mcgstl &= ~(1<<2); 74 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 75 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 76}
70 77
71 78
72/* AMD K7 machine check is Intel like */ 79/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 80void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 81{
75 u32 l, h; 82 u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 86 return;
80 87
81 machine_check_vector = k7_machine_check; 88 machine_check_vector = k7_machine_check;
89 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 90 wmb();
83 91
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 92 printk(KERN_INFO "Intel machine check architecture supported.\n");
93
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 94 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 95 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 96 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 97 nr_mce_banks = l & 0xff;
89 98
90 /* Clear status for MC index 0 separately, we don't touch CTL, 99 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 100 * Clear status for MC index 0 separately, we don't touch CTL,
101 * as some K7 Athlons cause spurious MCEs when its enabled:
102 */
92 if (boot_cpu_data.x86 == 6) { 103 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 104 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 105 i = 1;
95 } else 106 } else
96 i = 0; 107 i = 0;
108
97 for (; i < nr_mce_banks; i++) { 109 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 110 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 111 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..fabba15e4558
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47#include "mce.h"
48
49/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code)
51{
52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
53 smp_processor_id());
54}
55
56/* Call the installed machine check handler for this CPU setup. */
57void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check;
59
60int mce_disabled;
61
62#ifdef CONFIG_X86_NEW_MCE
63
64#define MISC_MCELOG_MINOR 227
65
66#define SPINUNIT 100 /* 100ns */
67
68atomic_t mce_entry;
69
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72/*
73 * Tolerant levels:
74 * 0: always panic on uncorrected errors, log corrected errors
75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */
79static int tolerant = 1;
80static int banks;
81static u64 *bank;
82static unsigned long notify_user;
83static int rip_msr;
84static int mce_bootlog = -1;
85static int monarch_timeout = -1;
86static int mce_panic_timeout;
87static int mce_dont_log_ce;
88int mce_cmci_disabled;
89int mce_ignore_ce;
90int mce_ser;
91
92static char trigger[128];
93static char *trigger_argv[2] = { trigger, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &notify_user);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int no_way_out, int *order)
695{
696 int nwo;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout) {
701 *order = -1;
702 return no_way_out;
703 }
704
705 atomic_add(no_way_out, &global_nwo);
706
707 /*
708 * Wait for everyone.
709 */
710 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0);
713 *order = -1;
714 return no_way_out;
715 }
716 ndelay(SPINUNIT);
717 }
718
719 /*
720 * Cache the global no_way_out state.
721 */
722 nwo = atomic_read(&global_nwo);
723
724 /*
725 * Monarch starts executing now, the others wait.
726 */
727 if (*order == 1) {
728 atomic_set(&mce_executing, 1);
729 return nwo;
730 }
731
732 /*
733 * Now start the scanning loop one by one
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < *order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747}
748
749/*
750 * Synchronize between CPUs after main scanning loop.
751 * This invokes the bulk of the Monarch processing.
752 */
753static int mce_end(int order)
754{
755 int ret = -1;
756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758 if (!timeout)
759 goto reset;
760 if (order < 0)
761 goto reset;
762
763 /*
764 * Allow others to run.
765 */
766 atomic_inc(&mce_executing);
767
768 if (order == 1) {
769 /* CHECKME: Can this race with a parallel hotplug? */
770 int cpus = num_online_cpus();
771
772 /*
773 * Monarch: Wait for everyone to go through their scanning
774 * loops.
775 */
776 while (atomic_read(&mce_executing) <= cpus) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 mce_reign();
783 barrier();
784 ret = 0;
785 } else {
786 /*
787 * Subject: Wait for Monarch to finish.
788 */
789 while (atomic_read(&mce_executing) != 0) {
790 if (mce_timed_out(&timeout))
791 goto reset;
792 ndelay(SPINUNIT);
793 }
794
795 /*
796 * Don't reset anything. That's done by the Monarch.
797 */
798 return 0;
799 }
800
801 /*
802 * Reset all global state.
803 */
804reset:
805 atomic_set(&global_nwo, 0);
806 atomic_set(&mce_callin, 0);
807 barrier();
808
809 /*
810 * Let others run again.
811 */
812 atomic_set(&mce_executing, 0);
813 return ret;
814}
815
816/*
817 * Check if the address reported by the CPU is in a format we can parse.
818 * It would be possible to add code for most other cases, but all would
819 * be somewhat complicated (e.g. segment offset would require an instruction
820 * parser). So only support physical addresses upto page granuality for now.
821 */
822static int mce_usable_address(struct mce *m)
823{
824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
825 return 0;
826 if ((m->misc & 0x3f) > PAGE_SHIFT)
827 return 0;
828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
829 return 0;
830 return 1;
831}
832
833static void mce_clear_state(unsigned long *toclear)
834{
835 int i;
836
837 for (i = 0; i < banks; i++) {
838 if (test_bit(i, toclear))
839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
840 }
841}
842
843/*
844 * The actual machine check handler. This only handles real
845 * exceptions when something got corrupted coming in through int 18.
846 *
847 * This is executed in NMI context not subject to normal locking rules. This
848 * implies that most kernel services cannot be safely used. Don't even
849 * think about putting a printk in there!
850 *
851 * On Intel systems this is entered on all CPUs in parallel through
852 * MCE broadcast. However some CPUs might be broken beyond repair,
853 * so be always careful when synchronizing with others.
854 */
855void do_machine_check(struct pt_regs *regs, long error_code)
856{
857 struct mce m, *final;
858 int i;
859 int worst = 0;
860 int severity;
861 /*
862 * Establish sequential order between the CPUs entering the machine
863 * check handler.
864 */
865 int order;
866
867 /*
868 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway.
870 */
871 int no_way_out = 0;
872 /*
873 * If kill_it gets set, there might be a way to recover from this
874 * error.
875 */
876 int kill_it = 0;
877 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
878 char *msg = "Unknown";
879
880 atomic_inc(&mce_entry);
881
882 __get_cpu_var(mce_exception_count)++;
883
884 if (notify_die(DIE_NMI, "machine check", regs, error_code,
885 18, SIGKILL) == NOTIFY_STOP)
886 goto out;
887 if (!banks)
888 goto out;
889
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m);
892
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
894 no_way_out = mce_no_way_out(&m, &msg);
895
896 final = &__get_cpu_var(mces_seen);
897 *final = m;
898
899 barrier();
900
901 /*
902 * When no restart IP must always kill or panic.
903 */
904 if (!(m.mcgstatus & MCG_STATUS_RIPV))
905 kill_it = 1;
906
907 /*
908 * Go through all the banks in exclusion of the other CPUs.
909 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it.
911 */
912 no_way_out = mce_start(no_way_out, &order);
913 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear);
915 if (!bank[i])
916 continue;
917
918 m.misc = 0;
919 m.addr = 0;
920 m.bank = i;
921
922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
923 if ((m.status & MCI_STATUS_VAL) == 0)
924 continue;
925
926 /*
927 * Non uncorrected or non signaled errors are handled by
928 * machine_check_poll. Leave them alone, unless this panics.
929 */
930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
931 !no_way_out)
932 continue;
933
934 /*
935 * Set taint even when machine check was not enabled.
936 */
937 add_taint(TAINT_MACHINE_CHECK);
938
939 severity = mce_severity(&m, tolerant, NULL);
940
941 /*
942 * When machine check was for corrected handler don't touch,
943 * unless we're panicing.
944 */
945 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
946 continue;
947 __set_bit(i, toclear);
948 if (severity == MCE_NO_SEVERITY) {
949 /*
950 * Machine check event was not enabled. Clear, but
951 * ignore.
952 */
953 continue;
954 }
955
956 /*
957 * Kill on action required.
958 */
959 if (severity == MCE_AR_SEVERITY)
960 kill_it = 1;
961
962 if (m.status & MCI_STATUS_MISCV)
963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
964 if (m.status & MCI_STATUS_ADDRV)
965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
966
967 /*
968 * Action optional error. Queue address for later processing.
969 * When the ring overflows we just ignore the AO error.
970 * RED-PEN add some logging mechanism when
971 * usable_address or mce_add_ring fails.
972 * RED-PEN don't ignore overflow for tolerant == 0
973 */
974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
975 mce_ring_add(m.addr >> PAGE_SHIFT);
976
977 mce_get_rip(&m, regs);
978 mce_log(&m);
979
980 if (severity > worst) {
981 *final = m;
982 worst = severity;
983 }
984 }
985
986 if (!no_way_out)
987 mce_clear_state(toclear);
988
989 /*
990 * Do most of the synchronization with other CPUs.
991 * When there's any problem use only local no_way_out state.
992 */
993 if (mce_end(order) < 0)
994 no_way_out = worst >= MCE_PANIC_SEVERITY;
995
996 /*
997 * If we have decided that we just CAN'T continue, and the user
998 * has not set tolerant to an insane level, give up and die.
999 *
1000 * This is mainly used in the case when the system doesn't
1001 * support MCE broadcasting or it has been disabled.
1002 */
1003 if (no_way_out && tolerant < 3)
1004 mce_panic("Fatal machine check on current CPU", final, msg);
1005
1006 /*
1007 * If the error seems to be unrecoverable, something should be
1008 * done. Try to kill as little as possible. If we can kill just
1009 * one task, do that. If the user has set the tolerance very
1010 * high, don't try to do anything at all.
1011 */
1012
1013 if (kill_it && tolerant < 3)
1014 force_sig(SIGBUS, current);
1015
1016 /* notify userspace ASAP */
1017 set_thread_flag(TIF_MCE_NOTIFY);
1018
1019 if (worst > 0)
1020 mce_report_event(regs);
1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1022out:
1023 atomic_dec(&mce_entry);
1024 sync_core();
1025}
1026EXPORT_SYMBOL_GPL(do_machine_check);
1027
1028/* dummy to break dependency. actual code is in mm/memory-failure.c */
1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1030{
1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1032}
1033
1034/*
1035 * Called after mce notification in process context. This code
1036 * is allowed to sleep. Call the high level VM handler to process
1037 * any corrupted pages.
1038 * Assume that the work queue code only calls this one at a time
1039 * per CPU.
1040 * Note we don't disable preemption, so this code might run on the wrong
1041 * CPU. In this case the event is picked up by the scheduled work queue.
1042 * This is merely a fast path to expedite processing in some common
1043 * cases.
1044 */
1045void mce_notify_process(void)
1046{
1047 unsigned long pfn;
1048 mce_notify_irq();
1049 while (mce_ring_get(&pfn))
1050 memory_failure(pfn, MCE_VECTOR);
1051}
1052
1053static void mce_process_work(struct work_struct *dummy)
1054{
1055 mce_notify_process();
1056}
1057
1058#ifdef CONFIG_X86_MCE_INTEL
1059/***
1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1061 * @cpu: The CPU on which the event occurred.
1062 * @status: Event status information
1063 *
1064 * This function should be called by the thermal interrupt after the
1065 * event has been processed and the decision was made to log the event
1066 * further.
1067 *
1068 * The status parameter will be saved to the 'status' field of 'struct mce'
1069 * and historically has been the register value of the
1070 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1071 */
1072void mce_log_therm_throt_event(__u64 status)
1073{
1074 struct mce m;
1075
1076 mce_setup(&m);
1077 m.bank = MCE_THERMAL_BANK;
1078 m.status = status;
1079 mce_log(&m);
1080}
1081#endif /* CONFIG_X86_MCE_INTEL */
1082
1083/*
1084 * Periodic polling timer for "silent" machine check errors. If the
1085 * poller finds an MCE, poll 2x faster. When the poller finds no more
1086 * errors, poll 2x slower (up to check_interval seconds).
1087 */
1088static int check_interval = 5 * 60; /* 5 minutes */
1089
1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1091static DEFINE_PER_CPU(struct timer_list, mce_timer);
1092
1093static void mcheck_timer(unsigned long data)
1094{
1095 struct timer_list *t = &per_cpu(mce_timer, data);
1096 int *n;
1097
1098 WARN_ON(smp_processor_id() != data);
1099
1100 if (mce_available(&current_cpu_data)) {
1101 machine_check_poll(MCP_TIMESTAMP,
1102 &__get_cpu_var(mce_poll_banks));
1103 }
1104
1105 /*
1106 * Alert userspace if needed. If we logged an MCE, reduce the
1107 * polling interval, otherwise increase the polling interval.
1108 */
1109 n = &__get_cpu_var(next_interval);
1110 if (mce_notify_irq())
1111 *n = max(*n/2, HZ/100);
1112 else
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114
1115 t->expires = jiffies + *n;
1116 add_timer(t);
1117}
1118
1119static void mce_do_trigger(struct work_struct *work)
1120{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
1122}
1123
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1125
1126/*
1127 * Notify the user(s) about new machine check events.
1128 * Can be called from interrupt context, but not from machine check/NMI
1129 * context.
1130 */
1131int mce_notify_irq(void)
1132{
1133 /* Not more than two messages every minute */
1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1135
1136 clear_thread_flag(TIF_MCE_NOTIFY);
1137
1138 if (test_and_clear_bit(0, &notify_user)) {
1139 wake_up_interruptible(&mce_wait);
1140
1141 /*
1142 * There is no risk of missing notifications because
1143 * work_pending is always cleared before the function is
1144 * executed.
1145 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work);
1148
1149 if (__ratelimit(&ratelimit))
1150 printk(KERN_INFO "Machine check events logged\n");
1151
1152 return 1;
1153 }
1154 return 0;
1155}
1156EXPORT_SYMBOL_GPL(mce_notify_irq);
1157
1158/*
1159 * Initialize Machine Checks for a CPU.
1160 */
1161static int mce_cap_init(void)
1162{
1163 unsigned b;
1164 u64 cap;
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
1167
1168 b = cap & MCG_BANKCNT_MASK;
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
1181 if (!bank) {
1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1183 if (!bank)
1184 return -ENOMEM;
1185 memset(bank, 0xff, banks * sizeof(u64));
1186 }
1187
1188 /* Use accurate RIP reporting if available. */
1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1190 rip_msr = MSR_IA32_MCG_EIP;
1191
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
1195 return 0;
1196}
1197
1198static void mce_init(void)
1199{
1200 mce_banks_t all_banks;
1201 u64 cap;
1202 int i;
1203
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
1207 bitmap_fill(all_banks, MAX_NR_BANKS);
1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
1217 if (skip_bank_init(i))
1218 continue;
1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1221 }
1222}
1223
1224/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{
1227 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
1235 clear_bit(10, (unsigned long *)&bank[4]);
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
1242 mce_bootlog = 0;
1243 }
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
1248 if (c->x86 == 6)
1249 bank[0] = 0;
1250 }
1251
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
1262 if (c->x86 == 6 && c->x86_model < 0x1A)
1263 __set_bit(0, &dont_init_banks);
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
1272 }
1273 if (monarch_timeout < 0)
1274 monarch_timeout = 0;
1275 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30;
1277}
1278
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1280{
1281 if (c->x86 != 5)
1282 return;
1283 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled())
1286 intel_p5_mcheck_init(c);
1287 break;
1288 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c);
1290 break;
1291 }
1292}
1293
1294static void mce_cpu_features(struct cpuinfo_x86 *c)
1295{
1296 switch (c->x86_vendor) {
1297 case X86_VENDOR_INTEL:
1298 mce_intel_feature_init(c);
1299 break;
1300 case X86_VENDOR_AMD:
1301 mce_amd_feature_init(c);
1302 break;
1303 default:
1304 break;
1305 }
1306}
1307
1308static void mce_init_timer(void)
1309{
1310 struct timer_list *t = &__get_cpu_var(mce_timer);
1311 int *n = &__get_cpu_var(next_interval);
1312
1313 if (mce_ignore_ce)
1314 return;
1315
1316 *n = check_interval * HZ;
1317 if (!*n)
1318 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t);
1322}
1323
1324/*
1325 * Called for each booted CPU to set up machine checks.
1326 * Must be called with preempt off:
1327 */
1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1329{
1330 if (mce_disabled)
1331 return;
1332
1333 mce_ancient_init(c);
1334
1335 if (!mce_available(c))
1336 return;
1337
1338 if (mce_cap_init() < 0) {
1339 mce_disabled = 1;
1340 return;
1341 }
1342 mce_cpu_quirks(c);
1343
1344 machine_check_vector = do_machine_check;
1345
1346 mce_init();
1347 mce_cpu_features(c);
1348 mce_init_timer();
1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1350}
1351
1352/*
1353 * Character device to read and clear the MCE log.
1354 */
1355
1356static DEFINE_SPINLOCK(mce_state_lock);
1357static int open_count; /* #times opened */
1358static int open_exclu; /* already open exclusive? */
1359
1360static int mce_open(struct inode *inode, struct file *file)
1361{
1362 spin_lock(&mce_state_lock);
1363
1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1365 spin_unlock(&mce_state_lock);
1366
1367 return -EBUSY;
1368 }
1369
1370 if (file->f_flags & O_EXCL)
1371 open_exclu = 1;
1372 open_count++;
1373
1374 spin_unlock(&mce_state_lock);
1375
1376 return nonseekable_open(inode, file);
1377}
1378
1379static int mce_release(struct inode *inode, struct file *file)
1380{
1381 spin_lock(&mce_state_lock);
1382
1383 open_count--;
1384 open_exclu = 0;
1385
1386 spin_unlock(&mce_state_lock);
1387
1388 return 0;
1389}
1390
1391static void collect_tscs(void *data)
1392{
1393 unsigned long *cpu_tsc = (unsigned long *)data;
1394
1395 rdtscll(cpu_tsc[smp_processor_id()]);
1396}
1397
1398static DEFINE_MUTEX(mce_read_mutex);
1399
1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1401 loff_t *off)
1402{
1403 char __user *buf = ubuf;
1404 unsigned long *cpu_tsc;
1405 unsigned prev, next;
1406 int i, err;
1407
1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1409 if (!cpu_tsc)
1410 return -ENOMEM;
1411
1412 mutex_lock(&mce_read_mutex);
1413 next = rcu_dereference(mcelog.next);
1414
1415 /* Only supports full reads right now */
1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1417 mutex_unlock(&mce_read_mutex);
1418 kfree(cpu_tsc);
1419
1420 return -EINVAL;
1421 }
1422
1423 err = 0;
1424 prev = 0;
1425 do {
1426 for (i = prev; i < next; i++) {
1427 unsigned long start = jiffies;
1428
1429 while (!mcelog.entry[i].finished) {
1430 if (time_after_eq(jiffies, start + 2)) {
1431 memset(mcelog.entry + i, 0,
1432 sizeof(struct mce));
1433 goto timeout;
1434 }
1435 cpu_relax();
1436 }
1437 smp_rmb();
1438 err |= copy_to_user(buf, mcelog.entry + i,
1439 sizeof(struct mce));
1440 buf += sizeof(struct mce);
1441timeout:
1442 ;
1443 }
1444
1445 memset(mcelog.entry + prev, 0,
1446 (next - prev) * sizeof(struct mce));
1447 prev = next;
1448 next = cmpxchg(&mcelog.next, prev, 0);
1449 } while (next != prev);
1450
1451 synchronize_sched();
1452
1453 /*
1454 * Collect entries that were still getting written before the
1455 * synchronize.
1456 */
1457 on_each_cpu(collect_tscs, cpu_tsc, 1);
1458
1459 for (i = next; i < MCE_LOG_LEN; i++) {
1460 if (mcelog.entry[i].finished &&
1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1462 err |= copy_to_user(buf, mcelog.entry+i,
1463 sizeof(struct mce));
1464 smp_rmb();
1465 buf += sizeof(struct mce);
1466 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1467 }
1468 }
1469 mutex_unlock(&mce_read_mutex);
1470 kfree(cpu_tsc);
1471
1472 return err ? -EFAULT : buf - ubuf;
1473}
1474
1475static unsigned int mce_poll(struct file *file, poll_table *wait)
1476{
1477 poll_wait(file, &mce_wait, wait);
1478 if (rcu_dereference(mcelog.next))
1479 return POLLIN | POLLRDNORM;
1480 return 0;
1481}
1482
1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1484{
1485 int __user *p = (int __user *)arg;
1486
1487 if (!capable(CAP_SYS_ADMIN))
1488 return -EPERM;
1489
1490 switch (cmd) {
1491 case MCE_GET_RECORD_LEN:
1492 return put_user(sizeof(struct mce), p);
1493 case MCE_GET_LOG_LEN:
1494 return put_user(MCE_LOG_LEN, p);
1495 case MCE_GETCLEAR_FLAGS: {
1496 unsigned flags;
1497
1498 do {
1499 flags = mcelog.flags;
1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1501
1502 return put_user(flags, p);
1503 }
1504 default:
1505 return -ENOTTY;
1506 }
1507}
1508
1509/* Modified in mce-inject.c, so not static or const */
1510struct file_operations mce_chrdev_ops = {
1511 .open = mce_open,
1512 .release = mce_release,
1513 .read = mce_read,
1514 .poll = mce_poll,
1515 .unlocked_ioctl = mce_ioctl,
1516};
1517EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1518
1519static struct miscdevice mce_log_device = {
1520 MISC_MCELOG_MINOR,
1521 "mcelog",
1522 &mce_chrdev_ops,
1523};
1524
1525/*
1526 * mce=off Disables machine check
1527 * mce=no_cmci Disables CMCI
1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1531 * monarchtimeout is how long to wait for other CPUs on machine
1532 * check, or 0 to not wait
1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1534 * mce=nobootlog Don't log MCEs from before booting.
1535 */
1536static int __init mcheck_enable(char *str)
1537{
1538 if (*str == 0)
1539 enable_p5_mce();
1540 if (*str == '=')
1541 str++;
1542 if (!strcmp(str, "off"))
1543 mce_disabled = 1;
1544 else if (!strcmp(str, "no_cmci"))
1545 mce_cmci_disabled = 1;
1546 else if (!strcmp(str, "dont_log_ce"))
1547 mce_dont_log_ce = 1;
1548 else if (!strcmp(str, "ignore_ce"))
1549 mce_ignore_ce = 1;
1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1551 mce_bootlog = (str[0] == 'b');
1552 else if (isdigit(str[0])) {
1553 get_option(&str, &tolerant);
1554 if (*str == ',') {
1555 ++str;
1556 get_option(&str, &monarch_timeout);
1557 }
1558 } else {
1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1560 str);
1561 return 0;
1562 }
1563 return 1;
1564}
1565__setup("mce", mcheck_enable);
1566
1567/*
1568 * Sysfs support
1569 */
1570
1571/*
1572 * Disable machine checks on suspend and shutdown. We can't really handle
1573 * them later.
1574 */
1575static int mce_disable(void)
1576{
1577 int i;
1578
1579 for (i = 0; i < banks; i++) {
1580 if (!skip_bank_init(i))
1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1582 }
1583 return 0;
1584}
1585
1586static int mce_suspend(struct sys_device *dev, pm_message_t state)
1587{
1588 return mce_disable();
1589}
1590
1591static int mce_shutdown(struct sys_device *dev)
1592{
1593 return mce_disable();
1594}
1595
1596/*
1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1598 * Only one CPU is active at this time, the others get re-added later using
1599 * CPU hotplug:
1600 */
1601static int mce_resume(struct sys_device *dev)
1602{
1603 mce_init();
1604 mce_cpu_features(&current_cpu_data);
1605
1606 return 0;
1607}
1608
1609static void mce_cpu_restart(void *data)
1610{
1611 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data))
1613 mce_init();
1614 mce_init_timer();
1615}
1616
1617/* Reinit MCEs after user configuration changes */
1618static void mce_restart(void)
1619{
1620 on_each_cpu(mce_cpu_restart, NULL, 1);
1621}
1622
1623static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown,
1626 .resume = mce_resume,
1627 .name = "machinecheck",
1628};
1629
1630DEFINE_PER_CPU(struct sys_device, mce_dev);
1631
1632__cpuinitdata
1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1634
1635static struct sysdev_attribute *bank_attrs;
1636
1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1638 char *buf)
1639{
1640 u64 b = bank[attr - bank_attrs];
1641
1642 return sprintf(buf, "%llx\n", b);
1643}
1644
1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1646 const char *buf, size_t size)
1647{
1648 u64 new;
1649
1650 if (strict_strtoull(buf, 0, &new) < 0)
1651 return -EINVAL;
1652
1653 bank[attr - bank_attrs] = new;
1654 mce_restart();
1655
1656 return size;
1657}
1658
1659static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{
1662 strcpy(buf, trigger);
1663 strcat(buf, "\n");
1664 return strlen(trigger) + 1;
1665}
1666
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz)
1669{
1670 char *p;
1671 int len;
1672
1673 strncpy(trigger, buf, sizeof(trigger));
1674 trigger[sizeof(trigger)-1] = 0;
1675 len = strlen(trigger);
1676 p = strchr(trigger, '\n');
1677
1678 if (*p)
1679 *p = 0;
1680
1681 return len;
1682}
1683
1684static ssize_t store_int_with_restart(struct sys_device *s,
1685 struct sysdev_attribute *attr,
1686 const char *buf, size_t size)
1687{
1688 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1689 mce_restart();
1690 return ret;
1691}
1692
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1696
1697static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1699 store_int_with_restart),
1700 &check_interval
1701};
1702
1703static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1705 &attr_monarch_timeout.attr,
1706 NULL
1707};
1708
1709static cpumask_var_t mce_dev_initialized;
1710
1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1712static __cpuinit int mce_create_device(unsigned int cpu)
1713{
1714 int err;
1715 int i;
1716
1717 if (!mce_available(&boot_cpu_data))
1718 return -EIO;
1719
1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1721 per_cpu(mce_dev, cpu).id = cpu;
1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1723
1724 err = sysdev_register(&per_cpu(mce_dev, cpu));
1725 if (err)
1726 return err;
1727
1728 for (i = 0; mce_attrs[i]; i++) {
1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1730 if (err)
1731 goto error;
1732 }
1733 for (i = 0; i < banks; i++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]);
1736 if (err)
1737 goto error2;
1738 }
1739 cpumask_set_cpu(cpu, mce_dev_initialized);
1740
1741 return 0;
1742error2:
1743 while (--i >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1745error:
1746 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1748
1749 sysdev_unregister(&per_cpu(mce_dev, cpu));
1750
1751 return err;
1752}
1753
1754static __cpuinit void mce_remove_device(unsigned int cpu)
1755{
1756 int i;
1757
1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1759 return;
1760
1761 for (i = 0; mce_attrs[i]; i++)
1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1763
1764 for (i = 0; i < banks; i++)
1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1766
1767 sysdev_unregister(&per_cpu(mce_dev, cpu));
1768 cpumask_clear_cpu(cpu, mce_dev_initialized);
1769}
1770
1771/* Make sure there are no machine checks on offlined CPUs. */
1772static void mce_disable_cpu(void *h)
1773{
1774 unsigned long action = *(unsigned long *)h;
1775 int i;
1776
1777 if (!mce_available(&current_cpu_data))
1778 return;
1779 if (!(action & CPU_TASKS_FROZEN))
1780 cmci_clear();
1781 for (i = 0; i < banks; i++) {
1782 if (!skip_bank_init(i))
1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1784 }
1785}
1786
1787static void mce_reenable_cpu(void *h)
1788{
1789 unsigned long action = *(unsigned long *)h;
1790 int i;
1791
1792 if (!mce_available(&current_cpu_data))
1793 return;
1794
1795 if (!(action & CPU_TASKS_FROZEN))
1796 cmci_reenable();
1797 for (i = 0; i < banks; i++) {
1798 if (!skip_bank_init(i))
1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1800 }
1801}
1802
1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1804static int __cpuinit
1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1806{
1807 unsigned int cpu = (unsigned long)hcpu;
1808 struct timer_list *t = &per_cpu(mce_timer, cpu);
1809
1810 switch (action) {
1811 case CPU_ONLINE:
1812 case CPU_ONLINE_FROZEN:
1813 mce_create_device(cpu);
1814 if (threshold_cpu_callback)
1815 threshold_cpu_callback(action, cpu);
1816 break;
1817 case CPU_DEAD:
1818 case CPU_DEAD_FROZEN:
1819 if (threshold_cpu_callback)
1820 threshold_cpu_callback(action, cpu);
1821 mce_remove_device(cpu);
1822 break;
1823 case CPU_DOWN_PREPARE:
1824 case CPU_DOWN_PREPARE_FROZEN:
1825 del_timer_sync(t);
1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1827 break;
1828 case CPU_DOWN_FAILED:
1829 case CPU_DOWN_FAILED_FROZEN:
1830 t->expires = round_jiffies(jiffies +
1831 __get_cpu_var(next_interval));
1832 add_timer_on(t, cpu);
1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1834 break;
1835 case CPU_POST_DEAD:
1836 /* intentionally ignoring frozen here */
1837 cmci_rediscover(cpu);
1838 break;
1839 }
1840 return NOTIFY_OK;
1841}
1842
1843static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1844 .notifier_call = mce_cpu_callback,
1845};
1846
1847static __init int mce_init_banks(void)
1848{
1849 int i;
1850
1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1852 GFP_KERNEL);
1853 if (!bank_attrs)
1854 return -ENOMEM;
1855
1856 for (i = 0; i < banks; i++) {
1857 struct sysdev_attribute *a = &bank_attrs[i];
1858
1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1860 if (!a->attr.name)
1861 goto nomem;
1862
1863 a->attr.mode = 0644;
1864 a->show = show_bank;
1865 a->store = set_bank;
1866 }
1867 return 0;
1868
1869nomem:
1870 while (--i >= 0)
1871 kfree(bank_attrs[i].attr.name);
1872 kfree(bank_attrs);
1873 bank_attrs = NULL;
1874
1875 return -ENOMEM;
1876}
1877
1878static __init int mce_init_device(void)
1879{
1880 int err;
1881 int i = 0;
1882
1883 if (!mce_available(&boot_cpu_data))
1884 return -EIO;
1885
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887
1888 err = mce_init_banks();
1889 if (err)
1890 return err;
1891
1892 err = sysdev_class_register(&mce_sysclass);
1893 if (err)
1894 return err;
1895
1896 for_each_online_cpu(i) {
1897 err = mce_create_device(i);
1898 if (err)
1899 return err;
1900 }
1901
1902 register_hotcpu_notifier(&mce_cpu_notifier);
1903 misc_register(&mce_log_device);
1904
1905 return err;
1906}
1907
1908device_initcall(mce_init_device);
1909
1910#else /* CONFIG_X86_OLD_MCE: */
1911
1912int nr_mce_banks;
1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1914
1915/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c)
1917{
1918 if (mce_disabled == 1)
1919 return;
1920
1921 switch (c->x86_vendor) {
1922 case X86_VENDOR_AMD:
1923 amd_mcheck_init(c);
1924 break;
1925
1926 case X86_VENDOR_INTEL:
1927 if (c->x86 == 5)
1928 intel_p5_mcheck_init(c);
1929 if (c->x86 == 6)
1930 intel_p6_mcheck_init(c);
1931 if (c->x86 == 15)
1932 intel_p4_mcheck_init(c);
1933 break;
1934
1935 case X86_VENDOR_CENTAUR:
1936 if (c->x86 == 5)
1937 winchip_mcheck_init(c);
1938 break;
1939
1940 default:
1941 break;
1942 }
1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1944}
1945
1946static int __init mcheck_enable(char *str)
1947{
1948 mce_disabled = -1;
1949 return 1;
1950}
1951
1952__setup("mce", mcheck_enable);
1953
1954#endif /* CONFIG_X86_OLD_MCE */
1955
1956/*
1957 * Old style boot options parsing. Only for compatibility.
1958 */
1959static int __init mcheck_disable(char *str)
1960{
1961 mce_disabled = 1;
1962 return 1;
1963}
1964__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f1..84a552b458c8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4#ifdef CONFIG_X86_OLD_MCE
4void amd_mcheck_init(struct cpuinfo_x86 *c); 5void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c); 7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
9 22
10/* Call the installed machine check handler for this CPU setup. */ 23/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code); 24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 25
26#ifdef CONFIG_X86_OLD_MCE
27
13extern int nr_mce_banks; 28extern int nr_mce_banks;
14 29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 09dd1d414fc3..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
438void mce_log_therm_throt_event(__u64 status)
439{
440 struct mce m;
441
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
449/*
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
459
460static void mcheck_timer(unsigned long data)
461{
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
464
465 WARN_ON(smp_processor_id() != data);
466
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
470
471 /*
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
474 */
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480 }
481
482 t->expires = jiffies + *n;
483 add_timer(t);
484}
485
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
493/*
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
497 */
498int mce_notify_user(void)
499{
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
514
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
517
518 return 1;
519 }
520 return 0;
521}
522
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
532}
533
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
537
538static __init int periodic_mcheck_init(void)
539{
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
542}
543__initcall(periodic_mcheck_init);
544
545/*
546 * Initialize Machine Checks for a CPU.
547 */
548static int mce_cap_init(void)
549{
550 u64 cap;
551 unsigned b;
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
570 }
571
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
575
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
584
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590
591 set_in_cr4(X86_CR4_MCE);
592
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 }
601}
602
603/* Add per CPU specific workarounds here */
604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605{
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
616 }
617
618}
619
620static void mce_cpu_features(struct cpuinfo_x86 *c)
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
631 }
632}
633
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
638
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
645}
646
647/*
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
650 */
651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652{
653 if (!mce_available(c))
654 return;
655
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
677 lock_kernel();
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
692
693 return nonseekable_open(inode, file);
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
708static void collect_tscs(void *data)
709{
710 unsigned long *cpu_tsc = (unsigned long *)data;
711
712 rdtscll(cpu_tsc[smp_processor_id()]);
713}
714
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
717{
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
723
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
727
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
736 }
737
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
751 }
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
758 }
759
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
765
766 synchronize_sched();
767
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
782 }
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
786}
787
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797{
798 int __user *p = (int __user *)arg;
799
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
809
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
814 }
815 default:
816 return -ENOTTY;
817 }
818}
819
820static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
834/*
835 * Old style boot options parsing. Only for compatibility.
836 */
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
840 return 1;
841}
842
843/* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847static int __init mcheck_enable(char *str)
848{
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
858}
859
860__setup("nomce", mcheck_disable);
861__setup("mce=", mcheck_enable);
862
863/*
864 * Sysfs support
865 */
866
867/*
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
870 */
871static int mce_disable(void)
872{
873 int i;
874
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
878}
879
880static int mce_suspend(struct sys_device *dev, pm_message_t state)
881{
882 return mce_disable();
883}
884
885static int mce_shutdown(struct sys_device *dev)
886{
887 return mce_disable();
888}
889
890/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893static int mce_resume(struct sys_device *dev)
894{
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
898}
899
900static void mce_cpu_restart(void *data)
901{
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
906}
907
908/* Reinit MCEs after user configuration changes */
909static void mce_restart(void)
910{
911 on_each_cpu(mce_cpu_restart, NULL, 1);
912}
913
914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
919};
920
921DEFINE_PER_CPU(struct sys_device, device_mce);
922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
963
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987ACCESSOR(check_interval,check_interval,mce_restart())
988static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
991};
992
993static cpumask_var_t mce_device_initialized;
994
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
997{
998 int err;
999 int i;
1000
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1003
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
1032error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1036 }
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039 return err;
1040}
1041
1042static __cpuinit void mce_remove_device(unsigned int cpu)
1043{
1044 int i;
1045
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1048
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1089{
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1122 }
1123 return NOTIFY_OK;
1124}
1125
1126static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1128};
1129
1130static __init int mce_init_banks(void)
1131{
1132 int i;
1133
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1138
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1147 }
1148 return 0;
1149
1150nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1156}
1157
1158static __init int mce_init_device(void)
1159{
1160 int err;
1161 int i = 0;
1162
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1165
1166 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1171
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1175
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1180 }
1181
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1185}
1186
1187device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000000..2b011d2d8579
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
1/*
2 * Common code for Intel machine checks
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9
10#include <asm/therm_throt.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/apic.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18void intel_init_thermal(struct cpuinfo_x86 *c)
19{
20 unsigned int cpu = smp_processor_id();
21 int tm2 = 0;
22 u32 l, h;
23
24 /* Thermal monitoring depends on ACPI and clock modulation*/
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
26 return;
27
28 /*
29 * First check if its enabled already, in which case there might
30 * be some SMM goo which handles it, so we can't even put a handler
31 * since it might be delivered via SMI already:
32 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
34 h = apic_read(APIC_LVTTHMR);
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
36 printk(KERN_DEBUG
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
38 return;
39 }
40
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
42 tm2 = 1;
43
44 /* Check whether a vector already exists */
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return;
50 }
51
52 /* We'll mask the thermal vector in the lapic till we're ready: */
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
54 apic_write(APIC_LVTTHMR, h);
55
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
57 wrmsr(MSR_IA32_THERM_INTERRUPT,
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
59
60 intel_set_thermal_handler();
61
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
64
65 /* Unmask the thermal vector: */
66 l = apic_read(APIC_LVTTHMR);
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
70 cpu, tm2 ? "TM2" : "TM1");
71
72 /* enable thermal throttle processing */
73 atomic_set(&therm_throt_en, 1);
74}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 65a0fceedcd7..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -16,6 +16,8 @@
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18 18
19#include "mce.h"
20
19asmlinkage void smp_thermal_interrupt(void) 21asmlinkage void smp_thermal_interrupt(void)
20{ 22{
21 __u64 msr_val; 23 __u64 msr_val;
@@ -26,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
26 irq_enter(); 28 irq_enter();
27 29
28 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
29 if (therm_throt_process(msr_val & 1)) 31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
30 mce_log_therm_throt_event(msr_val); 32 mce_log_therm_throt_event(msr_val);
31 33
32 inc_irq_stat(irq_thermal_count); 34 inc_irq_stat(irq_thermal_count);
33 irq_exit(); 35 irq_exit();
34} 36}
35 37
36static void intel_init_thermal(struct cpuinfo_x86 *c)
37{
38 u32 l, h;
39 int tm2 = 0;
40 unsigned int cpu = smp_processor_id();
41
42 if (!cpu_has(c, X86_FEATURE_ACPI))
43 return;
44
45 if (!cpu_has(c, X86_FEATURE_ACC))
46 return;
47
48 /* first check if TM1 is already enabled by the BIOS, in which
49 * case there might be some SMM goo which handles it, so we can't even
50 * put a handler since it might be delivered via SMI already.
51 */
52 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
53 h = apic_read(APIC_LVTTHMR);
54 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
55 printk(KERN_DEBUG
56 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
57 return;
58 }
59
60 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
61 tm2 = 1;
62
63 if (h & APIC_VECTOR_MASK) {
64 printk(KERN_DEBUG
65 "CPU%d: Thermal LVT vector (%#x) already "
66 "installed\n", cpu, (h & APIC_VECTOR_MASK));
67 return;
68 }
69
70 h = THERMAL_APIC_VECTOR;
71 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
72 apic_write(APIC_LVTTHMR, h);
73
74 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
75 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
76
77 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
78 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
79
80 l = apic_read(APIC_LVTTHMR);
81 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
82 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
83 cpu, tm2 ? "TM2" : "TM1");
84
85 /* enable thermal throttle processing */
86 atomic_set(&therm_throt_en, 1);
87 return;
88}
89
90/* 38/*
91 * Support for Intel Correct Machine Check Interrupts. This allows 39 * Support for Intel Correct Machine Check Interrupts. This allows
92 * the CPU to raise an interrupt when a corrected machine check happened. 40 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -108,6 +56,9 @@ static int cmci_supported(int *banks)
108{ 56{
109 u64 cap; 57 u64 cap;
110 58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
111 /* 62 /*
112 * Vendor check is not strictly needed, but the initial 63 * Vendor check is not strictly needed, but the initial
113 * initialization is vendor keyed and this 64 * initialization is vendor keyed and this
@@ -131,7 +82,7 @@ static int cmci_supported(int *banks)
131static void intel_threshold_interrupt(void) 82static void intel_threshold_interrupt(void)
132{ 83{
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 mce_notify_user(); 85 mce_notify_irq();
135} 86}
136 87
137static void print_update(char *type, int *hdr, int num) 88static void print_update(char *type, int *hdr, int num)
@@ -247,7 +198,7 @@ void cmci_rediscover(int dying)
247 return; 198 return;
248 cpumask_copy(old, &current->cpus_allowed); 199 cpumask_copy(old, &current->cpus_allowed);
249 200
250 for_each_online_cpu (cpu) { 201 for_each_online_cpu(cpu) {
251 if (cpu == dying) 202 if (cpu == dying)
252 continue; 203 continue;
253 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..70b710420f74 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
@@ -22,9 +21,9 @@
22 21
23#include "mce.h" 22#include "mce.h"
24 23
25static int firstbank; 24static int firstbank;
26 25
27#define MCE_RATE 15*HZ /* timer rate is 15s */ 26#define MCE_RATE (15*HZ) /* timer rate is 15s */
28 27
29static void mce_checkregs(void *info) 28static void mce_checkregs(void *info)
30{ 29{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 33 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 35
37 if (high & (1<<31)) { 36 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 37 continue;
39 "fatal, correctable incident occurred on " 38
40 "CPU %d.\n", 39 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
40 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 41 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 42
43 43 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 44
45 * Scrub the error so we don't pick it up in MCE_RATE 45 /*
46 * seconds time. 46 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 47 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 48 */
49 49 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 50
51 wmb(); 51 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 52 wmb();
53 } 53 add_taint(TAINT_MACHINE_CHECK);
54 } 54 }
55} 55}
56 56
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
77 77
78 /* Some Athlons misbehave when we frob bank 0 */ 78 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 80 boot_cpu_data.x86 == 6)
81 firstbank = 1; 81 firstbank = 1;
82 else 82 else
83 firstbank = 0; 83 firstbank = 0;
84 84
85 /* 85 /*
86 * Check for non-fatal errors every MCE_RATE s 86 * Check for non-fatal errors every MCE_RATE s
87 */ 87 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 89 printk(KERN_INFO "Machine check exception polling timer started.\n");
90
90 return 0; 91 return 0;
91} 92}
92module_init(init_nonfatal_mce_checker); 93module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..82cee108a2d3 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4 4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/therm_throt.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15#include <asm/msr.h>
16#include <asm/therm_throt.h>
17 16
18#include "mce.h" 17#include "mce.h"
19 18
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
36 35
37 36
38#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
44} 44}
45 45
46/* P4/Xeon Thermal transition interrupt handler */ 46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs) 47static void intel_thermal_interrupt(struct pt_regs *regs)
48{ 48{
49 __u64 msr_val; 49 __u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
51 ack_APIC_irq(); 51 ack_APIC_irq();
52 52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1); 54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55} 55}
56 56
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
59 60
60void smp_thermal_interrupt(struct pt_regs *regs) 61void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
65 irq_exit(); 66 irq_exit();
66} 67}
67 68
68/* P4/Xeon Thermal regulation detect and init */ 69void intel_set_thermal_handler(void)
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{ 70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 71 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123} 72}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125 73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
126 75
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 78{
130 u32 h; 79 u32 h;
131 80
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 92
144static void intel_machine_check(struct pt_regs *regs, long error_code) 93static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 94{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 95 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 96 u32 mcgstl, mcgsth;
97 int recover = 1;
149 int i; 98 int i;
150 99
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 100 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 106
158 if (mce_num_extended_msrs > 0) { 107 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 108 struct intel_mce_extended_msrs dbg;
109
160 intel_get_extended_msrs(&dbg); 110 intel_get_extended_msrs(&dbg);
111
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 112 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 113 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 114 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 122 if (high & (1<<31)) {
172 char misc[20]; 123 char misc[20];
173 char addr[24]; 124 char addr[24];
125
174 misc[0] = addr[0] = '\0'; 126 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 127 if (high & (1<<29))
176 recover |= 1; 128 recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 148 panic("Unable to continue");
197 149
198 printk(KERN_EMERG "Attempting to continue.\n"); 150 printk(KERN_EMERG "Attempting to continue.\n");
151
199 /* 152 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 153 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 154 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 170 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 171}
219 172
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 173void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 174{
223 u32 l, h; 175 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..015f481ab1b0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,39 +14,58 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine check handler for Pentium class Intel */ 17/* By default disabled */
18int mce_p5_enable;
19
20/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 21static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 22{
21 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
24
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 25 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 26 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 27
25 if (lotype&(1<<5)) 28 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 29 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
30 smp_processor_id(), loaddr, lotype);
31
32 if (lotype & (1<<5)) {
33 printk(KERN_EMERG
34 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
35 smp_processor_id());
36 }
37
27 add_taint(TAINT_MACHINE_CHECK); 38 add_taint(TAINT_MACHINE_CHECK);
28} 39}
29 40
30/* Set up machine check reporting for processors with Intel style MCE */ 41/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 42void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 43{
33 u32 l, h; 44 u32 l, h;
34 45
35 /*Check for MCE support */ 46 /* Check for MCE support: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 47 if (!cpu_has(c, X86_FEATURE_MCE))
37 return; 48 return;
38 49
39 /* Default P5 to off as its often misconnected */ 50#ifdef CONFIG_X86_OLD_MCE
51 /* Default P5 to off as its often misconnected: */
40 if (mce_disabled != -1) 52 if (mce_disabled != -1)
41 return; 53 return;
54#endif
55
42 machine_check_vector = pentium_machine_check; 56 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 58 wmb();
44 59
45 /* Read registers before enabling */ 60 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 61 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 62 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 63 printk(KERN_INFO
64 "Intel old style machine check architecture supported.\n");
49 65
50 /* Enable MCE */ 66 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 67 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 68 printk(KERN_INFO
69 "Intel old style machine check reporting enabled on CPU#%d.\n",
70 smp_processor_id());
53} 71}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..43c24e667457 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -18,9 +17,9 @@
18/* Machine Check Handler For PII/PIII */ 17/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 18static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 } 58 }
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 67 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 68 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 69 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 70 * for errors if the OS could not log the error:
67 */ 71 */
68 for (i = 0; i < nr_mce_banks; i++) { 72 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 73 unsigned int msr;
74
70 msr = MSR_IA32_MC0_STATUS+i*4; 75 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 76 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 77 if (high & (1<<31)) {
73 /* Clear it */ 78 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 79 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 80 /* Serialize: */
76 wmb(); 81 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 82 add_taint(TAINT_MACHINE_CHECK);
78 } 83 }
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 86 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 87}
83 88
84/* Set up machine check reporting for processors with Intel style MCE */ 89/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 90void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 91{
87 u32 l, h; 92 u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 102
98 /* Ok machine check is available */ 103 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 104 machine_check_vector = intel_machine_check;
105 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 106 wmb();
101 107
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 108 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..7b1ae2e20ba5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/notifier.h>
17#include <linux/jiffies.h>
17#include <linux/percpu.h> 18#include <linux/percpu.h>
18#include <linux/sysdev.h> 19#include <linux/sysdev.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <asm/cpu.h> 21
21#include <linux/notifier.h>
22#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 22#include <asm/therm_throt.h>
24 23
25/* How long to wait between reporting thermal events */ 24/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 25#define CHECK_INTERVAL (300 * HZ)
27 26
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 29
30atomic_t therm_throt_en = ATOMIC_INIT(0);
31 31
32#ifdef CONFIG_SYSFS 32#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 33#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 38 struct sysdev_attribute *attr, \
39 char *buf) \ 39 char *buf) \
40{ \ 40{ \
41 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 42 ssize_t ret; \
43 \ 43 \
44 preempt_disable(); /* CPU hotplug */ \ 44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 48 else \
49 ret = 0; \ 49 ret = 0; \
50 preempt_enable(); \ 50 preempt_enable(); \
51 \ 51 \
52 return ret; \ 52 return ret; \
53} 53}
54 54
55define_therm_throt_sysdev_show_func(count); 55define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 61};
62 62
63static struct attribute_group thermal_throttle_attr_group = { 63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 65 .name = "thermal_throttle"
66}; 66};
67#endif /* CONFIG_SYSFS */ 67#endif /* CONFIG_SYSFS */
68 68
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
110} 110}
111 111
112#ifdef CONFIG_SYSFS 112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 113/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj,
117 &thermal_throttle_attr_group);
117} 118}
118 119
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 122 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 123}
123 124
124/* Mutex protecting device creation against CPU hotplug */ 125/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 126static DEFINE_MUTEX(therm_cpu_lock);
126 127
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 128/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 129static __cpuinit int
129 unsigned long action, 130thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 131 unsigned long action,
132 void *hcpu)
131{ 133{
132 unsigned int cpu = (unsigned long)hcpu; 134 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 135 struct sys_device *sys_dev;
134 int err = 0; 136 int err = 0;
135 137
136 sys_dev = get_cpu_sysdev(cpu); 138 sys_dev = get_cpu_sysdev(cpu);
139
137 switch (action) { 140 switch (action) {
138 case CPU_UP_PREPARE: 141 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 142 case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..81b02487090b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
@@ -14,7 +13,7 @@
14 13
15#include "mce.h" 14#include "mce.h"
16 15
17/* Machine check handler for WinChip C6 */ 16/* Machine check handler for WinChip C6: */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 17static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 24void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 25{
27 u32 lo, hi; 26 u32 lo, hi;
27
28 machine_check_vector = winchip_machine_check; 28 machine_check_vector = winchip_machine_check;
29 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 30 wmb();
31
30 rdmsr(MSR_IDT_FCR1, lo, hi); 32 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 33 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 34 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 35 wrmsr(MSR_IDT_FCR1, lo, hi);
36
34 set_in_cr4(X86_CR4_MCE); 37 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 38
39 printk(KERN_INFO
40 "Winchip machine check reporting enabled on CPU#0.\n");
36} 41}
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 895c82e78455..275bc142cd5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -968,6 +968,13 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
968 if (!x86_pmu.num_counters_fixed) 968 if (!x86_pmu.num_counters_fixed)
969 return -1; 969 return -1;
970 970
971 /*
972 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
973 */
974 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
975 boot_cpu_data.x86_model == 28)
976 return -1;
977
971 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 978 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
972 979
973 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 980 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb3..b07af8861244 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a4742a340d8d..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,6 +963,8 @@ END(\sym)
963#ifdef CONFIG_SMP 963#ifdef CONFIG_SMP
964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
966apicinterrupt REBOOT_VECTOR \
967 reboot_interrupt smp_reboot_interrupt
966#endif 968#endif
967 969
968#ifdef CONFIG_X86_UV 970#ifdef CONFIG_X86_UV
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
994#endif 996#endif
995 997
996apicinterrupt THRESHOLD_APIC_VECTOR \ 998apicinterrupt THRESHOLD_APIC_VECTOR \
997 threshold_interrupt mce_threshold_interrupt 999 threshold_interrupt smp_threshold_interrupt
998apicinterrupt THERMAL_APIC_VECTOR \ 1000apicinterrupt THERMAL_APIC_VECTOR \
999 thermal_interrupt smp_thermal_interrupt 1001 thermal_interrupt smp_thermal_interrupt
1000 1002
1003#ifdef CONFIG_X86_MCE
1004apicinterrupt MCE_SELF_VECTOR \
1005 mce_self_interrupt smp_mce_self_interrupt
1006#endif
1007
1001#ifdef CONFIG_SMP 1008#ifdef CONFIG_SMP
1002apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1009apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1003 call_function_single_interrupt smp_call_function_single_interrupt 1010 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1379,7 +1386,7 @@ errorentry xen_stack_segment do_stack_segment
1379errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1380errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1381#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
1382paranoidzeroentry machine_check do_machine_check 1389paranoidzeroentry machine_check *machine_check_vector(%rip)
1383#endif 1390#endif
1384 1391
1385 /* 1392 /*
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d4..5cf36c053ac4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269beab..270ff83efc11 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 38287b5f116e..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
15#include <asm/hw_irq.h> 16#include <asm/hw_irq.h>
16 17
17atomic_t irq_err_count; 18atomic_t irq_err_count;
@@ -96,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
96 for_each_online_cpu(j) 97 for_each_online_cpu(j)
97 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
98 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
99# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
100 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
101 for_each_online_cpu(j) 102 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
103 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
104# endif 105# endif
105#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
106 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
107#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
108 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -185,10 +196,14 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185#endif 196#endif
186#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
187 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
188# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
189 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
190# endif 201# endif
191#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
206#endif
192 return sum; 207 return sum;
193} 208}
194 209
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 267c6624c77f..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -173,6 +173,9 @@ static void __init smp_intr_init(void)
173 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
176#endif 179#endif
177#endif /* CONFIG_SMP */ 180#endif /* CONFIG_SMP */
178} 181}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6551dedee20c..a78ecad0c900 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9c4461501fcb..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode",
239 .fops = &microcode_fops, 240 .fops = &microcode_fops,
240}; 241};
241 242
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5c..89f386f044e4 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
1/* Kernel module help for x86-64 1/* Kernel module help for x86.
2 Copyright (C) 2001 Rusty Russell. 2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4 3
5 This program is free software; you can redistribute it and/or modify 4 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 5 it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
22#include <linux/fs.h> 21#include <linux/fs.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/slab.h>
27#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h>
28 26
29#include <asm/system.h> 27#include <asm/system.h>
30#include <asm/page.h> 28#include <asm/page.h>
31#include <asm/pgtable.h> 29#include <asm/pgtable.h>
32 30
31#if 0
32#define DEBUGP printk
33#else
33#define DEBUGP(fmt...) 34#define DEBUGP(fmt...)
34 35#endif
35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region)
37{
38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */
41}
42 36
43void *module_alloc(unsigned long size) 37void *module_alloc(unsigned long size)
44{ 38{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
54 if (!area) 48 if (!area)
55 return NULL; 49 return NULL;
56 50
57 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); 51 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
52 PAGE_KERNEL_EXEC);
53}
54
55/* Free memory returned from module_alloc */
56void module_free(struct module *mod, void *module_region)
57{
58 vfree(module_region);
58} 59}
59#endif
60 60
61/* We don't need anything special. */ 61/* We don't need anything special. */
62int module_frob_arch_sections(Elf_Ehdr *hdr, 62int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
67 return 0; 67 return 0;
68} 68}
69 69
70#ifdef CONFIG_X86_32
71int apply_relocate(Elf32_Shdr *sechdrs,
72 const char *strtab,
73 unsigned int symindex,
74 unsigned int relsec,
75 struct module *me)
76{
77 unsigned int i;
78 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
79 Elf32_Sym *sym;
80 uint32_t *location;
81
82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info);
84 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
85 /* This is where to make the change */
86 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
87 + rel[i].r_offset;
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
91 + ELF32_R_SYM(rel[i].r_info);
92
93 switch (ELF32_R_TYPE(rel[i].r_info)) {
94 case R_386_32:
95 /* We add the value into the location given */
96 *location += sym->st_value;
97 break;
98 case R_386_PC32:
99 /* Add the value, subtract its postition */
100 *location += sym->st_value - (uint32_t)location;
101 break;
102 default:
103 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
104 me->name, ELF32_R_TYPE(rel[i].r_info));
105 return -ENOEXEC;
106 }
107 }
108 return 0;
109}
110
111int apply_relocate_add(Elf32_Shdr *sechdrs,
112 const char *strtab,
113 unsigned int symindex,
114 unsigned int relsec,
115 struct module *me)
116{
117 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
118 me->name);
119 return -ENOEXEC;
120}
121#else /*X86_64*/
70int apply_relocate_add(Elf64_Shdr *sechdrs, 122int apply_relocate_add(Elf64_Shdr *sechdrs,
71 const char *strtab, 123 const char *strtab,
72 unsigned int symindex, 124 unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
147 return -ENOSYS; 199 return -ENOSYS;
148} 200}
149 201
202#endif
203
150int module_finalize(const Elf_Ehdr *hdr, 204int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 205 const Elf_Shdr *sechdrs,
152 struct module *me) 206 struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e7..000000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec626..98fd6cd4e3a4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649bd..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
63 task_xstate_cachep = 63 task_xstate_cachep =
64 kmem_cache_create("task_xstate", xstate_size, 64 kmem_cache_create("task_xstate", xstate_size,
65 __alignof__(union thread_xstate), 65 __alignof__(union thread_xstate),
66 SLAB_PANIC, NULL); 66 SLAB_PANIC | SLAB_NOTRACK, NULL);
67} 67}
68 68
69/* 69/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d1c636bf31a7..be5ae80f897f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -301,15 +301,13 @@ static void __init reserve_brk(void)
301 301
302#ifdef CONFIG_BLK_DEV_INITRD 302#ifdef CONFIG_BLK_DEV_INITRD
303 303
304#ifdef CONFIG_X86_32
305
306#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 304#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
307static void __init relocate_initrd(void) 305static void __init relocate_initrd(void)
308{ 306{
309 307
310 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 308 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
311 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 309 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
312 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 310 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
313 u64 ramdisk_here; 311 u64 ramdisk_here;
314 unsigned long slop, clen, mapaddr; 312 unsigned long slop, clen, mapaddr;
315 char *p, *q; 313 char *p, *q;
@@ -365,14 +363,13 @@ static void __init relocate_initrd(void)
365 ramdisk_image, ramdisk_image + ramdisk_size - 1, 363 ramdisk_image, ramdisk_image + ramdisk_size - 1,
366 ramdisk_here, ramdisk_here + ramdisk_size - 1); 364 ramdisk_here, ramdisk_here + ramdisk_size - 1);
367} 365}
368#endif
369 366
370static void __init reserve_initrd(void) 367static void __init reserve_initrd(void)
371{ 368{
372 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 369 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
373 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 370 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
374 u64 ramdisk_end = ramdisk_image + ramdisk_size; 371 u64 ramdisk_end = ramdisk_image + ramdisk_size;
375 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 372 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
376 373
377 if (!boot_params.hdr.type_of_loader || 374 if (!boot_params.hdr.type_of_loader ||
378 !ramdisk_image || !ramdisk_size) 375 !ramdisk_image || !ramdisk_size)
@@ -402,14 +399,8 @@ static void __init reserve_initrd(void)
402 return; 399 return;
403 } 400 }
404 401
405#ifdef CONFIG_X86_32
406 relocate_initrd(); 402 relocate_initrd();
407#else 403
408 printk(KERN_ERR "initrd extends beyond end of memory "
409 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
410 ramdisk_end, end_of_lowmem);
411 initrd_start = 0;
412#endif
413 free_early(ramdisk_image, ramdisk_end); 404 free_early(ramdisk_image, ramdisk_end);
414} 405}
415#else 406#else
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0a813b17b172..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -24,11 +24,11 @@
24#include <asm/ucontext.h> 24#include <asm/ucontext.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
27 28
28#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
29#include <asm/proto.h> 30#include <asm/proto.h>
30#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
31#include <asm/mce.h>
32#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
33 33
34#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -856,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 859#ifdef CONFIG_X86_NEW_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_user(); 862 mce_notify_process();
863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
864 864
865 /* deal with pending signal delivery */ 865 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index f6db48c405b8..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
172{ 198{
173 ack_APIC_irq(); 199 ack_APIC_irq();
174 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
175} 204}
176 205
177void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c80007ea5f7..2fecda69ee64 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
873 873
874 err = do_boot_cpu(apicid, cpu); 874 err = do_boot_cpu(apicid, cpu);
875 875
876 zap_low_mappings(); 876 zap_low_mappings(false);
877 low_mappings = 0; 877 low_mappings = 0;
878#else 878#else
879 err = do_boot_cpu(apicid, cpu); 879 err = do_boot_cpu(apicid, cpu);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 07d60c870ce2..5f935f0d5861 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 535
535 get_debugreg(condition, 6); 536 get_debugreg(condition, 6);
536 537
538 /* Catch kmemcheck conditions first of all! */
539 if (condition & DR_STEP && kmemcheck_trap(regs))
540 return;
541
537 /* 542 /*
538 * The processor cleared BTF, so don't mark that we need it set. 543 * The processor cleared BTF, so don't mark that we need it set.
539 */ 544 */
@@ -798,15 +803,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
798 803
799 return new_kesp; 804 return new_kesp;
800} 805}
801#else 806#endif
807
802asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 808asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
803{ 809{
804} 810}
805 811
806asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 812asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
807{ 813{
808} 814}
809#endif
810 815
811/* 816/*
812 * 'math_state_restore()' saves the current math information in the 817 * 'math_state_restore()' saves the current math information in the
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3e1c057e98fe..ae3180c506a6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4c85b2e2bb65..367e87882041 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -108,6 +108,8 @@ SECTIONS
108 /* Data */ 108 /* Data */
109 . = ALIGN(PAGE_SIZE); 109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) { 110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 /* Start of data section */
112 _sdata = .;
111 DATA_DATA 113 DATA_DATA
112 CONSTRUCTORS 114 CONSTRUCTORS
113 115
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78cc..8600a09e0c6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
50 Provides support for KVM on Intel processors equipped with the VT 50 Provides support for KVM on Intel processors equipped with the VT
51 extensions. 51 extensions.
52 52
53 To compile this as a module, choose M here: the module
54 will be called kvm-intel.
55
53config KVM_AMD 56config KVM_AMD
54 tristate "KVM for AMD processors support" 57 tristate "KVM for AMD processors support"
55 depends on KVM 58 depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
57 Provides support for KVM on AMD processors equipped with the AMD-V 60 Provides support for KVM on AMD processors equipped with the AMD-V
58 (SVM) extensions. 61 (SVM) extensions.
59 62
63 To compile this as a module, choose M here: the module
64 will be called kvm-amd.
65
60config KVM_TRACE 66config KVM_TRACE
61 bool "KVM trace support" 67 bool "KVM trace support"
62 depends on KVM && SYSFS 68 depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f2..b43c4efafe80 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 15
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
17 i8254.o 17 i8254.o timer.o
18obj-$(CONFIG_KVM) += kvm.o 18obj-$(CONFIG_KVM) += kvm.o
19kvm-intel-objs = vmx.o 19kvm-intel-objs = vmx.o
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d3157..4d6f0d293ee2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
98 return kvm->arch.vpit->pit_state.channels[channel].gate; 98 return kvm->arch.vpit->pit_state.channels[channel].gate;
99} 99}
100 100
101static s64 __kpit_elapsed(struct kvm *kvm)
102{
103 s64 elapsed;
104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106
107 /*
108 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
110 * the highest count, either FFFF hex for binary counting
111 * or 9999 for BCD counting, and continues counting.
112 * Modes 2 and 3 are periodic; the Counter reloads
113 * itself with the initial count and continues counting
114 * from there.
115 */
116 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
117 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
118 elapsed = mod_64(elapsed, ps->pit_timer.period);
119
120 return elapsed;
121}
122
123static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
124 int channel)
125{
126 if (channel == 0)
127 return __kpit_elapsed(kvm);
128
129 return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
130}
131
101static int pit_get_count(struct kvm *kvm, int channel) 132static int pit_get_count(struct kvm *kvm, int channel)
102{ 133{
103 struct kvm_kpit_channel_state *c = 134 struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
107 138
108 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 139 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
109 140
110 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 141 t = kpit_elapsed(kvm, c, channel);
111 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 142 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
112 143
113 switch (c->mode) { 144 switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
137 168
138 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 169 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
139 170
140 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 171 t = kpit_elapsed(kvm, c, channel);
141 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 172 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
142 173
143 switch (c->mode) { 174 switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 224 }
194} 225}
195 226
196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200
201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203
204 if (!pt->reinject)
205 atomic_set(&pt->pending, 1);
206
207 if (vcpu0 && waitqueue_active(&vcpu0->wq))
208 wake_up_interruptible(&vcpu0->wq);
209
210 hrtimer_add_expires_ns(&pt->timer, pt->period);
211 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
212 if (pt->period)
213 ps->channels[0].count_load_time = ktime_get();
214
215 return (pt->period == 0 ? 0 : 1);
216}
217
218int pit_has_pending_timer(struct kvm_vcpu *vcpu) 227int pit_has_pending_timer(struct kvm_vcpu *vcpu)
219{ 228{
220 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 229 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
235 spin_unlock(&ps->inject_lock); 244 spin_unlock(&ps->inject_lock);
236} 245}
237 246
238static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
239{
240 struct kvm_kpit_state *ps;
241 int restart_timer = 0;
242
243 ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
244
245 restart_timer = __pit_timer_fn(ps);
246
247 if (restart_timer)
248 return HRTIMER_RESTART;
249 else
250 return HRTIMER_NORESTART;
251}
252
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 247void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
254{ 248{
255 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 249 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 257 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 258}
265 259
266static void destroy_pit_timer(struct kvm_kpit_timer *pt) 260static void destroy_pit_timer(struct kvm_timer *pt)
267{ 261{
268 pr_debug("pit: execute del timer!\n"); 262 pr_debug("pit: execute del timer!\n");
269 hrtimer_cancel(&pt->timer); 263 hrtimer_cancel(&pt->timer);
270} 264}
271 265
266static bool kpit_is_periodic(struct kvm_timer *ktimer)
267{
268 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
269 pit_timer);
270 return ps->is_periodic;
271}
272
273static struct kvm_timer_ops kpit_ops = {
274 .is_periodic = kpit_is_periodic,
275};
276
272static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 277static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
273{ 278{
274 struct kvm_kpit_timer *pt = &ps->pit_timer; 279 struct kvm_timer *pt = &ps->pit_timer;
275 s64 interval; 280 s64 interval;
276 281
277 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 282 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
280 285
281 /* TODO The new value only affected after the retriggered */ 286 /* TODO The new value only affected after the retriggered */
282 hrtimer_cancel(&pt->timer); 287 hrtimer_cancel(&pt->timer);
283 pt->period = (is_period == 0) ? 0 : interval; 288 pt->period = interval;
284 pt->timer.function = pit_timer_fn; 289 ps->is_periodic = is_period;
290
291 pt->timer.function = kvm_timer_fn;
292 pt->t_ops = &kpit_ops;
293 pt->kvm = ps->pit->kvm;
294 pt->vcpu_id = 0;
295
285 atomic_set(&pt->pending, 0); 296 atomic_set(&pt->pending, 0);
286 ps->irq_ack = 1; 297 ps->irq_ack = 1;
287 298
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
298 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 309 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
299 310
300 /* 311 /*
301 * Though spec said the state of 8254 is undefined after power-up, 312 * The largest possible initial count is 0; this is equivalent
302 * seems some tricky OS like Windows XP depends on IRQ0 interrupt 313 * to 216 for binary counting and 104 for BCD counting.
303 * when booting up.
304 * So here setting initialize rate for it, and not a specific number
305 */ 314 */
306 if (val == 0) 315 if (val == 0)
307 val = 0x10000; 316 val = 0x10000;
308 317
309 ps->channels[channel].count_load_time = ktime_get();
310 ps->channels[channel].count = val; 318 ps->channels[channel].count = val;
311 319
312 if (channel != 0) 320 if (channel != 0) {
321 ps->channels[channel].count_load_time = ktime_get();
313 return; 322 return;
323 }
314 324
315 /* Two types of timer 325 /* Two types of timer
316 * mode 1 is one shot, mode 2 is period, otherwise del timer */ 326 * mode 1 is one shot, mode 2 is period, otherwise del timer */
317 switch (ps->channels[0].mode) { 327 switch (ps->channels[0].mode) {
328 case 0:
318 case 1: 329 case 1:
319 /* FIXME: enhance mode 4 precision */ 330 /* FIXME: enhance mode 4 precision */
320 case 4: 331 case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d5..bbd863ff60b7 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
3 3
4#include "iodev.h" 4#include "iodev.h"
5 5
6struct kvm_kpit_timer {
7 struct hrtimer timer;
8 int irq;
9 s64 period; /* unit: ns */
10 s64 scheduled;
11 atomic_t pending;
12 bool reinject;
13};
14
15struct kvm_kpit_channel_state { 6struct kvm_kpit_channel_state {
16 u32 count; /* can be 65536 */ 7 u32 count; /* can be 65536 */
17 u16 latched_count; 8 u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
30 21
31struct kvm_kpit_state { 22struct kvm_kpit_state {
32 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
33 struct kvm_kpit_timer pit_timer; 24 struct kvm_timer pit_timer;
25 bool is_periodic;
34 u32 speaker_data_on; 26 u32 speaker_data_on;
35 struct mutex lock; 27 struct mutex lock;
36 struct kvm_pit *pit; 28 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6fb..96dfbb6ad2a9 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
24 24
25#include "irq.h" 25#include "irq.h"
26#include "i8254.h" 26#include "i8254.h"
27#include "x86.h"
27 28
28/* 29/*
29 * check if there are pending timer events 30 * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
48{ 49{
49 struct kvm_pic *s; 50 struct kvm_pic *s;
50 51
52 if (!irqchip_in_kernel(v->kvm))
53 return v->arch.interrupt.pending;
54
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 55 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
52 if (kvm_apic_accept_pic_intr(v)) { 56 if (kvm_apic_accept_pic_intr(v)) {
53 s = pic_irqchip(v->kvm); /* PIC */ 57 s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
67 struct kvm_pic *s; 71 struct kvm_pic *s;
68 int vector; 72 int vector;
69 73
74 if (!irqchip_in_kernel(v->kvm))
75 return v->arch.interrupt.nr;
76
70 vector = kvm_get_apic_interrupt(v); /* APIC */ 77 vector = kvm_get_apic_interrupt(v); /* APIC */
71 if (vector == -1) { 78 if (vector == -1) {
72 if (kvm_apic_accept_pic_intr(v)) { 79 if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 000000000000..26bd6ba74e1c
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 atomic_t pending; /* accumulated triggered timers */
6 bool reinject;
7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm;
9 int vcpu_id;
10};
11
12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *);
14};
15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd69..ae99d83f81a3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
196} 196}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 198
199int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) 199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode);
201
202int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
200{ 203{
201 struct kvm_lapic *apic = vcpu->arch.apic; 204 struct kvm_lapic *apic = vcpu->arch.apic;
202 205
203 if (!apic_test_and_set_irr(vec, apic)) { 206 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
204 /* a new pending irq is set in IRR */ 207 irq->level, irq->trig_mode);
205 if (trig)
206 apic_set_vector(vec, apic->regs + APIC_TMR);
207 else
208 apic_clear_vector(vec, apic->regs + APIC_TMR);
209 kvm_vcpu_kick(apic->vcpu);
210 return 1;
211 }
212 return 0;
213} 208}
214 209
215static inline int apic_find_highest_isr(struct kvm_lapic *apic) 210static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
250 245
251int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) 246int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
252{ 247{
253 return kvm_apic_id(apic) == dest; 248 return dest == 0xff || kvm_apic_id(apic) == dest;
254} 249}
255 250
256int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
279 return result; 274 return result;
280} 275}
281 276
282static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 277int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
283 int short_hand, int dest, int dest_mode) 278 int short_hand, int dest, int dest_mode)
284{ 279{
285 int result = 0; 280 int result = 0;
286 struct kvm_lapic *target = vcpu->arch.apic; 281 struct kvm_lapic *target = vcpu->arch.apic;
287 282
288 apic_debug("target %p, source %p, dest 0x%x, " 283 apic_debug("target %p, source %p, dest 0x%x, "
289 "dest_mode 0x%x, short_hand 0x%x", 284 "dest_mode 0x%x, short_hand 0x%x\n",
290 target, source, dest, dest_mode, short_hand); 285 target, source, dest, dest_mode, short_hand);
291 286
292 ASSERT(!target); 287 ASSERT(!target);
293 switch (short_hand) { 288 switch (short_hand) {
294 case APIC_DEST_NOSHORT: 289 case APIC_DEST_NOSHORT:
295 if (dest_mode == 0) { 290 if (dest_mode == 0)
296 /* Physical mode. */ 291 /* Physical mode. */
297 if ((dest == 0xFF) || (dest == kvm_apic_id(target))) 292 result = kvm_apic_match_physical_addr(target, dest);
298 result = 1; 293 else
299 } else
300 /* Logical mode. */ 294 /* Logical mode. */
301 result = kvm_apic_match_logical_addr(target, dest); 295 result = kvm_apic_match_logical_addr(target, dest);
302 break; 296 break;
303 case APIC_DEST_SELF: 297 case APIC_DEST_SELF:
304 if (target == source) 298 result = (target == source);
305 result = 1;
306 break; 299 break;
307 case APIC_DEST_ALLINC: 300 case APIC_DEST_ALLINC:
308 result = 1; 301 result = 1;
309 break; 302 break;
310 case APIC_DEST_ALLBUT: 303 case APIC_DEST_ALLBUT:
311 if (target != source) 304 result = (target != source);
312 result = 1;
313 break; 305 break;
314 default: 306 default:
315 printk(KERN_WARNING "Bad dest shorthand value %x\n", 307 printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
327static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 319static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
328 int vector, int level, int trig_mode) 320 int vector, int level, int trig_mode)
329{ 321{
330 int orig_irr, result = 0; 322 int result = 0;
331 struct kvm_vcpu *vcpu = apic->vcpu; 323 struct kvm_vcpu *vcpu = apic->vcpu;
332 324
333 switch (delivery_mode) { 325 switch (delivery_mode) {
334 case APIC_DM_FIXED:
335 case APIC_DM_LOWEST: 326 case APIC_DM_LOWEST:
327 vcpu->arch.apic_arb_prio++;
328 case APIC_DM_FIXED:
336 /* FIXME add logic for vcpu on reset */ 329 /* FIXME add logic for vcpu on reset */
337 if (unlikely(!apic_enabled(apic))) 330 if (unlikely(!apic_enabled(apic)))
338 break; 331 break;
339 332
340 orig_irr = apic_test_and_set_irr(vector, apic); 333 result = !apic_test_and_set_irr(vector, apic);
341 if (orig_irr && trig_mode) { 334 if (!result) {
342 apic_debug("level trig mode repeatedly for vector %d", 335 if (trig_mode)
343 vector); 336 apic_debug("level trig mode repeatedly for "
337 "vector %d", vector);
344 break; 338 break;
345 } 339 }
346 340
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
349 apic_set_vector(vector, apic->regs + APIC_TMR); 343 apic_set_vector(vector, apic->regs + APIC_TMR);
350 } else 344 } else
351 apic_clear_vector(vector, apic->regs + APIC_TMR); 345 apic_clear_vector(vector, apic->regs + APIC_TMR);
352
353 kvm_vcpu_kick(vcpu); 346 kvm_vcpu_kick(vcpu);
354
355 result = (orig_irr == 0);
356 break; 347 break;
357 348
358 case APIC_DM_REMRD: 349 case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
364 break; 355 break;
365 356
366 case APIC_DM_NMI: 357 case APIC_DM_NMI:
358 result = 1;
367 kvm_inject_nmi(vcpu); 359 kvm_inject_nmi(vcpu);
368 kvm_vcpu_kick(vcpu); 360 kvm_vcpu_kick(vcpu);
369 break; 361 break;
370 362
371 case APIC_DM_INIT: 363 case APIC_DM_INIT:
372 if (level) { 364 if (level) {
365 result = 1;
373 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 366 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
374 printk(KERN_DEBUG 367 printk(KERN_DEBUG
375 "INIT on a runnable vcpu %d\n", 368 "INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
386 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 379 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
387 vcpu->vcpu_id, vector); 380 vcpu->vcpu_id, vector);
388 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 381 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
382 result = 1;
389 vcpu->arch.sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
390 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
391 kvm_vcpu_kick(vcpu); 385 kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
408 return result; 402 return result;
409} 403}
410 404
411static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 405int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
412 unsigned long bitmap)
413{
414 int last;
415 int next;
416 struct kvm_lapic *apic = NULL;
417
418 last = kvm->arch.round_robin_prev_vcpu;
419 next = last;
420
421 do {
422 if (++next == KVM_MAX_VCPUS)
423 next = 0;
424 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
425 continue;
426 apic = kvm->vcpus[next]->arch.apic;
427 if (apic && apic_enabled(apic))
428 break;
429 apic = NULL;
430 } while (next != last);
431 kvm->arch.round_robin_prev_vcpu = next;
432
433 if (!apic)
434 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
435
436 return apic;
437}
438
439struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
440 unsigned long bitmap)
441{ 406{
442 struct kvm_lapic *apic; 407 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
443
444 apic = kvm_apic_round_robin(kvm, vector, bitmap);
445 if (apic)
446 return apic->vcpu;
447 return NULL;
448} 408}
449 409
450static void apic_set_eoi(struct kvm_lapic *apic) 410static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
472{ 432{
473 u32 icr_low = apic_get_reg(apic, APIC_ICR); 433 u32 icr_low = apic_get_reg(apic, APIC_ICR);
474 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 434 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
435 struct kvm_lapic_irq irq;
475 436
476 unsigned int dest = GET_APIC_DEST_FIELD(icr_high); 437 irq.vector = icr_low & APIC_VECTOR_MASK;
477 unsigned int short_hand = icr_low & APIC_SHORT_MASK; 438 irq.delivery_mode = icr_low & APIC_MODE_MASK;
478 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; 439 irq.dest_mode = icr_low & APIC_DEST_MASK;
479 unsigned int level = icr_low & APIC_INT_ASSERT; 440 irq.level = icr_low & APIC_INT_ASSERT;
480 unsigned int dest_mode = icr_low & APIC_DEST_MASK; 441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
481 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 442 irq.shorthand = icr_low & APIC_SHORT_MASK;
482 unsigned int vector = icr_low & APIC_VECTOR_MASK; 443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
483
484 struct kvm_vcpu *target;
485 struct kvm_vcpu *vcpu;
486 unsigned long lpr_map = 0;
487 int i;
488 444
489 apic_debug("icr_high 0x%x, icr_low 0x%x, " 445 apic_debug("icr_high 0x%x, icr_low 0x%x, "
490 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
491 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", 447 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
492 icr_high, icr_low, short_hand, dest, 448 icr_high, icr_low, irq.shorthand, irq.dest_id,
493 trig_mode, level, dest_mode, delivery_mode, vector); 449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
494 450 irq.vector);
495 for (i = 0; i < KVM_MAX_VCPUS; i++) {
496 vcpu = apic->vcpu->kvm->vcpus[i];
497 if (!vcpu)
498 continue;
499
500 if (vcpu->arch.apic &&
501 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
502 if (delivery_mode == APIC_DM_LOWEST)
503 set_bit(vcpu->vcpu_id, &lpr_map);
504 else
505 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
506 vector, level, trig_mode);
507 }
508 }
509 451
510 if (delivery_mode == APIC_DM_LOWEST) { 452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
511 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
512 if (target != NULL)
513 __apic_accept_irq(target->arch.apic, delivery_mode,
514 vector, level, trig_mode);
515 }
516} 453}
517 454
518static u32 apic_get_tmcct(struct kvm_lapic *apic) 455static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
527 if (apic_get_reg(apic, APIC_TMICT) == 0) 464 if (apic_get_reg(apic, APIC_TMICT) == 0)
528 return 0; 465 return 0;
529 466
530 remaining = hrtimer_expires_remaining(&apic->timer.dev); 467 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
531 if (ktime_to_ns(remaining) < 0) 468 if (ktime_to_ns(remaining) < 0)
532 remaining = ktime_set(0, 0); 469 remaining = ktime_set(0, 0);
533 470
534 ns = mod_64(ktime_to_ns(remaining), apic->timer.period); 471 ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
535 tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 472 tmcct = div64_u64(ns,
473 (APIC_BUS_CYCLE_NS * apic->divide_count));
536 474
537 return tmcct; 475 return tmcct;
538} 476}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
619 tdcr = apic_get_reg(apic, APIC_TDCR); 557 tdcr = apic_get_reg(apic, APIC_TDCR);
620 tmp1 = tdcr & 0xf; 558 tmp1 = tdcr & 0xf;
621 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 559 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
622 apic->timer.divide_count = 0x1 << (tmp2 & 0x7); 560 apic->divide_count = 0x1 << (tmp2 & 0x7);
623 561
624 apic_debug("timer divide count is 0x%x\n", 562 apic_debug("timer divide count is 0x%x\n",
625 apic->timer.divide_count); 563 apic->divide_count);
626} 564}
627 565
628static void start_apic_timer(struct kvm_lapic *apic) 566static void start_apic_timer(struct kvm_lapic *apic)
629{ 567{
630 ktime_t now = apic->timer.dev.base->get_time(); 568 ktime_t now = apic->lapic_timer.timer.base->get_time();
631 569
632 apic->timer.period = apic_get_reg(apic, APIC_TMICT) * 570 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
633 APIC_BUS_CYCLE_NS * apic->timer.divide_count; 571 APIC_BUS_CYCLE_NS * apic->divide_count;
634 atomic_set(&apic->timer.pending, 0); 572 atomic_set(&apic->lapic_timer.pending, 0);
635 573
636 if (!apic->timer.period) 574 if (!apic->lapic_timer.period)
637 return; 575 return;
638 576
639 hrtimer_start(&apic->timer.dev, 577 hrtimer_start(&apic->lapic_timer.timer,
640 ktime_add_ns(now, apic->timer.period), 578 ktime_add_ns(now, apic->lapic_timer.period),
641 HRTIMER_MODE_ABS); 579 HRTIMER_MODE_ABS);
642 580
643 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 581 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
646 "expire @ 0x%016" PRIx64 ".\n", __func__, 584 "expire @ 0x%016" PRIx64 ".\n", __func__,
647 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 585 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
648 apic_get_reg(apic, APIC_TMICT), 586 apic_get_reg(apic, APIC_TMICT),
649 apic->timer.period, 587 apic->lapic_timer.period,
650 ktime_to_ns(ktime_add_ns(now, 588 ktime_to_ns(ktime_add_ns(now,
651 apic->timer.period))); 589 apic->lapic_timer.period)));
652} 590}
653 591
654static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 592static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
730 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 668 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
731 lvt_val | APIC_LVT_MASKED); 669 lvt_val | APIC_LVT_MASKED);
732 } 670 }
733 atomic_set(&apic->timer.pending, 0); 671 atomic_set(&apic->lapic_timer.pending, 0);
734 672
735 } 673 }
736 break; 674 break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
762 break; 700 break;
763 701
764 case APIC_TMICT: 702 case APIC_TMICT:
765 hrtimer_cancel(&apic->timer.dev); 703 hrtimer_cancel(&apic->lapic_timer.timer);
766 apic_set_reg(apic, APIC_TMICT, val); 704 apic_set_reg(apic, APIC_TMICT, val);
767 start_apic_timer(apic); 705 start_apic_timer(apic);
768 return; 706 return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
802 if (!vcpu->arch.apic) 740 if (!vcpu->arch.apic)
803 return; 741 return;
804 742
805 hrtimer_cancel(&vcpu->arch.apic->timer.dev); 743 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
806 744
807 if (vcpu->arch.apic->regs_page) 745 if (vcpu->arch.apic->regs_page)
808 __free_page(vcpu->arch.apic->regs_page); 746 __free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
880 ASSERT(apic != NULL); 818 ASSERT(apic != NULL);
881 819
882 /* Stop the timer in case it's a reset to an active apic */ 820 /* Stop the timer in case it's a reset to an active apic */
883 hrtimer_cancel(&apic->timer.dev); 821 hrtimer_cancel(&apic->lapic_timer.timer);
884 822
885 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
886 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 824 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
905 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
906 } 844 }
907 update_divide_count(apic); 845 update_divide_count(apic);
908 atomic_set(&apic->timer.pending, 0); 846 atomic_set(&apic->lapic_timer.pending, 0);
909 if (vcpu->vcpu_id == 0) 847 if (vcpu->vcpu_id == 0)
910 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
911 apic_update_ppr(apic); 849 apic_update_ppr(apic);
912 850
851 vcpu->arch.apic_arb_prio = 0;
852
913 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 853 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
914 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 854 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
915 vcpu, kvm_apic_id(apic), 855 vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
917} 857}
918EXPORT_SYMBOL_GPL(kvm_lapic_reset); 858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
919 859
920int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 860bool kvm_apic_present(struct kvm_vcpu *vcpu)
921{ 861{
922 struct kvm_lapic *apic = vcpu->arch.apic; 862 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
923 int ret = 0; 863}
924
925 if (!apic)
926 return 0;
927 ret = apic_enabled(apic);
928 864
929 return ret; 865int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
930} 868}
931EXPORT_SYMBOL_GPL(kvm_lapic_enabled); 869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
932 870
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
936 *---------------------------------------------------------------------- 874 *----------------------------------------------------------------------
937 */ 875 */
938 876
939/* TODO: make sure __apic_timer_fn runs in current pCPU */ 877static bool lapic_is_periodic(struct kvm_timer *ktimer)
940static int __apic_timer_fn(struct kvm_lapic *apic)
941{ 878{
942 int result = 0; 879 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
943 wait_queue_head_t *q = &apic->vcpu->wq; 880 lapic_timer);
944 881 return apic_lvtt_period(apic);
945 if(!atomic_inc_and_test(&apic->timer.pending))
946 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
947 if (waitqueue_active(q))
948 wake_up_interruptible(q);
949
950 if (apic_lvtt_period(apic)) {
951 result = 1;
952 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
953 }
954 return result;
955} 882}
956 883
957int apic_has_pending_timer(struct kvm_vcpu *vcpu) 884int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
959 struct kvm_lapic *lapic = vcpu->arch.apic; 886 struct kvm_lapic *lapic = vcpu->arch.apic;
960 887
961 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 888 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
962 return atomic_read(&lapic->timer.pending); 889 return atomic_read(&lapic->lapic_timer.pending);
963 890
964 return 0; 891 return 0;
965} 892}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
986 kvm_apic_local_deliver(apic, APIC_LVT0); 913 kvm_apic_local_deliver(apic, APIC_LVT0);
987} 914}
988 915
989static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 916static struct kvm_timer_ops lapic_timer_ops = {
990{ 917 .is_periodic = lapic_is_periodic,
991 struct kvm_lapic *apic; 918};
992 int restart_timer = 0;
993
994 apic = container_of(data, struct kvm_lapic, timer.dev);
995
996 restart_timer = __apic_timer_fn(apic);
997
998 if (restart_timer)
999 return HRTIMER_RESTART;
1000 else
1001 return HRTIMER_NORESTART;
1002}
1003 919
1004int kvm_create_lapic(struct kvm_vcpu *vcpu) 920int kvm_create_lapic(struct kvm_vcpu *vcpu)
1005{ 921{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1024 memset(apic->regs, 0, PAGE_SIZE); 940 memset(apic->regs, 0, PAGE_SIZE);
1025 apic->vcpu = vcpu; 941 apic->vcpu = vcpu;
1026 942
1027 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 943 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1028 apic->timer.dev.function = apic_timer_fn; 944 HRTIMER_MODE_ABS);
945 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
949
1029 apic->base_address = APIC_DEFAULT_PHYS_BASE; 950 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1030 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1031 952
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1078{ 999{
1079 struct kvm_lapic *apic = vcpu->arch.apic; 1000 struct kvm_lapic *apic = vcpu->arch.apic;
1080 1001
1081 if (apic && atomic_read(&apic->timer.pending) > 0) { 1002 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
1082 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1003 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1083 atomic_dec(&apic->timer.pending); 1004 atomic_dec(&apic->lapic_timer.pending);
1084 } 1005 }
1085} 1006}
1086 1007
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1106 MSR_IA32_APICBASE_BASE; 1027 MSR_IA32_APICBASE_BASE;
1107 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1108 apic_update_ppr(apic); 1029 apic_update_ppr(apic);
1109 hrtimer_cancel(&apic->timer.dev); 1030 hrtimer_cancel(&apic->lapic_timer.timer);
1110 update_divide_count(apic); 1031 update_divide_count(apic);
1111 start_apic_timer(apic); 1032 start_apic_timer(apic);
1112} 1033}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1119 if (!apic) 1040 if (!apic)
1120 return; 1041 return;
1121 1042
1122 timer = &apic->timer.dev; 1043 timer = &apic->lapic_timer.timer;
1123 if (hrtimer_cancel(timer)) 1044 if (hrtimer_cancel(timer))
1124 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1045 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1125} 1046}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee71209..a587f8349c46 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
5 6
6#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
7 8
8struct kvm_lapic { 9struct kvm_lapic {
9 unsigned long base_address; 10 unsigned long base_address;
10 struct kvm_io_device dev; 11 struct kvm_io_device dev;
11 struct { 12 struct kvm_timer lapic_timer;
12 atomic_t pending; 13 u32 divide_count;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 struct hrtimer dev;
16 } timer;
17 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
18 struct page *regs_page; 15 struct page *regs_page;
19 void *regs; 16 void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 31
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); 34int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
38 35
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 36u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 37void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 38void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 39int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
40bool kvm_apic_present(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 41int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44 42
45void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 43void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728a..5c3d6e81a7dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
126#define PFERR_PRESENT_MASK (1U << 0) 126#define PFERR_PRESENT_MASK (1U << 0)
127#define PFERR_WRITE_MASK (1U << 1) 127#define PFERR_WRITE_MASK (1U << 1)
128#define PFERR_USER_MASK (1U << 2) 128#define PFERR_USER_MASK (1U << 2)
129#define PFERR_RSVD_MASK (1U << 3)
129#define PFERR_FETCH_MASK (1U << 4) 130#define PFERR_FETCH_MASK (1U << 4)
130 131
131#define PT_DIRECTORY_LEVEL 2 132#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
177static u64 __read_mostly shadow_user_mask; 178static u64 __read_mostly shadow_user_mask;
178static u64 __read_mostly shadow_accessed_mask; 179static u64 __read_mostly shadow_accessed_mask;
179static u64 __read_mostly shadow_dirty_mask; 180static u64 __read_mostly shadow_dirty_mask;
180static u64 __read_mostly shadow_mt_mask; 181
182static inline u64 rsvd_bits(int s, int e)
183{
184 return ((1ULL << (e - s + 1)) - 1) << s;
185}
181 186
182void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 187void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
183{ 188{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
193EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
194 199
195void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 200void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
196 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) 201 u64 dirty_mask, u64 nx_mask, u64 x_mask)
197{ 202{
198 shadow_user_mask = user_mask; 203 shadow_user_mask = user_mask;
199 shadow_accessed_mask = accessed_mask; 204 shadow_accessed_mask = accessed_mask;
200 shadow_dirty_mask = dirty_mask; 205 shadow_dirty_mask = dirty_mask;
201 shadow_nx_mask = nx_mask; 206 shadow_nx_mask = nx_mask;
202 shadow_x_mask = x_mask; 207 shadow_x_mask = x_mask;
203 shadow_mt_mask = mt_mask;
204} 208}
205EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 209EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
206 210
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
219 return vcpu->arch.shadow_efer & EFER_NX; 223 return vcpu->arch.shadow_efer & EFER_NX;
220} 224}
221 225
222static int is_present_pte(unsigned long pte)
223{
224 return pte & PT_PRESENT_MASK;
225}
226
227static int is_shadow_present_pte(u64 pte) 226static int is_shadow_present_pte(u64 pte)
228{ 227{
229 return pte != shadow_trap_nonpresent_pte 228 return pte != shadow_trap_nonpresent_pte
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1074 return NULL; 1073 return NULL;
1075} 1074}
1076 1075
1077static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1078{
1079 list_del(&sp->oos_link);
1080 --kvm->stat.mmu_unsync_global;
1081}
1082
1083static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1076static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1084{ 1077{
1085 WARN_ON(!sp->unsync); 1078 WARN_ON(!sp->unsync);
1086 sp->unsync = 0; 1079 sp->unsync = 0;
1087 if (sp->global)
1088 kvm_unlink_unsync_global(kvm, sp);
1089 --kvm->stat.mmu_unsync; 1080 --kvm->stat.mmu_unsync;
1090} 1081}
1091 1082
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1248 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); 1239 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1249 sp->gfn = gfn; 1240 sp->gfn = gfn;
1250 sp->role = role; 1241 sp->role = role;
1251 sp->global = 0;
1252 hlist_add_head(&sp->hash_link, bucket); 1242 hlist_add_head(&sp->hash_link, bucket);
1253 if (!direct) { 1243 if (!direct) {
1254 if (rmap_write_protect(vcpu->kvm, gfn)) 1244 if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1616 return mtrr_state->def_type; 1606 return mtrr_state->def_type;
1617} 1607}
1618 1608
1619static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1609u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1620{ 1610{
1621 u8 mtrr; 1611 u8 mtrr;
1622 1612
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1626 mtrr = MTRR_TYPE_WRBACK; 1616 mtrr = MTRR_TYPE_WRBACK;
1627 return mtrr; 1617 return mtrr;
1628} 1618}
1619EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1629 1620
1630static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1621static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1631{ 1622{
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 ++vcpu->kvm->stat.mmu_unsync; 1637 ++vcpu->kvm->stat.mmu_unsync;
1647 sp->unsync = 1; 1638 sp->unsync = 1;
1648 1639
1649 if (sp->global) { 1640 kvm_mmu_mark_parents_unsync(vcpu, sp);
1650 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1651 ++vcpu->kvm->stat.mmu_unsync_global;
1652 } else
1653 kvm_mmu_mark_parents_unsync(vcpu, sp);
1654 1641
1655 mmu_convert_notrap(sp); 1642 mmu_convert_notrap(sp);
1656 return 0; 1643 return 0;
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1677static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1664static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1678 unsigned pte_access, int user_fault, 1665 unsigned pte_access, int user_fault,
1679 int write_fault, int dirty, int largepage, 1666 int write_fault, int dirty, int largepage,
1680 int global, gfn_t gfn, pfn_t pfn, bool speculative, 1667 gfn_t gfn, pfn_t pfn, bool speculative,
1681 bool can_unsync) 1668 bool can_unsync)
1682{ 1669{
1683 u64 spte; 1670 u64 spte;
1684 int ret = 0; 1671 int ret = 0;
1685 u64 mt_mask = shadow_mt_mask;
1686 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1687
1688 if (!global && sp->global) {
1689 sp->global = 0;
1690 if (sp->unsync) {
1691 kvm_unlink_unsync_global(vcpu->kvm, sp);
1692 kvm_mmu_mark_parents_unsync(vcpu, sp);
1693 }
1694 }
1695 1672
1696 /* 1673 /*
1697 * We don't set the accessed bit, since we sometimes want to see 1674 * We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1711 spte |= shadow_user_mask; 1688 spte |= shadow_user_mask;
1712 if (largepage) 1689 if (largepage)
1713 spte |= PT_PAGE_SIZE_MASK; 1690 spte |= PT_PAGE_SIZE_MASK;
1714 if (mt_mask) { 1691 if (tdp_enabled)
1715 if (!kvm_is_mmio_pfn(pfn)) { 1692 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1716 mt_mask = get_memory_type(vcpu, gfn) << 1693 kvm_is_mmio_pfn(pfn));
1717 kvm_x86_ops->get_mt_mask_shift();
1718 mt_mask |= VMX_EPT_IGMT_BIT;
1719 } else
1720 mt_mask = MTRR_TYPE_UNCACHABLE <<
1721 kvm_x86_ops->get_mt_mask_shift();
1722 spte |= mt_mask;
1723 }
1724 1694
1725 spte |= (u64)pfn << PAGE_SHIFT; 1695 spte |= (u64)pfn << PAGE_SHIFT;
1726 1696
@@ -1765,8 +1735,8 @@ set_pte:
1765static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1735static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1766 unsigned pt_access, unsigned pte_access, 1736 unsigned pt_access, unsigned pte_access,
1767 int user_fault, int write_fault, int dirty, 1737 int user_fault, int write_fault, int dirty,
1768 int *ptwrite, int largepage, int global, 1738 int *ptwrite, int largepage, gfn_t gfn,
1769 gfn_t gfn, pfn_t pfn, bool speculative) 1739 pfn_t pfn, bool speculative)
1770{ 1740{
1771 int was_rmapped = 0; 1741 int was_rmapped = 0;
1772 int was_writeble = is_writeble_pte(*shadow_pte); 1742 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1795 was_rmapped = 1; 1765 was_rmapped = 1;
1796 } 1766 }
1797 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1767 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1798 dirty, largepage, global, gfn, pfn, speculative, true)) { 1768 dirty, largepage, gfn, pfn, speculative, true)) {
1799 if (write_fault) 1769 if (write_fault)
1800 *ptwrite = 1; 1770 *ptwrite = 1;
1801 kvm_x86_ops->tlb_flush(vcpu); 1771 kvm_x86_ops->tlb_flush(vcpu);
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1843 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { 1813 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1844 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1814 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1845 0, write, 1, &pt_write, 1815 0, write, 1, &pt_write,
1846 largepage, 0, gfn, pfn, false); 1816 largepage, gfn, pfn, false);
1847 ++vcpu->stat.pf_fixed; 1817 ++vcpu->stat.pf_fixed;
1848 break; 1818 break;
1849 } 1819 }
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1942 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 1912 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1943} 1913}
1944 1914
1945static void mmu_alloc_roots(struct kvm_vcpu *vcpu) 1915static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1916{
1917 int ret = 0;
1918
1919 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1920 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1921 ret = 1;
1922 }
1923
1924 return ret;
1925}
1926
1927static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1946{ 1928{
1947 int i; 1929 int i;
1948 gfn_t root_gfn; 1930 gfn_t root_gfn;
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1957 ASSERT(!VALID_PAGE(root)); 1939 ASSERT(!VALID_PAGE(root));
1958 if (tdp_enabled) 1940 if (tdp_enabled)
1959 direct = 1; 1941 direct = 1;
1942 if (mmu_check_root(vcpu, root_gfn))
1943 return 1;
1960 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1944 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1961 PT64_ROOT_LEVEL, direct, 1945 PT64_ROOT_LEVEL, direct,
1962 ACC_ALL, NULL); 1946 ACC_ALL, NULL);
1963 root = __pa(sp->spt); 1947 root = __pa(sp->spt);
1964 ++sp->root_count; 1948 ++sp->root_count;
1965 vcpu->arch.mmu.root_hpa = root; 1949 vcpu->arch.mmu.root_hpa = root;
1966 return; 1950 return 0;
1967 } 1951 }
1968 direct = !is_paging(vcpu); 1952 direct = !is_paging(vcpu);
1969 if (tdp_enabled) 1953 if (tdp_enabled)
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1980 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 1964 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1981 } else if (vcpu->arch.mmu.root_level == 0) 1965 } else if (vcpu->arch.mmu.root_level == 0)
1982 root_gfn = 0; 1966 root_gfn = 0;
1967 if (mmu_check_root(vcpu, root_gfn))
1968 return 1;
1983 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1969 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1984 PT32_ROOT_LEVEL, direct, 1970 PT32_ROOT_LEVEL, direct,
1985 ACC_ALL, NULL); 1971 ACC_ALL, NULL);
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1988 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 1974 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1989 } 1975 }
1990 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1976 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1977 return 0;
1991} 1978}
1992 1979
1993static void mmu_sync_roots(struct kvm_vcpu *vcpu) 1980static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2006 for (i = 0; i < 4; ++i) { 1993 for (i = 0; i < 4; ++i) {
2007 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1994 hpa_t root = vcpu->arch.mmu.pae_root[i];
2008 1995
2009 if (root) { 1996 if (root && VALID_PAGE(root)) {
2010 root &= PT64_BASE_ADDR_MASK; 1997 root &= PT64_BASE_ADDR_MASK;
2011 sp = page_header(root); 1998 sp = page_header(root);
2012 mmu_sync_children(vcpu, sp); 1999 mmu_sync_children(vcpu, sp);
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2014 } 2001 }
2015} 2002}
2016 2003
2017static void mmu_sync_global(struct kvm_vcpu *vcpu)
2018{
2019 struct kvm *kvm = vcpu->kvm;
2020 struct kvm_mmu_page *sp, *n;
2021
2022 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2023 kvm_sync_page(vcpu, sp);
2024}
2025
2026void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2004void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2027{ 2005{
2028 spin_lock(&vcpu->kvm->mmu_lock); 2006 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2030 spin_unlock(&vcpu->kvm->mmu_lock); 2008 spin_unlock(&vcpu->kvm->mmu_lock);
2031} 2009}
2032 2010
2033void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2034{
2035 spin_lock(&vcpu->kvm->mmu_lock);
2036 mmu_sync_global(vcpu);
2037 spin_unlock(&vcpu->kvm->mmu_lock);
2038}
2039
2040static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2011static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
2041{ 2012{
2042 return vaddr; 2013 return vaddr;
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
2151 nonpaging_free(vcpu); 2122 nonpaging_free(vcpu);
2152} 2123}
2153 2124
2125static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2126{
2127 int bit7;
2128
2129 bit7 = (gpte >> 7) & 1;
2130 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2131}
2132
2154#define PTTYPE 64 2133#define PTTYPE 64
2155#include "paging_tmpl.h" 2134#include "paging_tmpl.h"
2156#undef PTTYPE 2135#undef PTTYPE
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
2159#include "paging_tmpl.h" 2138#include "paging_tmpl.h"
2160#undef PTTYPE 2139#undef PTTYPE
2161 2140
2141static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2142{
2143 struct kvm_mmu *context = &vcpu->arch.mmu;
2144 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2145 u64 exb_bit_rsvd = 0;
2146
2147 if (!is_nx(vcpu))
2148 exb_bit_rsvd = rsvd_bits(63, 63);
2149 switch (level) {
2150 case PT32_ROOT_LEVEL:
2151 /* no rsvd bits for 2 level 4K page table entries */
2152 context->rsvd_bits_mask[0][1] = 0;
2153 context->rsvd_bits_mask[0][0] = 0;
2154 if (is_cpuid_PSE36())
2155 /* 36bits PSE 4MB page */
2156 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2157 else
2158 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull;
2161 break;
2162 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] =
2164 rsvd_bits(maxphyaddr, 63) |
2165 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2166 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2167 rsvd_bits(maxphyaddr, 62); /* PDE */
2168 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2169 rsvd_bits(maxphyaddr, 62); /* PTE */
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull;
2174 break;
2175 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2177 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2178 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2179 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2180 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2181 rsvd_bits(maxphyaddr, 51);
2182 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2183 rsvd_bits(maxphyaddr, 51);
2184 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2185 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull;
2190 break;
2191 }
2192}
2193
2162static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2194static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2163{ 2195{
2164 struct kvm_mmu *context = &vcpu->arch.mmu; 2196 struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2179 2211
2180static int paging64_init_context(struct kvm_vcpu *vcpu) 2212static int paging64_init_context(struct kvm_vcpu *vcpu)
2181{ 2213{
2214 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2182 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2215 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2183} 2216}
2184 2217
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2186{ 2219{
2187 struct kvm_mmu *context = &vcpu->arch.mmu; 2220 struct kvm_mmu *context = &vcpu->arch.mmu;
2188 2221
2222 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2189 context->new_cr3 = paging_new_cr3; 2223 context->new_cr3 = paging_new_cr3;
2190 context->page_fault = paging32_page_fault; 2224 context->page_fault = paging32_page_fault;
2191 context->gva_to_gpa = paging32_gva_to_gpa; 2225 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2201 2235
2202static int paging32E_init_context(struct kvm_vcpu *vcpu) 2236static int paging32E_init_context(struct kvm_vcpu *vcpu)
2203{ 2237{
2238 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2204 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2239 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2205} 2240}
2206 2241
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2221 context->gva_to_gpa = nonpaging_gva_to_gpa; 2256 context->gva_to_gpa = nonpaging_gva_to_gpa;
2222 context->root_level = 0; 2257 context->root_level = 0;
2223 } else if (is_long_mode(vcpu)) { 2258 } else if (is_long_mode(vcpu)) {
2259 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2224 context->gva_to_gpa = paging64_gva_to_gpa; 2260 context->gva_to_gpa = paging64_gva_to_gpa;
2225 context->root_level = PT64_ROOT_LEVEL; 2261 context->root_level = PT64_ROOT_LEVEL;
2226 } else if (is_pae(vcpu)) { 2262 } else if (is_pae(vcpu)) {
2263 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2227 context->gva_to_gpa = paging64_gva_to_gpa; 2264 context->gva_to_gpa = paging64_gva_to_gpa;
2228 context->root_level = PT32E_ROOT_LEVEL; 2265 context->root_level = PT32E_ROOT_LEVEL;
2229 } else { 2266 } else {
2267 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2230 context->gva_to_gpa = paging32_gva_to_gpa; 2268 context->gva_to_gpa = paging32_gva_to_gpa;
2231 context->root_level = PT32_ROOT_LEVEL; 2269 context->root_level = PT32_ROOT_LEVEL;
2232 } 2270 }
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2290 goto out; 2328 goto out;
2291 spin_lock(&vcpu->kvm->mmu_lock); 2329 spin_lock(&vcpu->kvm->mmu_lock);
2292 kvm_mmu_free_some_pages(vcpu); 2330 kvm_mmu_free_some_pages(vcpu);
2293 mmu_alloc_roots(vcpu); 2331 r = mmu_alloc_roots(vcpu);
2294 mmu_sync_roots(vcpu); 2332 mmu_sync_roots(vcpu);
2295 spin_unlock(&vcpu->kvm->mmu_lock); 2333 spin_unlock(&vcpu->kvm->mmu_lock);
2334 if (r)
2335 goto out;
2296 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2336 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2297 kvm_mmu_flush_tlb(vcpu); 2337 kvm_mmu_flush_tlb(vcpu);
2298out: 2338out:
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2638 2678
2639static void free_mmu_pages(struct kvm_vcpu *vcpu) 2679static void free_mmu_pages(struct kvm_vcpu *vcpu)
2640{ 2680{
2641 struct kvm_mmu_page *sp;
2642
2643 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2644 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2645 struct kvm_mmu_page, link);
2646 kvm_mmu_zap_page(vcpu->kvm, sp);
2647 cond_resched();
2648 }
2649 free_page((unsigned long)vcpu->arch.mmu.pae_root); 2681 free_page((unsigned long)vcpu->arch.mmu.pae_root);
2650} 2682}
2651 2683
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2710{ 2742{
2711 struct kvm_mmu_page *sp; 2743 struct kvm_mmu_page *sp;
2712 2744
2713 spin_lock(&kvm->mmu_lock);
2714 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2745 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2715 int i; 2746 int i;
2716 u64 *pt; 2747 u64 *pt;
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2725 pt[i] &= ~PT_WRITABLE_MASK; 2756 pt[i] &= ~PT_WRITABLE_MASK;
2726 } 2757 }
2727 kvm_flush_remote_tlbs(kvm); 2758 kvm_flush_remote_tlbs(kvm);
2728 spin_unlock(&kvm->mmu_lock);
2729} 2759}
2730 2760
2731void kvm_mmu_zap_all(struct kvm *kvm) 2761void kvm_mmu_zap_all(struct kvm *kvm)
@@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3007 " in nonleaf level: levels %d gva %lx" 3037 " in nonleaf level: levels %d gva %lx"
3008 " level %d pte %llx\n", audit_msg, 3038 " level %d pte %llx\n", audit_msg,
3009 vcpu->arch.mmu.root_level, va, level, ent); 3039 vcpu->arch.mmu.root_level, va, level, ent);
3010 3040 else
3011 audit_mappings_page(vcpu, ent, va, level - 1); 3041 audit_mappings_page(vcpu, ent, va, level - 1);
3012 } else { 3042 } else {
3013 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3043 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3014 hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; 3044 gfn_t gfn = gpa >> PAGE_SHIFT;
3045 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3046 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3015 3047
3016 if (is_shadow_present_pte(ent) 3048 if (is_shadow_present_pte(ent)
3017 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3049 && (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62b..3494a2fb136e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 75 return vcpu->arch.cr0 & X86_CR0_PG;
76} 76}
77 77
78static inline int is_present_pte(unsigned long pte)
79{
80 return pte & PT_PRESENT_MASK;
81}
82
78#endif 83#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c561..258e4591e1ca 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
123 gfn_t table_gfn; 123 gfn_t table_gfn;
124 unsigned index, pt_access, pte_access; 124 unsigned index, pt_access, pte_access;
125 gpa_t pte_gpa; 125 gpa_t pte_gpa;
126 int rsvd_fault = 0;
126 127
127 pgprintk("%s: addr %lx\n", __func__, addr); 128 pgprintk("%s: addr %lx\n", __func__, addr);
128walk: 129walk:
@@ -157,6 +158,10 @@ walk:
157 if (!is_present_pte(pte)) 158 if (!is_present_pte(pte))
158 goto not_present; 159 goto not_present;
159 160
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
162 if (rsvd_fault)
163 goto access_error;
164
160 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writeble_pte(pte))
161 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
162 goto access_error; 167 goto access_error;
@@ -209,7 +214,6 @@ walk:
209 if (ret) 214 if (ret)
210 goto walk; 215 goto walk;
211 pte |= PT_DIRTY_MASK; 216 pte |= PT_DIRTY_MASK;
212 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
213 walker->ptes[walker->level - 1] = pte; 217 walker->ptes[walker->level - 1] = pte;
214 } 218 }
215 219
@@ -233,6 +237,8 @@ err:
233 walker->error_code |= PFERR_USER_MASK; 237 walker->error_code |= PFERR_USER_MASK;
234 if (fetch_fault) 238 if (fetch_fault)
235 walker->error_code |= PFERR_FETCH_MASK; 239 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK;
236 return 0; 242 return 0;
237} 243}
238 244
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
262 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
263 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
264 gpte & PT_DIRTY_MASK, NULL, largepage, 270 gpte & PT_DIRTY_MASK, NULL, largepage,
265 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), 271 gpte_to_gfn(gpte), pfn, true);
266 pfn, true);
267} 272}
268 273
269/* 274/*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
297 user_fault, write_fault, 302 user_fault, write_fault,
298 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
299 ptwrite, largepage, 304 ptwrite, largepage,
300 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
301 gw->gfn, pfn, false); 305 gw->gfn, pfn, false);
302 break; 306 break;
303 } 307 }
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 return r; 384 return r;
381 385
382 /* 386 /*
383 * Look up the shadow pte for the faulting address. 387 * Look up the guest pte for the faulting address.
384 */ 388 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 389 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault); 390 fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
586 nr_present++; 590 nr_present++;
587 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
588 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
589 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, 593 is_dirty_pte(gpte), 0, gfn,
590 spte_to_pfn(sp->spt[i]), true, false); 594 spte_to_pfn(sp->spt[i]), true, false);
591 } 595 }
592 596
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6e..71510e07e69e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h"
22 23
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
69static int nested = 0; 70static int nested = 0;
70module_param(nested, int, S_IRUGO); 71module_param(nested, int, S_IRUGO);
71 72
72static void kvm_reput_irq(struct vcpu_svm *svm);
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 73static void svm_flush_tlb(struct kvm_vcpu *vcpu);
74 74
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
132 return svm_features & feat; 132 return svm_features & feat;
133} 133}
134 134
135static inline u8 pop_irq(struct kvm_vcpu *vcpu)
136{
137 int word_index = __ffs(vcpu->arch.irq_summary);
138 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
139 int irq = word_index * BITS_PER_LONG + bit_index;
140
141 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
142 if (!vcpu->arch.irq_pending[word_index])
143 clear_bit(word_index, &vcpu->arch.irq_summary);
144 return irq;
145}
146
147static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
148{
149 set_bit(irq, vcpu->arch.irq_pending);
150 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
151}
152
153static inline void clgi(void) 135static inline void clgi(void)
154{ 136{
155 asm volatile (__ex(SVM_CLGI)); 137 asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
214 svm->vmcb->control.event_inj_err = error_code; 196 svm->vmcb->control.event_inj_err = error_code;
215} 197}
216 198
217static bool svm_exception_injected(struct kvm_vcpu *vcpu) 199static int is_external_interrupt(u32 info)
200{
201 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
202 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
203}
204
205static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
218{ 206{
219 struct vcpu_svm *svm = to_svm(vcpu); 207 struct vcpu_svm *svm = to_svm(vcpu);
208 u32 ret = 0;
220 209
221 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); 210 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
211 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
212 return ret & mask;
222} 213}
223 214
224static int is_external_interrupt(u32 info) 215static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
225{ 216{
226 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 217 struct vcpu_svm *svm = to_svm(vcpu);
227 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 218
219 if (mask == 0)
220 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
221 else
222 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
223
228} 224}
229 225
230static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 226static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
232 struct vcpu_svm *svm = to_svm(vcpu); 228 struct vcpu_svm *svm = to_svm(vcpu);
233 229
234 if (!svm->next_rip) { 230 if (!svm->next_rip) {
235 printk(KERN_DEBUG "%s: NOP\n", __func__); 231 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
232 EMULATE_DONE)
233 printk(KERN_DEBUG "%s: NOP\n", __func__);
236 return; 234 return;
237 } 235 }
238 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
240 __func__, kvm_rip_read(vcpu), svm->next_rip); 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
241 239
242 kvm_rip_write(vcpu, svm->next_rip); 240 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm_set_interrupt_shadow(vcpu, 0);
244
245 vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246} 242}
247 243
248static int has_svm(void) 244static int has_svm(void)
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
830 if (!var->unusable) 826 if (!var->unusable)
831 var->type |= 0x1; 827 var->type |= 0x1;
832 break; 828 break;
829 case VCPU_SREG_SS:
830 /* On AMD CPUs sometimes the DB bit in the segment
831 * descriptor is left as 1, although the whole segment has
832 * been made unusable. Clear it here to pass an Intel VMX
833 * entry check when cross vendor migrating.
834 */
835 if (var->unusable)
836 var->db = 0;
837 break;
833 } 838 }
834} 839}
835 840
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
960 965
961} 966}
962 967
963static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 968static void update_db_intercept(struct kvm_vcpu *vcpu)
964{ 969{
965 int old_debug = vcpu->guest_debug;
966 struct vcpu_svm *svm = to_svm(vcpu); 970 struct vcpu_svm *svm = to_svm(vcpu);
967 971
968 vcpu->guest_debug = dbg->control;
969
970 svm->vmcb->control.intercept_exceptions &= 972 svm->vmcb->control.intercept_exceptions &=
971 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 973 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
974
975 if (vcpu->arch.singlestep)
976 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
977
972 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 978 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
973 if (vcpu->guest_debug & 979 if (vcpu->guest_debug &
974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 980 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
979 1 << BP_VECTOR; 985 1 << BP_VECTOR;
980 } else 986 } else
981 vcpu->guest_debug = 0; 987 vcpu->guest_debug = 0;
988}
989
990static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
991{
992 int old_debug = vcpu->guest_debug;
993 struct vcpu_svm *svm = to_svm(vcpu);
994
995 vcpu->guest_debug = dbg->control;
996
997 update_db_intercept(vcpu);
982 998
983 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 999 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
984 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1000 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
993 return 0; 1009 return 0;
994} 1010}
995 1011
996static int svm_get_irq(struct kvm_vcpu *vcpu)
997{
998 struct vcpu_svm *svm = to_svm(vcpu);
999 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1000
1001 if (is_external_interrupt(exit_int_info))
1002 return exit_int_info & SVM_EVTINJ_VEC_MASK;
1003 return -1;
1004}
1005
1006static void load_host_msrs(struct kvm_vcpu *vcpu) 1012static void load_host_msrs(struct kvm_vcpu *vcpu)
1007{ 1013{
1008#ifdef CONFIG_X86_64 1014#ifdef CONFIG_X86_64
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1107 1113
1108static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1114static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1109{ 1115{
1110 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1111 struct kvm *kvm = svm->vcpu.kvm;
1112 u64 fault_address; 1116 u64 fault_address;
1113 u32 error_code; 1117 u32 error_code;
1114 bool event_injection = false;
1115
1116 if (!irqchip_in_kernel(kvm) &&
1117 is_external_interrupt(exit_int_info)) {
1118 event_injection = true;
1119 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1120 }
1121 1118
1122 fault_address = svm->vmcb->control.exit_info_2; 1119 fault_address = svm->vmcb->control.exit_info_2;
1123 error_code = svm->vmcb->control.exit_info_1; 1120 error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1137 */ 1134 */
1138 if (npt_enabled) 1135 if (npt_enabled)
1139 svm_flush_tlb(&svm->vcpu); 1136 svm_flush_tlb(&svm->vcpu);
1140 1137 else {
1141 if (!npt_enabled && event_injection) 1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1142 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1143 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1144} 1142}
1145 1143
1146static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1144static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1147{ 1145{
1148 if (!(svm->vcpu.guest_debug & 1146 if (!(svm->vcpu.guest_debug &
1149 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 1147 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1148 !svm->vcpu.arch.singlestep) {
1150 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1149 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1151 return 1; 1150 return 1;
1152 } 1151 }
1153 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1152
1154 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1153 if (svm->vcpu.arch.singlestep) {
1155 kvm_run->debug.arch.exception = DB_VECTOR; 1154 svm->vcpu.arch.singlestep = false;
1156 return 0; 1155 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1156 svm->vmcb->save.rflags &=
1157 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1158 update_db_intercept(&svm->vcpu);
1159 }
1160
1161 if (svm->vcpu.guest_debug &
1162 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
1163 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1164 kvm_run->debug.arch.pc =
1165 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1166 kvm_run->debug.arch.exception = DB_VECTOR;
1167 return 0;
1168 }
1169
1170 return 1;
1157} 1171}
1158 1172
1159static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1173static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
1842 struct kvm_run *kvm_run) 1856 struct kvm_run *kvm_run)
1843{ 1857{
1844 u16 tss_selector; 1858 u16 tss_selector;
1859 int reason;
1860 int int_type = svm->vmcb->control.exit_int_info &
1861 SVM_EXITINTINFO_TYPE_MASK;
1862 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
1863 uint32_t type =
1864 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
1865 uint32_t idt_v =
1866 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
1845 1867
1846 tss_selector = (u16)svm->vmcb->control.exit_info_1; 1868 tss_selector = (u16)svm->vmcb->control.exit_info_1;
1869
1847 if (svm->vmcb->control.exit_info_2 & 1870 if (svm->vmcb->control.exit_info_2 &
1848 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 1871 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1849 return kvm_task_switch(&svm->vcpu, tss_selector, 1872 reason = TASK_SWITCH_IRET;
1850 TASK_SWITCH_IRET); 1873 else if (svm->vmcb->control.exit_info_2 &
1851 if (svm->vmcb->control.exit_info_2 & 1874 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1852 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 1875 reason = TASK_SWITCH_JMP;
1853 return kvm_task_switch(&svm->vcpu, tss_selector, 1876 else if (idt_v)
1854 TASK_SWITCH_JMP); 1877 reason = TASK_SWITCH_GATE;
1855 return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); 1878 else
1879 reason = TASK_SWITCH_CALL;
1880
1881 if (reason == TASK_SWITCH_GATE) {
1882 switch (type) {
1883 case SVM_EXITINTINFO_TYPE_NMI:
1884 svm->vcpu.arch.nmi_injected = false;
1885 break;
1886 case SVM_EXITINTINFO_TYPE_EXEPT:
1887 kvm_clear_exception_queue(&svm->vcpu);
1888 break;
1889 case SVM_EXITINTINFO_TYPE_INTR:
1890 kvm_clear_interrupt_queue(&svm->vcpu);
1891 break;
1892 default:
1893 break;
1894 }
1895 }
1896
1897 if (reason != TASK_SWITCH_GATE ||
1898 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
1899 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
1900 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
1901 skip_emulated_instruction(&svm->vcpu);
1902
1903 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
1856} 1904}
1857 1905
1858static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1906static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1862 return 1; 1910 return 1;
1863} 1911}
1864 1912
1913static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1914{
1915 ++svm->vcpu.stat.nmi_window_exits;
1916 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
1917 svm->vcpu.arch.hflags |= HF_IRET_MASK;
1918 return 1;
1919}
1920
1865static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1866{ 1922{
1867 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 1923 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
1879 1935
1880static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1881{ 1937{
1938 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
1939 /* instruction emulation calls kvm_set_cr8() */
1882 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 1940 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1883 if (irqchip_in_kernel(svm->vcpu.kvm)) 1941 if (irqchip_in_kernel(svm->vcpu.kvm)) {
1942 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1943 return 1;
1944 }
1945 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
1884 return 1; 1946 return 1;
1885 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1947 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1886 return 0; 1948 return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2090 * If the user space waits to inject interrupts, exit as soon as 2152 * If the user space waits to inject interrupts, exit as soon as
2091 * possible 2153 * possible
2092 */ 2154 */
2093 if (kvm_run->request_interrupt_window && 2155 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
2094 !svm->vcpu.arch.irq_summary) { 2156 kvm_run->request_interrupt_window &&
2157 !kvm_cpu_has_interrupt(&svm->vcpu)) {
2095 ++svm->vcpu.stat.irq_window_exits; 2158 ++svm->vcpu.stat.irq_window_exits;
2096 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2159 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2097 return 0; 2160 return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2134 [SVM_EXIT_VINTR] = interrupt_window_interception, 2197 [SVM_EXIT_VINTR] = interrupt_window_interception,
2135 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ 2198 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2136 [SVM_EXIT_CPUID] = cpuid_interception, 2199 [SVM_EXIT_CPUID] = cpuid_interception,
2200 [SVM_EXIT_IRET] = iret_interception,
2137 [SVM_EXIT_INVD] = emulate_on_interception, 2201 [SVM_EXIT_INVD] = emulate_on_interception,
2138 [SVM_EXIT_HLT] = halt_interception, 2202 [SVM_EXIT_HLT] = halt_interception,
2139 [SVM_EXIT_INVLPG] = invlpg_interception, 2203 [SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2194 } 2258 }
2195 } 2259 }
2196 2260
2197 kvm_reput_irq(svm);
2198 2261
2199 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2262 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2200 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2263 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2205 2268
2206 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 2269 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2207 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 2270 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2208 exit_code != SVM_EXIT_NPF) 2271 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
2209 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 2272 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2210 "exit_code 0x%x\n", 2273 "exit_code 0x%x\n",
2211 __func__, svm->vmcb->control.exit_int_info, 2274 __func__, svm->vmcb->control.exit_int_info,
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm)
2242 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2243} 2306}
2244 2307
2308static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2309{
2310 struct vcpu_svm *svm = to_svm(vcpu);
2311
2312 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2313 vcpu->arch.hflags |= HF_NMI_MASK;
2314 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2315 ++vcpu->stat.nmi_injections;
2316}
2245 2317
2246static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 2318static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2247{ 2319{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2257 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2258} 2330}
2259 2331
2260static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) 2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2261{ 2333{
2262 struct vcpu_svm *svm = to_svm(vcpu); 2334 struct vcpu_svm *svm = to_svm(vcpu);
2263 2335
2264 nested_svm_intr(svm); 2336 svm->vmcb->control.event_inj = nr |
2265 2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2266 svm_inject_irq(svm, irq);
2267} 2338}
2268 2339
2269static void update_cr8_intercept(struct kvm_vcpu *vcpu) 2340static void svm_set_irq(struct kvm_vcpu *vcpu)
2270{ 2341{
2271 struct vcpu_svm *svm = to_svm(vcpu); 2342 struct vcpu_svm *svm = to_svm(vcpu);
2272 struct vmcb *vmcb = svm->vmcb;
2273 int max_irr, tpr;
2274 2343
2275 if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) 2344 nested_svm_intr(svm);
2276 return;
2277 2345
2278 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
2347}
2279 2348
2280 max_irr = kvm_lapic_find_highest_irr(vcpu); 2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2281 if (max_irr == -1) 2350{
2282 return; 2351 struct vcpu_svm *svm = to_svm(vcpu);
2283 2352
2284 tpr = kvm_lapic_get_cr8(vcpu) << 4; 2353 if (irr == -1)
2354 return;
2285 2355
2286 if (tpr >= (max_irr & 0xf0)) 2356 if (tpr >= irr)
2287 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 2357 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2288} 2358}
2289 2359
2290static void svm_intr_assist(struct kvm_vcpu *vcpu) 2360static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2291{ 2361{
2292 struct vcpu_svm *svm = to_svm(vcpu); 2362 struct vcpu_svm *svm = to_svm(vcpu);
2293 struct vmcb *vmcb = svm->vmcb; 2363 struct vmcb *vmcb = svm->vmcb;
2294 int intr_vector = -1; 2364 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2295 2365 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2296 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
2297 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
2298 intr_vector = vmcb->control.exit_int_info &
2299 SVM_EVTINJ_VEC_MASK;
2300 vmcb->control.exit_int_info = 0;
2301 svm_inject_irq(svm, intr_vector);
2302 goto out;
2303 }
2304
2305 if (vmcb->control.int_ctl & V_IRQ_MASK)
2306 goto out;
2307
2308 if (!kvm_cpu_has_interrupt(vcpu))
2309 goto out;
2310
2311 if (nested_svm_intr(svm))
2312 goto out;
2313
2314 if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2315 goto out;
2316
2317 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
2318 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
2319 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
2320 /* unable to deliver irq, set pending irq */
2321 svm_set_vintr(svm);
2322 svm_inject_irq(svm, 0x0);
2323 goto out;
2324 }
2325 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
2326 intr_vector = kvm_cpu_get_interrupt(vcpu);
2327 svm_inject_irq(svm, intr_vector);
2328out:
2329 update_cr8_intercept(vcpu);
2330} 2366}
2331 2367
2332static void kvm_reput_irq(struct vcpu_svm *svm) 2368static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2333{ 2369{
2334 struct vmcb_control_area *control = &svm->vmcb->control; 2370 struct vcpu_svm *svm = to_svm(vcpu);
2335 2371 struct vmcb *vmcb = svm->vmcb;
2336 if ((control->int_ctl & V_IRQ_MASK) 2372 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2337 && !irqchip_in_kernel(svm->vcpu.kvm)) { 2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2338 control->int_ctl &= ~V_IRQ_MASK; 2374 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2339 push_irq(&svm->vcpu, control->int_vector);
2340 }
2341
2342 svm->vcpu.arch.interrupt_window_open =
2343 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2344 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2345} 2375}
2346 2376
2347static void svm_do_inject_vector(struct vcpu_svm *svm) 2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2348{ 2378{
2349 struct kvm_vcpu *vcpu = &svm->vcpu; 2379 svm_set_vintr(to_svm(vcpu));
2350 int word_index = __ffs(vcpu->arch.irq_summary); 2380 svm_inject_irq(to_svm(vcpu), 0x0);
2351 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2352 int irq = word_index * BITS_PER_LONG + bit_index;
2353
2354 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2355 if (!vcpu->arch.irq_pending[word_index])
2356 clear_bit(word_index, &vcpu->arch.irq_summary);
2357 svm_inject_irq(svm, irq);
2358} 2381}
2359 2382
2360static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2383static void enable_nmi_window(struct kvm_vcpu *vcpu)
2361 struct kvm_run *kvm_run)
2362{ 2384{
2363 struct vcpu_svm *svm = to_svm(vcpu); 2385 struct vcpu_svm *svm = to_svm(vcpu);
2364 struct vmcb_control_area *control = &svm->vmcb->control;
2365
2366 if (nested_svm_intr(svm))
2367 return;
2368 2386
2369 svm->vcpu.arch.interrupt_window_open = 2387 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
2370 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 2388 == HF_NMI_MASK)
2371 (svm->vmcb->save.rflags & X86_EFLAGS_IF) && 2389 return; /* IRET will cause a vm exit */
2372 (svm->vcpu.arch.hflags & HF_GIF_MASK));
2373 2390
2374 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) 2391 /* Something prevents NMI from been injected. Single step over
2375 /* 2392 possible problem (IRET or exception injection or interrupt
2376 * If interrupts enabled, and not blocked by sti or mov ss. Good. 2393 shadow) */
2377 */ 2394 vcpu->arch.singlestep = true;
2378 svm_do_inject_vector(svm); 2395 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2379 2396 update_db_intercept(vcpu);
2380 /*
2381 * Interrupts blocked. Wait for unblock.
2382 */
2383 if (!svm->vcpu.arch.interrupt_window_open &&
2384 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
2385 svm_set_vintr(svm);
2386 else
2387 svm_clear_vintr(svm);
2388} 2397}
2389 2398
2390static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2399static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2407 2416
2408 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2417 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2409 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2418 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2410 kvm_lapic_set_tpr(vcpu, cr8); 2419 kvm_set_cr8(vcpu, cr8);
2411 } 2420 }
2412} 2421}
2413 2422
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2416 struct vcpu_svm *svm = to_svm(vcpu); 2425 struct vcpu_svm *svm = to_svm(vcpu);
2417 u64 cr8; 2426 u64 cr8;
2418 2427
2419 if (!irqchip_in_kernel(vcpu->kvm))
2420 return;
2421
2422 cr8 = kvm_get_cr8(vcpu); 2428 cr8 = kvm_get_cr8(vcpu);
2423 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2429 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2424 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2430 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2425} 2431}
2426 2432
2433static void svm_complete_interrupts(struct vcpu_svm *svm)
2434{
2435 u8 vector;
2436 int type;
2437 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2438
2439 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2440 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
2441
2442 svm->vcpu.arch.nmi_injected = false;
2443 kvm_clear_exception_queue(&svm->vcpu);
2444 kvm_clear_interrupt_queue(&svm->vcpu);
2445
2446 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
2447 return;
2448
2449 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
2450 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
2451
2452 switch (type) {
2453 case SVM_EXITINTINFO_TYPE_NMI:
2454 svm->vcpu.arch.nmi_injected = true;
2455 break;
2456 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */
2459 if (kvm_exception_is_soft(vector))
2460 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2462 u32 err = svm->vmcb->control.exit_int_info_err;
2463 kvm_queue_exception_e(&svm->vcpu, vector, err);
2464
2465 } else
2466 kvm_queue_exception(&svm->vcpu, vector);
2467 break;
2468 case SVM_EXITINTINFO_TYPE_INTR:
2469 kvm_queue_interrupt(&svm->vcpu, vector, false);
2470 break;
2471 default:
2472 break;
2473 }
2474}
2475
2427#ifdef CONFIG_X86_64 2476#ifdef CONFIG_X86_64
2428#define R "r" 2477#define R "r"
2429#else 2478#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2552 sync_cr8_to_lapic(vcpu); 2601 sync_cr8_to_lapic(vcpu);
2553 2602
2554 svm->next_rip = 0; 2603 svm->next_rip = 0;
2604
2605 svm_complete_interrupts(svm);
2555} 2606}
2556 2607
2557#undef R 2608#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
2617#endif 2668#endif
2618} 2669}
2619 2670
2620static int svm_get_mt_mask_shift(void) 2671static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2621{ 2672{
2622 return 0; 2673 return 0;
2623} 2674}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
2667 .run = svm_vcpu_run, 2718 .run = svm_vcpu_run,
2668 .handle_exit = handle_exit, 2719 .handle_exit = handle_exit,
2669 .skip_emulated_instruction = skip_emulated_instruction, 2720 .skip_emulated_instruction = skip_emulated_instruction,
2721 .set_interrupt_shadow = svm_set_interrupt_shadow,
2722 .get_interrupt_shadow = svm_get_interrupt_shadow,
2670 .patch_hypercall = svm_patch_hypercall, 2723 .patch_hypercall = svm_patch_hypercall,
2671 .get_irq = svm_get_irq,
2672 .set_irq = svm_set_irq, 2724 .set_irq = svm_set_irq,
2725 .set_nmi = svm_inject_nmi,
2673 .queue_exception = svm_queue_exception, 2726 .queue_exception = svm_queue_exception,
2674 .exception_injected = svm_exception_injected, 2727 .interrupt_allowed = svm_interrupt_allowed,
2675 .inject_pending_irq = svm_intr_assist, 2728 .nmi_allowed = svm_nmi_allowed,
2676 .inject_pending_vectors = do_interrupt_requests, 2729 .enable_nmi_window = enable_nmi_window,
2730 .enable_irq_window = enable_irq_window,
2731 .update_cr8_intercept = update_cr8_intercept,
2677 2732
2678 .set_tss_addr = svm_set_tss_addr, 2733 .set_tss_addr = svm_set_tss_addr,
2679 .get_tdp_level = get_npt_level, 2734 .get_tdp_level = get_npt_level,
2680 .get_mt_mask_shift = svm_get_mt_mask_shift, 2735 .get_mt_mask = svm_get_mt_mask,
2681}; 2736};
2682 2737
2683static int __init svm_init(void) 2738static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 000000000000..86dbac072d0c
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
1#include <linux/kvm_host.h>
2#include <linux/kvm.h>
3#include <linux/hrtimer.h>
4#include <asm/atomic.h>
5#include "kvm_timer.h"
6
7static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
8{
9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq;
11
12 /* FIXME: this code should not know anything about vcpus */
13 if (!atomic_inc_and_test(&ktimer->pending))
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18
19 if (waitqueue_active(q))
20 wake_up_interruptible(q);
21
22 if (ktimer->t_ops->is_periodic(ktimer)) {
23 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
24 restart_timer = 1;
25 }
26
27 return restart_timer;
28}
29
30enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
31{
32 int restart_timer;
33 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
37 if (!vcpu)
38 return HRTIMER_NORESTART;
39
40 restart_timer = __kvm_timer_fn(vcpu, ktimer);
41 if (restart_timer)
42 return HRTIMER_RESTART;
43 else
44 return HRTIMER_NORESTART;
45}
46
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716f..e770bf349ec4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
32#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h> 33#include <asm/vmx.h>
34#include <asm/virtext.h> 34#include <asm/virtext.h>
35#include <asm/mce.h>
35 36
36#define __ex(x) __kvm_handle_fault_on_reboot(x) 37#define __ex(x) __kvm_handle_fault_on_reboot(x)
37 38
38MODULE_AUTHOR("Qumranet"); 39MODULE_AUTHOR("Qumranet");
39MODULE_LICENSE("GPL"); 40MODULE_LICENSE("GPL");
40 41
41static int bypass_guest_pf = 1; 42static int __read_mostly bypass_guest_pf = 1;
42module_param(bypass_guest_pf, bool, 0); 43module_param(bypass_guest_pf, bool, S_IRUGO);
43 44
44static int enable_vpid = 1; 45static int __read_mostly enable_vpid = 1;
45module_param(enable_vpid, bool, 0); 46module_param_named(vpid, enable_vpid, bool, 0444);
46 47
47static int flexpriority_enabled = 1; 48static int __read_mostly flexpriority_enabled = 1;
48module_param(flexpriority_enabled, bool, 0); 49module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
49 50
50static int enable_ept = 1; 51static int __read_mostly enable_ept = 1;
51module_param(enable_ept, bool, 0); 52module_param_named(ept, enable_ept, bool, S_IRUGO);
52 53
53static int emulate_invalid_guest_state = 0; 54static int __read_mostly emulate_invalid_guest_state = 0;
54module_param(emulate_invalid_guest_state, bool, 0); 55module_param(emulate_invalid_guest_state, bool, S_IRUGO);
55 56
56struct vmcs { 57struct vmcs {
57 u32 revision_id; 58 u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
97 int soft_vnmi_blocked; 98 int soft_vnmi_blocked;
98 ktime_t entry_time; 99 ktime_t entry_time;
99 s64 vnmi_blocked_time; 100 s64 vnmi_blocked_time;
101 u32 exit_reason;
100}; 102};
101 103
102static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 104static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
111static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 113static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
112static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 114static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
113 115
114static struct page *vmx_io_bitmap_a; 116static unsigned long *vmx_io_bitmap_a;
115static struct page *vmx_io_bitmap_b; 117static unsigned long *vmx_io_bitmap_b;
116static struct page *vmx_msr_bitmap; 118static unsigned long *vmx_msr_bitmap_legacy;
119static unsigned long *vmx_msr_bitmap_longmode;
117 120
118static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 121static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
119static DEFINE_SPINLOCK(vmx_vpid_lock); 122static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
213 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 216 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
214} 217}
215 218
219static inline int is_machine_check(u32 intr_info)
220{
221 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
222 INTR_INFO_VALID_MASK)) ==
223 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
224}
225
216static inline int cpu_has_vmx_msr_bitmap(void) 226static inline int cpu_has_vmx_msr_bitmap(void)
217{ 227{
218 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); 228 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
219} 229}
220 230
221static inline int cpu_has_vmx_tpr_shadow(void) 231static inline int cpu_has_vmx_tpr_shadow(void)
222{ 232{
223 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); 233 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
224} 234}
225 235
226static inline int vm_need_tpr_shadow(struct kvm *kvm) 236static inline int vm_need_tpr_shadow(struct kvm *kvm)
227{ 237{
228 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 238 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
229} 239}
230 240
231static inline int cpu_has_secondary_exec_ctrls(void) 241static inline int cpu_has_secondary_exec_ctrls(void)
232{ 242{
233 return (vmcs_config.cpu_based_exec_ctrl & 243 return vmcs_config.cpu_based_exec_ctrl &
234 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 244 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
235} 245}
236 246
237static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 247static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
238{ 248{
239 return flexpriority_enabled 249 return vmcs_config.cpu_based_2nd_exec_ctrl &
240 && (vmcs_config.cpu_based_2nd_exec_ctrl & 250 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 251}
252
253static inline bool cpu_has_vmx_flexpriority(void)
254{
255 return cpu_has_vmx_tpr_shadow() &&
256 cpu_has_vmx_virtualize_apic_accesses();
242} 257}
243 258
244static inline int cpu_has_vmx_invept_individual_addr(void) 259static inline int cpu_has_vmx_invept_individual_addr(void)
245{ 260{
246 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); 261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
247} 262}
248 263
249static inline int cpu_has_vmx_invept_context(void) 264static inline int cpu_has_vmx_invept_context(void)
250{ 265{
251 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); 266 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
252} 267}
253 268
254static inline int cpu_has_vmx_invept_global(void) 269static inline int cpu_has_vmx_invept_global(void)
255{ 270{
256 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); 271 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
257} 272}
258 273
259static inline int cpu_has_vmx_ept(void) 274static inline int cpu_has_vmx_ept(void)
260{ 275{
261 return (vmcs_config.cpu_based_2nd_exec_ctrl & 276 return vmcs_config.cpu_based_2nd_exec_ctrl &
262 SECONDARY_EXEC_ENABLE_EPT); 277 SECONDARY_EXEC_ENABLE_EPT;
263}
264
265static inline int vm_need_ept(void)
266{
267 return (cpu_has_vmx_ept() && enable_ept);
268} 278}
269 279
270static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
271{ 281{
272 return ((cpu_has_vmx_virtualize_apic_accesses()) && 282 return flexpriority_enabled &&
273 (irqchip_in_kernel(kvm))); 283 (cpu_has_vmx_virtualize_apic_accesses()) &&
284 (irqchip_in_kernel(kvm));
274} 285}
275 286
276static inline int cpu_has_vmx_vpid(void) 287static inline int cpu_has_vmx_vpid(void)
277{ 288{
278 return (vmcs_config.cpu_based_2nd_exec_ctrl & 289 return vmcs_config.cpu_based_2nd_exec_ctrl &
279 SECONDARY_EXEC_ENABLE_VPID); 290 SECONDARY_EXEC_ENABLE_VPID;
280} 291}
281 292
282static inline int cpu_has_virtual_nmis(void) 293static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
284 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 295 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
285} 296}
286 297
298static inline bool report_flexpriority(void)
299{
300 return flexpriority_enabled;
301}
302
287static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 303static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
288{ 304{
289 int i; 305 int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
381 397
382static inline void ept_sync_context(u64 eptp) 398static inline void ept_sync_context(u64 eptp)
383{ 399{
384 if (vm_need_ept()) { 400 if (enable_ept) {
385 if (cpu_has_vmx_invept_context()) 401 if (cpu_has_vmx_invept_context())
386 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 402 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
387 else 403 else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
391 407
392static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) 408static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
393{ 409{
394 if (vm_need_ept()) { 410 if (enable_ept) {
395 if (cpu_has_vmx_invept_individual_addr()) 411 if (cpu_has_vmx_invept_individual_addr())
396 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, 412 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
397 eptp, gpa); 413 eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
478{ 494{
479 u32 eb; 495 u32 eb;
480 496
481 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); 497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
482 if (!vcpu->fpu_active) 498 if (!vcpu->fpu_active)
483 eb |= 1u << NM_VECTOR; 499 eb |= 1u << NM_VECTOR;
484 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
488 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
489 eb |= 1u << BP_VECTOR; 505 eb |= 1u << BP_VECTOR;
490 } 506 }
491 if (vcpu->arch.rmode.active) 507 if (vcpu->arch.rmode.vm86_active)
492 eb = ~0; 508 eb = ~0;
493 if (vm_need_ept()) 509 if (enable_ept)
494 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
495 vmcs_write32(EXCEPTION_BITMAP, eb); 511 vmcs_write32(EXCEPTION_BITMAP, eb);
496} 512}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
724 740
725static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
726{ 742{
727 if (vcpu->arch.rmode.active) 743 if (vcpu->arch.rmode.vm86_active)
728 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
729 vmcs_writel(GUEST_RFLAGS, rflags); 745 vmcs_writel(GUEST_RFLAGS, rflags);
730} 746}
731 747
748static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
749{
750 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
751 int ret = 0;
752
753 if (interruptibility & GUEST_INTR_STATE_STI)
754 ret |= X86_SHADOW_INT_STI;
755 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
756 ret |= X86_SHADOW_INT_MOV_SS;
757
758 return ret & mask;
759}
760
761static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
762{
763 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
764 u32 interruptibility = interruptibility_old;
765
766 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
767
768 if (mask & X86_SHADOW_INT_MOV_SS)
769 interruptibility |= GUEST_INTR_STATE_MOV_SS;
770 if (mask & X86_SHADOW_INT_STI)
771 interruptibility |= GUEST_INTR_STATE_STI;
772
773 if ((interruptibility != interruptibility_old))
774 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
775}
776
732static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 777static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733{ 778{
734 unsigned long rip; 779 unsigned long rip;
735 u32 interruptibility;
736 780
737 rip = kvm_rip_read(vcpu); 781 rip = kvm_rip_read(vcpu);
738 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 782 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
739 kvm_rip_write(vcpu, rip); 783 kvm_rip_write(vcpu, rip);
740 784
741 /* 785 /* skipping an emulated instruction also counts */
742 * We emulated an instruction, so temporary interrupt blocking 786 vmx_set_interrupt_shadow(vcpu, 0);
743 * should be removed, if set.
744 */
745 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
746 if (interruptibility & 3)
747 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
748 interruptibility & ~3);
749 vcpu->arch.interrupt_window_open = 1;
750} 787}
751 788
752static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 789static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
760 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 797 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
761 } 798 }
762 799
763 if (vcpu->arch.rmode.active) { 800 if (vcpu->arch.rmode.vm86_active) {
764 vmx->rmode.irq.pending = true; 801 vmx->rmode.irq.pending = true;
765 vmx->rmode.irq.vector = nr; 802 vmx->rmode.irq.vector = nr;
766 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 803 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
773 return; 810 return;
774 } 811 }
775 812
776 if (nr == BP_VECTOR || nr == OF_VECTOR) { 813 if (kvm_exception_is_soft(nr)) {
777 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 814 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
815 vmx->vcpu.arch.event_exit_inst_len);
778 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 816 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
779 } else 817 } else
780 intr_info |= INTR_TYPE_HARD_EXCEPTION; 818 intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
782 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 820 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
783} 821}
784 822
785static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
786{
787 return false;
788}
789
790/* 823/*
791 * Swap MSR entry in host/guest MSR entry array. 824 * Swap MSR entry in host/guest MSR entry array.
792 */ 825 */
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
812static void setup_msrs(struct vcpu_vmx *vmx) 845static void setup_msrs(struct vcpu_vmx *vmx)
813{ 846{
814 int save_nmsrs; 847 int save_nmsrs;
848 unsigned long *msr_bitmap;
815 849
816 vmx_load_host_state(vmx); 850 vmx_load_host_state(vmx);
817 save_nmsrs = 0; 851 save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
847 __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 881 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
848#endif 882#endif
849 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); 883 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
884
885 if (cpu_has_vmx_msr_bitmap()) {
886 if (is_long_mode(&vmx->vcpu))
887 msr_bitmap = vmx_msr_bitmap_longmode;
888 else
889 msr_bitmap = vmx_msr_bitmap_legacy;
890
891 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
892 }
850} 893}
851 894
852/* 895/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1034 return 0; 1077 return 0;
1035} 1078}
1036 1079
1037static int vmx_get_irq(struct kvm_vcpu *vcpu)
1038{
1039 if (!vcpu->arch.interrupt.pending)
1040 return -1;
1041 return vcpu->arch.interrupt.nr;
1042}
1043
1044static __init int cpu_has_kvm_support(void) 1080static __init int cpu_has_kvm_support(void)
1045{ 1081{
1046 return cpu_has_vmx(); 1082 return cpu_has_vmx();
@@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
1241 struct page *pages; 1277 struct page *pages;
1242 struct vmcs *vmcs; 1278 struct vmcs *vmcs;
1243 1279
1244 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 1280 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1245 if (!pages) 1281 if (!pages)
1246 return NULL; 1282 return NULL;
1247 vmcs = page_address(pages); 1283 vmcs = page_address(pages);
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
1294 if (boot_cpu_has(X86_FEATURE_NX)) 1330 if (boot_cpu_has(X86_FEATURE_NX))
1295 kvm_enable_efer_bits(EFER_NX); 1331 kvm_enable_efer_bits(EFER_NX);
1296 1332
1333 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0;
1335
1336 if (!cpu_has_vmx_ept())
1337 enable_ept = 0;
1338
1339 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0;
1341
1342 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL;
1344
1297 return alloc_kvm_area(); 1345 return alloc_kvm_area();
1298} 1346}
1299 1347
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1324 struct vcpu_vmx *vmx = to_vmx(vcpu); 1372 struct vcpu_vmx *vmx = to_vmx(vcpu);
1325 1373
1326 vmx->emulation_required = 1; 1374 vmx->emulation_required = 1;
1327 vcpu->arch.rmode.active = 0; 1375 vcpu->arch.rmode.vm86_active = 0;
1328 1376
1329 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1330 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1386 struct vcpu_vmx *vmx = to_vmx(vcpu); 1434 struct vcpu_vmx *vmx = to_vmx(vcpu);
1387 1435
1388 vmx->emulation_required = 1; 1436 vmx->emulation_required = 1;
1389 vcpu->arch.rmode.active = 1; 1437 vcpu->arch.rmode.vm86_active = 1;
1390 1438
1391 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1392 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1485static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1533static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1486{ 1534{
1487 vpid_sync_vcpu_all(to_vmx(vcpu)); 1535 vpid_sync_vcpu_all(to_vmx(vcpu));
1488 if (vm_need_ept()) 1536 if (enable_ept)
1489 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1537 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1490} 1538}
1491 1539
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1555 1603
1556 vmx_fpu_deactivate(vcpu); 1604 vmx_fpu_deactivate(vcpu);
1557 1605
1558 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) 1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1559 enter_pmode(vcpu); 1607 enter_pmode(vcpu);
1560 1608
1561 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) 1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1562 enter_rmode(vcpu); 1610 enter_rmode(vcpu);
1563 1611
1564#ifdef CONFIG_X86_64 1612#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1570 } 1618 }
1571#endif 1619#endif
1572 1620
1573 if (vm_need_ept()) 1621 if (enable_ept)
1574 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1622 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1575 1623
1576 vmcs_writel(CR0_READ_SHADOW, cr0); 1624 vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1599 u64 eptp; 1647 u64 eptp;
1600 1648
1601 guest_cr3 = cr3; 1649 guest_cr3 = cr3;
1602 if (vm_need_ept()) { 1650 if (enable_ept) {
1603 eptp = construct_eptp(cr3); 1651 eptp = construct_eptp(cr3);
1604 vmcs_write64(EPT_POINTER, eptp); 1652 vmcs_write64(EPT_POINTER, eptp);
1605 ept_sync_context(eptp); 1653 ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1616 1664
1617static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1618{ 1666{
1619 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? 1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1620 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1621 1669
1622 vcpu->arch.cr4 = cr4; 1670 vcpu->arch.cr4 = cr4;
1623 if (vm_need_ept()) 1671 if (enable_ept)
1624 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1672 ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1625 1673
1626 vmcs_writel(CR4_READ_SHADOW, cr4); 1674 vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1699 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1700 u32 ar; 1748 u32 ar;
1701 1749
1702 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { 1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1703 vcpu->arch.rmode.tr.selector = var->selector; 1751 vcpu->arch.rmode.tr.selector = var->selector;
1704 vcpu->arch.rmode.tr.base = var->base; 1752 vcpu->arch.rmode.tr.base = var->base;
1705 vcpu->arch.rmode.tr.limit = var->limit; 1753 vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1709 vmcs_writel(sf->base, var->base); 1757 vmcs_writel(sf->base, var->base);
1710 vmcs_write32(sf->limit, var->limit); 1758 vmcs_write32(sf->limit, var->limit);
1711 vmcs_write16(sf->selector, var->selector); 1759 vmcs_write16(sf->selector, var->selector);
1712 if (vcpu->arch.rmode.active && var->s) { 1760 if (vcpu->arch.rmode.vm86_active && var->s) {
1713 /* 1761 /*
1714 * Hack real-mode segments into vm86 compatibility. 1762 * Hack real-mode segments into vm86 compatibility.
1715 */ 1763 */
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
1982 pfn_t identity_map_pfn; 2030 pfn_t identity_map_pfn;
1983 u32 tmp; 2031 u32 tmp;
1984 2032
1985 if (!vm_need_ept()) 2033 if (!enable_ept)
1986 return 1; 2034 return 1;
1987 if (unlikely(!kvm->arch.ept_identity_pagetable)) { 2035 if (unlikely(!kvm->arch.ept_identity_pagetable)) {
1988 printk(KERN_ERR "EPT: identity-mapping pagetable " 2036 printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2071 int vpid; 2119 int vpid;
2072 2120
2073 vmx->vpid = 0; 2121 vmx->vpid = 0;
2074 if (!enable_vpid || !cpu_has_vmx_vpid()) 2122 if (!enable_vpid)
2075 return; 2123 return;
2076 spin_lock(&vmx_vpid_lock); 2124 spin_lock(&vmx_vpid_lock);
2077 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 2125 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2082 spin_unlock(&vmx_vpid_lock); 2130 spin_unlock(&vmx_vpid_lock);
2083} 2131}
2084 2132
2085static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 2133static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2086{ 2134{
2087 void *va; 2135 int f = sizeof(unsigned long);
2088 2136
2089 if (!cpu_has_vmx_msr_bitmap()) 2137 if (!cpu_has_vmx_msr_bitmap())
2090 return; 2138 return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2094 * have the write-low and read-high bitmap offsets the wrong way round. 2142 * have the write-low and read-high bitmap offsets the wrong way round.
2095 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 2143 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2096 */ 2144 */
2097 va = kmap(msr_bitmap);
2098 if (msr <= 0x1fff) { 2145 if (msr <= 0x1fff) {
2099 __clear_bit(msr, va + 0x000); /* read-low */ 2146 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2100 __clear_bit(msr, va + 0x800); /* write-low */ 2147 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2101 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 2148 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2102 msr &= 0x1fff; 2149 msr &= 0x1fff;
2103 __clear_bit(msr, va + 0x400); /* read-high */ 2150 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2104 __clear_bit(msr, va + 0xc00); /* write-high */ 2151 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2105 } 2152 }
2106 kunmap(msr_bitmap); 2153}
2154
2155static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2156{
2157 if (!longmode_only)
2158 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2159 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2107} 2160}
2108 2161
2109/* 2162/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2121 u32 exec_control; 2174 u32 exec_control;
2122 2175
2123 /* I/O */ 2176 /* I/O */
2124 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 2177 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2125 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 2178 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2126 2179
2127 if (cpu_has_vmx_msr_bitmap()) 2180 if (cpu_has_vmx_msr_bitmap())
2128 vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); 2181 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2129 2182
2130 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 2183 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2131 2184
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2141 CPU_BASED_CR8_LOAD_EXITING; 2194 CPU_BASED_CR8_LOAD_EXITING;
2142#endif 2195#endif
2143 } 2196 }
2144 if (!vm_need_ept()) 2197 if (!enable_ept)
2145 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2198 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2146 CPU_BASED_CR3_LOAD_EXITING | 2199 CPU_BASED_CR3_LOAD_EXITING |
2147 CPU_BASED_INVLPG_EXITING; 2200 CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2154 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2207 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2155 if (vmx->vpid == 0) 2208 if (vmx->vpid == 0)
2156 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2157 if (!vm_need_ept()) 2210 if (!enable_ept)
2158 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2159 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2160 } 2213 }
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2273 goto out; 2326 goto out;
2274 } 2327 }
2275 2328
2276 vmx->vcpu.arch.rmode.active = 0; 2329 vmx->vcpu.arch.rmode.vm86_active = 0;
2277 2330
2278 vmx->soft_vnmi_blocked = 0; 2331 vmx->soft_vnmi_blocked = 0;
2279 2332
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2402 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2455 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2403} 2456}
2404 2457
2405static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2458static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2406{ 2459{
2407 struct vcpu_vmx *vmx = to_vmx(vcpu); 2460 struct vcpu_vmx *vmx = to_vmx(vcpu);
2461 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr;
2408 2463
2409 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2410 2465
2411 ++vcpu->stat.irq_injections; 2466 ++vcpu->stat.irq_injections;
2412 if (vcpu->arch.rmode.active) { 2467 if (vcpu->arch.rmode.vm86_active) {
2413 vmx->rmode.irq.pending = true; 2468 vmx->rmode.irq.pending = true;
2414 vmx->rmode.irq.vector = irq; 2469 vmx->rmode.irq.vector = irq;
2415 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2419 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); 2474 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2420 return; 2475 return;
2421 } 2476 }
2422 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2477 intr = irq | INTR_INFO_VALID_MASK;
2423 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2478 if (vcpu->arch.interrupt.soft) {
2479 intr |= INTR_TYPE_SOFT_INTR;
2480 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2481 vmx->vcpu.arch.event_exit_inst_len);
2482 } else
2483 intr |= INTR_TYPE_EXT_INTR;
2484 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2424} 2485}
2425 2486
2426static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2487static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2441 } 2502 }
2442 2503
2443 ++vcpu->stat.nmi_injections; 2504 ++vcpu->stat.nmi_injections;
2444 if (vcpu->arch.rmode.active) { 2505 if (vcpu->arch.rmode.vm86_active) {
2445 vmx->rmode.irq.pending = true; 2506 vmx->rmode.irq.pending = true;
2446 vmx->rmode.irq.vector = NMI_VECTOR; 2507 vmx->rmode.irq.vector = NMI_VECTOR;
2447 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2456 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2517 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2457} 2518}
2458 2519
2459static void vmx_update_window_states(struct kvm_vcpu *vcpu) 2520static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2460{ 2521{
2461 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2462
2463 vcpu->arch.nmi_window_open =
2464 !(guest_intr & (GUEST_INTR_STATE_STI |
2465 GUEST_INTR_STATE_MOV_SS |
2466 GUEST_INTR_STATE_NMI));
2467 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 2522 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2468 vcpu->arch.nmi_window_open = 0; 2523 return 0;
2469
2470 vcpu->arch.interrupt_window_open =
2471 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2472 !(guest_intr & (GUEST_INTR_STATE_STI |
2473 GUEST_INTR_STATE_MOV_SS)));
2474}
2475
2476static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2477{
2478 int word_index = __ffs(vcpu->arch.irq_summary);
2479 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2480 int irq = word_index * BITS_PER_LONG + bit_index;
2481 2524
2482 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2525 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2483 if (!vcpu->arch.irq_pending[word_index]) 2526 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2484 clear_bit(word_index, &vcpu->arch.irq_summary); 2527 GUEST_INTR_STATE_NMI));
2485 kvm_queue_interrupt(vcpu, irq);
2486} 2528}
2487 2529
2488static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2530static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2489 struct kvm_run *kvm_run)
2490{ 2531{
2491 vmx_update_window_states(vcpu); 2532 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2492 2533 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2493 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 2534 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2494 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2495 GUEST_INTR_STATE_STI |
2496 GUEST_INTR_STATE_MOV_SS);
2497
2498 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2499 if (vcpu->arch.interrupt.pending) {
2500 enable_nmi_window(vcpu);
2501 } else if (vcpu->arch.nmi_window_open) {
2502 vcpu->arch.nmi_pending = false;
2503 vcpu->arch.nmi_injected = true;
2504 } else {
2505 enable_nmi_window(vcpu);
2506 return;
2507 }
2508 }
2509 if (vcpu->arch.nmi_injected) {
2510 vmx_inject_nmi(vcpu);
2511 if (vcpu->arch.nmi_pending)
2512 enable_nmi_window(vcpu);
2513 else if (vcpu->arch.irq_summary
2514 || kvm_run->request_interrupt_window)
2515 enable_irq_window(vcpu);
2516 return;
2517 }
2518
2519 if (vcpu->arch.interrupt_window_open) {
2520 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2521 kvm_do_inject_irq(vcpu);
2522
2523 if (vcpu->arch.interrupt.pending)
2524 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2525 }
2526 if (!vcpu->arch.interrupt_window_open &&
2527 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2528 enable_irq_window(vcpu);
2529} 2535}
2530 2536
2531static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2537static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2585 return 0; 2591 return 0;
2586} 2592}
2587 2593
2594/*
2595 * Trigger machine check on the host. We assume all the MSRs are already set up
2596 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2597 * We pass a fake environment to the machine check handler because we want
2598 * the guest to be always treated like user space, no matter what context
2599 * it used internally.
2600 */
2601static void kvm_machine_check(void)
2602{
2603#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2604 struct pt_regs regs = {
2605 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2606 .flags = X86_EFLAGS_IF,
2607 };
2608
2609 do_machine_check(&regs, 0);
2610#endif
2611}
2612
2613static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2614{
2615 /* already handled by vcpu_run */
2616 return 1;
2617}
2618
2588static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2619static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2589{ 2620{
2590 struct vcpu_vmx *vmx = to_vmx(vcpu); 2621 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2596 vect_info = vmx->idt_vectoring_info; 2627 vect_info = vmx->idt_vectoring_info;
2597 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2628 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2598 2629
2630 if (is_machine_check(intr_info))
2631 return handle_machine_check(vcpu, kvm_run);
2632
2599 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2633 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2600 !is_page_fault(intr_info)) 2634 !is_page_fault(intr_info))
2601 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2635 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2602 "intr info 0x%x\n", __func__, vect_info, intr_info); 2636 "intr info 0x%x\n", __func__, vect_info, intr_info);
2603 2637
2604 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
2605 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2606 set_bit(irq, vcpu->arch.irq_pending);
2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2608 }
2609
2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2638 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2611 return 1; /* already handled by vmx_vcpu_run() */ 2639 return 1; /* already handled by vmx_vcpu_run() */
2612 2640
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2628 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2656 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2629 if (is_page_fault(intr_info)) { 2657 if (is_page_fault(intr_info)) {
2630 /* EPT won't cause page fault directly */ 2658 /* EPT won't cause page fault directly */
2631 if (vm_need_ept()) 2659 if (enable_ept)
2632 BUG(); 2660 BUG();
2633 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2661 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2634 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2635 (u32)((u64)cr2 >> 32), handler); 2663 (u32)((u64)cr2 >> 32), handler);
2636 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) 2664 if (kvm_event_needs_reinjection(vcpu))
2637 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2665 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2638 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2666 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2639 } 2667 }
2640 2668
2641 if (vcpu->arch.rmode.active && 2669 if (vcpu->arch.rmode.vm86_active &&
2642 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2643 error_code)) { 2671 error_code)) {
2644 if (vcpu->arch.halt_request) { 2672 if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2753 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2754 skip_emulated_instruction(vcpu); 2782 skip_emulated_instruction(vcpu);
2755 return 1; 2783 return 1;
2756 case 8: 2784 case 8: {
2757 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); 2785 u8 cr8_prev = kvm_get_cr8(vcpu);
2758 skip_emulated_instruction(vcpu); 2786 u8 cr8 = kvm_register_read(vcpu, reg);
2759 if (irqchip_in_kernel(vcpu->kvm)) 2787 kvm_set_cr8(vcpu, cr8);
2760 return 1; 2788 skip_emulated_instruction(vcpu);
2761 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2789 if (irqchip_in_kernel(vcpu->kvm))
2762 return 0; 2790 return 1;
2791 if (cr8_prev <= cr8)
2792 return 1;
2793 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2794 return 0;
2795 }
2763 }; 2796 };
2764 break; 2797 break;
2765 case 2: /* clts */ 2798 case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2957 * If the user space waits to inject interrupts, exit as soon as 2990 * If the user space waits to inject interrupts, exit as soon as
2958 * possible 2991 * possible
2959 */ 2992 */
2960 if (kvm_run->request_interrupt_window && 2993 if (!irqchip_in_kernel(vcpu->kvm) &&
2961 !vcpu->arch.irq_summary) { 2994 kvm_run->request_interrupt_window &&
2995 !kvm_cpu_has_interrupt(vcpu)) {
2962 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2996 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2963 return 0; 2997 return 0;
2964 } 2998 }
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2980 3014
2981static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2982{ 3016{
2983 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2984 3018
2985 kvm_mmu_invlpg(vcpu, exit_qualification); 3019 kvm_mmu_invlpg(vcpu, exit_qualification);
2986 skip_emulated_instruction(vcpu); 3020 skip_emulated_instruction(vcpu);
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2996 3030
2997static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3031static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2998{ 3032{
2999 u64 exit_qualification; 3033 unsigned long exit_qualification;
3000 enum emulation_result er; 3034 enum emulation_result er;
3001 unsigned long offset; 3035 unsigned long offset;
3002 3036
3003 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3004 offset = exit_qualification & 0xffful; 3038 offset = exit_qualification & 0xffful;
3005 3039
3006 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3040 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3019 struct vcpu_vmx *vmx = to_vmx(vcpu); 3053 struct vcpu_vmx *vmx = to_vmx(vcpu);
3020 unsigned long exit_qualification; 3054 unsigned long exit_qualification;
3021 u16 tss_selector; 3055 u16 tss_selector;
3022 int reason; 3056 int reason, type, idt_v;
3057
3058 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3059 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3023 3060
3024 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3061 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3025 3062
3026 reason = (u32)exit_qualification >> 30; 3063 reason = (u32)exit_qualification >> 30;
3027 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && 3064 if (reason == TASK_SWITCH_GATE && idt_v) {
3028 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 3065 switch (type) {
3029 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) 3066 case INTR_TYPE_NMI_INTR:
3030 == INTR_TYPE_NMI_INTR) { 3067 vcpu->arch.nmi_injected = false;
3031 vcpu->arch.nmi_injected = false; 3068 if (cpu_has_virtual_nmis())
3032 if (cpu_has_virtual_nmis()) 3069 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3033 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3070 GUEST_INTR_STATE_NMI);
3034 GUEST_INTR_STATE_NMI); 3071 break;
3072 case INTR_TYPE_EXT_INTR:
3073 case INTR_TYPE_SOFT_INTR:
3074 kvm_clear_interrupt_queue(vcpu);
3075 break;
3076 case INTR_TYPE_HARD_EXCEPTION:
3077 case INTR_TYPE_SOFT_EXCEPTION:
3078 kvm_clear_exception_queue(vcpu);
3079 break;
3080 default:
3081 break;
3082 }
3035 } 3083 }
3036 tss_selector = exit_qualification; 3084 tss_selector = exit_qualification;
3037 3085
3086 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3087 type != INTR_TYPE_EXT_INTR &&
3088 type != INTR_TYPE_NMI_INTR))
3089 skip_emulated_instruction(vcpu);
3090
3038 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3091 if (!kvm_task_switch(vcpu, tss_selector, reason))
3039 return 0; 3092 return 0;
3040 3093
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3051 3104
3052static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3105static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053{ 3106{
3054 u64 exit_qualification; 3107 unsigned long exit_qualification;
3055 gpa_t gpa; 3108 gpa_t gpa;
3056 int gla_validity; 3109 int gla_validity;
3057 3110
3058 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3111 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3059 3112
3060 if (exit_qualification & (1 << 6)) { 3113 if (exit_qualification & (1 << 6)) {
3061 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3114 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3067 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 3120 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3068 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 3121 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3069 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 3122 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3070 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); 3123 vmcs_readl(GUEST_LINEAR_ADDRESS));
3071 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3124 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3072 (long unsigned int)exit_qualification); 3125 (long unsigned int)exit_qualification);
3073 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3126 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3150 [EXIT_REASON_WBINVD] = handle_wbinvd, 3203 [EXIT_REASON_WBINVD] = handle_wbinvd,
3151 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3204 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3152 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3205 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3206 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3153}; 3207};
3154 3208
3155static const int kvm_vmx_max_exit_handlers = 3209static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers =
3159 * The guest has exited. See if we can fix it or if we need userspace 3213 * The guest has exited. See if we can fix it or if we need userspace
3160 * assistance. 3214 * assistance.
3161 */ 3215 */
3162static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3216static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3163{ 3217{
3164 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3218 struct vcpu_vmx *vmx = to_vmx(vcpu);
3219 u32 exit_reason = vmx->exit_reason;
3166 u32 vectoring_info = vmx->idt_vectoring_info; 3220 u32 vectoring_info = vmx->idt_vectoring_info;
3167 3221
3168 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3222 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3178 3232
3179 /* Access CR3 don't cause VMExit in paging mode, so we need 3233 /* Access CR3 don't cause VMExit in paging mode, so we need
3180 * to sync with guest real CR3. */ 3234 * to sync with guest real CR3. */
3181 if (vm_need_ept() && is_paging(vcpu)) { 3235 if (enable_ept && is_paging(vcpu)) {
3182 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3236 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3183 ept_load_pdptrs(vcpu); 3237 ept_load_pdptrs(vcpu);
3184 } 3238 }
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3199 __func__, vectoring_info, exit_reason); 3253 __func__, vectoring_info, exit_reason);
3200 3254
3201 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 3255 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3202 if (vcpu->arch.interrupt_window_open) { 3256 if (vmx_interrupt_allowed(vcpu)) {
3203 vmx->soft_vnmi_blocked = 0; 3257 vmx->soft_vnmi_blocked = 0;
3204 vcpu->arch.nmi_window_open = 1;
3205 } else if (vmx->vnmi_blocked_time > 1000000000LL && 3258 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3206 vcpu->arch.nmi_pending) { 3259 vcpu->arch.nmi_pending) {
3207 /* 3260 /*
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3214 "state on VCPU %d after 1 s timeout\n", 3267 "state on VCPU %d after 1 s timeout\n",
3215 __func__, vcpu->vcpu_id); 3268 __func__, vcpu->vcpu_id);
3216 vmx->soft_vnmi_blocked = 0; 3269 vmx->soft_vnmi_blocked = 0;
3217 vmx->vcpu.arch.nmi_window_open = 1;
3218 } 3270 }
3219 } 3271 }
3220 3272
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3228 return 0; 3280 return 0;
3229} 3281}
3230 3282
3231static void update_tpr_threshold(struct kvm_vcpu *vcpu) 3283static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3232{ 3284{
3233 int max_irr, tpr; 3285 if (irr == -1 || tpr < irr) {
3234
3235 if (!vm_need_tpr_shadow(vcpu->kvm))
3236 return;
3237
3238 if (!kvm_lapic_enabled(vcpu) ||
3239 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
3240 vmcs_write32(TPR_THRESHOLD, 0); 3286 vmcs_write32(TPR_THRESHOLD, 0);
3241 return; 3287 return;
3242 } 3288 }
3243 3289
3244 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; 3290 vmcs_write32(TPR_THRESHOLD, irr);
3245 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3246} 3291}
3247 3292
3248static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3293static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3249{ 3294{
3250 u32 exit_intr_info; 3295 u32 exit_intr_info;
3251 u32 idt_vectoring_info; 3296 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3252 bool unblock_nmi; 3297 bool unblock_nmi;
3253 u8 vector; 3298 u8 vector;
3254 int type; 3299 int type;
3255 bool idtv_info_valid; 3300 bool idtv_info_valid;
3256 u32 error;
3257 3301
3258 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3302 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3303
3304 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3305
3306 /* Handle machine checks before interrupts are enabled */
3307 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3308 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3309 && is_machine_check(exit_intr_info)))
3310 kvm_machine_check();
3311
3312 /* We need to handle NMIs before interrupts are enabled */
3313 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3314 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3315 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3316 asm("int $2");
3317 }
3318
3319 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3320
3259 if (cpu_has_virtual_nmis()) { 3321 if (cpu_has_virtual_nmis()) {
3260 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3322 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3261 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 3323 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3262 /* 3324 /*
3263 * SDM 3: 25.7.1.2 3325 * SDM 3: 27.7.1.2 (September 2008)
3264 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3326 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3265 * a guest IRET fault. 3327 * a guest IRET fault.
3328 * SDM 3: 23.2.2 (September 2008)
3329 * Bit 12 is undefined in any of the following cases:
3330 * If the VM exit sets the valid bit in the IDT-vectoring
3331 * information field.
3332 * If the VM exit is due to a double fault.
3266 */ 3333 */
3267 if (unblock_nmi && vector != DF_VECTOR) 3334 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3335 vector != DF_VECTOR && !idtv_info_valid)
3268 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3336 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3269 GUEST_INTR_STATE_NMI); 3337 GUEST_INTR_STATE_NMI);
3270 } else if (unlikely(vmx->soft_vnmi_blocked)) 3338 } else if (unlikely(vmx->soft_vnmi_blocked))
3271 vmx->vnmi_blocked_time += 3339 vmx->vnmi_blocked_time +=
3272 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3340 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3273 3341
3274 idt_vectoring_info = vmx->idt_vectoring_info; 3342 vmx->vcpu.arch.nmi_injected = false;
3275 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3343 kvm_clear_exception_queue(&vmx->vcpu);
3344 kvm_clear_interrupt_queue(&vmx->vcpu);
3345
3346 if (!idtv_info_valid)
3347 return;
3348
3276 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3349 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3277 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3350 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3278 if (vmx->vcpu.arch.nmi_injected) { 3351
3352 switch (type) {
3353 case INTR_TYPE_NMI_INTR:
3354 vmx->vcpu.arch.nmi_injected = true;
3279 /* 3355 /*
3280 * SDM 3: 25.7.1.2 3356 * SDM 3: 27.7.1.2 (September 2008)
3281 * Clear bit "block by NMI" before VM entry if a NMI delivery 3357 * Clear bit "block by NMI" before VM entry if a NMI
3282 * faulted. 3358 * delivery faulted.
3283 */ 3359 */
3284 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) 3360 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3285 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 3361 GUEST_INTR_STATE_NMI);
3286 GUEST_INTR_STATE_NMI); 3362 break;
3287 else 3363 case INTR_TYPE_SOFT_EXCEPTION:
3288 vmx->vcpu.arch.nmi_injected = false; 3364 vmx->vcpu.arch.event_exit_inst_len =
3289 } 3365 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3290 kvm_clear_exception_queue(&vmx->vcpu); 3366 /* fall through */
3291 if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || 3367 case INTR_TYPE_HARD_EXCEPTION:
3292 type == INTR_TYPE_SOFT_EXCEPTION)) {
3293 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3368 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3294 error = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3369 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3295 kvm_queue_exception_e(&vmx->vcpu, vector, error); 3370 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3296 } else 3371 } else
3297 kvm_queue_exception(&vmx->vcpu, vector); 3372 kvm_queue_exception(&vmx->vcpu, vector);
3298 vmx->idt_vectoring_info = 0; 3373 break;
3299 } 3374 case INTR_TYPE_SOFT_INTR:
3300 kvm_clear_interrupt_queue(&vmx->vcpu); 3375 vmx->vcpu.arch.event_exit_inst_len =
3301 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { 3376 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3302 kvm_queue_interrupt(&vmx->vcpu, vector); 3377 /* fall through */
3303 vmx->idt_vectoring_info = 0; 3378 case INTR_TYPE_EXT_INTR:
3304 } 3379 kvm_queue_interrupt(&vmx->vcpu, vector,
3305} 3380 type == INTR_TYPE_SOFT_INTR);
3306 3381 break;
3307static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3382 default:
3308{ 3383 break;
3309 update_tpr_threshold(vcpu);
3310
3311 vmx_update_window_states(vcpu);
3312
3313 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3314 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3315 GUEST_INTR_STATE_STI |
3316 GUEST_INTR_STATE_MOV_SS);
3317
3318 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3319 if (vcpu->arch.interrupt.pending) {
3320 enable_nmi_window(vcpu);
3321 } else if (vcpu->arch.nmi_window_open) {
3322 vcpu->arch.nmi_pending = false;
3323 vcpu->arch.nmi_injected = true;
3324 } else {
3325 enable_nmi_window(vcpu);
3326 return;
3327 }
3328 }
3329 if (vcpu->arch.nmi_injected) {
3330 vmx_inject_nmi(vcpu);
3331 if (vcpu->arch.nmi_pending)
3332 enable_nmi_window(vcpu);
3333 else if (kvm_cpu_has_interrupt(vcpu))
3334 enable_irq_window(vcpu);
3335 return;
3336 }
3337 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3338 if (vcpu->arch.interrupt_window_open)
3339 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3340 else
3341 enable_irq_window(vcpu);
3342 }
3343 if (vcpu->arch.interrupt.pending) {
3344 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3345 if (kvm_cpu_has_interrupt(vcpu))
3346 enable_irq_window(vcpu);
3347 } 3384 }
3348} 3385}
3349 3386
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3381static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3418static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3382{ 3419{
3383 struct vcpu_vmx *vmx = to_vmx(vcpu); 3420 struct vcpu_vmx *vmx = to_vmx(vcpu);
3384 u32 intr_info;
3385 3421
3386 /* Record the guest's net vcpu time for enforced NMI injections. */ 3422 /* Record the guest's net vcpu time for enforced NMI injections. */
3387 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3423 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3505 if (vmx->rmode.irq.pending) 3541 if (vmx->rmode.irq.pending)
3506 fixup_rmode_irq(vmx); 3542 fixup_rmode_irq(vmx);
3507 3543
3508 vmx_update_window_states(vcpu);
3509
3510 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3544 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3511 vmx->launched = 1; 3545 vmx->launched = 1;
3512 3546
3513 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3514
3515 /* We need to handle NMIs before interrupts are enabled */
3516 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3517 (intr_info & INTR_INFO_VALID_MASK)) {
3518 KVMTRACE_0D(NMI, vcpu, handler);
3519 asm("int $2");
3520 }
3521
3522 vmx_complete_interrupts(vmx); 3547 vmx_complete_interrupts(vmx);
3523} 3548}
3524 3549
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3593 if (alloc_apic_access_page(kvm) != 0) 3618 if (alloc_apic_access_page(kvm) != 0)
3594 goto free_vmcs; 3619 goto free_vmcs;
3595 3620
3596 if (vm_need_ept()) 3621 if (enable_ept)
3597 if (alloc_identity_pagetable(kvm) != 0) 3622 if (alloc_identity_pagetable(kvm) != 0)
3598 goto free_vmcs; 3623 goto free_vmcs;
3599 3624
@@ -3631,9 +3656,32 @@ static int get_ept_level(void)
3631 return VMX_EPT_DEFAULT_GAW + 1; 3656 return VMX_EPT_DEFAULT_GAW + 1;
3632} 3657}
3633 3658
3634static int vmx_get_mt_mask_shift(void) 3659static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3635{ 3660{
3636 return VMX_EPT_MT_EPTE_SHIFT; 3661 u64 ret;
3662
3663 /* For VT-d and EPT combination
3664 * 1. MMIO: always map as UC
3665 * 2. EPT with VT-d:
3666 * a. VT-d without snooping control feature: can't guarantee the
3667 * result, try to trust guest.
3668 * b. VT-d with snooping control feature: snooping control feature of
3669 * VT-d engine can guarantee the cache correctness. Just set it
3670 * to WB to keep consistent with host. So the same as item 3.
3671 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3672 * consistent with host MTRR
3673 */
3674 if (is_mmio)
3675 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3676 else if (vcpu->kvm->arch.iommu_domain &&
3677 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3678 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3679 VMX_EPT_MT_EPTE_SHIFT;
3680 else
3681 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3682 | VMX_EPT_IGMT_BIT;
3683
3684 return ret;
3637} 3685}
3638 3686
3639static struct kvm_x86_ops vmx_x86_ops = { 3687static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3644 .check_processor_compatibility = vmx_check_processor_compat, 3692 .check_processor_compatibility = vmx_check_processor_compat,
3645 .hardware_enable = hardware_enable, 3693 .hardware_enable = hardware_enable,
3646 .hardware_disable = hardware_disable, 3694 .hardware_disable = hardware_disable,
3647 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, 3695 .cpu_has_accelerated_tpr = report_flexpriority,
3648 3696
3649 .vcpu_create = vmx_create_vcpu, 3697 .vcpu_create = vmx_create_vcpu,
3650 .vcpu_free = vmx_free_vcpu, 3698 .vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
3678 .tlb_flush = vmx_flush_tlb, 3726 .tlb_flush = vmx_flush_tlb,
3679 3727
3680 .run = vmx_vcpu_run, 3728 .run = vmx_vcpu_run,
3681 .handle_exit = kvm_handle_exit, 3729 .handle_exit = vmx_handle_exit,
3682 .skip_emulated_instruction = skip_emulated_instruction, 3730 .skip_emulated_instruction = skip_emulated_instruction,
3731 .set_interrupt_shadow = vmx_set_interrupt_shadow,
3732 .get_interrupt_shadow = vmx_get_interrupt_shadow,
3683 .patch_hypercall = vmx_patch_hypercall, 3733 .patch_hypercall = vmx_patch_hypercall,
3684 .get_irq = vmx_get_irq,
3685 .set_irq = vmx_inject_irq, 3734 .set_irq = vmx_inject_irq,
3735 .set_nmi = vmx_inject_nmi,
3686 .queue_exception = vmx_queue_exception, 3736 .queue_exception = vmx_queue_exception,
3687 .exception_injected = vmx_exception_injected, 3737 .interrupt_allowed = vmx_interrupt_allowed,
3688 .inject_pending_irq = vmx_intr_assist, 3738 .nmi_allowed = vmx_nmi_allowed,
3689 .inject_pending_vectors = do_interrupt_requests, 3739 .enable_nmi_window = enable_nmi_window,
3740 .enable_irq_window = enable_irq_window,
3741 .update_cr8_intercept = update_cr8_intercept,
3690 3742
3691 .set_tss_addr = vmx_set_tss_addr, 3743 .set_tss_addr = vmx_set_tss_addr,
3692 .get_tdp_level = get_ept_level, 3744 .get_tdp_level = get_ept_level,
3693 .get_mt_mask_shift = vmx_get_mt_mask_shift, 3745 .get_mt_mask = vmx_get_mt_mask,
3694}; 3746};
3695 3747
3696static int __init vmx_init(void) 3748static int __init vmx_init(void)
3697{ 3749{
3698 void *va;
3699 int r; 3750 int r;
3700 3751
3701 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3752 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3702 if (!vmx_io_bitmap_a) 3753 if (!vmx_io_bitmap_a)
3703 return -ENOMEM; 3754 return -ENOMEM;
3704 3755
3705 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3756 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3706 if (!vmx_io_bitmap_b) { 3757 if (!vmx_io_bitmap_b) {
3707 r = -ENOMEM; 3758 r = -ENOMEM;
3708 goto out; 3759 goto out;
3709 } 3760 }
3710 3761
3711 vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3762 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3712 if (!vmx_msr_bitmap) { 3763 if (!vmx_msr_bitmap_legacy) {
3713 r = -ENOMEM; 3764 r = -ENOMEM;
3714 goto out1; 3765 goto out1;
3715 } 3766 }
3716 3767
3768 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3769 if (!vmx_msr_bitmap_longmode) {
3770 r = -ENOMEM;
3771 goto out2;
3772 }
3773
3717 /* 3774 /*
3718 * Allow direct access to the PC debug port (it is often used for I/O 3775 * Allow direct access to the PC debug port (it is often used for I/O
3719 * delays, but the vmexits simply slow things down). 3776 * delays, but the vmexits simply slow things down).
3720 */ 3777 */
3721 va = kmap(vmx_io_bitmap_a); 3778 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3722 memset(va, 0xff, PAGE_SIZE); 3779 clear_bit(0x80, vmx_io_bitmap_a);
3723 clear_bit(0x80, va);
3724 kunmap(vmx_io_bitmap_a);
3725 3780
3726 va = kmap(vmx_io_bitmap_b); 3781 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3727 memset(va, 0xff, PAGE_SIZE);
3728 kunmap(vmx_io_bitmap_b);
3729 3782
3730 va = kmap(vmx_msr_bitmap); 3783 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3731 memset(va, 0xff, PAGE_SIZE); 3784 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3732 kunmap(vmx_msr_bitmap);
3733 3785
3734 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 3786 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3735 3787
3736 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 3788 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3737 if (r) 3789 if (r)
3738 goto out2; 3790 goto out3;
3739 3791
3740 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); 3792 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3741 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); 3793 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3742 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); 3794 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3743 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3795 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3744 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3796 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3797 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3745 3798
3746 if (vm_need_ept()) { 3799 if (enable_ept) {
3747 bypass_guest_pf = 0; 3800 bypass_guest_pf = 0;
3748 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3801 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3749 VMX_EPT_WRITABLE_MASK); 3802 VMX_EPT_WRITABLE_MASK);
3750 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3803 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3751 VMX_EPT_EXECUTABLE_MASK, 3804 VMX_EPT_EXECUTABLE_MASK);
3752 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3753 kvm_enable_tdp(); 3805 kvm_enable_tdp();
3754 } else 3806 } else
3755 kvm_disable_tdp(); 3807 kvm_disable_tdp();
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void)
3761 3813
3762 return 0; 3814 return 0;
3763 3815
3816out3:
3817 free_page((unsigned long)vmx_msr_bitmap_longmode);
3764out2: 3818out2:
3765 __free_page(vmx_msr_bitmap); 3819 free_page((unsigned long)vmx_msr_bitmap_legacy);
3766out1: 3820out1:
3767 __free_page(vmx_io_bitmap_b); 3821 free_page((unsigned long)vmx_io_bitmap_b);
3768out: 3822out:
3769 __free_page(vmx_io_bitmap_a); 3823 free_page((unsigned long)vmx_io_bitmap_a);
3770 return r; 3824 return r;
3771} 3825}
3772 3826
3773static void __exit vmx_exit(void) 3827static void __exit vmx_exit(void)
3774{ 3828{
3775 __free_page(vmx_msr_bitmap); 3829 free_page((unsigned long)vmx_msr_bitmap_legacy);
3776 __free_page(vmx_io_bitmap_b); 3830 free_page((unsigned long)vmx_msr_bitmap_longmode);
3777 __free_page(vmx_io_bitmap_a); 3831 free_page((unsigned long)vmx_io_bitmap_b);
3832 free_page((unsigned long)vmx_io_bitmap_a);
3778 3833
3779 kvm_exit(); 3834 kvm_exit();
3780} 3835}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e794..249540f98513 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
92 { "hypercalls", VCPU_STAT(hypercalls) }, 92 { "hypercalls", VCPU_STAT(hypercalls) },
93 { "request_irq", VCPU_STAT(request_irq_exits) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) },
94 { "request_nmi", VCPU_STAT(request_nmi_exits) },
95 { "irq_exits", VCPU_STAT(irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) },
96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) },
97 { "efer_reload", VCPU_STAT(efer_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) },
109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) },
111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
113 { "largepages", VM_STAT(lpages) }, 111 { "largepages", VM_STAT(lpages) },
114 { NULL } 112 { NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
234 goto out; 232 goto out;
235 } 233 }
236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 235 if (is_present_pte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
238 ret = 0; 237 ret = 0;
239 goto out; 238 goto out;
240 } 239 }
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
321 kvm_x86_ops->set_cr0(vcpu, cr0); 320 kvm_x86_ops->set_cr0(vcpu, cr0);
322 vcpu->arch.cr0 = cr0; 321 vcpu->arch.cr0 = cr0;
323 322
324 kvm_mmu_sync_global(vcpu);
325 kvm_mmu_reset_context(vcpu); 323 kvm_mmu_reset_context(vcpu);
326 return; 324 return;
327} 325}
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
370 kvm_x86_ops->set_cr4(vcpu, cr4); 368 kvm_x86_ops->set_cr4(vcpu, cr4);
371 vcpu->arch.cr4 = cr4; 369 vcpu->arch.cr4 = cr4;
372 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
373 kvm_mmu_sync_global(vcpu);
374 kvm_mmu_reset_context(vcpu); 371 kvm_mmu_reset_context(vcpu);
375} 372}
376EXPORT_SYMBOL_GPL(kvm_set_cr4); 373EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
523 efer |= vcpu->arch.shadow_efer & EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA;
524 521
525 vcpu->arch.shadow_efer = efer; 522 vcpu->arch.shadow_efer = efer;
523
524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
525 kvm_mmu_reset_context(vcpu);
526} 526}
527 527
528void kvm_enable_efer_bits(u64 mask) 528void kvm_enable_efer_bits(u64 mask)
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
630 unsigned long flags; 630 unsigned long flags;
631 struct kvm_vcpu_arch *vcpu = &v->arch; 631 struct kvm_vcpu_arch *vcpu = &v->arch;
632 void *shared_kaddr; 632 void *shared_kaddr;
633 unsigned long this_tsc_khz;
633 634
634 if ((!vcpu->time_page)) 635 if ((!vcpu->time_page))
635 return; 636 return;
636 637
637 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz);
638 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
639 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
641 vcpu->hv_clock_tsc_khz = this_tsc_khz;
640 } 642 }
643 put_cpu_var(cpu_tsc_khz);
641 644
642 /* Keep irq disabled to prevent changes to the clock */ 645 /* Keep irq disabled to prevent changes to the clock */
643 local_irq_save(flags); 646 local_irq_save(flags);
@@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
893 case MSR_IA32_LASTINTFROMIP: 896 case MSR_IA32_LASTINTFROMIP:
894 case MSR_IA32_LASTINTTOIP: 897 case MSR_IA32_LASTINTTOIP:
895 case MSR_VM_HSAVE_PA: 898 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1:
896 data = 0; 901 data = 0;
897 break; 902 break;
898 case MSR_MTRRcap: 903 case MSR_MTRRcap:
@@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1024 case KVM_CAP_SYNC_MMU: 1029 case KVM_CAP_SYNC_MMU:
1025 case KVM_CAP_REINJECT_CONTROL: 1030 case KVM_CAP_REINJECT_CONTROL:
1026 case KVM_CAP_IRQ_INJECT_STATUS: 1031 case KVM_CAP_IRQ_INJECT_STATUS:
1032 case KVM_CAP_ASSIGN_DEV_IRQ:
1027 r = 1; 1033 r = 1;
1028 break; 1034 break;
1029 case KVM_CAP_COALESCED_MMIO: 1035 case KVM_CAP_COALESCED_MMIO:
@@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1241 entry->flags = 0; 1247 entry->flags = 0;
1242} 1248}
1243 1249
1250#define F(x) bit(X86_FEATURE_##x)
1251
1244static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1252static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1245 u32 index, int *nent, int maxnent) 1253 u32 index, int *nent, int maxnent)
1246{ 1254{
1247 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1255 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1248 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1249 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1250 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1251 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1252 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1253 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1254 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1255 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1256 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1257 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1258 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1259 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1260 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1261 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1262 bit(X86_FEATURE_PGE) |
1263 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1264 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1265 bit(X86_FEATURE_SYSCALL) |
1266 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1267#ifdef CONFIG_X86_64 1256#ifdef CONFIG_X86_64
1268 bit(X86_FEATURE_LM) | 1257 unsigned f_lm = F(LM);
1258#else
1259 unsigned f_lm = 0;
1269#endif 1260#endif
1270 bit(X86_FEATURE_FXSR_OPT) | 1261
1271 bit(X86_FEATURE_MMXEXT) | 1262 /* cpuid 1.edx */
1272 bit(X86_FEATURE_3DNOWEXT) | 1263 const u32 kvm_supported_word0_x86_features =
1273 bit(X86_FEATURE_3DNOW); 1264 F(FPU) | F(VME) | F(DE) | F(PSE) |
1274 const u32 kvm_supported_word3_x86_features = 1265 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1275 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1266 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1267 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1268 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1269 0 /* Reserved, DS, ACPI */ | F(MMX) |
1270 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1271 0 /* HTT, TM, Reserved, PBE */;
1272 /* cpuid 0x80000001.edx */
1273 const u32 kvm_supported_word1_x86_features =
1274 F(FPU) | F(VME) | F(DE) | F(PSE) |
1275 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1276 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1277 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1278 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1279 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1280 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1281 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1282 /* cpuid 1.ecx */
1283 const u32 kvm_supported_word4_x86_features =
1284 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1285 0 /* DS-CPL, VMX, SMX, EST */ |
1286 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1287 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1288 0 /* Reserved, DCA */ | F(XMM4_1) |
1289 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1290 0 /* Reserved, XSAVE, OSXSAVE */;
1291 /* cpuid 0x80000001.ecx */
1276 const u32 kvm_supported_word6_x86_features = 1292 const u32 kvm_supported_word6_x86_features =
1277 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1293 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1278 bit(X86_FEATURE_SVM); 1294 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1295 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1296 0 /* SKINIT */ | 0 /* WDT */;
1279 1297
1280 /* all calls to cpuid_count() should be made on the same cpu */ 1298 /* all calls to cpuid_count() should be made on the same cpu */
1281 get_cpu(); 1299 get_cpu();
@@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1288 break; 1306 break;
1289 case 1: 1307 case 1:
1290 entry->edx &= kvm_supported_word0_x86_features; 1308 entry->edx &= kvm_supported_word0_x86_features;
1291 entry->ecx &= kvm_supported_word3_x86_features; 1309 entry->ecx &= kvm_supported_word4_x86_features;
1292 break; 1310 break;
1293 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1311 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1294 * may return different values. This forces us to get_cpu() before 1312 * may return different values. This forces us to get_cpu() before
@@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1350 put_cpu(); 1368 put_cpu();
1351} 1369}
1352 1370
1371#undef F
1372
1353static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1373static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1354 struct kvm_cpuid_entry2 __user *entries) 1374 struct kvm_cpuid_entry2 __user *entries)
1355{ 1375{
@@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1421 return -ENXIO; 1441 return -ENXIO;
1422 vcpu_load(vcpu); 1442 vcpu_load(vcpu);
1423 1443
1424 set_bit(irq->irq, vcpu->arch.irq_pending); 1444 kvm_queue_interrupt(vcpu, irq->irq, false);
1425 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1426 1445
1427 vcpu_put(vcpu); 1446 vcpu_put(vcpu);
1428 1447
@@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1584 r = -EINVAL; 1603 r = -EINVAL;
1585 } 1604 }
1586out: 1605out:
1587 if (lapic) 1606 kfree(lapic);
1588 kfree(lapic);
1589 return r; 1607 return r;
1590} 1608}
1591 1609
@@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1606 return -EINVAL; 1624 return -EINVAL;
1607 1625
1608 down_write(&kvm->slots_lock); 1626 down_write(&kvm->slots_lock);
1627 spin_lock(&kvm->mmu_lock);
1609 1628
1610 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1629 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1611 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1630 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1612 1631
1632 spin_unlock(&kvm->mmu_lock);
1613 up_write(&kvm->slots_lock); 1633 up_write(&kvm->slots_lock);
1614 return 0; 1634 return 0;
1615} 1635}
@@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1785 1805
1786 /* If nothing is dirty, don't bother messing with page tables. */ 1806 /* If nothing is dirty, don't bother messing with page tables. */
1787 if (is_dirty) { 1807 if (is_dirty) {
1808 spin_lock(&kvm->mmu_lock);
1788 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1809 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1810 spin_unlock(&kvm->mmu_lock);
1789 kvm_flush_remote_tlbs(kvm); 1811 kvm_flush_remote_tlbs(kvm);
1790 memslot = &kvm->memslots[log->slot]; 1812 memslot = &kvm->memslots[log->slot];
1791 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2360 u16 error_code, 2382 u16 error_code,
2361 int emulation_type) 2383 int emulation_type)
2362{ 2384{
2363 int r; 2385 int r, shadow_mask;
2364 struct decode_cache *c; 2386 struct decode_cache *c;
2365 2387
2366 kvm_clear_exception_queue(vcpu); 2388 kvm_clear_exception_queue(vcpu);
@@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2408 } 2430 }
2409 } 2431 }
2410 2432
2433 if (emulation_type & EMULTYPE_SKIP) {
2434 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2435 return EMULATE_DONE;
2436 }
2437
2411 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2438 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2439 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2440
2441 if (r == 0)
2442 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2412 2443
2413 if (vcpu->arch.pio.string) 2444 if (vcpu->arch.pio.string)
2414 return EMULATE_DO_MMIO; 2445 return EMULATE_DO_MMIO;
@@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque)
2761 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2792 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2762 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2793 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2763 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2794 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2764 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2795 PT_DIRTY_MASK, PT64_NX_MASK, 0);
2765 2796
2766 for_each_possible_cpu(cpu) 2797 for_each_possible_cpu(cpu)
2767 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2798 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3012 return best; 3043 return best;
3013} 3044}
3014 3045
3046int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3047{
3048 struct kvm_cpuid_entry2 *best;
3049
3050 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3051 if (best)
3052 return best->eax & 0xff;
3053 return 36;
3054}
3055
3015void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3056void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3016{ 3057{
3017 u32 function, index; 3058 u32 function, index;
@@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3048static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3089static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3049 struct kvm_run *kvm_run) 3090 struct kvm_run *kvm_run)
3050{ 3091{
3051 return (!vcpu->arch.irq_summary && 3092 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3052 kvm_run->request_interrupt_window && 3093 kvm_run->request_interrupt_window &&
3053 vcpu->arch.interrupt_window_open && 3094 kvm_arch_interrupt_allowed(vcpu));
3054 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3055} 3095}
3056 3096
3057static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3097static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3064 kvm_run->ready_for_interrupt_injection = 1; 3104 kvm_run->ready_for_interrupt_injection = 1;
3065 else 3105 else
3066 kvm_run->ready_for_interrupt_injection = 3106 kvm_run->ready_for_interrupt_injection =
3067 (vcpu->arch.interrupt_window_open && 3107 kvm_arch_interrupt_allowed(vcpu) &&
3068 vcpu->arch.irq_summary == 0); 3108 !kvm_cpu_has_interrupt(vcpu) &&
3109 !kvm_event_needs_reinjection(vcpu);
3069} 3110}
3070 3111
3071static void vapic_enter(struct kvm_vcpu *vcpu) 3112static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
3094 up_read(&vcpu->kvm->slots_lock); 3135 up_read(&vcpu->kvm->slots_lock);
3095} 3136}
3096 3137
3138static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3139{
3140 int max_irr, tpr;
3141
3142 if (!kvm_x86_ops->update_cr8_intercept)
3143 return;
3144
3145 if (!vcpu->arch.apic->vapic_addr)
3146 max_irr = kvm_lapic_find_highest_irr(vcpu);
3147 else
3148 max_irr = -1;
3149
3150 if (max_irr != -1)
3151 max_irr >>= 4;
3152
3153 tpr = kvm_lapic_get_cr8(vcpu);
3154
3155 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3156}
3157
3158static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3159{
3160 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3161 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3162
3163 /* try to reinject previous events if any */
3164 if (vcpu->arch.nmi_injected) {
3165 kvm_x86_ops->set_nmi(vcpu);
3166 return;
3167 }
3168
3169 if (vcpu->arch.interrupt.pending) {
3170 kvm_x86_ops->set_irq(vcpu);
3171 return;
3172 }
3173
3174 /* try to inject new event if pending */
3175 if (vcpu->arch.nmi_pending) {
3176 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3177 vcpu->arch.nmi_pending = false;
3178 vcpu->arch.nmi_injected = true;
3179 kvm_x86_ops->set_nmi(vcpu);
3180 }
3181 } else if (kvm_cpu_has_interrupt(vcpu)) {
3182 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3183 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3184 false);
3185 kvm_x86_ops->set_irq(vcpu);
3186 }
3187 }
3188}
3189
3097static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3098{ 3191{
3099 int r; 3192 int r;
3193 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3194 kvm_run->request_interrupt_window;
3100 3195
3101 if (vcpu->requests) 3196 if (vcpu->requests)
3102 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3197 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3128 } 3223 }
3129 } 3224 }
3130 3225
3131 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3132 kvm_inject_pending_timer_irqs(vcpu);
3133
3134 preempt_disable(); 3226 preempt_disable();
3135 3227
3136 kvm_x86_ops->prepare_guest_switch(vcpu); 3228 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3138 3230
3139 local_irq_disable(); 3231 local_irq_disable();
3140 3232
3233 clear_bit(KVM_REQ_KICK, &vcpu->requests);
3234 smp_mb__after_clear_bit();
3235
3141 if (vcpu->requests || need_resched() || signal_pending(current)) { 3236 if (vcpu->requests || need_resched() || signal_pending(current)) {
3142 local_irq_enable(); 3237 local_irq_enable();
3143 preempt_enable(); 3238 preempt_enable();
@@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3145 goto out; 3240 goto out;
3146 } 3241 }
3147 3242
3148 vcpu->guest_mode = 1;
3149 /*
3150 * Make sure that guest_mode assignment won't happen after
3151 * testing the pending IRQ vector bitmap.
3152 */
3153 smp_wmb();
3154
3155 if (vcpu->arch.exception.pending) 3243 if (vcpu->arch.exception.pending)
3156 __queue_exception(vcpu); 3244 __queue_exception(vcpu);
3157 else if (irqchip_in_kernel(vcpu->kvm))
3158 kvm_x86_ops->inject_pending_irq(vcpu);
3159 else 3245 else
3160 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3246 inject_pending_irq(vcpu, kvm_run);
3161 3247
3162 kvm_lapic_sync_to_vapic(vcpu); 3248 /* enable NMI/IRQ window open exits if needed */
3249 if (vcpu->arch.nmi_pending)
3250 kvm_x86_ops->enable_nmi_window(vcpu);
3251 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3252 kvm_x86_ops->enable_irq_window(vcpu);
3253
3254 if (kvm_lapic_enabled(vcpu)) {
3255 update_cr8_intercept(vcpu);
3256 kvm_lapic_sync_to_vapic(vcpu);
3257 }
3163 3258
3164 up_read(&vcpu->kvm->slots_lock); 3259 up_read(&vcpu->kvm->slots_lock);
3165 3260
@@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3193 set_debugreg(vcpu->arch.host_dr6, 6); 3288 set_debugreg(vcpu->arch.host_dr6, 6);
3194 set_debugreg(vcpu->arch.host_dr7, 7); 3289 set_debugreg(vcpu->arch.host_dr7, 7);
3195 3290
3196 vcpu->guest_mode = 0; 3291 set_bit(KVM_REQ_KICK, &vcpu->requests);
3197 local_irq_enable(); 3292 local_irq_enable();
3198 3293
3199 ++vcpu->stat.exits; 3294 ++vcpu->stat.exits;
@@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3220 profile_hit(KVM_PROFILING, (void *)rip); 3315 profile_hit(KVM_PROFILING, (void *)rip);
3221 } 3316 }
3222 3317
3223 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3224 vcpu->arch.exception.pending = false;
3225 3318
3226 kvm_lapic_sync_from_vapic(vcpu); 3319 kvm_lapic_sync_from_vapic(vcpu);
3227 3320
@@ -3230,6 +3323,7 @@ out:
3230 return r; 3323 return r;
3231} 3324}
3232 3325
3326
3233static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3327static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3234{ 3328{
3235 int r; 3329 int r;
@@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3256 kvm_vcpu_block(vcpu); 3350 kvm_vcpu_block(vcpu);
3257 down_read(&vcpu->kvm->slots_lock); 3351 down_read(&vcpu->kvm->slots_lock);
3258 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3352 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3259 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3353 {
3354 switch(vcpu->arch.mp_state) {
3355 case KVM_MP_STATE_HALTED:
3260 vcpu->arch.mp_state = 3356 vcpu->arch.mp_state =
3261 KVM_MP_STATE_RUNNABLE; 3357 KVM_MP_STATE_RUNNABLE;
3262 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3358 case KVM_MP_STATE_RUNNABLE:
3263 r = -EINTR; 3359 break;
3360 case KVM_MP_STATE_SIPI_RECEIVED:
3361 default:
3362 r = -EINTR;
3363 break;
3364 }
3365 }
3264 } 3366 }
3265 3367
3266 if (r > 0) { 3368 if (r <= 0)
3267 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3369 break;
3268 r = -EINTR; 3370
3269 kvm_run->exit_reason = KVM_EXIT_INTR; 3371 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3270 ++vcpu->stat.request_irq_exits; 3372 if (kvm_cpu_has_pending_timer(vcpu))
3271 } 3373 kvm_inject_pending_timer_irqs(vcpu);
3272 if (signal_pending(current)) { 3374
3273 r = -EINTR; 3375 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3274 kvm_run->exit_reason = KVM_EXIT_INTR; 3376 r = -EINTR;
3275 ++vcpu->stat.signal_exits; 3377 kvm_run->exit_reason = KVM_EXIT_INTR;
3276 } 3378 ++vcpu->stat.request_irq_exits;
3277 if (need_resched()) { 3379 }
3278 up_read(&vcpu->kvm->slots_lock); 3380 if (signal_pending(current)) {
3279 kvm_resched(vcpu); 3381 r = -EINTR;
3280 down_read(&vcpu->kvm->slots_lock); 3382 kvm_run->exit_reason = KVM_EXIT_INTR;
3281 } 3383 ++vcpu->stat.signal_exits;
3384 }
3385 if (need_resched()) {
3386 up_read(&vcpu->kvm->slots_lock);
3387 kvm_resched(vcpu);
3388 down_read(&vcpu->kvm->slots_lock);
3282 } 3389 }
3283 } 3390 }
3284 3391
@@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3442 struct kvm_sregs *sregs) 3549 struct kvm_sregs *sregs)
3443{ 3550{
3444 struct descriptor_table dt; 3551 struct descriptor_table dt;
3445 int pending_vec;
3446 3552
3447 vcpu_load(vcpu); 3553 vcpu_load(vcpu);
3448 3554
@@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3472 sregs->efer = vcpu->arch.shadow_efer; 3578 sregs->efer = vcpu->arch.shadow_efer;
3473 sregs->apic_base = kvm_get_apic_base(vcpu); 3579 sregs->apic_base = kvm_get_apic_base(vcpu);
3474 3580
3475 if (irqchip_in_kernel(vcpu->kvm)) { 3581 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3476 memset(sregs->interrupt_bitmap, 0, 3582
3477 sizeof sregs->interrupt_bitmap); 3583 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3478 pending_vec = kvm_x86_ops->get_irq(vcpu); 3584 set_bit(vcpu->arch.interrupt.nr,
3479 if (pending_vec >= 0) 3585 (unsigned long *)sregs->interrupt_bitmap);
3480 set_bit(pending_vec,
3481 (unsigned long *)sregs->interrupt_bitmap);
3482 } else
3483 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3484 sizeof sregs->interrupt_bitmap);
3485 3586
3486 vcpu_put(vcpu); 3587 vcpu_put(vcpu);
3487 3588
@@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3688 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3789 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3689 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3790 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3690 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3791 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3691 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3692} 3792}
3693 3793
3694static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3794static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3785} 3885}
3786 3886
3787static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3887static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3788 u32 old_tss_base, 3888 u16 old_tss_sel, u32 old_tss_base,
3789 struct desc_struct *nseg_desc) 3889 struct desc_struct *nseg_desc)
3790{ 3890{
3791 struct tss_segment_16 tss_segment_16; 3891 struct tss_segment_16 tss_segment_16;
3792 int ret = 0; 3892 int ret = 0;
@@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3805 &tss_segment_16, sizeof tss_segment_16)) 3905 &tss_segment_16, sizeof tss_segment_16))
3806 goto out; 3906 goto out;
3807 3907
3908 if (old_tss_sel != 0xffff) {
3909 tss_segment_16.prev_task_link = old_tss_sel;
3910
3911 if (kvm_write_guest(vcpu->kvm,
3912 get_tss_base_addr(vcpu, nseg_desc),
3913 &tss_segment_16.prev_task_link,
3914 sizeof tss_segment_16.prev_task_link))
3915 goto out;
3916 }
3917
3808 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3918 if (load_state_from_tss16(vcpu, &tss_segment_16))
3809 goto out; 3919 goto out;
3810 3920
@@ -3814,7 +3924,7 @@ out:
3814} 3924}
3815 3925
3816static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3926static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3817 u32 old_tss_base, 3927 u16 old_tss_sel, u32 old_tss_base,
3818 struct desc_struct *nseg_desc) 3928 struct desc_struct *nseg_desc)
3819{ 3929{
3820 struct tss_segment_32 tss_segment_32; 3930 struct tss_segment_32 tss_segment_32;
@@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3834 &tss_segment_32, sizeof tss_segment_32)) 3944 &tss_segment_32, sizeof tss_segment_32))
3835 goto out; 3945 goto out;
3836 3946
3947 if (old_tss_sel != 0xffff) {
3948 tss_segment_32.prev_task_link = old_tss_sel;
3949
3950 if (kvm_write_guest(vcpu->kvm,
3951 get_tss_base_addr(vcpu, nseg_desc),
3952 &tss_segment_32.prev_task_link,
3953 sizeof tss_segment_32.prev_task_link))
3954 goto out;
3955 }
3956
3837 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3957 if (load_state_from_tss32(vcpu, &tss_segment_32))
3838 goto out; 3958 goto out;
3839 3959
@@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3887 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4007 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3888 } 4008 }
3889 4009
3890 kvm_x86_ops->skip_emulated_instruction(vcpu); 4010 /* set back link to prev task only if NT bit is set in eflags
4011 note that old_tss_sel is not used afetr this point */
4012 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4013 old_tss_sel = 0xffff;
4014
4015 /* set back link to prev task only if NT bit is set in eflags
4016 note that old_tss_sel is not used afetr this point */
4017 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4018 old_tss_sel = 0xffff;
3891 4019
3892 if (nseg_desc.type & 8) 4020 if (nseg_desc.type & 8)
3893 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 4021 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
3894 &nseg_desc); 4022 old_tss_base, &nseg_desc);
3895 else 4023 else
3896 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 4024 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
3897 &nseg_desc); 4025 old_tss_base, &nseg_desc);
3898 4026
3899 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4027 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3900 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4028 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3920 struct kvm_sregs *sregs) 4048 struct kvm_sregs *sregs)
3921{ 4049{
3922 int mmu_reset_needed = 0; 4050 int mmu_reset_needed = 0;
3923 int i, pending_vec, max_bits; 4051 int pending_vec, max_bits;
3924 struct descriptor_table dt; 4052 struct descriptor_table dt;
3925 4053
3926 vcpu_load(vcpu); 4054 vcpu_load(vcpu);
@@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3934 4062
3935 vcpu->arch.cr2 = sregs->cr2; 4063 vcpu->arch.cr2 = sregs->cr2;
3936 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4064 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3937 vcpu->arch.cr3 = sregs->cr3; 4065
4066 down_read(&vcpu->kvm->slots_lock);
4067 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4068 vcpu->arch.cr3 = sregs->cr3;
4069 else
4070 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4071 up_read(&vcpu->kvm->slots_lock);
3938 4072
3939 kvm_set_cr8(vcpu, sregs->cr8); 4073 kvm_set_cr8(vcpu, sregs->cr8);
3940 4074
@@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3956 if (mmu_reset_needed) 4090 if (mmu_reset_needed)
3957 kvm_mmu_reset_context(vcpu); 4091 kvm_mmu_reset_context(vcpu);
3958 4092
3959 if (!irqchip_in_kernel(vcpu->kvm)) { 4093 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3960 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 4094 pending_vec = find_first_bit(
3961 sizeof vcpu->arch.irq_pending); 4095 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
3962 vcpu->arch.irq_summary = 0; 4096 if (pending_vec < max_bits) {
3963 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 4097 kvm_queue_interrupt(vcpu, pending_vec, false);
3964 if (vcpu->arch.irq_pending[i]) 4098 pr_debug("Set back pending irq %d\n", pending_vec);
3965 __set_bit(i, &vcpu->arch.irq_summary); 4099 if (irqchip_in_kernel(vcpu->kvm))
3966 } else { 4100 kvm_pic_clear_isr_ack(vcpu->kvm);
3967 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3968 pending_vec = find_first_bit(
3969 (const unsigned long *)sregs->interrupt_bitmap,
3970 max_bits);
3971 /* Only pending external irq is handled here */
3972 if (pending_vec < max_bits) {
3973 kvm_x86_ops->set_irq(vcpu, pending_vec);
3974 pr_debug("Set back pending irq %d\n",
3975 pending_vec);
3976 }
3977 kvm_pic_clear_isr_ack(vcpu->kvm);
3978 } 4101 }
3979 4102
3980 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4103 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void)
4308 return ERR_PTR(-ENOMEM); 4431 return ERR_PTR(-ENOMEM);
4309 4432
4310 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4433 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4311 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4312 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4434 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4313 4435
4314 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4436 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4411 } 4533 }
4412 } 4534 }
4413 4535
4536 spin_lock(&kvm->mmu_lock);
4414 if (!kvm->arch.n_requested_mmu_pages) { 4537 if (!kvm->arch.n_requested_mmu_pages) {
4415 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4538 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4416 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4539 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4417 } 4540 }
4418 4541
4419 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4542 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4543 spin_unlock(&kvm->mmu_lock);
4420 kvm_flush_remote_tlbs(kvm); 4544 kvm_flush_remote_tlbs(kvm);
4421 4545
4422 return 0; 4546 return 0;
@@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4425void kvm_arch_flush_shadow(struct kvm *kvm) 4549void kvm_arch_flush_shadow(struct kvm *kvm)
4426{ 4550{
4427 kvm_mmu_zap_all(kvm); 4551 kvm_mmu_zap_all(kvm);
4552 kvm_reload_remote_mmus(kvm);
4428} 4553}
4429 4554
4430int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4555int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4434 || vcpu->arch.nmi_pending; 4559 || vcpu->arch.nmi_pending;
4435} 4560}
4436 4561
4437static void vcpu_kick_intr(void *info)
4438{
4439#ifdef DEBUG
4440 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4441 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4442#endif
4443}
4444
4445void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4562void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4446{ 4563{
4447 int ipi_pcpu = vcpu->cpu; 4564 int me;
4448 int cpu = get_cpu(); 4565 int cpu = vcpu->cpu;
4449 4566
4450 if (waitqueue_active(&vcpu->wq)) { 4567 if (waitqueue_active(&vcpu->wq)) {
4451 wake_up_interruptible(&vcpu->wq); 4568 wake_up_interruptible(&vcpu->wq);
4452 ++vcpu->stat.halt_wakeup; 4569 ++vcpu->stat.halt_wakeup;
4453 } 4570 }
4454 /* 4571
4455 * We may be called synchronously with irqs disabled in guest mode, 4572 me = get_cpu();
4456 * So need not to call smp_call_function_single() in that case. 4573 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4457 */ 4574 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4458 if (vcpu->guest_mode && vcpu->cpu != cpu) 4575 smp_send_reschedule(cpu);
4459 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4460 put_cpu(); 4576 put_cpu();
4461} 4577}
4578
4579int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4580{
4581 return kvm_x86_ops->interrupt_allowed(vcpu);
4582}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a7384..4c8e10af78e8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
8 vcpu->arch.exception.pending = false; 8 vcpu->arch.exception.pending = false;
9} 9}
10 10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) 11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
12 bool soft)
12{ 13{
13 vcpu->arch.interrupt.pending = true; 14 vcpu->arch.interrupt.pending = true;
15 vcpu->arch.interrupt.soft = soft;
14 vcpu->arch.interrupt.nr = vector; 16 vcpu->arch.interrupt.nr = vector;
15} 17}
16 18
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
19 vcpu->arch.interrupt.pending = false; 21 vcpu->arch.interrupt.pending = false;
20} 22}
21 23
24static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
25{
26 return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
27 vcpu->arch.nmi_injected;
28}
29
30static inline bool kvm_exception_is_soft(unsigned int nr)
31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33}
22#endif 34#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d2083..c1b6c232e02b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcOne (7<<4) /* Implied '1' */
62#define SrcMask (7<<4) 62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
63#define SrcMask (0xf<<4)
63/* Generic ModRM decode. */ 64/* Generic ModRM decode. */
64#define ModRM (1<<7) 65#define ModRM (1<<8)
65/* Destination is only written; never read. */ 66/* Destination is only written; never read. */
66#define Mov (1<<8) 67#define Mov (1<<9)
67#define BitOp (1<<9) 68#define BitOp (1<<10)
68#define MemAbs (1<<10) /* Memory operand is absolute displacement */ 69#define MemAbs (1<<11) /* Memory operand is absolute displacement */
69#define String (1<<12) /* String instruction (rep capable) */ 70#define String (1<<12) /* String instruction (rep capable) */
70#define Stack (1<<13) /* Stack instruction (push/pop) */ 71#define Stack (1<<13) /* Stack instruction (push/pop) */
71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 72#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
76#define Src2CL (1<<29) 77#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29) 78#define Src2ImmByte (2<<29)
78#define Src2One (3<<29) 79#define Src2One (3<<29)
80#define Src2Imm16 (4<<29)
79#define Src2Mask (7<<29) 81#define Src2Mask (7<<29)
80 82
81enum { 83enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
135 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 137 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
136 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 138 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
137 /* 0x70 - 0x77 */ 139 /* 0x70 - 0x77 */
138 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 140 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
139 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 141 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
140 /* 0x78 - 0x7F */ 142 /* 0x78 - 0x7F */
141 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 143 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
142 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 144 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
143 /* 0x80 - 0x87 */ 145 /* 0x80 - 0x87 */
144 Group | Group1_80, Group | Group1_81, 146 Group | Group1_80, Group | Group1_81,
145 Group | Group1_82, Group | Group1_83, 147 Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
153 /* 0x90 - 0x97 */ 155 /* 0x90 - 0x97 */
154 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 156 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
155 /* 0x98 - 0x9F */ 157 /* 0x98 - 0x9F */
156 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 158 0, 0, SrcImm | Src2Imm16, 0,
159 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
157 /* 0xA0 - 0xA7 */ 160 /* 0xA0 - 0xA7 */
158 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 161 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
159 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 162 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
178 0, ImplicitOps | Stack, 0, 0, 181 0, ImplicitOps | Stack, 0, 0,
179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 182 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
180 /* 0xC8 - 0xCF */ 183 /* 0xC8 - 0xCF */
181 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, 184 0, 0, 0, ImplicitOps | Stack,
185 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
182 /* 0xD0 - 0xD7 */ 186 /* 0xD0 - 0xD7 */
183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 187 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 188 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
187 0, 0, 0, 0, 0, 0, 0, 0, 191 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 0xE0 - 0xE7 */ 192 /* 0xE0 - 0xE7 */
189 0, 0, 0, 0, 193 0, 0, 0, 0,
190 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 194 ByteOp | SrcImmUByte, SrcImmUByte,
191 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 195 ByteOp | SrcImmUByte, SrcImmUByte,
192 /* 0xE8 - 0xEF */ 196 /* 0xE8 - 0xEF */
193 ImplicitOps | Stack, SrcImm | ImplicitOps, 197 SrcImm | Stack, SrcImm | ImplicitOps,
194 ImplicitOps, SrcImmByte | ImplicitOps, 198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
195 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
196 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
197 /* 0xF0 - 0xF7 */ 201 /* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
230 /* 0x70 - 0x7F */ 234 /* 0x70 - 0x7F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x80 - 0x8F */ 236 /* 0x80 - 0x8F */
233 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 237 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
234 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 238 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
235 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
236 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
237 /* 0x90 - 0x9F */ 239 /* 0x90 - 0x9F */
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 /* 0xA0 - 0xA7 */ 241 /* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
1044 } 1046 }
1045 break; 1047 break;
1046 case SrcImmByte: 1048 case SrcImmByte:
1049 case SrcImmUByte:
1047 c->src.type = OP_IMM; 1050 c->src.type = OP_IMM;
1048 c->src.ptr = (unsigned long *)c->eip; 1051 c->src.ptr = (unsigned long *)c->eip;
1049 c->src.bytes = 1; 1052 c->src.bytes = 1;
1050 c->src.val = insn_fetch(s8, 1, c->eip); 1053 if ((c->d & SrcMask) == SrcImmByte)
1054 c->src.val = insn_fetch(s8, 1, c->eip);
1055 else
1056 c->src.val = insn_fetch(u8, 1, c->eip);
1051 break; 1057 break;
1052 case SrcOne: 1058 case SrcOne:
1053 c->src.bytes = 1; 1059 c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
1072 c->src2.bytes = 1; 1078 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip); 1079 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break; 1080 break;
1081 case Src2Imm16:
1082 c->src2.type = OP_IMM;
1083 c->src2.ptr = (unsigned long *)c->eip;
1084 c->src2.bytes = 2;
1085 c->src2.val = insn_fetch(u16, 2, c->eip);
1086 break;
1075 case Src2One: 1087 case Src2One:
1076 c->src2.bytes = 1; 1088 c->src2.bytes = 1;
1077 c->src2.val = 1; 1089 c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1349 return 0; 1361 return 0;
1350} 1362}
1351 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /*
1368 * an sti; sti; sequence only disable interrupts for the first
1369 * instruction. So, if the last instruction, be it emulated or
1370 * not, left the system with the INT_STI flag enabled, it
1371 * means that the last instruction is an sti. We should not
1372 * leave the flag on in this case. The same goes for mov ss
1373 */
1374 if (!(int_shadow & mask))
1375 ctxt->interruptibility = mask;
1376}
1377
1352int 1378int
1353x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1354{ 1380{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1360 int io_dir_in; 1386 int io_dir_in;
1361 int rc = 0; 1387 int rc = 0;
1362 1388
1389 ctxt->interruptibility = 0;
1390
1363 /* Shadow copy of register state. Committed on successful emulation. 1391 /* Shadow copy of register state. Committed on successful emulation.
1364 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't 1392 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1365 * modify them. 1393 * modify them.
@@ -1531,13 +1559,10 @@ special_insn:
1531 return -1; 1559 return -1;
1532 } 1560 }
1533 return 0; 1561 return 0;
1534 case 0x70 ... 0x7f: /* jcc (short) */ { 1562 case 0x70 ... 0x7f: /* jcc (short) */
1535 int rel = insn_fetch(s8, 1, c->eip);
1536
1537 if (test_cc(c->b, ctxt->eflags)) 1563 if (test_cc(c->b, ctxt->eflags))
1538 jmp_rel(c, rel); 1564 jmp_rel(c, c->src.val);
1539 break; 1565 break;
1540 }
1541 case 0x80 ... 0x83: /* Grp1 */ 1566 case 0x80 ... 0x83: /* Grp1 */
1542 switch (c->modrm_reg) { 1567 switch (c->modrm_reg) {
1543 case 0: 1568 case 0:
@@ -1609,6 +1634,9 @@ special_insn:
1609 int err; 1634 int err;
1610 1635
1611 sel = c->src.val; 1636 sel = c->src.val;
1637 if (c->modrm_reg == VCPU_SREG_SS)
1638 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1639
1612 if (c->modrm_reg <= 5) { 1640 if (c->modrm_reg <= 5) {
1613 type_bits = (c->modrm_reg == 1) ? 9 : 1; 1641 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1614 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 1642 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
1769 break; 1797 break;
1770 case 0xe4: /* inb */ 1798 case 0xe4: /* inb */
1771 case 0xe5: /* in */ 1799 case 0xe5: /* in */
1772 port = insn_fetch(u8, 1, c->eip); 1800 port = c->src.val;
1773 io_dir_in = 1; 1801 io_dir_in = 1;
1774 goto do_io; 1802 goto do_io;
1775 case 0xe6: /* outb */ 1803 case 0xe6: /* outb */
1776 case 0xe7: /* out */ 1804 case 0xe7: /* out */
1777 port = insn_fetch(u8, 1, c->eip); 1805 port = c->src.val;
1778 io_dir_in = 0; 1806 io_dir_in = 0;
1779 goto do_io; 1807 goto do_io;
1780 case 0xe8: /* call (near) */ { 1808 case 0xe8: /* call (near) */ {
1781 long int rel; 1809 long int rel = c->src.val;
1782 switch (c->op_bytes) {
1783 case 2:
1784 rel = insn_fetch(s16, 2, c->eip);
1785 break;
1786 case 4:
1787 rel = insn_fetch(s32, 4, c->eip);
1788 break;
1789 default:
1790 DPRINTF("Call: Invalid op_bytes\n");
1791 goto cannot_emulate;
1792 }
1793 c->src.val = (unsigned long) c->eip; 1810 c->src.val = (unsigned long) c->eip;
1794 jmp_rel(c, rel); 1811 jmp_rel(c, rel);
1795 c->op_bytes = c->ad_bytes;
1796 emulate_push(ctxt); 1812 emulate_push(ctxt);
1797 break; 1813 break;
1798 } 1814 }
1799 case 0xe9: /* jmp rel */ 1815 case 0xe9: /* jmp rel */
1800 goto jmp; 1816 goto jmp;
1801 case 0xea: /* jmp far */ { 1817 case 0xea: /* jmp far */
1802 uint32_t eip; 1818 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
1803 uint16_t sel; 1819 VCPU_SREG_CS) < 0) {
1804
1805 switch (c->op_bytes) {
1806 case 2:
1807 eip = insn_fetch(u16, 2, c->eip);
1808 break;
1809 case 4:
1810 eip = insn_fetch(u32, 4, c->eip);
1811 break;
1812 default:
1813 DPRINTF("jmp far: Invalid op_bytes\n");
1814 goto cannot_emulate;
1815 }
1816 sel = insn_fetch(u16, 2, c->eip);
1817 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1818 DPRINTF("jmp far: Failed to load CS descriptor\n"); 1820 DPRINTF("jmp far: Failed to load CS descriptor\n");
1819 goto cannot_emulate; 1821 goto cannot_emulate;
1820 } 1822 }
1821 1823
1822 c->eip = eip; 1824 c->eip = c->src.val;
1823 break; 1825 break;
1824 }
1825 case 0xeb: 1826 case 0xeb:
1826 jmp: /* jmp rel short */ 1827 jmp: /* jmp rel short */
1827 jmp_rel(c, c->src.val); 1828 jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
1865 c->dst.type = OP_NONE; /* Disable writeback. */ 1866 c->dst.type = OP_NONE; /* Disable writeback. */
1866 break; 1867 break;
1867 case 0xfb: /* sti */ 1868 case 0xfb: /* sti */
1869 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
1868 ctxt->eflags |= X86_EFLAGS_IF; 1870 ctxt->eflags |= X86_EFLAGS_IF;
1869 c->dst.type = OP_NONE; /* Disable writeback. */ 1871 c->dst.type = OP_NONE; /* Disable writeback. */
1870 break; 1872 break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
2039 if (!test_cc(c->b, ctxt->eflags)) 2041 if (!test_cc(c->b, ctxt->eflags))
2040 c->dst.type = OP_NONE; /* no writeback */ 2042 c->dst.type = OP_NONE; /* no writeback */
2041 break; 2043 break;
2042 case 0x80 ... 0x8f: /* jnz rel, etc*/ { 2044 case 0x80 ... 0x8f: /* jnz rel, etc*/
2043 long int rel;
2044
2045 switch (c->op_bytes) {
2046 case 2:
2047 rel = insn_fetch(s16, 2, c->eip);
2048 break;
2049 case 4:
2050 rel = insn_fetch(s32, 4, c->eip);
2051 break;
2052 case 8:
2053 rel = insn_fetch(s64, 8, c->eip);
2054 break;
2055 default:
2056 DPRINTF("jnz: Invalid op_bytes\n");
2057 goto cannot_emulate;
2058 }
2059 if (test_cc(c->b, ctxt->eflags)) 2045 if (test_cc(c->b, ctxt->eflags))
2060 jmp_rel(c, rel); 2046 jmp_rel(c, c->src.val);
2061 c->dst.type = OP_NONE; 2047 c->dst.type = OP_NONE;
2062 break; 2048 break;
2063 }
2064 case 0xa3: 2049 case 0xa3:
2065 bt: /* bt */ 2050 bt: /* bt */
2066 c->dst.type = OP_NONE; 2051 c->dst.type = OP_NONE;
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d3..38718041efc3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE
6 select VIRTIO 5 select VIRTIO
7 select VIRTIO_RING 6 select VIRTIO_RING
8 select VIRTIO_CONSOLE 7 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4e0c26559395..7bc65f0f62c4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,7 +87,7 @@ struct lguest_data lguest_data = {
87 87
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall 90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
92 * and 255 once the Host has finished with it. 92 * and 255 once the Host has finished with it.
93 * 93 *
@@ -96,7 +96,8 @@ struct lguest_data lguest_data = {
96 * effect of causing the Host to run all the stored calls in the ring buffer 96 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 97 * which empties it for next time! */
98static void async_hcall(unsigned long call, unsigned long arg1, 98static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3) 99 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4)
100{ 101{
101 /* Note: This code assumes we're uniprocessor. */ 102 /* Note: This code assumes we're uniprocessor. */
102 static unsigned int next_call; 103 static unsigned int next_call;
@@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
108 local_irq_save(flags); 109 local_irq_save(flags);
109 if (lguest_data.hcall_status[next_call] != 0xFF) { 110 if (lguest_data.hcall_status[next_call] != 0xFF) {
110 /* Table full, so do normal hcall which will flush table. */ 111 /* Table full, so do normal hcall which will flush table. */
111 kvm_hypercall3(call, arg1, arg2, arg3); 112 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
112 } else { 113 } else {
113 lguest_data.hcalls[next_call].arg0 = call; 114 lguest_data.hcalls[next_call].arg0 = call;
114 lguest_data.hcalls[next_call].arg1 = arg1; 115 lguest_data.hcalls[next_call].arg1 = arg1;
115 lguest_data.hcalls[next_call].arg2 = arg2; 116 lguest_data.hcalls[next_call].arg2 = arg2;
116 lguest_data.hcalls[next_call].arg3 = arg3; 117 lguest_data.hcalls[next_call].arg3 = arg3;
118 lguest_data.hcalls[next_call].arg4 = arg4;
117 /* Arguments must all be written before we mark it to go */ 119 /* Arguments must all be written before we mark it to go */
118 wmb(); 120 wmb();
119 lguest_data.hcall_status[next_call] = 0; 121 lguest_data.hcall_status[next_call] = 0;
@@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
141 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 143 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
142 kvm_hypercall1(call, arg1); 144 kvm_hypercall1(call, arg1);
143 else 145 else
144 async_hcall(call, arg1, 0, 0); 146 async_hcall(call, arg1, 0, 0, 0);
145} 147}
146 148
147static void lazy_hcall2(unsigned long call, 149static void lazy_hcall2(unsigned long call,
@@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 153 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall2(call, arg1, arg2); 154 kvm_hypercall2(call, arg1, arg2);
153 else 155 else
154 async_hcall(call, arg1, arg2, 0); 156 async_hcall(call, arg1, arg2, 0, 0);
155} 157}
156 158
157static void lazy_hcall3(unsigned long call, 159static void lazy_hcall3(unsigned long call,
@@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call,
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall3(call, arg1, arg2, arg3); 165 kvm_hypercall3(call, arg1, arg2, arg3);
164 else 166 else
165 async_hcall(call, arg1, arg2, arg3); 167 async_hcall(call, arg1, arg2, arg3, 0);
166} 168}
167 169
170#ifdef CONFIG_X86_PAE
171static void lazy_hcall4(unsigned long call,
172 unsigned long arg1,
173 unsigned long arg2,
174 unsigned long arg3,
175 unsigned long arg4)
176{
177 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
178 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
179 else
180 async_hcall(call, arg1, arg2, arg3, arg4);
181}
182#endif
183
168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
169 * issue the do-nothing hypercall to flush any stored calls. */ 185 * issue the do-nothing hypercall to flush any stored calls. */
170static void lguest_leave_lazy_mmu_mode(void) 186static void lguest_leave_lazy_mmu_mode(void)
@@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next)
179 paravirt_end_context_switch(next); 195 paravirt_end_context_switch(next);
180} 196}
181 197
182/*G:033 198/*G:032
183 * After that diversion we return to our first native-instruction 199 * After that diversion we return to our first native-instruction
184 * replacements: four functions for interrupt control. 200 * replacements: four functions for interrupt control.
185 * 201 *
@@ -199,30 +215,28 @@ static unsigned long save_fl(void)
199{ 215{
200 return lguest_data.irq_enabled; 216 return lguest_data.irq_enabled;
201} 217}
202PV_CALLEE_SAVE_REGS_THUNK(save_fl);
203
204/* restore_flags() just sets the flags back to the value given. */
205static void restore_fl(unsigned long flags)
206{
207 lguest_data.irq_enabled = flags;
208}
209PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
210 218
211/* Interrupts go off... */ 219/* Interrupts go off... */
212static void irq_disable(void) 220static void irq_disable(void)
213{ 221{
214 lguest_data.irq_enabled = 0; 222 lguest_data.irq_enabled = 0;
215} 223}
224
225/* Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl);
216PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 233PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/
217 235
218/* Interrupts go on... */ 236/* These are in i386_head.S */
219static void irq_enable(void) 237extern void lg_irq_enable(void);
220{ 238extern void lg_restore_fl(unsigned long flags);
221 lguest_data.irq_enabled = X86_EFLAGS_IF;
222}
223PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
224 239
225/*:*/
226/*M:003 Note that we don't check for outstanding interrupts when we re-enable 240/*M:003 Note that we don't check for outstanding interrupts when we re-enable
227 * them (or when we unmask an interrupt). This seems to work for the moment, 241 * them (or when we unmask an interrupt). This seems to work for the moment,
228 * since interrupts are rare and we'll just get the interrupt on the next timer 242 * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
368 case 1: /* Basic feature request. */ 382 case 1: /* Basic feature request. */
369 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
370 *cx &= 0x00002201; 384 *cx &= 0x00002201;
371 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
372 *dx &= 0x07808111; 386 *dx &= 0x07808151;
373 /* The Host can do a nice optimization if it knows that the 387 /* The Host can do a nice optimization if it knows that the
374 * kernel mappings (addresses above 0xC0000000 or whatever 388 * kernel mappings (addresses above 0xC0000000 or whatever
375 * PAGE_OFFSET is set to) haven't changed. But Linux calls 389 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
388 if (*ax > 0x80000008) 402 if (*ax > 0x80000008)
389 *ax = 0x80000008; 403 *ax = 0x80000008;
390 break; 404 break;
405 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20);
409 break;
391 } 410 }
392} 411}
393 412
@@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
521static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
522 pte_t *ptep) 541 pte_t *ptep)
523{ 542{
543#ifdef CONFIG_X86_PAE
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high);
546#else
524 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 547 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
548#endif
525} 549}
526 550
527static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep, pte_t pteval) 552 pte_t *ptep, pte_t pteval)
529{ 553{
530 *ptep = pteval; 554 native_set_pte(ptep, pteval);
531 lguest_pte_update(mm, addr, ptep); 555 lguest_pte_update(mm, addr, ptep);
532} 556}
533 557
534/* The Guest calls this to set a top-level entry. Again, we set the entry then 558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
535 * tell the Host which top-level page we changed, and the index of the entry we 559 * to set a middle-level entry when PAE is activated.
536 * changed. */ 560 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */
562#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{
565 native_set_pud(pudp, pudval);
566
567 /* 32 bytes aligned pdpt address and the index. */
568 lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
569 (__pa(pudp) & 0x1F) / sizeof(pud_t));
570}
571
537static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 572static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
538{ 573{
539 *pmdp = pmdval; 574 native_set_pmd(pmdp, pmdval);
540 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 575 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
541 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); 576 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
542} 577}
578#else
579
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{
584 native_set_pmd(pmdp, pmdval);
585 lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
586 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
587}
588#endif
543 589
544/* There are a couple of legacy places where the kernel sets a PTE, but we 590/* There are a couple of legacy places where the kernel sets a PTE, but we
545 * don't know the top level any more. This is useless for us, since we don't 591 * don't know the top level any more. This is useless for us, since we don't
@@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
552 * which brings boot back to 0.25 seconds. */ 598 * which brings boot back to 0.25 seconds. */
553static void lguest_set_pte(pte_t *ptep, pte_t pteval) 599static void lguest_set_pte(pte_t *ptep, pte_t pteval)
554{ 600{
555 *ptep = pteval; 601 native_set_pte(ptep, pteval);
602 if (cr3_changed)
603 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
604}
605
606#ifdef CONFIG_X86_PAE
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{
609 native_set_pte_atomic(ptep, pte);
556 if (cr3_changed) 610 if (cr3_changed)
557 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 611 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
558} 612}
559 613
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep);
618}
619
620void lguest_pmd_clear(pmd_t *pmdp)
621{
622 lguest_set_pmd(pmdp, __pmd(0));
623}
624#endif
625
560/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
561 * native page table operations. On native hardware you can set a new page 627 * native page table operations. On native hardware you can set a new page
562 * table entry whenever you want, but if you want to remove one you have to do 628 * table entry whenever you want, but if you want to remove one you have to do
@@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void)
628{ 694{
629 unsigned int i; 695 unsigned int i;
630 696
631 for (i = 0; i < LGUEST_IRQS; i++) { 697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
632 int vector = FIRST_EXTERNAL_VECTOR + i;
633 /* Some systems map "vectors" to interrupts weirdly. Lguest has 698 /* Some systems map "vectors" to interrupts weirdly. Lguest has
634 * a straightforward 1 to 1 mapping, so force that here. */ 699 * a straightforward 1 to 1 mapping, so force that here. */
635 __get_cpu_var(vector_irq)[vector] = i; 700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
636 if (vector != SYSCALL_VECTOR) 701 if (i != SYSCALL_VECTOR)
637 set_intr_gate(vector, interrupt[i]); 702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
638 } 703 }
639 /* This call is required to set up for 4k stacks, where we have 704 /* This call is required to set up for 4k stacks, where we have
640 * separate stacks for hard and soft interrupts. */ 705 * separate stacks for hard and soft interrupts. */
@@ -973,10 +1038,10 @@ static void lguest_restart(char *reason)
973 * 1038 *
974 * Our current solution is to allow the paravirt back end to optionally patch 1039 * Our current solution is to allow the paravirt back end to optionally patch
975 * over the indirect calls to replace them with something more efficient. We 1040 * over the indirect calls to replace them with something more efficient. We
976 * patch the four most commonly called functions: disable interrupts, enable 1041 * patch two of the simplest of the most commonly called functions: disable
977 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 1042 * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
978 * bytes to patch into: the Guest versions of these operations are small enough 1043 * into: the Guest versions of these operations are small enough that we can
979 * that we can fit comfortably. 1044 * fit comfortably.
980 * 1045 *
981 * First we need assembly templates of each of the patchable Guest operations, 1046 * First we need assembly templates of each of the patchable Guest operations,
982 * and these are in i386_head.S. */ 1047 * and these are in i386_head.S. */
@@ -987,8 +1052,6 @@ static const struct lguest_insns
987 const char *start, *end; 1052 const char *start, *end;
988} lguest_insns[] = { 1053} lguest_insns[] = {
989 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1054 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
990 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
991 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
992 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
993}; 1056};
994 1057
@@ -1026,6 +1089,7 @@ __init void lguest_init(void)
1026 pv_info.name = "lguest"; 1089 pv_info.name = "lguest";
1027 pv_info.paravirt_enabled = 1; 1090 pv_info.paravirt_enabled = 1;
1028 pv_info.kernel_rpl = 1; 1091 pv_info.kernel_rpl = 1;
1092 pv_info.shared_kernel_pmd = 1;
1029 1093
1030 /* We set up all the lguest overrides for sensitive operations. These 1094 /* We set up all the lguest overrides for sensitive operations. These
1031 * are detailed with the operations themselves. */ 1095 * are detailed with the operations themselves. */
@@ -1033,9 +1097,9 @@ __init void lguest_init(void)
1033 /* interrupt-related operations */ 1097 /* interrupt-related operations */
1034 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1098 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1035 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1036 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); 1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1037 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1101 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
1038 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); 1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1039 pv_irq_ops.safe_halt = lguest_safe_halt; 1103 pv_irq_ops.safe_halt = lguest_safe_halt;
1040 1104
1041 /* init-time operations */ 1105 /* init-time operations */
@@ -1071,6 +1135,12 @@ __init void lguest_init(void)
1071 pv_mmu_ops.set_pte = lguest_set_pte; 1135 pv_mmu_ops.set_pte = lguest_set_pte;
1072 pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1136 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1073 pv_mmu_ops.set_pmd = lguest_set_pmd; 1137 pv_mmu_ops.set_pmd = lguest_set_pmd;
1138#ifdef CONFIG_X86_PAE
1139 pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
1140 pv_mmu_ops.pte_clear = lguest_pte_clear;
1141 pv_mmu_ops.pmd_clear = lguest_pmd_clear;
1142 pv_mmu_ops.set_pud = lguest_set_pud;
1143#endif
1074 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1144 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1075 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1145 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1076 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1146 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f79541989471..a9c8cfe61cd4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
46 .globl lgstart_##name; .globl lgend_##name 46 .globl lgstart_##name; .globl lgend_##name
47 47
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
50LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
51LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
52/*:*/ 50
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
52 * matter for save_fl and irq_disable later). If we write our routines
53 * carefully in assembler, we can avoid clobbering any registers and avoid
54 * jumping through the wrapper functions.
55 *
56 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine
58 * to enable interrupts: */
59ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */
72 ret
73send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS.
76 *
77 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */
82 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax
89 ret
90
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */
94ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret
53 107
54/* These demark the EIP range where host should never deliver interrupts. */ 108/* These demark the EIP range where host should never deliver interrupts. */
55.global lguest_noirq_start 109.global lguest_noirq_start
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66c..f9d35632666b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr.o
6 6
7lib-y := delay.o 7lib-y := delay.o
8lib-y += thunk_$(BITS).o 8lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb6..000000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 u32 l, h;
9 int err;
10};
11
12static void __rdmsr_on_cpu(void *info)
13{
14 struct msr_info *rv = info;
15
16 rdmsr(rv->msr_no, rv->l, rv->h);
17}
18
19static void __wrmsr_on_cpu(void *info)
20{
21 struct msr_info *rv = info;
22
23 wrmsr(rv->msr_no, rv->l, rv->h);
24}
25
26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{
28 int err;
29 struct msr_info rv;
30
31 rv.msr_no = msr_no;
32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 *l = rv.l;
34 *h = rv.h;
35
36 return err;
37}
38
39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
55{
56 struct msr_info *rv = info;
57
58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
59}
60
61static void __wrmsr_safe_on_cpu(void *info)
62{
63 struct msr_info *rv = info;
64
65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
66}
67
68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
69{
70 int err;
71 struct msr_info rv;
72
73 rv.msr_no = msr_no;
74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
75 *l = rv.l;
76 *h = rv.h;
77
78 return err ? err : rv.err;
79}
80
81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
82{
83 int err;
84 struct msr_info rv;
85
86 rv.msr_no = msr_no;
87 rv.l = l;
88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
92}
93
94EXPORT_SYMBOL(rdmsr_on_cpu);
95EXPORT_SYMBOL(wrmsr_on_cpu);
96EXPORT_SYMBOL(rdmsr_safe_on_cpu);
97EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 000000000000..1440b9c0547e
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,183 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 preempt_disable();
93 /*
94 * FIXME: handle the CPU we're executing on separately for now until
95 * smp_call_function_many has been fixed to not skip it.
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable();
102}
103EXPORT_SYMBOL(rdmsr_on_cpus);
104
105/*
106 * wrmsr on a bunch of CPUs
107 *
108 * @mask: which CPUs
109 * @msr_no: which MSR
110 * @msrs: array of MSR values
111 *
112 */
113void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
114{
115 struct msr_info rv;
116 int this_cpu;
117
118 memset(&rv, 0, sizeof(rv));
119
120 rv.off = cpumask_first(mask);
121 rv.msrs = msrs;
122 rv.msr_no = msr_no;
123
124 preempt_disable();
125 /*
126 * FIXME: handle the CPU we're executing on separately for now until
127 * smp_call_function_many has been fixed to not skip it.
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable();
134}
135EXPORT_SYMBOL(wrmsr_on_cpus);
136
137/* These "safe" variants are slower and should be used when the target MSR
138 may not actually exist. */
139static void __rdmsr_safe_on_cpu(void *info)
140{
141 struct msr_info *rv = info;
142
143 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
144}
145
146static void __wrmsr_safe_on_cpu(void *info)
147{
148 struct msr_info *rv = info;
149
150 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
151}
152
153int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
154{
155 int err;
156 struct msr_info rv;
157
158 memset(&rv, 0, sizeof(rv));
159
160 rv.msr_no = msr_no;
161 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
162 *l = rv.reg.l;
163 *h = rv.reg.h;
164
165 return err ? err : rv.err;
166}
167EXPORT_SYMBOL(rdmsr_safe_on_cpu);
168
169int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
170{
171 int err;
172 struct msr_info rv;
173
174 memset(&rv, 0, sizeof(rv));
175
176 rv.msr_no = msr_no;
177 rv.reg.l = l;
178 rv.reg.h = h;
179 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
180
181 return err ? err : rv.err;
182}
183EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..baa0e86adfbc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -14,6 +14,7 @@
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
17 18
18/* 19/*
19 * Page fault error code bits: 20 * Page fault error code bits:
@@ -956,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
956 /* Get the faulting address: */ 957 /* Get the faulting address: */
957 address = read_cr2(); 958 address = read_cr2();
958 959
960 /*
961 * Detect and handle instructions that would cause a page fault for
962 * both a tracked kernel page and a userspace page.
963 */
964 if (kmemcheck_active(regs))
965 kmemcheck_hide(regs);
966
959 if (unlikely(kmmio_fault(regs, address))) 967 if (unlikely(kmmio_fault(regs, address)))
960 return; 968 return;
961 969
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
973 * protection error (error_code & 9) == 0. 981 * protection error (error_code & 9) == 0.
974 */ 982 */
975 if (unlikely(fault_in_kernel_space(address))) { 983 if (unlikely(fault_in_kernel_space(address))) {
976 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 984 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
977 vmalloc_fault(address) >= 0) 985 if (vmalloc_fault(address) >= 0)
978 return; 986 return;
987
988 if (kmemcheck_fault(regs, address, error_code))
989 return;
990 }
979 991
980 /* Can handle a stale RO->RW TLB: */ 992 /* Can handle a stale RO->RW TLB: */
981 if (spurious_fault(error_code, address)) 993 if (spurious_fault(error_code, address))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..f53b57e4086f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -213,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
213 if (!after_bootmem) 213 if (!after_bootmem)
214 init_gbpages(); 214 init_gbpages();
215 215
216#ifdef CONFIG_DEBUG_PAGEALLOC 216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
217 /* 217 /*
218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
219 * This will simplify cpa(), which otherwise needs to support splitting 219 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 949708d7a481..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
111 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
112 112
113 if (after_bootmem) { 113 if (after_bootmem) {
114#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
@@ -564,7 +564,7 @@ static inline void save_pg_dir(void)
564} 564}
565#endif /* !CONFIG_ACPI_SLEEP */ 565#endif /* !CONFIG_ACPI_SLEEP */
566 566
567void zap_low_mappings(void) 567void zap_low_mappings(bool early)
568{ 568{
569 int i; 569 int i;
570 570
@@ -581,7 +581,11 @@ void zap_low_mappings(void)
581 set_pgd(swapper_pg_dir+i, __pgd(0)); 581 set_pgd(swapper_pg_dir+i, __pgd(0));
582#endif 582#endif
583 } 583 }
584 flush_tlb_all(); 584
585 if (early)
586 __flush_tlb();
587 else
588 flush_tlb_all();
585} 589}
586 590
587pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 591pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
@@ -956,7 +960,7 @@ void __init mem_init(void)
956 test_wp_bit(); 960 test_wp_bit();
957 961
958 save_pg_dir(); 962 save_pg_dir();
959 zap_low_mappings(); 963 zap_low_mappings(true);
960} 964}
961 965
962#ifdef CONFIG_MEMORY_HOTPLUG 966#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..9c543290a813 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void)
104 void *ptr; 104 void *ptr;
105 105
106 if (after_bootmem) 106 if (after_bootmem)
107 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
108 else 108 else
109 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
110 110
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
281 void *adr; 281 void *adr;
282 282
283 if (after_bootmem) { 283 if (after_bootmem) {
284 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
285 *phys = __pa(adr); 285 *phys = __pa(adr);
286 286
287 return adr; 287 return adr;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c0bedcd10f97..18d244f70205 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 *p; 43 u64 *p, *start, *end;
44 void *start, *end;
45 u64 start_bad, last_bad; 44 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 45 u64 start_phys_aligned;
47 size_t incr; 46 const size_t incr = sizeof(pattern);
48 47
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
51 start = __va(start_phys_aligned); 49 start = __va(start_phys_aligned);
52 end = start + size - (start_phys_aligned - start_phys); 50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
53 start_bad = 0; 51 start_bad = 0;
54 last_bad = 0; 52 last_bad = 0;
55 53
56 for (p = start; p < end; p++) 54 for (p = start; p < end; p++)
57 *p = pattern; 55 *p = pattern;
56
58 for (p = start; p < end; p++, start_phys_aligned += incr) { 57 for (p = start; p < end; p++, start_phys_aligned += incr) {
59 if (*p == pattern) 58 if (*p == pattern)
60 continue; 59 continue;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..3cfe9ced8a4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
470 470
471 if (!debug_pagealloc) 471 if (!debug_pagealloc)
472 spin_unlock(&cpa_lock); 472 spin_unlock(&cpa_lock);
473 base = alloc_pages(GFP_KERNEL, 0); 473 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
474 if (!debug_pagealloc) 474 if (!debug_pagealloc)
475 spin_lock(&cpa_lock); 475 spin_lock(&cpa_lock);
476 if (!base) 476 if (!base)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..8e43bdd45456 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 58b32db33125..de2abbd07544 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -3,5 +3,5 @@
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu_$(BITS).o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c
index 5343540f2607..d277ef1eea51 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Suspend and hibernation support for x86-64 2 * Suspend support specific for i386/x86-64.
3 * 3 *
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
@@ -8,18 +8,28 @@
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
11#include <linux/smp.h>
12#include <linux/suspend.h> 11#include <linux/suspend.h>
13#include <asm/proto.h> 12#include <linux/smp.h>
14#include <asm/page.h> 13
15#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/proto.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/page.h>
18#include <asm/mce.h>
17#include <asm/xcr.h> 19#include <asm/xcr.h>
18#include <asm/suspend.h> 20#include <asm/suspend.h>
19 21
20static void fix_processor_context(void); 22#ifdef CONFIG_X86_32
23static struct saved_context saved_context;
21 24
25unsigned long saved_context_ebx;
26unsigned long saved_context_esp, saved_context_ebp;
27unsigned long saved_context_esi, saved_context_edi;
28unsigned long saved_context_eflags;
29#else
30/* CONFIG_X86_64 */
22struct saved_context saved_context; 31struct saved_context saved_context;
32#endif
23 33
24/** 34/**
25 * __save_processor_state - save CPU registers before creating a 35 * __save_processor_state - save CPU registers before creating a
@@ -38,19 +48,35 @@ struct saved_context saved_context;
38 */ 48 */
39static void __save_processor_state(struct saved_context *ctxt) 49static void __save_processor_state(struct saved_context *ctxt)
40{ 50{
51#ifdef CONFIG_X86_32
52 mtrr_save_fixed_ranges(NULL);
53#endif
41 kernel_fpu_begin(); 54 kernel_fpu_begin();
42 55
43 /* 56 /*
44 * descriptor tables 57 * descriptor tables
45 */ 58 */
59#ifdef CONFIG_X86_32
60 store_gdt(&ctxt->gdt);
61 store_idt(&ctxt->idt);
62#else
63/* CONFIG_X86_64 */
46 store_gdt((struct desc_ptr *)&ctxt->gdt_limit); 64 store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
47 store_idt((struct desc_ptr *)&ctxt->idt_limit); 65 store_idt((struct desc_ptr *)&ctxt->idt_limit);
66#endif
48 store_tr(ctxt->tr); 67 store_tr(ctxt->tr);
49 68
50 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ 69 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
51 /* 70 /*
52 * segment registers 71 * segment registers
53 */ 72 */
73#ifdef CONFIG_X86_32
74 savesegment(es, ctxt->es);
75 savesegment(fs, ctxt->fs);
76 savesegment(gs, ctxt->gs);
77 savesegment(ss, ctxt->ss);
78#else
79/* CONFIG_X86_64 */
54 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); 80 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
55 asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); 81 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
56 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); 82 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
@@ -62,30 +88,87 @@ static void __save_processor_state(struct saved_context *ctxt)
62 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 88 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
63 mtrr_save_fixed_ranges(NULL); 89 mtrr_save_fixed_ranges(NULL);
64 90
91 rdmsrl(MSR_EFER, ctxt->efer);
92#endif
93
65 /* 94 /*
66 * control registers 95 * control registers
67 */ 96 */
68 rdmsrl(MSR_EFER, ctxt->efer);
69 ctxt->cr0 = read_cr0(); 97 ctxt->cr0 = read_cr0();
70 ctxt->cr2 = read_cr2(); 98 ctxt->cr2 = read_cr2();
71 ctxt->cr3 = read_cr3(); 99 ctxt->cr3 = read_cr3();
100#ifdef CONFIG_X86_32
101 ctxt->cr4 = read_cr4_safe();
102#else
103/* CONFIG_X86_64 */
72 ctxt->cr4 = read_cr4(); 104 ctxt->cr4 = read_cr4();
73 ctxt->cr8 = read_cr8(); 105 ctxt->cr8 = read_cr8();
106#endif
74} 107}
75 108
109/* Needed by apm.c */
76void save_processor_state(void) 110void save_processor_state(void)
77{ 111{
78 __save_processor_state(&saved_context); 112 __save_processor_state(&saved_context);
79} 113}
114#ifdef CONFIG_X86_32
115EXPORT_SYMBOL(save_processor_state);
116#endif
80 117
81static void do_fpu_end(void) 118static void do_fpu_end(void)
82{ 119{
83 /* 120 /*
84 * Restore FPU regs if necessary 121 * Restore FPU regs if necessary.
85 */ 122 */
86 kernel_fpu_end(); 123 kernel_fpu_end();
87} 124}
88 125
126static void fix_processor_context(void)
127{
128 int cpu = smp_processor_id();
129 struct tss_struct *t = &per_cpu(init_tss, cpu);
130
131 set_tss_desc(cpu, t); /*
132 * This just modifies memory; should not be
133 * necessary. But... This is necessary, because
134 * 386 hardware has concept of busy TSS or some
135 * similar stupidity.
136 */
137
138#ifdef CONFIG_X86_64
139 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
140
141 syscall_init(); /* This sets MSR_*STAR and related */
142#endif
143 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170}
171
89/** 172/**
90 * __restore_processor_state - restore the contents of CPU registers saved 173 * __restore_processor_state - restore the contents of CPU registers saved
91 * by __save_processor_state() 174 * by __save_processor_state()
@@ -96,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt)
96 /* 179 /*
97 * control registers 180 * control registers
98 */ 181 */
182 /* cr4 was introduced in the Pentium CPU */
183#ifdef CONFIG_X86_32
184 if (ctxt->cr4)
185 write_cr4(ctxt->cr4);
186#else
187/* CONFIG X86_64 */
99 wrmsrl(MSR_EFER, ctxt->efer); 188 wrmsrl(MSR_EFER, ctxt->efer);
100 write_cr8(ctxt->cr8); 189 write_cr8(ctxt->cr8);
101 write_cr4(ctxt->cr4); 190 write_cr4(ctxt->cr4);
191#endif
102 write_cr3(ctxt->cr3); 192 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 193 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 194 write_cr0(ctxt->cr0);
@@ -107,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt)
107 * now restore the descriptor tables to their proper values 197 * now restore the descriptor tables to their proper values
108 * ltr is done i fix_processor_context(). 198 * ltr is done i fix_processor_context().
109 */ 199 */
200#ifdef CONFIG_X86_32
201 load_gdt(&ctxt->gdt);
202 load_idt(&ctxt->idt);
203#else
204/* CONFIG_X86_64 */
110 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); 205 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
111 load_idt((const struct desc_ptr *)&ctxt->idt_limit); 206 load_idt((const struct desc_ptr *)&ctxt->idt_limit);
112 207#endif
113 208
114 /* 209 /*
115 * segment registers 210 * segment registers
116 */ 211 */
212#ifdef CONFIG_X86_32
213 loadsegment(es, ctxt->es);
214 loadsegment(fs, ctxt->fs);
215 loadsegment(gs, ctxt->gs);
216 loadsegment(ss, ctxt->ss);
217
218 /*
219 * sysenter MSRs
220 */
221 if (boot_cpu_has(X86_FEATURE_SEP))
222 enable_sep_cpu();
223#else
224/* CONFIG_X86_64 */
117 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); 225 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
118 asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); 226 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
119 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); 227 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
@@ -123,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
123 wrmsrl(MSR_FS_BASE, ctxt->fs_base); 231 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
124 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 232 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
125 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 233 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
234#endif
126 235
127 /* 236 /*
128 * restore XCR0 for xsave capable cpu's. 237 * restore XCR0 for xsave capable cpu's.
@@ -134,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt)
134 243
135 do_fpu_end(); 244 do_fpu_end();
136 mtrr_ap_init(); 245 mtrr_ap_init();
246
247#ifdef CONFIG_X86_32
248 mcheck_init(&boot_cpu_data);
249#endif
137} 250}
138 251
252/* Needed by apm.c */
139void restore_processor_state(void) 253void restore_processor_state(void)
140{ 254{
141 __restore_processor_state(&saved_context); 255 __restore_processor_state(&saved_context);
142} 256}
143 257#ifdef CONFIG_X86_32
144static void fix_processor_context(void) 258EXPORT_SYMBOL(restore_processor_state);
145{ 259#endif
146 int cpu = smp_processor_id();
147 struct tss_struct *t = &per_cpu(init_tss, cpu);
148
149 /*
150 * This just modifies memory; should not be necessary. But... This
151 * is necessary, because 386 hardware has concept of busy TSS or some
152 * similar stupidity.
153 */
154 set_tss_desc(cpu, t);
155
156 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
157
158 syscall_init(); /* This sets MSR_*STAR and related */
159 load_TR_desc(); /* This does ltr */
160 load_LDT(&current->active_mm->context); /* This does lldt */
161
162 /*
163 * Now maybe reload the debug registers
164 */
165 if (current->thread.debugreg7){
166 loaddebug(&current->thread, 0);
167 loaddebug(&current->thread, 1);
168 loaddebug(&current->thread, 2);
169 loaddebug(&current->thread, 3);
170 /* no 4 and 5 */
171 loaddebug(&current->thread, 6);
172 loaddebug(&current->thread, 7);
173 }
174}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
deleted file mode 100644
index ce702c5b3a2c..000000000000
--- a/arch/x86/power/cpu_32.c
+++ /dev/null
@@ -1,148 +0,0 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/module.h>
11#include <linux/suspend.h>
12#include <asm/mtrr.h>
13#include <asm/mce.h>
14#include <asm/xcr.h>
15#include <asm/suspend.h>
16
17static struct saved_context saved_context;
18
19unsigned long saved_context_ebx;
20unsigned long saved_context_esp, saved_context_ebp;
21unsigned long saved_context_esi, saved_context_edi;
22unsigned long saved_context_eflags;
23
24static void __save_processor_state(struct saved_context *ctxt)
25{
26 mtrr_save_fixed_ranges(NULL);
27 kernel_fpu_begin();
28
29 /*
30 * descriptor tables
31 */
32 store_gdt(&ctxt->gdt);
33 store_idt(&ctxt->idt);
34 store_tr(ctxt->tr);
35
36 /*
37 * segment registers
38 */
39 savesegment(es, ctxt->es);
40 savesegment(fs, ctxt->fs);
41 savesegment(gs, ctxt->gs);
42 savesegment(ss, ctxt->ss);
43
44 /*
45 * control registers
46 */
47 ctxt->cr0 = read_cr0();
48 ctxt->cr2 = read_cr2();
49 ctxt->cr3 = read_cr3();
50 ctxt->cr4 = read_cr4_safe();
51}
52
53/* Needed by apm.c */
54void save_processor_state(void)
55{
56 __save_processor_state(&saved_context);
57}
58EXPORT_SYMBOL(save_processor_state);
59
60static void do_fpu_end(void)
61{
62 /*
63 * Restore FPU regs if necessary.
64 */
65 kernel_fpu_end();
66}
67
68static void fix_processor_context(void)
69{
70 int cpu = smp_processor_id();
71 struct tss_struct *t = &per_cpu(init_tss, cpu);
72
73 set_tss_desc(cpu, t); /*
74 * This just modifies memory; should not be
75 * necessary. But... This is necessary, because
76 * 386 hardware has concept of busy TSS or some
77 * similar stupidity.
78 */
79
80 load_TR_desc(); /* This does ltr */
81 load_LDT(&current->active_mm->context); /* This does lldt */
82
83 /*
84 * Now maybe reload the debug registers
85 */
86 if (current->thread.debugreg7) {
87 set_debugreg(current->thread.debugreg0, 0);
88 set_debugreg(current->thread.debugreg1, 1);
89 set_debugreg(current->thread.debugreg2, 2);
90 set_debugreg(current->thread.debugreg3, 3);
91 /* no 4 and 5 */
92 set_debugreg(current->thread.debugreg6, 6);
93 set_debugreg(current->thread.debugreg7, 7);
94 }
95
96}
97
98static void __restore_processor_state(struct saved_context *ctxt)
99{
100 /*
101 * control registers
102 */
103 /* cr4 was introduced in the Pentium CPU */
104 if (ctxt->cr4)
105 write_cr4(ctxt->cr4);
106 write_cr3(ctxt->cr3);
107 write_cr2(ctxt->cr2);
108 write_cr0(ctxt->cr0);
109
110 /*
111 * now restore the descriptor tables to their proper values
112 * ltr is done i fix_processor_context().
113 */
114 load_gdt(&ctxt->gdt);
115 load_idt(&ctxt->idt);
116
117 /*
118 * segment registers
119 */
120 loadsegment(es, ctxt->es);
121 loadsegment(fs, ctxt->fs);
122 loadsegment(gs, ctxt->gs);
123 loadsegment(ss, ctxt->ss);
124
125 /*
126 * sysenter MSRs
127 */
128 if (boot_cpu_has(X86_FEATURE_SEP))
129 enable_sep_cpu();
130
131 /*
132 * restore XCR0 for xsave capable cpu's.
133 */
134 if (cpu_has_xsave)
135 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
136
137 fix_processor_context();
138 do_fpu_end();
139 mtrr_ap_init();
140 mcheck_init(&boot_cpu_data);
141}
142
143/* Needed by apm.c */
144void restore_processor_state(void)
145{
146 __restore_processor_state(&saved_context);
147}
148EXPORT_SYMBOL(restore_processor_state);