aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 07:06:17 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 07:06:17 -0400
commita3d06cc6aa3e765dc2bf98626f87272dcf641dca (patch)
treeaa3e49b58f08d6c0ea55cdca4fb5e6c8ba6ae333 /arch/x86
parent0990b1c65729012a63e0eeca93aaaafea4e9a064 (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into perfcounters/core
Conflicts: arch/x86/include/asm/kmap_types.h include/linux/mm.h include/asm-generic/kmap_types.h Merge reason: We crossed changes with kmap_types.h cleanups in mainline. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig46
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c267
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/entry_arch.h11
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h17
-rw-r--r--arch/x86/include/asm/kmap_types.h26
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/mce.h88
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1188
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c65
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/entry_64.S11
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c19
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/microcode_core.c1
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c28
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/traps.c11
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c12
79 files changed, 4805 insertions, 1716 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68f5578fe38e..cf42fc305419 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
46 select HAVE_KERNEL_GZIP 46 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 select HAVE_ARCH_KMEMCHECK
49 50
50config OUTPUT_FORMAT 51config OUTPUT_FORMAT
51 string 52 string
@@ -789,10 +790,26 @@ config X86_MCE
789 to disable it. MCE support simply ignores non-MCE processors like 790 to disable it. MCE support simply ignores non-MCE processors like
790 the 386 and 486, so nearly everyone can say Y here. 791 the 386 and 486, so nearly everyone can say Y here.
791 792
793config X86_OLD_MCE
794 depends on X86_32 && X86_MCE
795 bool "Use legacy machine check code (will go away)"
796 default n
797 select X86_ANCIENT_MCE
798 ---help---
799 Use the old i386 machine check code. This is merely intended for
800 testing in a transition period. Try this if you run into any machine
801 check related software problems, but report the problem to
802 linux-kernel. When in doubt say no.
803
804config X86_NEW_MCE
805 depends on X86_MCE
806 bool
807 default y if (!X86_OLD_MCE && X86_32) || X86_64
808
792config X86_MCE_INTEL 809config X86_MCE_INTEL
793 def_bool y 810 def_bool y
794 prompt "Intel MCE features" 811 prompt "Intel MCE features"
795 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 812 depends on X86_NEW_MCE && X86_LOCAL_APIC
796 ---help--- 813 ---help---
797 Additional support for intel specific MCE features such as 814 Additional support for intel specific MCE features such as
798 the thermal monitor. 815 the thermal monitor.
@@ -800,19 +817,36 @@ config X86_MCE_INTEL
800config X86_MCE_AMD 817config X86_MCE_AMD
801 def_bool y 818 def_bool y
802 prompt "AMD MCE features" 819 prompt "AMD MCE features"
803 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 820 depends on X86_NEW_MCE && X86_LOCAL_APIC
804 ---help--- 821 ---help---
805 Additional support for AMD specific MCE features such as 822 Additional support for AMD specific MCE features such as
806 the DRAM Error Threshold. 823 the DRAM Error Threshold.
807 824
825config X86_ANCIENT_MCE
826 def_bool n
827 depends on X86_32
828 prompt "Support for old Pentium 5 / WinChip machine checks"
829 ---help---
830 Include support for machine check handling on old Pentium 5 or WinChip
831 systems. These typically need to be enabled explicitely on the command
832 line.
833
808config X86_MCE_THRESHOLD 834config X86_MCE_THRESHOLD
809 depends on X86_MCE_AMD || X86_MCE_INTEL 835 depends on X86_MCE_AMD || X86_MCE_INTEL
810 bool 836 bool
811 default y 837 default y
812 838
839config X86_MCE_INJECT
840 depends on X86_NEW_MCE
841 tristate "Machine check injector support"
842 ---help---
843 Provide support for injecting machine checks for testing purposes.
844 If you don't know what a machine check is and you don't do kernel
845 QA it is safe to say n.
846
813config X86_MCE_NONFATAL 847config X86_MCE_NONFATAL
814 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 848 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
815 depends on X86_32 && X86_MCE 849 depends on X86_OLD_MCE
816 ---help--- 850 ---help---
817 Enabling this feature starts a timer that triggers every 5 seconds which 851 Enabling this feature starts a timer that triggers every 5 seconds which
818 will look at the machine check registers to see if anything happened. 852 will look at the machine check registers to see if anything happened.
@@ -825,11 +859,15 @@ config X86_MCE_NONFATAL
825 859
826config X86_MCE_P4THERMAL 860config X86_MCE_P4THERMAL
827 bool "check for P4 thermal throttling interrupt." 861 bool "check for P4 thermal throttling interrupt."
828 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) 862 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
829 ---help--- 863 ---help---
830 Enabling this feature will cause a message to be printed when the P4 864 Enabling this feature will cause a message to be printed when the P4
831 enters thermal throttling. 865 enters thermal throttling.
832 866
867config X86_THERMAL_VECTOR
868 def_bool y
869 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
870
833config VM86 871config VM86
834 bool "Enable VM86 support" if EMBEDDED 872 bool "Enable VM86 support" if EMBEDDED
835 default y 873 default y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca62067..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
81 endif 81 endif
82endif 82endif
83 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
84# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
85sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
86sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b4..cfb0010fa940 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af65497..4e663398f77f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
21#include <asm/i387.h> 21#include <asm/i387.h>
22#include <asm/aes.h> 22#include <asm/aes.h>
23 23
24#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
25#define HAS_CTR
26#endif
27
28#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
29#define HAS_LRW
30#endif
31
32#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
33#define HAS_PCBC
34#endif
35
36#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
37#define HAS_XTS
38#endif
39
24struct async_aes_ctx { 40struct async_aes_ctx {
25 struct cryptd_ablkcipher *cryptd_tfm; 41 struct cryptd_ablkcipher *cryptd_tfm;
26}; 42};
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
137 } 153 }
138}; 154};
139 155
156static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
157{
158 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
159
160 aesni_enc(ctx, dst, src);
161}
162
163static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
164{
165 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
166
167 aesni_dec(ctx, dst, src);
168}
169
170static struct crypto_alg __aesni_alg = {
171 .cra_name = "__aes-aesni",
172 .cra_driver_name = "__driver-aes-aesni",
173 .cra_priority = 0,
174 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
175 .cra_blocksize = AES_BLOCK_SIZE,
176 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
177 .cra_alignmask = 0,
178 .cra_module = THIS_MODULE,
179 .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
180 .cra_u = {
181 .cipher = {
182 .cia_min_keysize = AES_MIN_KEY_SIZE,
183 .cia_max_keysize = AES_MAX_KEY_SIZE,
184 .cia_setkey = aes_set_key,
185 .cia_encrypt = __aes_encrypt,
186 .cia_decrypt = __aes_decrypt
187 }
188 }
189};
190
140static int ecb_encrypt(struct blkcipher_desc *desc, 191static int ecb_encrypt(struct blkcipher_desc *desc,
141 struct scatterlist *dst, struct scatterlist *src, 192 struct scatterlist *dst, struct scatterlist *src,
142 unsigned int nbytes) 193 unsigned int nbytes)
@@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
277 unsigned int key_len) 328 unsigned int key_len)
278{ 329{
279 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 330 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
331 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
332 int err;
280 333
281 return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); 334 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
335 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
336 & CRYPTO_TFM_REQ_MASK);
337 err = crypto_ablkcipher_setkey(child, key, key_len);
338 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
339 & CRYPTO_TFM_RES_MASK);
340 return err;
282} 341}
283 342
284static int ablk_encrypt(struct ablkcipher_request *req) 343static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = {
411 }, 470 },
412}; 471};
413 472
473#ifdef HAS_CTR
474static int ablk_ctr_init(struct crypto_tfm *tfm)
475{
476 struct cryptd_ablkcipher *cryptd_tfm;
477
478 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
479 0, 0);
480 if (IS_ERR(cryptd_tfm))
481 return PTR_ERR(cryptd_tfm);
482 ablk_init_common(tfm, cryptd_tfm);
483 return 0;
484}
485
486static struct crypto_alg ablk_ctr_alg = {
487 .cra_name = "ctr(aes)",
488 .cra_driver_name = "ctr-aes-aesni",
489 .cra_priority = 400,
490 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
491 .cra_blocksize = 1,
492 .cra_ctxsize = sizeof(struct async_aes_ctx),
493 .cra_alignmask = 0,
494 .cra_type = &crypto_ablkcipher_type,
495 .cra_module = THIS_MODULE,
496 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
497 .cra_init = ablk_ctr_init,
498 .cra_exit = ablk_exit,
499 .cra_u = {
500 .ablkcipher = {
501 .min_keysize = AES_MIN_KEY_SIZE,
502 .max_keysize = AES_MAX_KEY_SIZE,
503 .ivsize = AES_BLOCK_SIZE,
504 .setkey = ablk_set_key,
505 .encrypt = ablk_encrypt,
506 .decrypt = ablk_decrypt,
507 .geniv = "chainiv",
508 },
509 },
510};
511#endif
512
513#ifdef HAS_LRW
514static int ablk_lrw_init(struct crypto_tfm *tfm)
515{
516 struct cryptd_ablkcipher *cryptd_tfm;
517
518 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
519 0, 0);
520 if (IS_ERR(cryptd_tfm))
521 return PTR_ERR(cryptd_tfm);
522 ablk_init_common(tfm, cryptd_tfm);
523 return 0;
524}
525
526static struct crypto_alg ablk_lrw_alg = {
527 .cra_name = "lrw(aes)",
528 .cra_driver_name = "lrw-aes-aesni",
529 .cra_priority = 400,
530 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
531 .cra_blocksize = AES_BLOCK_SIZE,
532 .cra_ctxsize = sizeof(struct async_aes_ctx),
533 .cra_alignmask = 0,
534 .cra_type = &crypto_ablkcipher_type,
535 .cra_module = THIS_MODULE,
536 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
537 .cra_init = ablk_lrw_init,
538 .cra_exit = ablk_exit,
539 .cra_u = {
540 .ablkcipher = {
541 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
542 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
543 .ivsize = AES_BLOCK_SIZE,
544 .setkey = ablk_set_key,
545 .encrypt = ablk_encrypt,
546 .decrypt = ablk_decrypt,
547 },
548 },
549};
550#endif
551
552#ifdef HAS_PCBC
553static int ablk_pcbc_init(struct crypto_tfm *tfm)
554{
555 struct cryptd_ablkcipher *cryptd_tfm;
556
557 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
558 0, 0);
559 if (IS_ERR(cryptd_tfm))
560 return PTR_ERR(cryptd_tfm);
561 ablk_init_common(tfm, cryptd_tfm);
562 return 0;
563}
564
565static struct crypto_alg ablk_pcbc_alg = {
566 .cra_name = "pcbc(aes)",
567 .cra_driver_name = "pcbc-aes-aesni",
568 .cra_priority = 400,
569 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
570 .cra_blocksize = AES_BLOCK_SIZE,
571 .cra_ctxsize = sizeof(struct async_aes_ctx),
572 .cra_alignmask = 0,
573 .cra_type = &crypto_ablkcipher_type,
574 .cra_module = THIS_MODULE,
575 .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
576 .cra_init = ablk_pcbc_init,
577 .cra_exit = ablk_exit,
578 .cra_u = {
579 .ablkcipher = {
580 .min_keysize = AES_MIN_KEY_SIZE,
581 .max_keysize = AES_MAX_KEY_SIZE,
582 .ivsize = AES_BLOCK_SIZE,
583 .setkey = ablk_set_key,
584 .encrypt = ablk_encrypt,
585 .decrypt = ablk_decrypt,
586 },
587 },
588};
589#endif
590
591#ifdef HAS_XTS
592static int ablk_xts_init(struct crypto_tfm *tfm)
593{
594 struct cryptd_ablkcipher *cryptd_tfm;
595
596 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
597 0, 0);
598 if (IS_ERR(cryptd_tfm))
599 return PTR_ERR(cryptd_tfm);
600 ablk_init_common(tfm, cryptd_tfm);
601 return 0;
602}
603
604static struct crypto_alg ablk_xts_alg = {
605 .cra_name = "xts(aes)",
606 .cra_driver_name = "xts-aes-aesni",
607 .cra_priority = 400,
608 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
609 .cra_blocksize = AES_BLOCK_SIZE,
610 .cra_ctxsize = sizeof(struct async_aes_ctx),
611 .cra_alignmask = 0,
612 .cra_type = &crypto_ablkcipher_type,
613 .cra_module = THIS_MODULE,
614 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
615 .cra_init = ablk_xts_init,
616 .cra_exit = ablk_exit,
617 .cra_u = {
618 .ablkcipher = {
619 .min_keysize = 2 * AES_MIN_KEY_SIZE,
620 .max_keysize = 2 * AES_MAX_KEY_SIZE,
621 .ivsize = AES_BLOCK_SIZE,
622 .setkey = ablk_set_key,
623 .encrypt = ablk_encrypt,
624 .decrypt = ablk_decrypt,
625 },
626 },
627};
628#endif
629
414static int __init aesni_init(void) 630static int __init aesni_init(void)
415{ 631{
416 int err; 632 int err;
@@ -421,6 +637,8 @@ static int __init aesni_init(void)
421 } 637 }
422 if ((err = crypto_register_alg(&aesni_alg))) 638 if ((err = crypto_register_alg(&aesni_alg)))
423 goto aes_err; 639 goto aes_err;
640 if ((err = crypto_register_alg(&__aesni_alg)))
641 goto __aes_err;
424 if ((err = crypto_register_alg(&blk_ecb_alg))) 642 if ((err = crypto_register_alg(&blk_ecb_alg)))
425 goto blk_ecb_err; 643 goto blk_ecb_err;
426 if ((err = crypto_register_alg(&blk_cbc_alg))) 644 if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +647,41 @@ static int __init aesni_init(void)
429 goto ablk_ecb_err; 647 goto ablk_ecb_err;
430 if ((err = crypto_register_alg(&ablk_cbc_alg))) 648 if ((err = crypto_register_alg(&ablk_cbc_alg)))
431 goto ablk_cbc_err; 649 goto ablk_cbc_err;
650#ifdef HAS_CTR
651 if ((err = crypto_register_alg(&ablk_ctr_alg)))
652 goto ablk_ctr_err;
653#endif
654#ifdef HAS_LRW
655 if ((err = crypto_register_alg(&ablk_lrw_alg)))
656 goto ablk_lrw_err;
657#endif
658#ifdef HAS_PCBC
659 if ((err = crypto_register_alg(&ablk_pcbc_alg)))
660 goto ablk_pcbc_err;
661#endif
662#ifdef HAS_XTS
663 if ((err = crypto_register_alg(&ablk_xts_alg)))
664 goto ablk_xts_err;
665#endif
432 666
433 return err; 667 return err;
434 668
669#ifdef HAS_XTS
670ablk_xts_err:
671#endif
672#ifdef HAS_PCBC
673 crypto_unregister_alg(&ablk_pcbc_alg);
674ablk_pcbc_err:
675#endif
676#ifdef HAS_LRW
677 crypto_unregister_alg(&ablk_lrw_alg);
678ablk_lrw_err:
679#endif
680#ifdef HAS_CTR
681 crypto_unregister_alg(&ablk_ctr_alg);
682ablk_ctr_err:
683#endif
684 crypto_unregister_alg(&ablk_cbc_alg);
435ablk_cbc_err: 685ablk_cbc_err:
436 crypto_unregister_alg(&ablk_ecb_alg); 686 crypto_unregister_alg(&ablk_ecb_alg);
437ablk_ecb_err: 687ablk_ecb_err:
@@ -439,6 +689,8 @@ ablk_ecb_err:
439blk_cbc_err: 689blk_cbc_err:
440 crypto_unregister_alg(&blk_ecb_alg); 690 crypto_unregister_alg(&blk_ecb_alg);
441blk_ecb_err: 691blk_ecb_err:
692 crypto_unregister_alg(&__aesni_alg);
693__aes_err:
442 crypto_unregister_alg(&aesni_alg); 694 crypto_unregister_alg(&aesni_alg);
443aes_err: 695aes_err:
444 return err; 696 return err;
@@ -446,10 +698,23 @@ aes_err:
446 698
447static void __exit aesni_exit(void) 699static void __exit aesni_exit(void)
448{ 700{
701#ifdef HAS_XTS
702 crypto_unregister_alg(&ablk_xts_alg);
703#endif
704#ifdef HAS_PCBC
705 crypto_unregister_alg(&ablk_pcbc_alg);
706#endif
707#ifdef HAS_LRW
708 crypto_unregister_alg(&ablk_lrw_alg);
709#endif
710#ifdef HAS_CTR
711 crypto_unregister_alg(&ablk_ctr_alg);
712#endif
449 crypto_unregister_alg(&ablk_cbc_alg); 713 crypto_unregister_alg(&ablk_cbc_alg);
450 crypto_unregister_alg(&ablk_ecb_alg); 714 crypto_unregister_alg(&ablk_ecb_alg);
451 crypto_unregister_alg(&blk_cbc_alg); 715 crypto_unregister_alg(&blk_cbc_alg);
452 crypto_unregister_alg(&blk_ecb_alg); 716 crypto_unregister_alg(&blk_ecb_alg);
717 crypto_unregister_alg(&__aesni_alg);
453 crypto_unregister_alg(&aesni_alg); 718 crypto_unregister_alg(&aesni_alg);
454} 719}
455 720
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 000000000000..5f9781a3815f
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
1/*
2 * FPU: Wrapper for blkcipher touching fpu
3 *
4 * Copyright (c) Intel Corp.
5 * Author: Huang Ying <ying.huang@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 *
12 */
13
14#include <crypto/algapi.h>
15#include <linux/err.h>
16#include <linux/init.h>
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <asm/i387.h>
20
21struct crypto_fpu_ctx {
22 struct crypto_blkcipher *child;
23};
24
25static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
26 unsigned int keylen)
27{
28 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
29 struct crypto_blkcipher *child = ctx->child;
30 int err;
31
32 crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
33 crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
34 CRYPTO_TFM_REQ_MASK);
35 err = crypto_blkcipher_setkey(child, key, keylen);
36 crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
37 CRYPTO_TFM_RES_MASK);
38 return err;
39}
40
41static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
42 struct scatterlist *dst, struct scatterlist *src,
43 unsigned int nbytes)
44{
45 int err;
46 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
47 struct crypto_blkcipher *child = ctx->child;
48 struct blkcipher_desc desc = {
49 .tfm = child,
50 .info = desc_in->info,
51 .flags = desc_in->flags,
52 };
53
54 kernel_fpu_begin();
55 err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
56 kernel_fpu_end();
57 return err;
58}
59
60static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
61 struct scatterlist *dst, struct scatterlist *src,
62 unsigned int nbytes)
63{
64 int err;
65 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
66 struct crypto_blkcipher *child = ctx->child;
67 struct blkcipher_desc desc = {
68 .tfm = child,
69 .info = desc_in->info,
70 .flags = desc_in->flags,
71 };
72
73 kernel_fpu_begin();
74 err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
75 kernel_fpu_end();
76 return err;
77}
78
79static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
80{
81 struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
82 struct crypto_spawn *spawn = crypto_instance_ctx(inst);
83 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
84 struct crypto_blkcipher *cipher;
85
86 cipher = crypto_spawn_blkcipher(spawn);
87 if (IS_ERR(cipher))
88 return PTR_ERR(cipher);
89
90 ctx->child = cipher;
91 return 0;
92}
93
94static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
95{
96 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
97 crypto_free_blkcipher(ctx->child);
98}
99
100static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
101{
102 struct crypto_instance *inst;
103 struct crypto_alg *alg;
104 int err;
105
106 err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
107 if (err)
108 return ERR_PTR(err);
109
110 alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
111 CRYPTO_ALG_TYPE_MASK);
112 if (IS_ERR(alg))
113 return ERR_CAST(alg);
114
115 inst = crypto_alloc_instance("fpu", alg);
116 if (IS_ERR(inst))
117 goto out_put_alg;
118
119 inst->alg.cra_flags = alg->cra_flags;
120 inst->alg.cra_priority = alg->cra_priority;
121 inst->alg.cra_blocksize = alg->cra_blocksize;
122 inst->alg.cra_alignmask = alg->cra_alignmask;
123 inst->alg.cra_type = alg->cra_type;
124 inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
125 inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
126 inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
127 inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
128 inst->alg.cra_init = crypto_fpu_init_tfm;
129 inst->alg.cra_exit = crypto_fpu_exit_tfm;
130 inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
131 inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
132 inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
133
134out_put_alg:
135 crypto_mod_put(alg);
136 return inst;
137}
138
139static void crypto_fpu_free(struct crypto_instance *inst)
140{
141 crypto_drop_spawn(crypto_instance_ctx(inst));
142 kfree(inst);
143}
144
145static struct crypto_template crypto_fpu_tmpl = {
146 .name = "fpu",
147 .alloc = crypto_fpu_alloc,
148 .free = crypto_fpu_free,
149 .module = THIS_MODULE,
150};
151
152static int __init crypto_fpu_module_init(void)
153{
154 return crypto_register_template(&crypto_fpu_tmpl);
155}
156
157static void __exit crypto_fpu_module_exit(void)
158{
159 crypto_unregister_template(&crypto_fpu_tmpl);
160}
161
162module_init(crypto_fpu_module_init);
163module_exit(crypto_fpu_module_exit);
164
165MODULE_LICENSE("GPL");
166MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..b93405b228b4 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
60 dma_addr_t addr; 61 dma_addr_t addr;
61 62
62 BUG_ON(!valid_dma_direction(dir)); 63 BUG_ON(!valid_dma_direction(dir));
64 kmemcheck_mark_initialized(ptr, size);
63 addr = ops->map_page(hwdev, virt_to_page(ptr), 65 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size, 66 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL); 67 dir, NULL);
@@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg,
87{ 89{
88 struct dma_map_ops *ops = get_dma_ops(hwdev); 90 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents; 91 int ents;
92 struct scatterlist *s;
93 int i;
90 94
91 BUG_ON(!valid_dma_direction(dir)); 95 BUG_ON(!valid_dma_direction(dir));
96 for_each_sg(sg, s, nents, i)
97 kmemcheck_mark_initialized(sg_virt(s), s->length);
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL); 98 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir); 99 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94 100
@@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
200 dma_addr_t addr; 206 dma_addr_t addr;
201 207
202 BUG_ON(!valid_dma_direction(dir)); 208 BUG_ON(!valid_dma_direction(dir));
209 kmemcheck_mark_initialized(page_address(page) + offset, size);
203 addr = ops->map_page(dev, page, offset, size, dir, NULL); 210 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 211 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205 212
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index d750a10ccad6..ff8cbfa07851 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
17 18
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 19BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt) 20 smp_invalidate_interrupt)
@@ -52,8 +53,16 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
52BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_THERMAL_VECTOR
56BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 57BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
57#endif 58#endif
58 59
60#ifdef CONFIG_X86_MCE_THRESHOLD
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif
63
64#ifdef CONFIG_X86_NEW_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif
67
59#endif 68#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 9ebc5c255032..82e3e8f01043 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,7 +22,7 @@ typedef struct {
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_MCE
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_64 25# ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 26 unsigned int irq_threshold_count;
27# endif 27# endif
28#endif 28#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6df45f639666..ba180d93b08c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -34,6 +34,7 @@ extern void perf_pending_interrupt(void);
34extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
35extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
36extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
37extern void mce_self_interrupt(void);
37 38
38extern void invalidate_interrupt(void); 39extern void invalidate_interrupt(void);
39extern void invalidate_interrupt0(void); 40extern void invalidate_interrupt0(void);
@@ -46,6 +47,7 @@ extern void invalidate_interrupt6(void);
46extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
47 48
48extern void irq_move_cleanup_interrupt(void); 49extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void);
49extern void threshold_interrupt(void); 51extern void threshold_interrupt(void);
50 52
51extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e997be98c9b9..5b21f0ec3df2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#define NMI_VECTOR 0x02 27#define NMI_VECTOR 0x02
28#define MCE_VECTOR 0x12
28 29
29/* 30/*
30 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start
@@ -87,13 +88,8 @@
87#define CALL_FUNCTION_VECTOR 0xfc 88#define CALL_FUNCTION_VECTOR 0xfc
88#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 89#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
89#define THERMAL_APIC_VECTOR 0xfa 90#define THERMAL_APIC_VECTOR 0xfa
90 91#define THRESHOLD_APIC_VECTOR 0xf9
91#ifdef CONFIG_X86_32 92#define REBOOT_VECTOR 0xf8
92/* 0xf8 - 0xf9 : free */
93#else
94# define THRESHOLD_APIC_VECTOR 0xf9
95# define UV_BAU_MESSAGE 0xf8
96#endif
97 93
98/* f0-f7 used for spreading out TLB flushes: */ 94/* f0-f7 used for spreading out TLB flushes: */
99#define INVALIDATE_TLB_VECTOR_END 0xf7 95#define INVALIDATE_TLB_VECTOR_END 0xf7
@@ -117,6 +113,13 @@
117 */ 113 */
118#define LOCAL_PENDING_VECTOR 0xec 114#define LOCAL_PENDING_VECTOR 0xec
119 115
116#define UV_BAU_MESSAGE 0xec
117
118/*
119 * Self IPI vector for machine checks
120 */
121#define MCE_SELF_VECTOR 0xeb
122
120/* 123/*
121 * First APIC vector available to drivers: (vectors 0x30-0xee) we 124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
122 * start at 0x31(0x41) to spread out vectors evenly between priority 125 * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index f86613846198..9e00a731a7fb 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,31 +2,11 @@
2#define _ASM_X86_KMAP_TYPES_H 2#define _ASM_X86_KMAP_TYPES_H
3 3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) 4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n , 5#define __WITH_KM_FENCE
6#else
7# define D(n)
8#endif 6#endif
9 7
10enum km_type { 8#include <asm-generic/kmap_types.h>
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_IRQ_PTE,
23D(12) KM_SOFTIRQ0,
24D(13) KM_SOFTIRQ1,
25D(14) KM_NMI,
26D(15) KM_NMI_PTE,
27D(16) KM_TYPE_NR
28};
29 9
30#undef D 10#undef __WITH_KM_FENCE
31 11
32#endif /* _ASM_X86_KMAP_TYPES_H */ 12#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e7..540a466e50f5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_MCE_H 1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H 2#define _ASM_X86_MCE_H
3 3
4#ifdef __x86_64__
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <asm/ioctls.h> 5#include <asm/ioctls.h>
8 6
@@ -10,21 +8,35 @@
10 * Machine Check support for x86 8 * Machine Check support for x86
11 */ 9 */
12 10
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
16 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 16#define MCG_EXT_CNT_SHIFT 16
19#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ 17#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
20 18#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
21#define MCI_STATUS_VAL (1UL<<63) /* valid error */ 19
22#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ 20#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
23#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ 21#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
24#define MCI_STATUS_EN (1UL<<60) /* error enabled */ 22#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
25#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ 23
26#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ 24#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
27#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ 25#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
26#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
27#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
28#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
29#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
30#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
31#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
32#define MCI_STATUS_AR (1ULL<<55) /* Action required */
33
34/* MISC register defines */
35#define MCM_ADDR_SEGOFF 0 /* segment offset */
36#define MCM_ADDR_LINEAR 1 /* linear address */
37#define MCM_ADDR_PHYS 2 /* physical address */
38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */
28 40
29/* Fields are zero when not available */ 41/* Fields are zero when not available */
30struct mce { 42struct mce {
@@ -34,13 +46,19 @@ struct mce {
34 __u64 mcgstatus; 46 __u64 mcgstatus;
35 __u64 ip; 47 __u64 ip;
36 __u64 tsc; /* cpu time stamp counter */ 48 __u64 tsc; /* cpu time stamp counter */
37 __u64 res1; /* for future extension */ 49 __u64 time; /* wall time_t when error was detected */
38 __u64 res2; /* dito. */ 50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1;
52 __u16 pad2;
53 __u32 cpuid; /* CPUID 1 EAX */
39 __u8 cs; /* code segment */ 54 __u8 cs; /* code segment */
40 __u8 bank; /* machine check bank */ 55 __u8 bank; /* machine check bank */
41 __u8 cpu; /* cpu that raised the error */ 56 __u8 cpu; /* cpu number; obsolete; use extcpu now */
42 __u8 finished; /* entry is valid */ 57 __u8 finished; /* entry is valid */
43 __u32 pad; 58 __u32 extcpu; /* linux cpu number that detected the error */
59 __u32 socketid; /* CPU socket ID */
60 __u32 apicid; /* CPU initial apic ID */
61 __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
44}; 62};
45 63
46/* 64/*
@@ -57,7 +75,7 @@ struct mce_log {
57 unsigned len; /* = MCE_LOG_LEN */ 75 unsigned len; /* = MCE_LOG_LEN */
58 unsigned next; 76 unsigned next;
59 unsigned flags; 77 unsigned flags;
60 unsigned pad0; 78 unsigned recordlen; /* length of struct mce */
61 struct mce entry[MCE_LOG_LEN]; 79 struct mce entry[MCE_LOG_LEN];
62}; 80};
63 81
@@ -82,19 +100,16 @@ struct mce_log {
82#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 100#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
83#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 101#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
84 102
85#endif /* __x86_64__ */
86
87#ifdef __KERNEL__ 103#ifdef __KERNEL__
88 104
89#ifdef CONFIG_X86_32
90extern int mce_disabled; 105extern int mce_disabled;
91#else /* CONFIG_X86_32 */
92 106
93#include <asm/atomic.h> 107#include <asm/atomic.h>
108#include <linux/percpu.h>
94 109
95void mce_setup(struct mce *m); 110void mce_setup(struct mce *m);
96void mce_log(struct mce *m); 111void mce_log(struct mce *m);
97DECLARE_PER_CPU(struct sys_device, device_mce); 112DECLARE_PER_CPU(struct sys_device, mce_dev);
98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 113extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
99 114
100/* 115/*
@@ -104,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 119#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105 120
106#ifdef CONFIG_X86_MCE_INTEL 121#ifdef CONFIG_X86_MCE_INTEL
122extern int mce_cmci_disabled;
123extern int mce_ignore_ce;
107void mce_intel_feature_init(struct cpuinfo_x86 *c); 124void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void); 125void cmci_clear(void);
109void cmci_reenable(void); 126void cmci_reenable(void);
@@ -123,13 +140,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 140static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
124#endif 141#endif
125 142
126extern int mce_available(struct cpuinfo_x86 *c); 143int mce_available(struct cpuinfo_x86 *c);
144
145DECLARE_PER_CPU(unsigned, mce_exception_count);
146DECLARE_PER_CPU(unsigned, mce_poll_count);
127 147
128void mce_log_therm_throt_event(__u64 status); 148void mce_log_therm_throt_event(__u64 status);
129 149
130extern atomic_t mce_entry; 150extern atomic_t mce_entry;
131 151
132extern void do_machine_check(struct pt_regs *, long); 152void do_machine_check(struct pt_regs *, long);
133 153
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 154typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 155DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
@@ -139,14 +159,16 @@ enum mcp_flags {
139 MCP_UC = (1 << 1), /* log uncorrected errors */ 159 MCP_UC = (1 << 1), /* log uncorrected errors */
140 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 160 MCP_DONTLOG = (1 << 2), /* only clear, don't log */
141}; 161};
142extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 162void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
143 163
144extern int mce_notify_user(void); 164int mce_notify_irq(void);
165void mce_notify_process(void);
145 166
146#endif /* !CONFIG_X86_32 */ 167DECLARE_PER_CPU(struct mce, injectm);
168extern struct file_operations mce_chrdev_ops;
147 169
148#ifdef CONFIG_X86_MCE 170#ifdef CONFIG_X86_MCE
149extern void mcheck_init(struct cpuinfo_x86 *c); 171void mcheck_init(struct cpuinfo_x86 *c);
150#else 172#else
151#define mcheck_init(c) do { } while (0) 173#define mcheck_init(c) do { } while (0)
152#endif 174#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4d58d04fca83..1692fb5050e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -207,7 +207,14 @@
207 207
208#define MSR_IA32_THERM_CONTROL 0x0000019a 208#define MSR_IA32_THERM_CONTROL 0x0000019a
209#define MSR_IA32_THERM_INTERRUPT 0x0000019b 209#define MSR_IA32_THERM_INTERRUPT 0x0000019b
210
211#define THERM_INT_LOW_ENABLE (1 << 0)
212#define THERM_INT_HIGH_ENABLE (1 << 1)
213
210#define MSR_IA32_THERM_STATUS 0x0000019c 214#define MSR_IA32_THERM_STATUS 0x0000019c
215
216#define THERM_STATUS_PROCHOT (1 << 0)
217
211#define MSR_IA32_MISC_ENABLE 0x000001a0 218#define MSR_IA32_MISC_ENABLE 0x000001a0
212 219
213/* MISC_ENABLE bits: architectural */ 220/* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf2631..3cc06e3fceb8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -317,6 +317,11 @@ static inline int pte_present(pte_t a)
317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
318} 318}
319 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
320static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
321{ 326{
322 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98c..b0783520988b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -154,9 +154,9 @@ struct thread_info {
154 154
155/* thread information allocation */ 155/* thread information allocation */
156#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
157#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
158#else 158#else
159#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
160#endif 160#endif
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981f..1375cfc93960 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
4#include <asm/processor.h> 4#include <asm/processor.h>
5#include <asm/tsc.h> 5#include <asm/tsc.h>
6 6
7/* The PIT ticks at this frequency (in HZ): */ 7/* Assume we use the PIT time source for the clock tick */
8#define PIT_TICK_RATE 1193182
9
10#define CLOCK_TICK_RATE PIT_TICK_RATE 8#define CLOCK_TICK_RATE PIT_TICK_RATE
11 9
12#define ARCH_HAS_READ_CURRENT_TIMER 10#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 076d3881f3da..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -899,7 +899,7 @@ void clear_local_APIC(void)
899 } 899 }
900 900
901 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
902#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
903 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
904 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -2017,7 +2017,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
2020#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
2021 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
2023#endif 2023#endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a691302dc3ff..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ef0ae207a7c8..096d19aea2f7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 463 uv_set_scir_bits(bits);
464 464
465 /* enable next timer period */ 465 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 466 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 467}
468 468
469static void __cpuinit uv_heartbeat_enable(int cpu) 469static void __cpuinit uv_heartbeat_enable(int cpu)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3ffdcfa9abdf..9fa33886c0d7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,6 @@ out:
487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
488{ 488{
489 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
490 static int printed;
491 int i; 490 int i;
492 491
493 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
504 } 503 }
505 } 504 }
506 505
507 if (!printed) { 506 printk_once(KERN_ERR
508 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
509 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
510 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
511
512 printk(KERN_ERR "CPU: Your system may be unstable.\n");
513 }
514 509
515 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
516 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index daed39ba2614..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..45004faf67ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o therm_throt.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..89e510424152 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,12 +14,12 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine Check Handler For AMD Athlon/Duron */ 17/* Machine Check Handler For AMD Athlon/Duron: */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 18static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 31
33 for (i = 1; i < nr_mce_banks; i++) { 32 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 58
59 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 60 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 61 /* Serialize: */
57 wmb(); 62 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 63 add_taint(TAINT_MACHINE_CHECK);
59 } 64 }
60 } 65 }
61 66
62 if (recover&2) 67 if (recover & 2)
63 panic("CPU context corrupt"); 68 panic("CPU context corrupt");
64 if (recover&1) 69 if (recover & 1)
65 panic("Unable to continue"); 70 panic("Unable to continue");
71
66 printk(KERN_EMERG "Attempting to continue.\n"); 72 printk(KERN_EMERG "Attempting to continue.\n");
73
67 mcgstl &= ~(1<<2); 74 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 75 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 76}
70 77
71 78
72/* AMD K7 machine check is Intel like */ 79/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 80void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 81{
75 u32 l, h; 82 u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 86 return;
80 87
81 machine_check_vector = k7_machine_check; 88 machine_check_vector = k7_machine_check;
89 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 90 wmb();
83 91
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 92 printk(KERN_INFO "Intel machine check architecture supported.\n");
93
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 94 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 95 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 96 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 97 nr_mce_banks = l & 0xff;
89 98
90 /* Clear status for MC index 0 separately, we don't touch CTL, 99 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 100 * Clear status for MC index 0 separately, we don't touch CTL,
101 * as some K7 Athlons cause spurious MCEs when its enabled:
102 */
92 if (boot_cpu_data.x86 == 6) { 103 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 104 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 105 i = 1;
95 } else 106 } else
96 i = 0; 107 i = 0;
108
97 for (; i < nr_mce_banks; i++) { 109 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 110 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 111 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..fabba15e4558
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47#include "mce.h"
48
49/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code)
51{
52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
53 smp_processor_id());
54}
55
56/* Call the installed machine check handler for this CPU setup. */
57void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check;
59
60int mce_disabled;
61
62#ifdef CONFIG_X86_NEW_MCE
63
64#define MISC_MCELOG_MINOR 227
65
66#define SPINUNIT 100 /* 100ns */
67
68atomic_t mce_entry;
69
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72/*
73 * Tolerant levels:
74 * 0: always panic on uncorrected errors, log corrected errors
75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */
79static int tolerant = 1;
80static int banks;
81static u64 *bank;
82static unsigned long notify_user;
83static int rip_msr;
84static int mce_bootlog = -1;
85static int monarch_timeout = -1;
86static int mce_panic_timeout;
87static int mce_dont_log_ce;
88int mce_cmci_disabled;
89int mce_ignore_ce;
90int mce_ser;
91
92static char trigger[128];
93static char *trigger_argv[2] = { trigger, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &notify_user);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int no_way_out, int *order)
695{
696 int nwo;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout) {
701 *order = -1;
702 return no_way_out;
703 }
704
705 atomic_add(no_way_out, &global_nwo);
706
707 /*
708 * Wait for everyone.
709 */
710 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0);
713 *order = -1;
714 return no_way_out;
715 }
716 ndelay(SPINUNIT);
717 }
718
719 /*
720 * Cache the global no_way_out state.
721 */
722 nwo = atomic_read(&global_nwo);
723
724 /*
725 * Monarch starts executing now, the others wait.
726 */
727 if (*order == 1) {
728 atomic_set(&mce_executing, 1);
729 return nwo;
730 }
731
732 /*
733 * Now start the scanning loop one by one
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < *order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747}
748
749/*
750 * Synchronize between CPUs after main scanning loop.
751 * This invokes the bulk of the Monarch processing.
752 */
753static int mce_end(int order)
754{
755 int ret = -1;
756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758 if (!timeout)
759 goto reset;
760 if (order < 0)
761 goto reset;
762
763 /*
764 * Allow others to run.
765 */
766 atomic_inc(&mce_executing);
767
768 if (order == 1) {
769 /* CHECKME: Can this race with a parallel hotplug? */
770 int cpus = num_online_cpus();
771
772 /*
773 * Monarch: Wait for everyone to go through their scanning
774 * loops.
775 */
776 while (atomic_read(&mce_executing) <= cpus) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 mce_reign();
783 barrier();
784 ret = 0;
785 } else {
786 /*
787 * Subject: Wait for Monarch to finish.
788 */
789 while (atomic_read(&mce_executing) != 0) {
790 if (mce_timed_out(&timeout))
791 goto reset;
792 ndelay(SPINUNIT);
793 }
794
795 /*
796 * Don't reset anything. That's done by the Monarch.
797 */
798 return 0;
799 }
800
801 /*
802 * Reset all global state.
803 */
804reset:
805 atomic_set(&global_nwo, 0);
806 atomic_set(&mce_callin, 0);
807 barrier();
808
809 /*
810 * Let others run again.
811 */
812 atomic_set(&mce_executing, 0);
813 return ret;
814}
815
816/*
817 * Check if the address reported by the CPU is in a format we can parse.
818 * It would be possible to add code for most other cases, but all would
819 * be somewhat complicated (e.g. segment offset would require an instruction
820 * parser). So only support physical addresses upto page granuality for now.
821 */
822static int mce_usable_address(struct mce *m)
823{
824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
825 return 0;
826 if ((m->misc & 0x3f) > PAGE_SHIFT)
827 return 0;
828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
829 return 0;
830 return 1;
831}
832
833static void mce_clear_state(unsigned long *toclear)
834{
835 int i;
836
837 for (i = 0; i < banks; i++) {
838 if (test_bit(i, toclear))
839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
840 }
841}
842
843/*
844 * The actual machine check handler. This only handles real
845 * exceptions when something got corrupted coming in through int 18.
846 *
847 * This is executed in NMI context not subject to normal locking rules. This
848 * implies that most kernel services cannot be safely used. Don't even
849 * think about putting a printk in there!
850 *
851 * On Intel systems this is entered on all CPUs in parallel through
852 * MCE broadcast. However some CPUs might be broken beyond repair,
853 * so be always careful when synchronizing with others.
854 */
855void do_machine_check(struct pt_regs *regs, long error_code)
856{
857 struct mce m, *final;
858 int i;
859 int worst = 0;
860 int severity;
861 /*
862 * Establish sequential order between the CPUs entering the machine
863 * check handler.
864 */
865 int order;
866
867 /*
868 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway.
870 */
871 int no_way_out = 0;
872 /*
873 * If kill_it gets set, there might be a way to recover from this
874 * error.
875 */
876 int kill_it = 0;
877 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
878 char *msg = "Unknown";
879
880 atomic_inc(&mce_entry);
881
882 __get_cpu_var(mce_exception_count)++;
883
884 if (notify_die(DIE_NMI, "machine check", regs, error_code,
885 18, SIGKILL) == NOTIFY_STOP)
886 goto out;
887 if (!banks)
888 goto out;
889
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m);
892
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
894 no_way_out = mce_no_way_out(&m, &msg);
895
896 final = &__get_cpu_var(mces_seen);
897 *final = m;
898
899 barrier();
900
901 /*
902 * When no restart IP must always kill or panic.
903 */
904 if (!(m.mcgstatus & MCG_STATUS_RIPV))
905 kill_it = 1;
906
907 /*
908 * Go through all the banks in exclusion of the other CPUs.
909 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it.
911 */
912 no_way_out = mce_start(no_way_out, &order);
913 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear);
915 if (!bank[i])
916 continue;
917
918 m.misc = 0;
919 m.addr = 0;
920 m.bank = i;
921
922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
923 if ((m.status & MCI_STATUS_VAL) == 0)
924 continue;
925
926 /*
927 * Non uncorrected or non signaled errors are handled by
928 * machine_check_poll. Leave them alone, unless this panics.
929 */
930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
931 !no_way_out)
932 continue;
933
934 /*
935 * Set taint even when machine check was not enabled.
936 */
937 add_taint(TAINT_MACHINE_CHECK);
938
939 severity = mce_severity(&m, tolerant, NULL);
940
941 /*
942 * When machine check was for corrected handler don't touch,
943 * unless we're panicing.
944 */
945 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
946 continue;
947 __set_bit(i, toclear);
948 if (severity == MCE_NO_SEVERITY) {
949 /*
950 * Machine check event was not enabled. Clear, but
951 * ignore.
952 */
953 continue;
954 }
955
956 /*
957 * Kill on action required.
958 */
959 if (severity == MCE_AR_SEVERITY)
960 kill_it = 1;
961
962 if (m.status & MCI_STATUS_MISCV)
963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
964 if (m.status & MCI_STATUS_ADDRV)
965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
966
967 /*
968 * Action optional error. Queue address for later processing.
969 * When the ring overflows we just ignore the AO error.
970 * RED-PEN add some logging mechanism when
971 * usable_address or mce_add_ring fails.
972 * RED-PEN don't ignore overflow for tolerant == 0
973 */
974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
975 mce_ring_add(m.addr >> PAGE_SHIFT);
976
977 mce_get_rip(&m, regs);
978 mce_log(&m);
979
980 if (severity > worst) {
981 *final = m;
982 worst = severity;
983 }
984 }
985
986 if (!no_way_out)
987 mce_clear_state(toclear);
988
989 /*
990 * Do most of the synchronization with other CPUs.
991 * When there's any problem use only local no_way_out state.
992 */
993 if (mce_end(order) < 0)
994 no_way_out = worst >= MCE_PANIC_SEVERITY;
995
996 /*
997 * If we have decided that we just CAN'T continue, and the user
998 * has not set tolerant to an insane level, give up and die.
999 *
1000 * This is mainly used in the case when the system doesn't
1001 * support MCE broadcasting or it has been disabled.
1002 */
1003 if (no_way_out && tolerant < 3)
1004 mce_panic("Fatal machine check on current CPU", final, msg);
1005
1006 /*
1007 * If the error seems to be unrecoverable, something should be
1008 * done. Try to kill as little as possible. If we can kill just
1009 * one task, do that. If the user has set the tolerance very
1010 * high, don't try to do anything at all.
1011 */
1012
1013 if (kill_it && tolerant < 3)
1014 force_sig(SIGBUS, current);
1015
1016 /* notify userspace ASAP */
1017 set_thread_flag(TIF_MCE_NOTIFY);
1018
1019 if (worst > 0)
1020 mce_report_event(regs);
1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1022out:
1023 atomic_dec(&mce_entry);
1024 sync_core();
1025}
1026EXPORT_SYMBOL_GPL(do_machine_check);
1027
1028/* dummy to break dependency. actual code is in mm/memory-failure.c */
1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1030{
1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1032}
1033
1034/*
1035 * Called after mce notification in process context. This code
1036 * is allowed to sleep. Call the high level VM handler to process
1037 * any corrupted pages.
1038 * Assume that the work queue code only calls this one at a time
1039 * per CPU.
1040 * Note we don't disable preemption, so this code might run on the wrong
1041 * CPU. In this case the event is picked up by the scheduled work queue.
1042 * This is merely a fast path to expedite processing in some common
1043 * cases.
1044 */
1045void mce_notify_process(void)
1046{
1047 unsigned long pfn;
1048 mce_notify_irq();
1049 while (mce_ring_get(&pfn))
1050 memory_failure(pfn, MCE_VECTOR);
1051}
1052
1053static void mce_process_work(struct work_struct *dummy)
1054{
1055 mce_notify_process();
1056}
1057
1058#ifdef CONFIG_X86_MCE_INTEL
1059/***
1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1061 * @cpu: The CPU on which the event occurred.
1062 * @status: Event status information
1063 *
1064 * This function should be called by the thermal interrupt after the
1065 * event has been processed and the decision was made to log the event
1066 * further.
1067 *
1068 * The status parameter will be saved to the 'status' field of 'struct mce'
1069 * and historically has been the register value of the
1070 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1071 */
1072void mce_log_therm_throt_event(__u64 status)
1073{
1074 struct mce m;
1075
1076 mce_setup(&m);
1077 m.bank = MCE_THERMAL_BANK;
1078 m.status = status;
1079 mce_log(&m);
1080}
1081#endif /* CONFIG_X86_MCE_INTEL */
1082
1083/*
1084 * Periodic polling timer for "silent" machine check errors. If the
1085 * poller finds an MCE, poll 2x faster. When the poller finds no more
1086 * errors, poll 2x slower (up to check_interval seconds).
1087 */
1088static int check_interval = 5 * 60; /* 5 minutes */
1089
1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1091static DEFINE_PER_CPU(struct timer_list, mce_timer);
1092
1093static void mcheck_timer(unsigned long data)
1094{
1095 struct timer_list *t = &per_cpu(mce_timer, data);
1096 int *n;
1097
1098 WARN_ON(smp_processor_id() != data);
1099
1100 if (mce_available(&current_cpu_data)) {
1101 machine_check_poll(MCP_TIMESTAMP,
1102 &__get_cpu_var(mce_poll_banks));
1103 }
1104
1105 /*
1106 * Alert userspace if needed. If we logged an MCE, reduce the
1107 * polling interval, otherwise increase the polling interval.
1108 */
1109 n = &__get_cpu_var(next_interval);
1110 if (mce_notify_irq())
1111 *n = max(*n/2, HZ/100);
1112 else
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114
1115 t->expires = jiffies + *n;
1116 add_timer(t);
1117}
1118
1119static void mce_do_trigger(struct work_struct *work)
1120{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
1122}
1123
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1125
1126/*
1127 * Notify the user(s) about new machine check events.
1128 * Can be called from interrupt context, but not from machine check/NMI
1129 * context.
1130 */
1131int mce_notify_irq(void)
1132{
1133 /* Not more than two messages every minute */
1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1135
1136 clear_thread_flag(TIF_MCE_NOTIFY);
1137
1138 if (test_and_clear_bit(0, &notify_user)) {
1139 wake_up_interruptible(&mce_wait);
1140
1141 /*
1142 * There is no risk of missing notifications because
1143 * work_pending is always cleared before the function is
1144 * executed.
1145 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work);
1148
1149 if (__ratelimit(&ratelimit))
1150 printk(KERN_INFO "Machine check events logged\n");
1151
1152 return 1;
1153 }
1154 return 0;
1155}
1156EXPORT_SYMBOL_GPL(mce_notify_irq);
1157
1158/*
1159 * Initialize Machine Checks for a CPU.
1160 */
1161static int mce_cap_init(void)
1162{
1163 unsigned b;
1164 u64 cap;
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
1167
1168 b = cap & MCG_BANKCNT_MASK;
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
1181 if (!bank) {
1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1183 if (!bank)
1184 return -ENOMEM;
1185 memset(bank, 0xff, banks * sizeof(u64));
1186 }
1187
1188 /* Use accurate RIP reporting if available. */
1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1190 rip_msr = MSR_IA32_MCG_EIP;
1191
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
1195 return 0;
1196}
1197
1198static void mce_init(void)
1199{
1200 mce_banks_t all_banks;
1201 u64 cap;
1202 int i;
1203
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
1207 bitmap_fill(all_banks, MAX_NR_BANKS);
1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
1217 if (skip_bank_init(i))
1218 continue;
1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1221 }
1222}
1223
1224/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{
1227 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
1235 clear_bit(10, (unsigned long *)&bank[4]);
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
1242 mce_bootlog = 0;
1243 }
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
1248 if (c->x86 == 6)
1249 bank[0] = 0;
1250 }
1251
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
1262 if (c->x86 == 6 && c->x86_model < 0x1A)
1263 __set_bit(0, &dont_init_banks);
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
1272 }
1273 if (monarch_timeout < 0)
1274 monarch_timeout = 0;
1275 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30;
1277}
1278
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1280{
1281 if (c->x86 != 5)
1282 return;
1283 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled())
1286 intel_p5_mcheck_init(c);
1287 break;
1288 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c);
1290 break;
1291 }
1292}
1293
1294static void mce_cpu_features(struct cpuinfo_x86 *c)
1295{
1296 switch (c->x86_vendor) {
1297 case X86_VENDOR_INTEL:
1298 mce_intel_feature_init(c);
1299 break;
1300 case X86_VENDOR_AMD:
1301 mce_amd_feature_init(c);
1302 break;
1303 default:
1304 break;
1305 }
1306}
1307
1308static void mce_init_timer(void)
1309{
1310 struct timer_list *t = &__get_cpu_var(mce_timer);
1311 int *n = &__get_cpu_var(next_interval);
1312
1313 if (mce_ignore_ce)
1314 return;
1315
1316 *n = check_interval * HZ;
1317 if (!*n)
1318 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t);
1322}
1323
1324/*
1325 * Called for each booted CPU to set up machine checks.
1326 * Must be called with preempt off:
1327 */
1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1329{
1330 if (mce_disabled)
1331 return;
1332
1333 mce_ancient_init(c);
1334
1335 if (!mce_available(c))
1336 return;
1337
1338 if (mce_cap_init() < 0) {
1339 mce_disabled = 1;
1340 return;
1341 }
1342 mce_cpu_quirks(c);
1343
1344 machine_check_vector = do_machine_check;
1345
1346 mce_init();
1347 mce_cpu_features(c);
1348 mce_init_timer();
1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1350}
1351
1352/*
1353 * Character device to read and clear the MCE log.
1354 */
1355
1356static DEFINE_SPINLOCK(mce_state_lock);
1357static int open_count; /* #times opened */
1358static int open_exclu; /* already open exclusive? */
1359
1360static int mce_open(struct inode *inode, struct file *file)
1361{
1362 spin_lock(&mce_state_lock);
1363
1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1365 spin_unlock(&mce_state_lock);
1366
1367 return -EBUSY;
1368 }
1369
1370 if (file->f_flags & O_EXCL)
1371 open_exclu = 1;
1372 open_count++;
1373
1374 spin_unlock(&mce_state_lock);
1375
1376 return nonseekable_open(inode, file);
1377}
1378
1379static int mce_release(struct inode *inode, struct file *file)
1380{
1381 spin_lock(&mce_state_lock);
1382
1383 open_count--;
1384 open_exclu = 0;
1385
1386 spin_unlock(&mce_state_lock);
1387
1388 return 0;
1389}
1390
1391static void collect_tscs(void *data)
1392{
1393 unsigned long *cpu_tsc = (unsigned long *)data;
1394
1395 rdtscll(cpu_tsc[smp_processor_id()]);
1396}
1397
1398static DEFINE_MUTEX(mce_read_mutex);
1399
1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1401 loff_t *off)
1402{
1403 char __user *buf = ubuf;
1404 unsigned long *cpu_tsc;
1405 unsigned prev, next;
1406 int i, err;
1407
1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1409 if (!cpu_tsc)
1410 return -ENOMEM;
1411
1412 mutex_lock(&mce_read_mutex);
1413 next = rcu_dereference(mcelog.next);
1414
1415 /* Only supports full reads right now */
1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1417 mutex_unlock(&mce_read_mutex);
1418 kfree(cpu_tsc);
1419
1420 return -EINVAL;
1421 }
1422
1423 err = 0;
1424 prev = 0;
1425 do {
1426 for (i = prev; i < next; i++) {
1427 unsigned long start = jiffies;
1428
1429 while (!mcelog.entry[i].finished) {
1430 if (time_after_eq(jiffies, start + 2)) {
1431 memset(mcelog.entry + i, 0,
1432 sizeof(struct mce));
1433 goto timeout;
1434 }
1435 cpu_relax();
1436 }
1437 smp_rmb();
1438 err |= copy_to_user(buf, mcelog.entry + i,
1439 sizeof(struct mce));
1440 buf += sizeof(struct mce);
1441timeout:
1442 ;
1443 }
1444
1445 memset(mcelog.entry + prev, 0,
1446 (next - prev) * sizeof(struct mce));
1447 prev = next;
1448 next = cmpxchg(&mcelog.next, prev, 0);
1449 } while (next != prev);
1450
1451 synchronize_sched();
1452
1453 /*
1454 * Collect entries that were still getting written before the
1455 * synchronize.
1456 */
1457 on_each_cpu(collect_tscs, cpu_tsc, 1);
1458
1459 for (i = next; i < MCE_LOG_LEN; i++) {
1460 if (mcelog.entry[i].finished &&
1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1462 err |= copy_to_user(buf, mcelog.entry+i,
1463 sizeof(struct mce));
1464 smp_rmb();
1465 buf += sizeof(struct mce);
1466 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1467 }
1468 }
1469 mutex_unlock(&mce_read_mutex);
1470 kfree(cpu_tsc);
1471
1472 return err ? -EFAULT : buf - ubuf;
1473}
1474
1475static unsigned int mce_poll(struct file *file, poll_table *wait)
1476{
1477 poll_wait(file, &mce_wait, wait);
1478 if (rcu_dereference(mcelog.next))
1479 return POLLIN | POLLRDNORM;
1480 return 0;
1481}
1482
1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1484{
1485 int __user *p = (int __user *)arg;
1486
1487 if (!capable(CAP_SYS_ADMIN))
1488 return -EPERM;
1489
1490 switch (cmd) {
1491 case MCE_GET_RECORD_LEN:
1492 return put_user(sizeof(struct mce), p);
1493 case MCE_GET_LOG_LEN:
1494 return put_user(MCE_LOG_LEN, p);
1495 case MCE_GETCLEAR_FLAGS: {
1496 unsigned flags;
1497
1498 do {
1499 flags = mcelog.flags;
1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1501
1502 return put_user(flags, p);
1503 }
1504 default:
1505 return -ENOTTY;
1506 }
1507}
1508
1509/* Modified in mce-inject.c, so not static or const */
1510struct file_operations mce_chrdev_ops = {
1511 .open = mce_open,
1512 .release = mce_release,
1513 .read = mce_read,
1514 .poll = mce_poll,
1515 .unlocked_ioctl = mce_ioctl,
1516};
1517EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1518
1519static struct miscdevice mce_log_device = {
1520 MISC_MCELOG_MINOR,
1521 "mcelog",
1522 &mce_chrdev_ops,
1523};
1524
1525/*
1526 * mce=off Disables machine check
1527 * mce=no_cmci Disables CMCI
1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1531 * monarchtimeout is how long to wait for other CPUs on machine
1532 * check, or 0 to not wait
1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1534 * mce=nobootlog Don't log MCEs from before booting.
1535 */
1536static int __init mcheck_enable(char *str)
1537{
1538 if (*str == 0)
1539 enable_p5_mce();
1540 if (*str == '=')
1541 str++;
1542 if (!strcmp(str, "off"))
1543 mce_disabled = 1;
1544 else if (!strcmp(str, "no_cmci"))
1545 mce_cmci_disabled = 1;
1546 else if (!strcmp(str, "dont_log_ce"))
1547 mce_dont_log_ce = 1;
1548 else if (!strcmp(str, "ignore_ce"))
1549 mce_ignore_ce = 1;
1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1551 mce_bootlog = (str[0] == 'b');
1552 else if (isdigit(str[0])) {
1553 get_option(&str, &tolerant);
1554 if (*str == ',') {
1555 ++str;
1556 get_option(&str, &monarch_timeout);
1557 }
1558 } else {
1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1560 str);
1561 return 0;
1562 }
1563 return 1;
1564}
1565__setup("mce", mcheck_enable);
1566
1567/*
1568 * Sysfs support
1569 */
1570
1571/*
1572 * Disable machine checks on suspend and shutdown. We can't really handle
1573 * them later.
1574 */
1575static int mce_disable(void)
1576{
1577 int i;
1578
1579 for (i = 0; i < banks; i++) {
1580 if (!skip_bank_init(i))
1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1582 }
1583 return 0;
1584}
1585
1586static int mce_suspend(struct sys_device *dev, pm_message_t state)
1587{
1588 return mce_disable();
1589}
1590
1591static int mce_shutdown(struct sys_device *dev)
1592{
1593 return mce_disable();
1594}
1595
1596/*
1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1598 * Only one CPU is active at this time, the others get re-added later using
1599 * CPU hotplug:
1600 */
1601static int mce_resume(struct sys_device *dev)
1602{
1603 mce_init();
1604 mce_cpu_features(&current_cpu_data);
1605
1606 return 0;
1607}
1608
1609static void mce_cpu_restart(void *data)
1610{
1611 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data))
1613 mce_init();
1614 mce_init_timer();
1615}
1616
1617/* Reinit MCEs after user configuration changes */
1618static void mce_restart(void)
1619{
1620 on_each_cpu(mce_cpu_restart, NULL, 1);
1621}
1622
1623static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown,
1626 .resume = mce_resume,
1627 .name = "machinecheck",
1628};
1629
1630DEFINE_PER_CPU(struct sys_device, mce_dev);
1631
1632__cpuinitdata
1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1634
1635static struct sysdev_attribute *bank_attrs;
1636
1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1638 char *buf)
1639{
1640 u64 b = bank[attr - bank_attrs];
1641
1642 return sprintf(buf, "%llx\n", b);
1643}
1644
1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1646 const char *buf, size_t size)
1647{
1648 u64 new;
1649
1650 if (strict_strtoull(buf, 0, &new) < 0)
1651 return -EINVAL;
1652
1653 bank[attr - bank_attrs] = new;
1654 mce_restart();
1655
1656 return size;
1657}
1658
1659static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{
1662 strcpy(buf, trigger);
1663 strcat(buf, "\n");
1664 return strlen(trigger) + 1;
1665}
1666
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz)
1669{
1670 char *p;
1671 int len;
1672
1673 strncpy(trigger, buf, sizeof(trigger));
1674 trigger[sizeof(trigger)-1] = 0;
1675 len = strlen(trigger);
1676 p = strchr(trigger, '\n');
1677
1678 if (*p)
1679 *p = 0;
1680
1681 return len;
1682}
1683
1684static ssize_t store_int_with_restart(struct sys_device *s,
1685 struct sysdev_attribute *attr,
1686 const char *buf, size_t size)
1687{
1688 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1689 mce_restart();
1690 return ret;
1691}
1692
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1696
1697static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1699 store_int_with_restart),
1700 &check_interval
1701};
1702
1703static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1705 &attr_monarch_timeout.attr,
1706 NULL
1707};
1708
1709static cpumask_var_t mce_dev_initialized;
1710
1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1712static __cpuinit int mce_create_device(unsigned int cpu)
1713{
1714 int err;
1715 int i;
1716
1717 if (!mce_available(&boot_cpu_data))
1718 return -EIO;
1719
1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1721 per_cpu(mce_dev, cpu).id = cpu;
1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1723
1724 err = sysdev_register(&per_cpu(mce_dev, cpu));
1725 if (err)
1726 return err;
1727
1728 for (i = 0; mce_attrs[i]; i++) {
1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1730 if (err)
1731 goto error;
1732 }
1733 for (i = 0; i < banks; i++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]);
1736 if (err)
1737 goto error2;
1738 }
1739 cpumask_set_cpu(cpu, mce_dev_initialized);
1740
1741 return 0;
1742error2:
1743 while (--i >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1745error:
1746 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1748
1749 sysdev_unregister(&per_cpu(mce_dev, cpu));
1750
1751 return err;
1752}
1753
1754static __cpuinit void mce_remove_device(unsigned int cpu)
1755{
1756 int i;
1757
1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1759 return;
1760
1761 for (i = 0; mce_attrs[i]; i++)
1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1763
1764 for (i = 0; i < banks; i++)
1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1766
1767 sysdev_unregister(&per_cpu(mce_dev, cpu));
1768 cpumask_clear_cpu(cpu, mce_dev_initialized);
1769}
1770
1771/* Make sure there are no machine checks on offlined CPUs. */
1772static void mce_disable_cpu(void *h)
1773{
1774 unsigned long action = *(unsigned long *)h;
1775 int i;
1776
1777 if (!mce_available(&current_cpu_data))
1778 return;
1779 if (!(action & CPU_TASKS_FROZEN))
1780 cmci_clear();
1781 for (i = 0; i < banks; i++) {
1782 if (!skip_bank_init(i))
1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1784 }
1785}
1786
1787static void mce_reenable_cpu(void *h)
1788{
1789 unsigned long action = *(unsigned long *)h;
1790 int i;
1791
1792 if (!mce_available(&current_cpu_data))
1793 return;
1794
1795 if (!(action & CPU_TASKS_FROZEN))
1796 cmci_reenable();
1797 for (i = 0; i < banks; i++) {
1798 if (!skip_bank_init(i))
1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1800 }
1801}
1802
1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1804static int __cpuinit
1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1806{
1807 unsigned int cpu = (unsigned long)hcpu;
1808 struct timer_list *t = &per_cpu(mce_timer, cpu);
1809
1810 switch (action) {
1811 case CPU_ONLINE:
1812 case CPU_ONLINE_FROZEN:
1813 mce_create_device(cpu);
1814 if (threshold_cpu_callback)
1815 threshold_cpu_callback(action, cpu);
1816 break;
1817 case CPU_DEAD:
1818 case CPU_DEAD_FROZEN:
1819 if (threshold_cpu_callback)
1820 threshold_cpu_callback(action, cpu);
1821 mce_remove_device(cpu);
1822 break;
1823 case CPU_DOWN_PREPARE:
1824 case CPU_DOWN_PREPARE_FROZEN:
1825 del_timer_sync(t);
1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1827 break;
1828 case CPU_DOWN_FAILED:
1829 case CPU_DOWN_FAILED_FROZEN:
1830 t->expires = round_jiffies(jiffies +
1831 __get_cpu_var(next_interval));
1832 add_timer_on(t, cpu);
1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1834 break;
1835 case CPU_POST_DEAD:
1836 /* intentionally ignoring frozen here */
1837 cmci_rediscover(cpu);
1838 break;
1839 }
1840 return NOTIFY_OK;
1841}
1842
1843static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1844 .notifier_call = mce_cpu_callback,
1845};
1846
1847static __init int mce_init_banks(void)
1848{
1849 int i;
1850
1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1852 GFP_KERNEL);
1853 if (!bank_attrs)
1854 return -ENOMEM;
1855
1856 for (i = 0; i < banks; i++) {
1857 struct sysdev_attribute *a = &bank_attrs[i];
1858
1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1860 if (!a->attr.name)
1861 goto nomem;
1862
1863 a->attr.mode = 0644;
1864 a->show = show_bank;
1865 a->store = set_bank;
1866 }
1867 return 0;
1868
1869nomem:
1870 while (--i >= 0)
1871 kfree(bank_attrs[i].attr.name);
1872 kfree(bank_attrs);
1873 bank_attrs = NULL;
1874
1875 return -ENOMEM;
1876}
1877
1878static __init int mce_init_device(void)
1879{
1880 int err;
1881 int i = 0;
1882
1883 if (!mce_available(&boot_cpu_data))
1884 return -EIO;
1885
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887
1888 err = mce_init_banks();
1889 if (err)
1890 return err;
1891
1892 err = sysdev_class_register(&mce_sysclass);
1893 if (err)
1894 return err;
1895
1896 for_each_online_cpu(i) {
1897 err = mce_create_device(i);
1898 if (err)
1899 return err;
1900 }
1901
1902 register_hotcpu_notifier(&mce_cpu_notifier);
1903 misc_register(&mce_log_device);
1904
1905 return err;
1906}
1907
1908device_initcall(mce_init_device);
1909
1910#else /* CONFIG_X86_OLD_MCE: */
1911
1912int nr_mce_banks;
1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1914
1915/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c)
1917{
1918 if (mce_disabled == 1)
1919 return;
1920
1921 switch (c->x86_vendor) {
1922 case X86_VENDOR_AMD:
1923 amd_mcheck_init(c);
1924 break;
1925
1926 case X86_VENDOR_INTEL:
1927 if (c->x86 == 5)
1928 intel_p5_mcheck_init(c);
1929 if (c->x86 == 6)
1930 intel_p6_mcheck_init(c);
1931 if (c->x86 == 15)
1932 intel_p4_mcheck_init(c);
1933 break;
1934
1935 case X86_VENDOR_CENTAUR:
1936 if (c->x86 == 5)
1937 winchip_mcheck_init(c);
1938 break;
1939
1940 default:
1941 break;
1942 }
1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1944}
1945
1946static int __init mcheck_enable(char *str)
1947{
1948 mce_disabled = -1;
1949 return 1;
1950}
1951
1952__setup("mce", mcheck_enable);
1953
1954#endif /* CONFIG_X86_OLD_MCE */
1955
1956/*
1957 * Old style boot options parsing. Only for compatibility.
1958 */
1959static int __init mcheck_disable(char *str)
1960{
1961 mce_disabled = 1;
1962 return 1;
1963}
1964__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f1..84a552b458c8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4#ifdef CONFIG_X86_OLD_MCE
4void amd_mcheck_init(struct cpuinfo_x86 *c); 5void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c); 7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
9 22
10/* Call the installed machine check handler for this CPU setup. */ 23/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code); 24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 25
26#ifdef CONFIG_X86_OLD_MCE
27
13extern int nr_mce_banks; 28extern int nr_mce_banks;
14 29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 289cc4815028..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1188 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423EXPORT_SYMBOL_GPL(do_machine_check);
424
425#ifdef CONFIG_X86_MCE_INTEL
426/***
427 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
428 * @cpu: The CPU on which the event occurred.
429 * @status: Event status information
430 *
431 * This function should be called by the thermal interrupt after the
432 * event has been processed and the decision was made to log the event
433 * further.
434 *
435 * The status parameter will be saved to the 'status' field of 'struct mce'
436 * and historically has been the register value of the
437 * MSR_IA32_THERMAL_STATUS (Intel) msr.
438 */
439void mce_log_therm_throt_event(__u64 status)
440{
441 struct mce m;
442
443 mce_setup(&m);
444 m.bank = MCE_THERMAL_BANK;
445 m.status = status;
446 mce_log(&m);
447}
448#endif /* CONFIG_X86_MCE_INTEL */
449
450/*
451 * Periodic polling timer for "silent" machine check errors. If the
452 * poller finds an MCE, poll 2x faster. When the poller finds no more
453 * errors, poll 2x slower (up to check_interval seconds).
454 */
455
456static int check_interval = 5 * 60; /* 5 minutes */
457static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
458static void mcheck_timer(unsigned long);
459static DEFINE_PER_CPU(struct timer_list, mce_timer);
460
461static void mcheck_timer(unsigned long data)
462{
463 struct timer_list *t = &per_cpu(mce_timer, data);
464 int *n;
465
466 WARN_ON(smp_processor_id() != data);
467
468 if (mce_available(&current_cpu_data))
469 machine_check_poll(MCP_TIMESTAMP,
470 &__get_cpu_var(mce_poll_banks));
471
472 /*
473 * Alert userspace if needed. If we logged an MCE, reduce the
474 * polling interval, otherwise increase the polling interval.
475 */
476 n = &__get_cpu_var(next_interval);
477 if (mce_notify_user()) {
478 *n = max(*n/2, HZ/100);
479 } else {
480 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
481 }
482
483 t->expires = jiffies + *n;
484 add_timer(t);
485}
486
487static void mce_do_trigger(struct work_struct *work)
488{
489 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
490}
491
492static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
493
494/*
495 * Notify the user(s) about new machine check events.
496 * Can be called from interrupt context, but not from machine check/NMI
497 * context.
498 */
499int mce_notify_user(void)
500{
501 /* Not more than two messages every minute */
502 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
503
504 clear_thread_flag(TIF_MCE_NOTIFY);
505 if (test_and_clear_bit(0, &notify_user)) {
506 wake_up_interruptible(&mce_wait);
507
508 /*
509 * There is no risk of missing notifications because
510 * work_pending is always cleared before the function is
511 * executed.
512 */
513 if (trigger[0] && !work_pending(&mce_trigger_work))
514 schedule_work(&mce_trigger_work);
515
516 if (__ratelimit(&ratelimit))
517 printk(KERN_INFO "Machine check events logged\n");
518
519 return 1;
520 }
521 return 0;
522}
523
524/* see if the idle task needs to notify userspace */
525static int
526mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
527{
528 /* IDLE_END should be safe - interrupts are back on */
529 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
530 mce_notify_user();
531
532 return NOTIFY_OK;
533}
534
535static struct notifier_block mce_idle_notifier = {
536 .notifier_call = mce_idle_callback,
537};
538
539static __init int periodic_mcheck_init(void)
540{
541 idle_notifier_register(&mce_idle_notifier);
542 return 0;
543}
544__initcall(periodic_mcheck_init);
545
546/*
547 * Initialize Machine Checks for a CPU.
548 */
549static int mce_cap_init(void)
550{
551 u64 cap;
552 unsigned b;
553
554 rdmsrl(MSR_IA32_MCG_CAP, cap);
555 b = cap & 0xff;
556 if (b > MAX_NR_BANKS) {
557 printk(KERN_WARNING
558 "MCE: Using only %u machine check banks out of %u\n",
559 MAX_NR_BANKS, b);
560 b = MAX_NR_BANKS;
561 }
562
563 /* Don't support asymmetric configurations today */
564 WARN_ON(banks != 0 && b != banks);
565 banks = b;
566 if (!bank) {
567 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
568 if (!bank)
569 return -ENOMEM;
570 memset(bank, 0xff, banks * sizeof(u64));
571 }
572
573 /* Use accurate RIP reporting if available. */
574 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
575 rip_msr = MSR_IA32_MCG_EIP;
576
577 return 0;
578}
579
580static void mce_init(void *dummy)
581{
582 u64 cap;
583 int i;
584 mce_banks_t all_banks;
585
586 /*
587 * Log the machine checks left over from the previous reset.
588 */
589 bitmap_fill(all_banks, MAX_NR_BANKS);
590 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
591
592 set_in_cr4(X86_CR4_MCE);
593
594 rdmsrl(MSR_IA32_MCG_CAP, cap);
595 if (cap & MCG_CTL_P)
596 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
597
598 for (i = 0; i < banks; i++) {
599 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
600 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
601 }
602}
603
604/* Add per CPU specific workarounds here */
605static void mce_cpu_quirks(struct cpuinfo_x86 *c)
606{
607 /* This should be disabled by the BIOS, but isn't always */
608 if (c->x86_vendor == X86_VENDOR_AMD) {
609 if (c->x86 == 15 && banks > 4)
610 /* disable GART TBL walk error reporting, which trips off
611 incorrectly with the IOMMU & 3ware & Cerberus. */
612 clear_bit(10, (unsigned long *)&bank[4]);
613 if(c->x86 <= 17 && mce_bootlog < 0)
614 /* Lots of broken BIOS around that don't clear them
615 by default and leave crap in there. Don't log. */
616 mce_bootlog = 0;
617 }
618
619}
620
621static void mce_cpu_features(struct cpuinfo_x86 *c)
622{
623 switch (c->x86_vendor) {
624 case X86_VENDOR_INTEL:
625 mce_intel_feature_init(c);
626 break;
627 case X86_VENDOR_AMD:
628 mce_amd_feature_init(c);
629 break;
630 default:
631 break;
632 }
633}
634
635static void mce_init_timer(void)
636{
637 struct timer_list *t = &__get_cpu_var(mce_timer);
638 int *n = &__get_cpu_var(next_interval);
639
640 *n = check_interval * HZ;
641 if (!*n)
642 return;
643 setup_timer(t, mcheck_timer, smp_processor_id());
644 t->expires = round_jiffies(jiffies + *n);
645 add_timer(t);
646}
647
648/*
649 * Called for each booted CPU to set up machine checks.
650 * Must be called with preempt off.
651 */
652void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
653{
654 if (!mce_available(c))
655 return;
656
657 if (mce_cap_init() < 0) {
658 mce_dont_init = 1;
659 return;
660 }
661 mce_cpu_quirks(c);
662
663 mce_init(NULL);
664 mce_cpu_features(c);
665 mce_init_timer();
666}
667
668/*
669 * Character device to read and clear the MCE log.
670 */
671
672static DEFINE_SPINLOCK(mce_state_lock);
673static int open_count; /* #times opened */
674static int open_exclu; /* already open exclusive? */
675
676static int mce_open(struct inode *inode, struct file *file)
677{
678 lock_kernel();
679 spin_lock(&mce_state_lock);
680
681 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
682 spin_unlock(&mce_state_lock);
683 unlock_kernel();
684 return -EBUSY;
685 }
686
687 if (file->f_flags & O_EXCL)
688 open_exclu = 1;
689 open_count++;
690
691 spin_unlock(&mce_state_lock);
692 unlock_kernel();
693
694 return nonseekable_open(inode, file);
695}
696
697static int mce_release(struct inode *inode, struct file *file)
698{
699 spin_lock(&mce_state_lock);
700
701 open_count--;
702 open_exclu = 0;
703
704 spin_unlock(&mce_state_lock);
705
706 return 0;
707}
708
709static void collect_tscs(void *data)
710{
711 unsigned long *cpu_tsc = (unsigned long *)data;
712
713 rdtscll(cpu_tsc[smp_processor_id()]);
714}
715
716static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
717 loff_t *off)
718{
719 unsigned long *cpu_tsc;
720 static DEFINE_MUTEX(mce_read_mutex);
721 unsigned prev, next;
722 char __user *buf = ubuf;
723 int i, err;
724
725 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
726 if (!cpu_tsc)
727 return -ENOMEM;
728
729 mutex_lock(&mce_read_mutex);
730 next = rcu_dereference(mcelog.next);
731
732 /* Only supports full reads right now */
733 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
734 mutex_unlock(&mce_read_mutex);
735 kfree(cpu_tsc);
736 return -EINVAL;
737 }
738
739 err = 0;
740 prev = 0;
741 do {
742 for (i = prev; i < next; i++) {
743 unsigned long start = jiffies;
744
745 while (!mcelog.entry[i].finished) {
746 if (time_after_eq(jiffies, start + 2)) {
747 memset(mcelog.entry + i, 0,
748 sizeof(struct mce));
749 goto timeout;
750 }
751 cpu_relax();
752 }
753 smp_rmb();
754 err |= copy_to_user(buf, mcelog.entry + i,
755 sizeof(struct mce));
756 buf += sizeof(struct mce);
757timeout:
758 ;
759 }
760
761 memset(mcelog.entry + prev, 0,
762 (next - prev) * sizeof(struct mce));
763 prev = next;
764 next = cmpxchg(&mcelog.next, prev, 0);
765 } while (next != prev);
766
767 synchronize_sched();
768
769 /*
770 * Collect entries that were still getting written before the
771 * synchronize.
772 */
773 on_each_cpu(collect_tscs, cpu_tsc, 1);
774 for (i = next; i < MCE_LOG_LEN; i++) {
775 if (mcelog.entry[i].finished &&
776 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
777 err |= copy_to_user(buf, mcelog.entry+i,
778 sizeof(struct mce));
779 smp_rmb();
780 buf += sizeof(struct mce);
781 memset(&mcelog.entry[i], 0, sizeof(struct mce));
782 }
783 }
784 mutex_unlock(&mce_read_mutex);
785 kfree(cpu_tsc);
786 return err ? -EFAULT : buf - ubuf;
787}
788
789static unsigned int mce_poll(struct file *file, poll_table *wait)
790{
791 poll_wait(file, &mce_wait, wait);
792 if (rcu_dereference(mcelog.next))
793 return POLLIN | POLLRDNORM;
794 return 0;
795}
796
797static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
798{
799 int __user *p = (int __user *)arg;
800
801 if (!capable(CAP_SYS_ADMIN))
802 return -EPERM;
803 switch (cmd) {
804 case MCE_GET_RECORD_LEN:
805 return put_user(sizeof(struct mce), p);
806 case MCE_GET_LOG_LEN:
807 return put_user(MCE_LOG_LEN, p);
808 case MCE_GETCLEAR_FLAGS: {
809 unsigned flags;
810
811 do {
812 flags = mcelog.flags;
813 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
814 return put_user(flags, p);
815 }
816 default:
817 return -ENOTTY;
818 }
819}
820
821static const struct file_operations mce_chrdev_ops = {
822 .open = mce_open,
823 .release = mce_release,
824 .read = mce_read,
825 .poll = mce_poll,
826 .unlocked_ioctl = mce_ioctl,
827};
828
829static struct miscdevice mce_log_device = {
830 MISC_MCELOG_MINOR,
831 "mcelog",
832 &mce_chrdev_ops,
833};
834
835/*
836 * Old style boot options parsing. Only for compatibility.
837 */
838static int __init mcheck_disable(char *str)
839{
840 mce_dont_init = 1;
841 return 1;
842}
843
844/* mce=off disables machine check.
845 mce=TOLERANCELEVEL (number, see above)
846 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
847 mce=nobootlog Don't log MCEs from before booting. */
848static int __init mcheck_enable(char *str)
849{
850 if (!strcmp(str, "off"))
851 mce_dont_init = 1;
852 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
853 mce_bootlog = str[0] == 'b';
854 else if (isdigit(str[0]))
855 get_option(&str, &tolerant);
856 else
857 printk("mce= argument %s ignored. Please use /sys", str);
858 return 1;
859}
860
861__setup("nomce", mcheck_disable);
862__setup("mce=", mcheck_enable);
863
864/*
865 * Sysfs support
866 */
867
868/*
869 * Disable machine checks on suspend and shutdown. We can't really handle
870 * them later.
871 */
872static int mce_disable(void)
873{
874 int i;
875
876 for (i = 0; i < banks; i++)
877 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
878 return 0;
879}
880
881static int mce_suspend(struct sys_device *dev, pm_message_t state)
882{
883 return mce_disable();
884}
885
886static int mce_shutdown(struct sys_device *dev)
887{
888 return mce_disable();
889}
890
891/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
892 Only one CPU is active at this time, the others get readded later using
893 CPU hotplug. */
894static int mce_resume(struct sys_device *dev)
895{
896 mce_init(NULL);
897 mce_cpu_features(&current_cpu_data);
898 return 0;
899}
900
901static void mce_cpu_restart(void *data)
902{
903 del_timer_sync(&__get_cpu_var(mce_timer));
904 if (mce_available(&current_cpu_data))
905 mce_init(NULL);
906 mce_init_timer();
907}
908
909/* Reinit MCEs after user configuration changes */
910static void mce_restart(void)
911{
912 on_each_cpu(mce_cpu_restart, NULL, 1);
913}
914
915static struct sysdev_class mce_sysclass = {
916 .suspend = mce_suspend,
917 .shutdown = mce_shutdown,
918 .resume = mce_resume,
919 .name = "machinecheck",
920};
921
922DEFINE_PER_CPU(struct sys_device, device_mce);
923void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
924
925/* Why are there no generic functions for this? */
926#define ACCESSOR(name, var, start) \
927 static ssize_t show_ ## name(struct sys_device *s, \
928 struct sysdev_attribute *attr, \
929 char *buf) { \
930 return sprintf(buf, "%lx\n", (unsigned long)var); \
931 } \
932 static ssize_t set_ ## name(struct sys_device *s, \
933 struct sysdev_attribute *attr, \
934 const char *buf, size_t siz) { \
935 char *end; \
936 unsigned long new = simple_strtoul(buf, &end, 0); \
937 if (end == buf) return -EINVAL; \
938 var = new; \
939 start; \
940 return end-buf; \
941 } \
942 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
943
944static struct sysdev_attribute *bank_attrs;
945
946static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
947 char *buf)
948{
949 u64 b = bank[attr - bank_attrs];
950 return sprintf(buf, "%llx\n", b);
951}
952
953static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
954 const char *buf, size_t siz)
955{
956 char *end;
957 u64 new = simple_strtoull(buf, &end, 0);
958 if (end == buf)
959 return -EINVAL;
960 bank[attr - bank_attrs] = new;
961 mce_restart();
962 return end-buf;
963}
964
965static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
966 char *buf)
967{
968 strcpy(buf, trigger);
969 strcat(buf, "\n");
970 return strlen(trigger) + 1;
971}
972
973static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
974 const char *buf,size_t siz)
975{
976 char *p;
977 int len;
978 strncpy(trigger, buf, sizeof(trigger));
979 trigger[sizeof(trigger)-1] = 0;
980 len = strlen(trigger);
981 p = strchr(trigger, '\n');
982 if (*p) *p = 0;
983 return len;
984}
985
986static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
987static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
988ACCESSOR(check_interval,check_interval,mce_restart())
989static struct sysdev_attribute *mce_attributes[] = {
990 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
991 NULL
992};
993
994static cpumask_var_t mce_device_initialized;
995
996/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
997static __cpuinit int mce_create_device(unsigned int cpu)
998{
999 int err;
1000 int i;
1001
1002 if (!mce_available(&boot_cpu_data))
1003 return -EIO;
1004
1005 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1006 per_cpu(device_mce,cpu).id = cpu;
1007 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1008
1009 err = sysdev_register(&per_cpu(device_mce,cpu));
1010 if (err)
1011 return err;
1012
1013 for (i = 0; mce_attributes[i]; i++) {
1014 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1015 mce_attributes[i]);
1016 if (err)
1017 goto error;
1018 }
1019 for (i = 0; i < banks; i++) {
1020 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1021 &bank_attrs[i]);
1022 if (err)
1023 goto error2;
1024 }
1025 cpumask_set_cpu(cpu, mce_device_initialized);
1026
1027 return 0;
1028error2:
1029 while (--i >= 0) {
1030 sysdev_remove_file(&per_cpu(device_mce, cpu),
1031 &bank_attrs[i]);
1032 }
1033error:
1034 while (--i >= 0) {
1035 sysdev_remove_file(&per_cpu(device_mce,cpu),
1036 mce_attributes[i]);
1037 }
1038 sysdev_unregister(&per_cpu(device_mce,cpu));
1039
1040 return err;
1041}
1042
1043static __cpuinit void mce_remove_device(unsigned int cpu)
1044{
1045 int i;
1046
1047 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1048 return;
1049
1050 for (i = 0; mce_attributes[i]; i++)
1051 sysdev_remove_file(&per_cpu(device_mce,cpu),
1052 mce_attributes[i]);
1053 for (i = 0; i < banks; i++)
1054 sysdev_remove_file(&per_cpu(device_mce, cpu),
1055 &bank_attrs[i]);
1056 sysdev_unregister(&per_cpu(device_mce,cpu));
1057 cpumask_clear_cpu(cpu, mce_device_initialized);
1058}
1059
1060/* Make sure there are no machine checks on offlined CPUs. */
1061static void mce_disable_cpu(void *h)
1062{
1063 int i;
1064 unsigned long action = *(unsigned long *)h;
1065
1066 if (!mce_available(&current_cpu_data))
1067 return;
1068 if (!(action & CPU_TASKS_FROZEN))
1069 cmci_clear();
1070 for (i = 0; i < banks; i++)
1071 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1072}
1073
1074static void mce_reenable_cpu(void *h)
1075{
1076 int i;
1077 unsigned long action = *(unsigned long *)h;
1078
1079 if (!mce_available(&current_cpu_data))
1080 return;
1081 if (!(action & CPU_TASKS_FROZEN))
1082 cmci_reenable();
1083 for (i = 0; i < banks; i++)
1084 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1085}
1086
1087/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1088static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1089 unsigned long action, void *hcpu)
1090{
1091 unsigned int cpu = (unsigned long)hcpu;
1092 struct timer_list *t = &per_cpu(mce_timer, cpu);
1093
1094 switch (action) {
1095 case CPU_ONLINE:
1096 case CPU_ONLINE_FROZEN:
1097 mce_create_device(cpu);
1098 if (threshold_cpu_callback)
1099 threshold_cpu_callback(action, cpu);
1100 break;
1101 case CPU_DEAD:
1102 case CPU_DEAD_FROZEN:
1103 if (threshold_cpu_callback)
1104 threshold_cpu_callback(action, cpu);
1105 mce_remove_device(cpu);
1106 break;
1107 case CPU_DOWN_PREPARE:
1108 case CPU_DOWN_PREPARE_FROZEN:
1109 del_timer_sync(t);
1110 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1111 break;
1112 case CPU_DOWN_FAILED:
1113 case CPU_DOWN_FAILED_FROZEN:
1114 t->expires = round_jiffies(jiffies +
1115 __get_cpu_var(next_interval));
1116 add_timer_on(t, cpu);
1117 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1118 break;
1119 case CPU_POST_DEAD:
1120 /* intentionally ignoring frozen here */
1121 cmci_rediscover(cpu);
1122 break;
1123 }
1124 return NOTIFY_OK;
1125}
1126
1127static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1128 .notifier_call = mce_cpu_callback,
1129};
1130
1131static __init int mce_init_banks(void)
1132{
1133 int i;
1134
1135 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1136 GFP_KERNEL);
1137 if (!bank_attrs)
1138 return -ENOMEM;
1139
1140 for (i = 0; i < banks; i++) {
1141 struct sysdev_attribute *a = &bank_attrs[i];
1142 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1143 if (!a->attr.name)
1144 goto nomem;
1145 a->attr.mode = 0644;
1146 a->show = show_bank;
1147 a->store = set_bank;
1148 }
1149 return 0;
1150
1151nomem:
1152 while (--i >= 0)
1153 kfree(bank_attrs[i].attr.name);
1154 kfree(bank_attrs);
1155 bank_attrs = NULL;
1156 return -ENOMEM;
1157}
1158
1159static __init int mce_init_device(void)
1160{
1161 int err;
1162 int i = 0;
1163
1164 if (!mce_available(&boot_cpu_data))
1165 return -EIO;
1166
1167 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1168
1169 err = mce_init_banks();
1170 if (err)
1171 return err;
1172
1173 err = sysdev_class_register(&mce_sysclass);
1174 if (err)
1175 return err;
1176
1177 for_each_online_cpu(i) {
1178 err = mce_create_device(i);
1179 if (err)
1180 return err;
1181 }
1182
1183 register_hotcpu_notifier(&mce_cpu_notifier);
1184 misc_register(&mce_log_device);
1185 return err;
1186}
1187
1188device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000000..2b011d2d8579
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
1/*
2 * Common code for Intel machine checks
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9
10#include <asm/therm_throt.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/apic.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18void intel_init_thermal(struct cpuinfo_x86 *c)
19{
20 unsigned int cpu = smp_processor_id();
21 int tm2 = 0;
22 u32 l, h;
23
24 /* Thermal monitoring depends on ACPI and clock modulation*/
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
26 return;
27
28 /*
29 * First check if its enabled already, in which case there might
30 * be some SMM goo which handles it, so we can't even put a handler
31 * since it might be delivered via SMI already:
32 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
34 h = apic_read(APIC_LVTTHMR);
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
36 printk(KERN_DEBUG
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
38 return;
39 }
40
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
42 tm2 = 1;
43
44 /* Check whether a vector already exists */
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return;
50 }
51
52 /* We'll mask the thermal vector in the lapic till we're ready: */
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
54 apic_write(APIC_LVTTHMR, h);
55
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
57 wrmsr(MSR_IA32_THERM_INTERRUPT,
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
59
60 intel_set_thermal_handler();
61
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
64
65 /* Unmask the thermal vector: */
66 l = apic_read(APIC_LVTTHMR);
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
70 cpu, tm2 ? "TM2" : "TM1");
71
72 /* enable thermal throttle processing */
73 atomic_set(&therm_throt_en, 1);
74}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 65a0fceedcd7..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -16,6 +16,8 @@
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18 18
19#include "mce.h"
20
19asmlinkage void smp_thermal_interrupt(void) 21asmlinkage void smp_thermal_interrupt(void)
20{ 22{
21 __u64 msr_val; 23 __u64 msr_val;
@@ -26,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
26 irq_enter(); 28 irq_enter();
27 29
28 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
29 if (therm_throt_process(msr_val & 1)) 31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
30 mce_log_therm_throt_event(msr_val); 32 mce_log_therm_throt_event(msr_val);
31 33
32 inc_irq_stat(irq_thermal_count); 34 inc_irq_stat(irq_thermal_count);
33 irq_exit(); 35 irq_exit();
34} 36}
35 37
36static void intel_init_thermal(struct cpuinfo_x86 *c)
37{
38 u32 l, h;
39 int tm2 = 0;
40 unsigned int cpu = smp_processor_id();
41
42 if (!cpu_has(c, X86_FEATURE_ACPI))
43 return;
44
45 if (!cpu_has(c, X86_FEATURE_ACC))
46 return;
47
48 /* first check if TM1 is already enabled by the BIOS, in which
49 * case there might be some SMM goo which handles it, so we can't even
50 * put a handler since it might be delivered via SMI already.
51 */
52 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
53 h = apic_read(APIC_LVTTHMR);
54 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
55 printk(KERN_DEBUG
56 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
57 return;
58 }
59
60 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
61 tm2 = 1;
62
63 if (h & APIC_VECTOR_MASK) {
64 printk(KERN_DEBUG
65 "CPU%d: Thermal LVT vector (%#x) already "
66 "installed\n", cpu, (h & APIC_VECTOR_MASK));
67 return;
68 }
69
70 h = THERMAL_APIC_VECTOR;
71 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
72 apic_write(APIC_LVTTHMR, h);
73
74 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
75 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
76
77 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
78 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
79
80 l = apic_read(APIC_LVTTHMR);
81 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
82 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
83 cpu, tm2 ? "TM2" : "TM1");
84
85 /* enable thermal throttle processing */
86 atomic_set(&therm_throt_en, 1);
87 return;
88}
89
90/* 38/*
91 * Support for Intel Correct Machine Check Interrupts. This allows 39 * Support for Intel Correct Machine Check Interrupts. This allows
92 * the CPU to raise an interrupt when a corrected machine check happened. 40 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -108,6 +56,9 @@ static int cmci_supported(int *banks)
108{ 56{
109 u64 cap; 57 u64 cap;
110 58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
111 /* 62 /*
112 * Vendor check is not strictly needed, but the initial 63 * Vendor check is not strictly needed, but the initial
113 * initialization is vendor keyed and this 64 * initialization is vendor keyed and this
@@ -131,7 +82,7 @@ static int cmci_supported(int *banks)
131static void intel_threshold_interrupt(void) 82static void intel_threshold_interrupt(void)
132{ 83{
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 mce_notify_user(); 85 mce_notify_irq();
135} 86}
136 87
137static void print_update(char *type, int *hdr, int num) 88static void print_update(char *type, int *hdr, int num)
@@ -247,7 +198,7 @@ void cmci_rediscover(int dying)
247 return; 198 return;
248 cpumask_copy(old, &current->cpus_allowed); 199 cpumask_copy(old, &current->cpus_allowed);
249 200
250 for_each_online_cpu (cpu) { 201 for_each_online_cpu(cpu) {
251 if (cpu == dying) 202 if (cpu == dying)
252 continue; 203 continue;
253 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..70b710420f74 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
@@ -22,9 +21,9 @@
22 21
23#include "mce.h" 22#include "mce.h"
24 23
25static int firstbank; 24static int firstbank;
26 25
27#define MCE_RATE 15*HZ /* timer rate is 15s */ 26#define MCE_RATE (15*HZ) /* timer rate is 15s */
28 27
29static void mce_checkregs(void *info) 28static void mce_checkregs(void *info)
30{ 29{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 33 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 35
37 if (high & (1<<31)) { 36 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 37 continue;
39 "fatal, correctable incident occurred on " 38
40 "CPU %d.\n", 39 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
40 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 41 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 42
43 43 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 44
45 * Scrub the error so we don't pick it up in MCE_RATE 45 /*
46 * seconds time. 46 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 47 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 48 */
49 49 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 50
51 wmb(); 51 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 52 wmb();
53 } 53 add_taint(TAINT_MACHINE_CHECK);
54 } 54 }
55} 55}
56 56
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
77 77
78 /* Some Athlons misbehave when we frob bank 0 */ 78 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 80 boot_cpu_data.x86 == 6)
81 firstbank = 1; 81 firstbank = 1;
82 else 82 else
83 firstbank = 0; 83 firstbank = 0;
84 84
85 /* 85 /*
86 * Check for non-fatal errors every MCE_RATE s 86 * Check for non-fatal errors every MCE_RATE s
87 */ 87 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 89 printk(KERN_INFO "Machine check exception polling timer started.\n");
90
90 return 0; 91 return 0;
91} 92}
92module_init(init_nonfatal_mce_checker); 93module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..82cee108a2d3 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4 4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/therm_throt.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15#include <asm/msr.h>
16#include <asm/therm_throt.h>
17 16
18#include "mce.h" 17#include "mce.h"
19 18
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
36 35
37 36
38#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
44} 44}
45 45
46/* P4/Xeon Thermal transition interrupt handler */ 46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs) 47static void intel_thermal_interrupt(struct pt_regs *regs)
48{ 48{
49 __u64 msr_val; 49 __u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
51 ack_APIC_irq(); 51 ack_APIC_irq();
52 52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1); 54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55} 55}
56 56
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
59 60
60void smp_thermal_interrupt(struct pt_regs *regs) 61void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
65 irq_exit(); 66 irq_exit();
66} 67}
67 68
68/* P4/Xeon Thermal regulation detect and init */ 69void intel_set_thermal_handler(void)
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{ 70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 71 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123} 72}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125 73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
126 75
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 78{
130 u32 h; 79 u32 h;
131 80
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 92
144static void intel_machine_check(struct pt_regs *regs, long error_code) 93static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 94{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 95 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 96 u32 mcgstl, mcgsth;
97 int recover = 1;
149 int i; 98 int i;
150 99
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 100 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 106
158 if (mce_num_extended_msrs > 0) { 107 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 108 struct intel_mce_extended_msrs dbg;
109
160 intel_get_extended_msrs(&dbg); 110 intel_get_extended_msrs(&dbg);
111
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 112 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 113 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 114 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 122 if (high & (1<<31)) {
172 char misc[20]; 123 char misc[20];
173 char addr[24]; 124 char addr[24];
125
174 misc[0] = addr[0] = '\0'; 126 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 127 if (high & (1<<29))
176 recover |= 1; 128 recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 148 panic("Unable to continue");
197 149
198 printk(KERN_EMERG "Attempting to continue.\n"); 150 printk(KERN_EMERG "Attempting to continue.\n");
151
199 /* 152 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 153 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 154 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 170 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 171}
219 172
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 173void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 174{
223 u32 l, h; 175 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..015f481ab1b0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,39 +14,58 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine check handler for Pentium class Intel */ 17/* By default disabled */
18int mce_p5_enable;
19
20/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 21static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 22{
21 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
24
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 25 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 26 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 27
25 if (lotype&(1<<5)) 28 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 29 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
30 smp_processor_id(), loaddr, lotype);
31
32 if (lotype & (1<<5)) {
33 printk(KERN_EMERG
34 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
35 smp_processor_id());
36 }
37
27 add_taint(TAINT_MACHINE_CHECK); 38 add_taint(TAINT_MACHINE_CHECK);
28} 39}
29 40
30/* Set up machine check reporting for processors with Intel style MCE */ 41/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 42void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 43{
33 u32 l, h; 44 u32 l, h;
34 45
35 /*Check for MCE support */ 46 /* Check for MCE support: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 47 if (!cpu_has(c, X86_FEATURE_MCE))
37 return; 48 return;
38 49
39 /* Default P5 to off as its often misconnected */ 50#ifdef CONFIG_X86_OLD_MCE
51 /* Default P5 to off as its often misconnected: */
40 if (mce_disabled != -1) 52 if (mce_disabled != -1)
41 return; 53 return;
54#endif
55
42 machine_check_vector = pentium_machine_check; 56 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 58 wmb();
44 59
45 /* Read registers before enabling */ 60 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 61 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 62 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 63 printk(KERN_INFO
64 "Intel old style machine check architecture supported.\n");
49 65
50 /* Enable MCE */ 66 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 67 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 68 printk(KERN_INFO
69 "Intel old style machine check reporting enabled on CPU#%d.\n",
70 smp_processor_id());
53} 71}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..43c24e667457 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -18,9 +17,9 @@
18/* Machine Check Handler For PII/PIII */ 17/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 18static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 } 58 }
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 67 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 68 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 69 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 70 * for errors if the OS could not log the error:
67 */ 71 */
68 for (i = 0; i < nr_mce_banks; i++) { 72 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 73 unsigned int msr;
74
70 msr = MSR_IA32_MC0_STATUS+i*4; 75 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 76 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 77 if (high & (1<<31)) {
73 /* Clear it */ 78 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 79 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 80 /* Serialize: */
76 wmb(); 81 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 82 add_taint(TAINT_MACHINE_CHECK);
78 } 83 }
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 86 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 87}
83 88
84/* Set up machine check reporting for processors with Intel style MCE */ 89/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 90void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 91{
87 u32 l, h; 92 u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 102
98 /* Ok machine check is available */ 103 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 104 machine_check_vector = intel_machine_check;
105 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 106 wmb();
101 107
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 108 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..7b1ae2e20ba5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/notifier.h>
17#include <linux/jiffies.h>
17#include <linux/percpu.h> 18#include <linux/percpu.h>
18#include <linux/sysdev.h> 19#include <linux/sysdev.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <asm/cpu.h> 21
21#include <linux/notifier.h>
22#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 22#include <asm/therm_throt.h>
24 23
25/* How long to wait between reporting thermal events */ 24/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 25#define CHECK_INTERVAL (300 * HZ)
27 26
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 29
30atomic_t therm_throt_en = ATOMIC_INIT(0);
31 31
32#ifdef CONFIG_SYSFS 32#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 33#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 38 struct sysdev_attribute *attr, \
39 char *buf) \ 39 char *buf) \
40{ \ 40{ \
41 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 42 ssize_t ret; \
43 \ 43 \
44 preempt_disable(); /* CPU hotplug */ \ 44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 48 else \
49 ret = 0; \ 49 ret = 0; \
50 preempt_enable(); \ 50 preempt_enable(); \
51 \ 51 \
52 return ret; \ 52 return ret; \
53} 53}
54 54
55define_therm_throt_sysdev_show_func(count); 55define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 61};
62 62
63static struct attribute_group thermal_throttle_attr_group = { 63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 65 .name = "thermal_throttle"
66}; 66};
67#endif /* CONFIG_SYSFS */ 67#endif /* CONFIG_SYSFS */
68 68
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
110} 110}
111 111
112#ifdef CONFIG_SYSFS 112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 113/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj,
117 &thermal_throttle_attr_group);
117} 118}
118 119
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 122 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 123}
123 124
124/* Mutex protecting device creation against CPU hotplug */ 125/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 126static DEFINE_MUTEX(therm_cpu_lock);
126 127
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 128/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 129static __cpuinit int
129 unsigned long action, 130thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 131 unsigned long action,
132 void *hcpu)
131{ 133{
132 unsigned int cpu = (unsigned long)hcpu; 134 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 135 struct sys_device *sys_dev;
134 int err = 0; 136 int err = 0;
135 137
136 sys_dev = get_cpu_sysdev(cpu); 138 sys_dev = get_cpu_sysdev(cpu);
139
137 switch (action) { 140 switch (action) {
138 case CPU_UP_PREPARE: 141 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 142 case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..81b02487090b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
@@ -14,7 +13,7 @@
14 13
15#include "mce.h" 14#include "mce.h"
16 15
17/* Machine check handler for WinChip C6 */ 16/* Machine check handler for WinChip C6: */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 17static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 24void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 25{
27 u32 lo, hi; 26 u32 lo, hi;
27
28 machine_check_vector = winchip_machine_check; 28 machine_check_vector = winchip_machine_check;
29 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 30 wmb();
31
30 rdmsr(MSR_IDT_FCR1, lo, hi); 32 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 33 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 34 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 35 wrmsr(MSR_IDT_FCR1, lo, hi);
36
34 set_in_cr4(X86_CR4_MCE); 37 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 38
39 printk(KERN_INFO
40 "Winchip machine check reporting enabled on CPU#0.\n");
36} 41}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb3..b07af8861244 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a4742a340d8d..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,6 +963,8 @@ END(\sym)
963#ifdef CONFIG_SMP 963#ifdef CONFIG_SMP
964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
966apicinterrupt REBOOT_VECTOR \
967 reboot_interrupt smp_reboot_interrupt
966#endif 968#endif
967 969
968#ifdef CONFIG_X86_UV 970#ifdef CONFIG_X86_UV
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
994#endif 996#endif
995 997
996apicinterrupt THRESHOLD_APIC_VECTOR \ 998apicinterrupt THRESHOLD_APIC_VECTOR \
997 threshold_interrupt mce_threshold_interrupt 999 threshold_interrupt smp_threshold_interrupt
998apicinterrupt THERMAL_APIC_VECTOR \ 1000apicinterrupt THERMAL_APIC_VECTOR \
999 thermal_interrupt smp_thermal_interrupt 1001 thermal_interrupt smp_thermal_interrupt
1000 1002
1003#ifdef CONFIG_X86_MCE
1004apicinterrupt MCE_SELF_VECTOR \
1005 mce_self_interrupt smp_mce_self_interrupt
1006#endif
1007
1001#ifdef CONFIG_SMP 1008#ifdef CONFIG_SMP
1002apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1009apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1003 call_function_single_interrupt smp_call_function_single_interrupt 1010 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1379,7 +1386,7 @@ errorentry xen_stack_segment do_stack_segment
1379errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1380errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1381#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
1382paranoidzeroentry machine_check do_machine_check 1389paranoidzeroentry machine_check *machine_check_vector(%rip)
1383#endif 1390#endif
1384 1391
1385 /* 1392 /*
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d4..5cf36c053ac4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269beab..270ff83efc11 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 38287b5f116e..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
15#include <asm/hw_irq.h> 16#include <asm/hw_irq.h>
16 17
17atomic_t irq_err_count; 18atomic_t irq_err_count;
@@ -96,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
96 for_each_online_cpu(j) 97 for_each_online_cpu(j)
97 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
98 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
99# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
100 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
101 for_each_online_cpu(j) 102 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
103 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
104# endif 105# endif
105#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
106 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
107#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
108 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -185,10 +196,14 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185#endif 196#endif
186#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
187 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
188# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
189 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
190# endif 201# endif
191#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
206#endif
192 return sum; 207 return sum;
193} 208}
194 209
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 267c6624c77f..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -173,6 +173,9 @@ static void __init smp_intr_init(void)
173 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
176#endif 179#endif
177#endif /* CONFIG_SMP */ 180#endif /* CONFIG_SMP */
178} 181}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9c4461501fcb..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode",
239 .fops = &microcode_fops, 240 .fops = &microcode_fops,
240}; 241};
241 242
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec626..98fd6cd4e3a4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649bd..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
63 task_xstate_cachep = 63 task_xstate_cachep =
64 kmem_cache_create("task_xstate", xstate_size, 64 kmem_cache_create("task_xstate", xstate_size,
65 __alignof__(union thread_xstate), 65 __alignof__(union thread_xstate),
66 SLAB_PANIC, NULL); 66 SLAB_PANIC | SLAB_NOTRACK, NULL);
67} 67}
68 68
69/* 69/*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0a813b17b172..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -24,11 +24,11 @@
24#include <asm/ucontext.h> 24#include <asm/ucontext.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
27 28
28#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
29#include <asm/proto.h> 30#include <asm/proto.h>
30#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
31#include <asm/mce.h>
32#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
33 33
34#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -856,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 859#ifdef CONFIG_X86_NEW_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_user(); 862 mce_notify_process();
863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
864 864
865 /* deal with pending signal delivery */ 865 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 28f5fb495a66..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 07d60c870ce2..5f935f0d5861 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 535
535 get_debugreg(condition, 6); 536 get_debugreg(condition, 6);
536 537
538 /* Catch kmemcheck conditions first of all! */
539 if (condition & DR_STEP && kmemcheck_trap(regs))
540 return;
541
537 /* 542 /*
538 * The processor cleared BTF, so don't mark that we need it set. 543 * The processor cleared BTF, so don't mark that we need it set.
539 */ 544 */
@@ -798,15 +803,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
798 803
799 return new_kesp; 804 return new_kesp;
800} 805}
801#else 806#endif
807
802asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 808asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
803{ 809{
804} 810}
805 811
806asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 812asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
807{ 813{
808} 814}
809#endif
810 815
811/* 816/*
812 * 'math_state_restore()' saves the current math information in the 817 * 'math_state_restore()' saves the current math information in the
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3e1c057e98fe..ae3180c506a6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32d6ae8fb60e..e770bf349ec4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
1277 struct page *pages; 1277 struct page *pages;
1278 struct vmcs *vmcs; 1278 struct vmcs *vmcs;
1279 1279
1280 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 1280 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1281 if (!pages) 1281 if (!pages)
1282 return NULL; 1282 return NULL;
1283 vmcs = page_address(pages); 1283 vmcs = page_address(pages);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..baa0e86adfbc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -14,6 +14,7 @@
14 14
15#include <asm/traps.h> /* dotraplinkage, ... */ 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
17 18
18/* 19/*
19 * Page fault error code bits: 20 * Page fault error code bits:
@@ -956,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
956 /* Get the faulting address: */ 957 /* Get the faulting address: */
957 address = read_cr2(); 958 address = read_cr2();
958 959
960 /*
961 * Detect and handle instructions that would cause a page fault for
962 * both a tracked kernel page and a userspace page.
963 */
964 if (kmemcheck_active(regs))
965 kmemcheck_hide(regs);
966
959 if (unlikely(kmmio_fault(regs, address))) 967 if (unlikely(kmmio_fault(regs, address)))
960 return; 968 return;
961 969
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
973 * protection error (error_code & 9) == 0. 981 * protection error (error_code & 9) == 0.
974 */ 982 */
975 if (unlikely(fault_in_kernel_space(address))) { 983 if (unlikely(fault_in_kernel_space(address))) {
976 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 984 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
977 vmalloc_fault(address) >= 0) 985 if (vmalloc_fault(address) >= 0)
978 return; 986 return;
987
988 if (kmemcheck_fault(regs, address, error_code))
989 return;
990 }
979 991
980 /* Can handle a stale RO->RW TLB: */ 992 /* Can handle a stale RO->RW TLB: */
981 if (spurious_fault(error_code, address)) 993 if (spurious_fault(error_code, address))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..f53b57e4086f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -213,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
213 if (!after_bootmem) 213 if (!after_bootmem)
214 init_gbpages(); 214 init_gbpages();
215 215
216#ifdef CONFIG_DEBUG_PAGEALLOC 216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
217 /* 217 /*
218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
219 * This will simplify cpa(), which otherwise needs to support splitting 219 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ff3c0816d15..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
111 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
112 112
113 if (after_bootmem) { 113 if (after_bootmem) {
114#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
116#endif 116#endif
117 if (!page_table) 117 if (!page_table)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..9c543290a813 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void)
104 void *ptr; 104 void *ptr;
105 105
106 if (after_bootmem) 106 if (after_bootmem)
107 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
108 else 108 else
109 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
110 110
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
281 void *adr; 281 void *adr;
282 282
283 if (after_bootmem) { 283 if (after_bootmem) {
284 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
285 *phys = __pa(adr); 285 *phys = __pa(adr);
286 286
287 return adr; 287 return adr;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..3cfe9ced8a4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
470 470
471 if (!debug_pagealloc) 471 if (!debug_pagealloc)
472 spin_unlock(&cpa_lock); 472 spin_unlock(&cpa_lock);
473 base = alloc_pages(GFP_KERNEL, 0); 473 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
474 if (!debug_pagealloc) 474 if (!debug_pagealloc)
475 spin_lock(&cpa_lock); 475 spin_lock(&cpa_lock);
476 if (!base) 476 if (!base)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..8e43bdd45456 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;