diff options
author | Andi Kleen <andi@firstfloor.org> | 2009-02-17 17:07:13 -0500 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2009-02-19 17:50:58 -0500 |
commit | 0d7482e3d76522157c9d741d79fce22c401fa0c5 (patch) | |
tree | be87d0d3dcf9c44c7e706ce40f23515471a0b16e /arch/x86/kernel/cpu | |
parent | e35849e910a6543d37c0d13648ef166678d03565 (diff) |
x86, mce: implement dynamic machine check banks support
Impact: cleanup; making code future proof; memory saving on small systems
This patch replaces the hardcoded max number of machine check banks with
dynamic allocation depending on what the CPU reports. The sysfs
data structures and the banks array are dynamically allocated.
There is still a hard bank limit (128) because the mcelog protocol uses
banks >= 128 as pseudo banks to escape other events. But we expect
that 128 banks is beyond any reasonable CPU for now.
This supersedes an earlier patch by Venki, but it solves the problem
more completely by making the limit fully dynamic (up to the 128
boundary).
This saves some memory on machines with less than 6 banks because
they won't need sysdevs for unused ones and also allows to
use sysfs to control these banks on possible future CPUs with
more than 6 banks.
This is an updated patch addressing Venki's comments. I also added in
another patch from Thomas which fixed the error allocation path (that
patch was previously separated)
Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_64.c | 147 |
1 files changed, 115 insertions, 32 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 870d08deccf..2297730bb51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/ctype.h> | 24 | #include <linux/ctype.h> |
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/kdebug.h> | 26 | #include <linux/kdebug.h> |
27 | #include <linux/kobject.h> | ||
28 | #include <linux/sysfs.h> | ||
27 | #include <asm/processor.h> | 29 | #include <asm/processor.h> |
28 | #include <asm/msr.h> | 30 | #include <asm/msr.h> |
29 | #include <asm/mce.h> | 31 | #include <asm/mce.h> |
@@ -32,7 +34,12 @@ | |||
32 | #include <asm/idle.h> | 34 | #include <asm/idle.h> |
33 | 35 | ||
34 | #define MISC_MCELOG_MINOR 227 | 36 | #define MISC_MCELOG_MINOR 227 |
35 | #define NR_SYSFS_BANKS 6 | 37 | |
38 | /* | ||
39 | * To support more than 128 would need to escape the predefined | ||
40 | * Linux defined extended banks first. | ||
41 | */ | ||
42 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) | ||
36 | 43 | ||
37 | atomic_t mce_entry; | 44 | atomic_t mce_entry; |
38 | 45 | ||
@@ -47,7 +54,7 @@ static int mce_dont_init; | |||
47 | */ | 54 | */ |
48 | static int tolerant = 1; | 55 | static int tolerant = 1; |
49 | static int banks; | 56 | static int banks; |
50 | static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; | 57 | static u64 *bank; |
51 | static unsigned long notify_user; | 58 | static unsigned long notify_user; |
52 | static int rip_msr; | 59 | static int rip_msr; |
53 | static int mce_bootlog = -1; | 60 | static int mce_bootlog = -1; |
@@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
212 | barrier(); | 219 | barrier(); |
213 | 220 | ||
214 | for (i = 0; i < banks; i++) { | 221 | for (i = 0; i < banks; i++) { |
215 | if (i < NR_SYSFS_BANKS && !bank[i]) | 222 | if (!bank[i]) |
216 | continue; | 223 | continue; |
217 | 224 | ||
218 | m.misc = 0; | 225 | m.misc = 0; |
@@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init); | |||
446 | /* | 453 | /* |
447 | * Initialize Machine Checks for a CPU. | 454 | * Initialize Machine Checks for a CPU. |
448 | */ | 455 | */ |
449 | static void mce_init(void *dummy) | 456 | static int mce_cap_init(void) |
450 | { | 457 | { |
451 | u64 cap; | 458 | u64 cap; |
452 | int i; | 459 | unsigned b; |
453 | 460 | ||
454 | rdmsrl(MSR_IA32_MCG_CAP, cap); | 461 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
455 | banks = cap & 0xff; | 462 | b = cap & 0xff; |
456 | if (banks > MCE_EXTENDED_BANK) { | 463 | if (b > MAX_NR_BANKS) { |
457 | banks = MCE_EXTENDED_BANK; | 464 | printk(KERN_WARNING |
458 | printk(KERN_INFO "MCE: warning: using only %d banks\n", | 465 | "MCE: Using only %u machine check banks out of %u\n", |
459 | MCE_EXTENDED_BANK); | 466 | MAX_NR_BANKS, b); |
467 | b = MAX_NR_BANKS; | ||
468 | } | ||
469 | |||
470 | /* Don't support asymmetric configurations today */ | ||
471 | WARN_ON(banks != 0 && b != banks); | ||
472 | banks = b; | ||
473 | if (!bank) { | ||
474 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
475 | if (!bank) | ||
476 | return -ENOMEM; | ||
477 | memset(bank, 0xff, banks * sizeof(u64)); | ||
460 | } | 478 | } |
479 | |||
461 | /* Use accurate RIP reporting if available. */ | 480 | /* Use accurate RIP reporting if available. */ |
462 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | 481 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) |
463 | rip_msr = MSR_IA32_MCG_EIP; | 482 | rip_msr = MSR_IA32_MCG_EIP; |
464 | 483 | ||
484 | return 0; | ||
485 | } | ||
486 | |||
487 | static void mce_init(void *dummy) | ||
488 | { | ||
489 | u64 cap; | ||
490 | int i; | ||
491 | |||
465 | /* Log the machine checks left over from the previous reset. | 492 | /* Log the machine checks left over from the previous reset. |
466 | This also clears all registers */ | 493 | This also clears all registers */ |
467 | do_machine_check(NULL, mce_bootlog ? -1 : -2); | 494 | do_machine_check(NULL, mce_bootlog ? -1 : -2); |
468 | 495 | ||
469 | set_in_cr4(X86_CR4_MCE); | 496 | set_in_cr4(X86_CR4_MCE); |
470 | 497 | ||
498 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
471 | if (cap & MCG_CTL_P) | 499 | if (cap & MCG_CTL_P) |
472 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 500 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
473 | 501 | ||
474 | for (i = 0; i < banks; i++) { | 502 | for (i = 0; i < banks; i++) { |
475 | if (i < NR_SYSFS_BANKS) | 503 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); |
476 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
477 | else | ||
478 | wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); | ||
479 | |||
480 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 504 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); |
481 | } | 505 | } |
482 | } | 506 | } |
@@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
486 | { | 510 | { |
487 | /* This should be disabled by the BIOS, but isn't always */ | 511 | /* This should be disabled by the BIOS, but isn't always */ |
488 | if (c->x86_vendor == X86_VENDOR_AMD) { | 512 | if (c->x86_vendor == X86_VENDOR_AMD) { |
489 | if(c->x86 == 15) | 513 | if (c->x86 == 15 && banks > 4) |
490 | /* disable GART TBL walk error reporting, which trips off | 514 | /* disable GART TBL walk error reporting, which trips off |
491 | incorrectly with the IOMMU & 3ware & Cerberus. */ | 515 | incorrectly with the IOMMU & 3ware & Cerberus. */ |
492 | clear_bit(10, &bank[4]); | 516 | clear_bit(10, (unsigned long *)&bank[4]); |
493 | if(c->x86 <= 17 && mce_bootlog < 0) | 517 | if(c->x86 <= 17 && mce_bootlog < 0) |
494 | /* Lots of broken BIOS around that don't clear them | 518 | /* Lots of broken BIOS around that don't clear them |
495 | by default and leave crap in there. Don't log. */ | 519 | by default and leave crap in there. Don't log. */ |
@@ -532,11 +556,15 @@ static void mce_init_timer(void) | |||
532 | */ | 556 | */ |
533 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | 557 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) |
534 | { | 558 | { |
535 | mce_cpu_quirks(c); | ||
536 | |||
537 | if (!mce_available(c)) | 559 | if (!mce_available(c)) |
538 | return; | 560 | return; |
539 | 561 | ||
562 | if (mce_cap_init() < 0) { | ||
563 | mce_dont_init = 1; | ||
564 | return; | ||
565 | } | ||
566 | mce_cpu_quirks(c); | ||
567 | |||
540 | mce_init(NULL); | 568 | mce_init(NULL); |
541 | mce_cpu_features(c); | 569 | mce_cpu_features(c); |
542 | mce_init_timer(); | 570 | mce_init_timer(); |
@@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit | |||
819 | } \ | 847 | } \ |
820 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | 848 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); |
821 | 849 | ||
822 | /* | 850 | static struct sysdev_attribute *bank_attrs; |
823 | * TBD should generate these dynamically based on number of available banks. | 851 | |
824 | * Have only 6 contol banks in /sysfs until then. | 852 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, |
825 | */ | 853 | char *buf) |
826 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | 854 | { |
827 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | 855 | u64 b = bank[attr - bank_attrs]; |
828 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | 856 | return sprintf(buf, "%Lx\n", b); |
829 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | 857 | } |
830 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | 858 | |
831 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | 859 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, |
860 | const char *buf, size_t siz) | ||
861 | { | ||
862 | char *end; | ||
863 | u64 new = simple_strtoull(buf, &end, 0); | ||
864 | if (end == buf) | ||
865 | return -EINVAL; | ||
866 | bank[attr - bank_attrs] = new; | ||
867 | mce_restart(); | ||
868 | return end-buf; | ||
869 | } | ||
832 | 870 | ||
833 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, | 871 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
834 | char *buf) | 872 | char *buf) |
@@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | |||
855 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | 893 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); |
856 | ACCESSOR(check_interval,check_interval,mce_restart()) | 894 | ACCESSOR(check_interval,check_interval,mce_restart()) |
857 | static struct sysdev_attribute *mce_attributes[] = { | 895 | static struct sysdev_attribute *mce_attributes[] = { |
858 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
859 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
860 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, | 896 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, |
861 | NULL | 897 | NULL |
862 | }; | 898 | }; |
@@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
886 | if (err) | 922 | if (err) |
887 | goto error; | 923 | goto error; |
888 | } | 924 | } |
925 | for (i = 0; i < banks; i++) { | ||
926 | err = sysdev_create_file(&per_cpu(device_mce, cpu), | ||
927 | &bank_attrs[i]); | ||
928 | if (err) | ||
929 | goto error2; | ||
930 | } | ||
889 | cpu_set(cpu, mce_device_initialized); | 931 | cpu_set(cpu, mce_device_initialized); |
890 | 932 | ||
891 | return 0; | 933 | return 0; |
934 | error2: | ||
935 | while (--i >= 0) { | ||
936 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
937 | &bank_attrs[i]); | ||
938 | } | ||
892 | error: | 939 | error: |
893 | while (i--) { | 940 | while (--i >= 0) { |
894 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 941 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
895 | mce_attributes[i]); | 942 | mce_attributes[i]); |
896 | } | 943 | } |
@@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) | |||
909 | for (i = 0; mce_attributes[i]; i++) | 956 | for (i = 0; mce_attributes[i]; i++) |
910 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 957 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
911 | mce_attributes[i]); | 958 | mce_attributes[i]); |
959 | for (i = 0; i < banks; i++) | ||
960 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
961 | &bank_attrs[i]); | ||
912 | sysdev_unregister(&per_cpu(device_mce,cpu)); | 962 | sysdev_unregister(&per_cpu(device_mce,cpu)); |
913 | cpu_clear(cpu, mce_device_initialized); | 963 | cpu_clear(cpu, mce_device_initialized); |
914 | } | 964 | } |
@@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { | |||
973 | .notifier_call = mce_cpu_callback, | 1023 | .notifier_call = mce_cpu_callback, |
974 | }; | 1024 | }; |
975 | 1025 | ||
1026 | static __init int mce_init_banks(void) | ||
1027 | { | ||
1028 | int i; | ||
1029 | |||
1030 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1031 | GFP_KERNEL); | ||
1032 | if (!bank_attrs) | ||
1033 | return -ENOMEM; | ||
1034 | |||
1035 | for (i = 0; i < banks; i++) { | ||
1036 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
1037 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
1038 | if (!a->attr.name) | ||
1039 | goto nomem; | ||
1040 | a->attr.mode = 0644; | ||
1041 | a->show = show_bank; | ||
1042 | a->store = set_bank; | ||
1043 | } | ||
1044 | return 0; | ||
1045 | |||
1046 | nomem: | ||
1047 | while (--i >= 0) | ||
1048 | kfree(bank_attrs[i].attr.name); | ||
1049 | kfree(bank_attrs); | ||
1050 | bank_attrs = NULL; | ||
1051 | return -ENOMEM; | ||
1052 | } | ||
1053 | |||
976 | static __init int mce_init_device(void) | 1054 | static __init int mce_init_device(void) |
977 | { | 1055 | { |
978 | int err; | 1056 | int err; |
@@ -980,6 +1058,11 @@ static __init int mce_init_device(void) | |||
980 | 1058 | ||
981 | if (!mce_available(&boot_cpu_data)) | 1059 | if (!mce_available(&boot_cpu_data)) |
982 | return -EIO; | 1060 | return -EIO; |
1061 | |||
1062 | err = mce_init_banks(); | ||
1063 | if (err) | ||
1064 | return err; | ||
1065 | |||
983 | err = sysdev_class_register(&mce_sysclass); | 1066 | err = sysdev_class_register(&mce_sysclass); |
984 | if (err) | 1067 | if (err) |
985 | return err; | 1068 | return err; |