aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-10-06 11:36:55 -0400
committerHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-10-06 11:36:55 -0400
commitd94e5fcbf1420366dcb4102bafe04dbcfc0d0d4b (patch)
treea9b7de7df6da5c3132cc68169b9c47ba288ccd42 /arch/x86/kernel
parentd55651168a20078a94597a297d5cdfd807bf07b6 (diff)
parent374576a8b6f865022c0fd1ca62396889b23d66dd (diff)
Merge commit 'v2.6.32-rc3'
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c105
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/alternative.c58
-rw-r--r--arch/x86/kernel/amd_iommu.c489
-rw-r--r--arch/x86/kernel/amd_iommu_init.c42
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c150
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c387
-rw-r--r--arch/x86/kernel/apic/ipi.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c26
-rw-r--r--arch/x86/kernel/apic/numaq_32.c57
-rw-r--r--arch/x86/kernel/apic/probe_64.c31
-rw-r--r--arch/x86/kernel/apic/summit_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/apm_32.c31
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c129
-rw-r--r--arch/x86/kernel/cpu/bugs.c10
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c2
-rw-r--r--arch/x86/kernel/cpu/common.c73
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c116
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cyrix.c19
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c19
-rw-r--r--arch/x86/kernel/cpu/intel.c17
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c148
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c159
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c342
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c80
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c97
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c168
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c390
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c94
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c304
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c135
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c499
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h19
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c68
-rw-r--r--arch/x86/kernel/cpu/perf_event.c (renamed from arch/x86/kernel/cpu/perf_counter.c)863
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c47
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/vmware.c41
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/doublefault_32.c4
-rw-r--r--arch/x86/kernel/ds.c6
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/early_printk.c785
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_64.S30
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/head32.c26
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c8
-rw-r--r--arch/x86/kernel/i8253.c19
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/mpparse.c85
-rw-r--r--arch/x86/kernel/mrst.c24
-rw-r--r--arch/x86/kernel/msr.c65
-rw-r--r--arch/x86/kernel/paravirt.c39
-rw-r--r--arch/x86/kernel/pci-dma.c21
-rw-r--r--arch/x86/kernel/pci-gart_64.c5
-rw-r--r--arch/x86/kernel/pci-nommu.c29
-rw-r--r--arch/x86/kernel/pci-swiotlb.c30
-rw-r--r--arch/x86/kernel/process.c31
-rw-r--r--arch/x86/kernel/process_32.c30
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c34
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/rtc.c17
-rw-r--r--arch/x86/kernel/setup.c131
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/sfi.c122
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/smpboot.c32
-rw-r--r--arch/x86/kernel/step.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/time.c120
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/trampoline_32.S8
-rw-r--r--arch/x86/kernel/trampoline_64.S5
-rw-r--r--arch/x86/kernel/traps.c65
-rw-r--r--arch/x86/kernel/tsc.c88
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/visws_quirks.c54
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S94
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c75
122 files changed, 4581 insertions, 4782 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7b..d8e5d0cdd678 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n
31 31
32obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
34obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 34obj-y += time.o ioport.o ldt.o dumpstack.o
35obj-y += setup.o i8259.o irqinit.o 35obj-y += setup.o x86_init.o i8259.o irqinit.o
36obj-$(CONFIG_X86_VISWS) += visws_quirks.o 36obj-$(CONFIG_X86_VISWS) += visws_quirks.o
37obj-$(CONFIG_X86_32) += probe_roms_32.o 37obj-$(CONFIG_X86_32) += probe_roms_32.o
38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 52obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 53obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 54obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o
55obj-$(CONFIG_STACKTRACE) += stacktrace.o 56obj-$(CONFIG_STACKTRACE) += stacktrace.o
56obj-y += cpu/ 57obj-y += cpu/
57obj-y += acpi/ 58obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
58obj-y += reboot.o 60obj-y += reboot.o
59obj-$(CONFIG_MCA) += mca_32.o 61obj-$(CONFIG_MCA) += mca_32.o
60obj-$(CONFIG_X86_MSR) += msr.o 62obj-$(CONFIG_X86_MSR) += msr.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 106scx200-y += scx200_32.o
105 107
106obj-$(CONFIG_OLPC) += olpc.o 108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_X86_MRST) += mrst.o
107 110
108microcode-y := microcode_core.o 111microcode-y := microcode_core.o
109microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 112microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6b8ca3a0285d..67e929b89875 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
833extern int es7000_plat; 833extern int es7000_plat;
834#endif 834#endif
835 835
836static struct {
837 int gsi_base;
838 int gsi_end;
839} mp_ioapic_routing[MAX_IO_APICS];
840
841int mp_find_ioapic(int gsi)
842{
843 int i = 0;
844
845 /* Find the IOAPIC that manages this GSI. */
846 for (i = 0; i < nr_ioapics; i++) {
847 if ((gsi >= mp_ioapic_routing[i].gsi_base)
848 && (gsi <= mp_ioapic_routing[i].gsi_end))
849 return i;
850 }
851
852 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
853 return -1;
854}
855
856int mp_find_ioapic_pin(int ioapic, int gsi)
857{
858 if (WARN_ON(ioapic == -1))
859 return -1;
860 if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
861 return -1;
862
863 return gsi - mp_ioapic_routing[ioapic].gsi_base;
864}
865
866static u8 __init uniq_ioapic_id(u8 id)
867{
868#ifdef CONFIG_X86_32
869 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
870 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
871 return io_apic_get_unique_id(nr_ioapics, id);
872 else
873 return id;
874#else
875 int i;
876 DECLARE_BITMAP(used, 256);
877 bitmap_zero(used, 256);
878 for (i = 0; i < nr_ioapics; i++) {
879 struct mpc_ioapic *ia = &mp_ioapics[i];
880 __set_bit(ia->apicid, used);
881 }
882 if (!test_bit(id, used))
883 return id;
884 return find_first_zero_bit(used, 256);
885#endif
886}
887
888static int bad_ioapic(unsigned long address)
889{
890 if (nr_ioapics >= MAX_IO_APICS) {
891 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
892 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
893 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
894 }
895 if (!address) {
896 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
897 " found in table, skipping!\n");
898 return 1;
899 }
900 return 0;
901}
902
903void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
904{
905 int idx = 0;
906
907 if (bad_ioapic(address))
908 return;
909
910 idx = nr_ioapics;
911
912 mp_ioapics[idx].type = MP_IOAPIC;
913 mp_ioapics[idx].flags = MPC_APIC_USABLE;
914 mp_ioapics[idx].apicaddr = address;
915
916 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
917 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
918 mp_ioapics[idx].apicver = io_apic_get_version(idx);
919
920 /*
921 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
922 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
923 */
924 mp_ioapic_routing[idx].gsi_base = gsi_base;
925 mp_ioapic_routing[idx].gsi_end = gsi_base +
926 io_apic_get_redir_entries(idx);
927
928 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
929 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
930 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
931 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
932
933 nr_ioapics++;
934}
935
936int __init acpi_probe_gsi(void) 836int __init acpi_probe_gsi(void)
937{ 837{
938 int idx; 838 int idx;
@@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void)
947 847
948 max_gsi = 0; 848 max_gsi = 0;
949 for (idx = 0; idx < nr_ioapics; idx++) { 849 for (idx = 0; idx < nr_ioapics; idx++) {
950 gsi = mp_ioapic_routing[idx].gsi_end; 850 gsi = mp_gsi_routing[idx].gsi_end;
951 851
952 if (gsi > max_gsi) 852 if (gsi > max_gsi)
953 max_gsi = gsi; 853 max_gsi = gsi;
@@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1179 * If MPS is present, it will handle them, 1079 * If MPS is present, it will handle them,
1180 * otherwise the system will stay in PIC mode 1080 * otherwise the system will stay in PIC mode
1181 */ 1081 */
1182 if (acpi_disabled || acpi_noirq) { 1082 if (acpi_disabled || acpi_noirq)
1183 return -ENODEV; 1083 return -ENODEV;
1184 }
1185 1084
1186 if (!cpu_has_apic) 1085 if (!cpu_has_apic)
1187 return -ENODEV; 1086 return -ENODEV;
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efcb..59cdfa4686b2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f57658702571..de7353c0ce9c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/mutex.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/stringify.h>
5#include <linux/kprobes.h> 6#include <linux/kprobes.h>
6#include <linux/mm.h> 7#include <linux/mm.h>
7#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
@@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly);
32#define smp_alt_once 1 33#define smp_alt_once 1
33#endif 34#endif
34 35
35static int debug_alternative; 36static int __initdata_or_module debug_alternative;
36 37
37static int __init debug_alt(char *str) 38static int __init debug_alt(char *str)
38{ 39{
@@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str)
51__setup("noreplace-smp", setup_noreplace_smp); 52__setup("noreplace-smp", setup_noreplace_smp);
52 53
53#ifdef CONFIG_PARAVIRT 54#ifdef CONFIG_PARAVIRT
54static int noreplace_paravirt = 0; 55static int __initdata_or_module noreplace_paravirt = 0;
55 56
56static int __init setup_noreplace_paravirt(char *str) 57static int __init setup_noreplace_paravirt(char *str)
57{ 58{
@@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
64#define DPRINTK(fmt, args...) if (debug_alternative) \ 65#define DPRINTK(fmt, args...) if (debug_alternative) \
65 printk(KERN_DEBUG fmt, args) 66 printk(KERN_DEBUG fmt, args)
66 67
67#ifdef GENERIC_NOP1 68#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
68/* Use inline assembly to define this because the nops are defined 69/* Use inline assembly to define this because the nops are defined
69 as inline assembly strings in the include files and we cannot 70 as inline assembly strings in the include files and we cannot
70 get them easily into strings. */ 71 get them easily into strings. */
71asm("\t.section .rodata, \"a\"\nintelnops: " 72asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
72 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 73 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
73 GENERIC_NOP7 GENERIC_NOP8 74 GENERIC_NOP7 GENERIC_NOP8
74 "\t.previous"); 75 "\t.previous");
75extern const unsigned char intelnops[]; 76extern const unsigned char intelnops[];
76static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { 77static const unsigned char *const __initconst_or_module
78intel_nops[ASM_NOP_MAX+1] = {
77 NULL, 79 NULL,
78 intelnops, 80 intelnops,
79 intelnops + 1, 81 intelnops + 1,
@@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
87#endif 89#endif
88 90
89#ifdef K8_NOP1 91#ifdef K8_NOP1
90asm("\t.section .rodata, \"a\"\nk8nops: " 92asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
91 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 93 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
92 K8_NOP7 K8_NOP8 94 K8_NOP7 K8_NOP8
93 "\t.previous"); 95 "\t.previous");
94extern const unsigned char k8nops[]; 96extern const unsigned char k8nops[];
95static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { 97static const unsigned char *const __initconst_or_module
98k8_nops[ASM_NOP_MAX+1] = {
96 NULL, 99 NULL,
97 k8nops, 100 k8nops,
98 k8nops + 1, 101 k8nops + 1,
@@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
105}; 108};
106#endif 109#endif
107 110
108#ifdef K7_NOP1 111#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
109asm("\t.section .rodata, \"a\"\nk7nops: " 112asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
110 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 113 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
111 K7_NOP7 K7_NOP8 114 K7_NOP7 K7_NOP8
112 "\t.previous"); 115 "\t.previous");
113extern const unsigned char k7nops[]; 116extern const unsigned char k7nops[];
114static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { 117static const unsigned char *const __initconst_or_module
118k7_nops[ASM_NOP_MAX+1] = {
115 NULL, 119 NULL,
116 k7nops, 120 k7nops,
117 k7nops + 1, 121 k7nops + 1,
@@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
125#endif 129#endif
126 130
127#ifdef P6_NOP1 131#ifdef P6_NOP1
128asm("\t.section .rodata, \"a\"\np6nops: " 132asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
129 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 133 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
130 P6_NOP7 P6_NOP8 134 P6_NOP7 P6_NOP8
131 "\t.previous"); 135 "\t.previous");
132extern const unsigned char p6nops[]; 136extern const unsigned char p6nops[];
133static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { 137static const unsigned char *const __initconst_or_module
138p6_nops[ASM_NOP_MAX+1] = {
134 NULL, 139 NULL,
135 p6nops, 140 p6nops,
136 p6nops + 1, 141 p6nops + 1,
@@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
146#ifdef CONFIG_X86_64 151#ifdef CONFIG_X86_64
147 152
148extern char __vsyscall_0; 153extern char __vsyscall_0;
149const unsigned char *const *find_nop_table(void) 154static const unsigned char *const *__init_or_module find_nop_table(void)
150{ 155{
151 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 156 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
152 boot_cpu_has(X86_FEATURE_NOPL)) 157 boot_cpu_has(X86_FEATURE_NOPL))
@@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void)
157 162
158#else /* CONFIG_X86_64 */ 163#else /* CONFIG_X86_64 */
159 164
160const unsigned char *const *find_nop_table(void) 165static const unsigned char *const *__init_or_module find_nop_table(void)
161{ 166{
162 if (boot_cpu_has(X86_FEATURE_K8)) 167 if (boot_cpu_has(X86_FEATURE_K8))
163 return k8_nops; 168 return k8_nops;
@@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void)
172#endif /* CONFIG_X86_64 */ 177#endif /* CONFIG_X86_64 */
173 178
174/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 179/* Use this to add nops to a buffer, then text_poke the whole buffer. */
175void add_nops(void *insns, unsigned int len) 180static void __init_or_module add_nops(void *insns, unsigned int len)
176{ 181{
177 const unsigned char *const *noptable = find_nop_table(); 182 const unsigned char *const *noptable = find_nop_table();
178 183
@@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len)
185 len -= noplen; 190 len -= noplen;
186 } 191 }
187} 192}
188EXPORT_SYMBOL_GPL(add_nops);
189 193
190extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 194extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
191extern u8 *__smp_locks[], *__smp_locks_end[]; 195extern u8 *__smp_locks[], *__smp_locks_end[];
196static void *text_poke_early(void *addr, const void *opcode, size_t len);
192 197
193/* Replace instructions with better alternatives for this CPU type. 198/* Replace instructions with better alternatives for this CPU type.
194 This runs before SMP is initialized to avoid SMP problems with 199 This runs before SMP is initialized to avoid SMP problems with
@@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[];
196 APs have less capabilities than the boot processor are not handled. 201 APs have less capabilities than the boot processor are not handled.
197 Tough. Make sure you disable such features by hand. */ 202 Tough. Make sure you disable such features by hand. */
198 203
199void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 204void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end)
200{ 206{
201 struct alt_instr *a; 207 struct alt_instr *a;
202 char insnbuf[MAX_PATCH_LEN]; 208 char insnbuf[MAX_PATCH_LEN];
@@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules);
279static DEFINE_MUTEX(smp_alt); 285static DEFINE_MUTEX(smp_alt);
280static int smp_mode = 1; /* protected by smp_alt */ 286static int smp_mode = 1; /* protected by smp_alt */
281 287
282void alternatives_smp_module_add(struct module *mod, char *name, 288void __init_or_module alternatives_smp_module_add(struct module *mod,
283 void *locks, void *locks_end, 289 char *name,
284 void *text, void *text_end) 290 void *locks, void *locks_end,
291 void *text, void *text_end)
285{ 292{
286 struct smp_alt_module *smp; 293 struct smp_alt_module *smp;
287 294
@@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name,
317 mutex_unlock(&smp_alt); 324 mutex_unlock(&smp_alt);
318} 325}
319 326
320void alternatives_smp_module_del(struct module *mod) 327void __init_or_module alternatives_smp_module_del(struct module *mod)
321{ 328{
322 struct smp_alt_module *item; 329 struct smp_alt_module *item;
323 330
@@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp)
386#endif 393#endif
387 394
388#ifdef CONFIG_PARAVIRT 395#ifdef CONFIG_PARAVIRT
389void apply_paravirt(struct paravirt_patch_site *start, 396void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
390 struct paravirt_patch_site *end) 397 struct paravirt_patch_site *end)
391{ 398{
392 struct paravirt_patch_site *p; 399 struct paravirt_patch_site *p;
393 char insnbuf[MAX_PATCH_LEN]; 400 char insnbuf[MAX_PATCH_LEN];
@@ -485,13 +492,14 @@ void __init alternative_instructions(void)
485 * instructions. And on the local CPU you need to be protected again NMI or MCE 492 * instructions. And on the local CPU you need to be protected again NMI or MCE
486 * handlers seeing an inconsistent instruction while you patch. 493 * handlers seeing an inconsistent instruction while you patch.
487 */ 494 */
488void *text_poke_early(void *addr, const void *opcode, size_t len) 495static void *__init_or_module text_poke_early(void *addr, const void *opcode,
496 size_t len)
489{ 497{
490 unsigned long flags; 498 unsigned long flags;
491 local_irq_save(flags); 499 local_irq_save(flags);
492 memcpy(addr, opcode, len); 500 memcpy(addr, opcode, len);
493 local_irq_restore(flags);
494 sync_core(); 501 sync_core();
502 local_irq_restore(flags);
495 /* Could also do a CLFLUSH here to speed up CPU recovery; but 503 /* Could also do a CLFLUSH here to speed up CPU recovery; but
496 that causes hangs on some VIA CPUs. */ 504 that causes hangs on some VIA CPUs. */
497 return addr; 505 return addr;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f5037801..98f230f6a28d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
41static LIST_HEAD(iommu_pd_list); 41static LIST_HEAD(iommu_pd_list);
42static DEFINE_SPINLOCK(iommu_pd_list_lock); 42static DEFINE_SPINLOCK(iommu_pd_list_lock);
43 43
44#ifdef CONFIG_IOMMU_API 44/*
45 * Domain for untranslated devices - only allocated
46 * if iommu=pt passed on kernel cmd line.
47 */
48static struct protection_domain *pt_domain;
49
45static struct iommu_ops amd_iommu_ops; 50static struct iommu_ops amd_iommu_ops;
46#endif
47 51
48/* 52/*
49 * general struct to manage commands send to an IOMMU 53 * general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 60 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 61static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom, 62static u64 *alloc_pte(struct protection_domain *domain,
59 unsigned long address, u64 63 unsigned long address, int end_lvl,
60 **pte_page, gfp_t gfp); 64 u64 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page, 66 unsigned long start_page,
63 unsigned int pages); 67 unsigned int pages);
64 68static void reset_iommu_command_buffer(struct amd_iommu *iommu);
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER 69static u64 *fetch_pte(struct protection_domain *domain,
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 70 unsigned long address, int map_size);
67#endif 71static void update_domain(struct protection_domain *domain);
68 72
69#ifdef CONFIG_AMD_IOMMU_STATS 73#ifdef CONFIG_AMD_IOMMU_STATS
70 74
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
138 * 142 *
139 ****************************************************************************/ 143 ****************************************************************************/
140 144
141static void iommu_print_event(void *__evt) 145static void dump_dte_entry(u16 devid)
146{
147 int i;
148
149 for (i = 0; i < 8; ++i)
150 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
151 amd_iommu_dev_table[devid].data[i]);
152}
153
154static void dump_command(unsigned long phys_addr)
155{
156 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
157 int i;
158
159 for (i = 0; i < 4; ++i)
160 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
161}
162
163static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
142{ 164{
143 u32 *event = __evt; 165 u32 *event = __evt;
144 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 166 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
147 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 169 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
148 u64 address = (u64)(((u64)event[3]) << 32) | event[2]; 170 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
149 171
150 printk(KERN_ERR "AMD IOMMU: Event logged ["); 172 printk(KERN_ERR "AMD-Vi: Event logged [");
151 173
152 switch (type) { 174 switch (type) {
153 case EVENT_TYPE_ILL_DEV: 175 case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
155 "address=0x%016llx flags=0x%04x]\n", 177 "address=0x%016llx flags=0x%04x]\n",
156 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), 178 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
157 address, flags); 179 address, flags);
180 dump_dte_entry(devid);
158 break; 181 break;
159 case EVENT_TYPE_IO_FAULT: 182 case EVENT_TYPE_IO_FAULT:
160 printk("IO_PAGE_FAULT device=%02x:%02x.%x " 183 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
176 break; 199 break;
177 case EVENT_TYPE_ILL_CMD: 200 case EVENT_TYPE_ILL_CMD:
178 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
202 reset_iommu_command_buffer(iommu);
203 dump_command(address);
179 break; 204 break;
180 case EVENT_TYPE_CMD_HARD_ERR: 205 case EVENT_TYPE_CMD_HARD_ERR:
181 printk("COMMAND_HARDWARE_ERROR address=0x%016llx " 206 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
209 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 234 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
210 235
211 while (head != tail) { 236 while (head != tail) {
212 iommu_print_event(iommu->evt_buf + head); 237 iommu_print_event(iommu, iommu->evt_buf + head);
213 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; 238 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
214 } 239 }
215 240
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
296 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
297 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
298 323
299 if (unlikely(i == EXIT_LOOP_COUNT)) 324 if (unlikely(i == EXIT_LOOP_COUNT)) {
300 panic("AMD IOMMU: Completion wait loop failed\n"); 325 spin_unlock(&iommu->lock);
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
301} 329}
302 330
303/* 331/*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
445} 473}
446 474
447/* 475/*
476 * This function flushes one domain on one IOMMU
477 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
479{
480 struct iommu_cmd cmd;
481 unsigned long flags;
482
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
484 domid, 1, 1);
485
486 spin_lock_irqsave(&iommu->lock, flags);
487 __iommu_queue_command(iommu, &cmd);
488 __iommu_completion_wait(iommu);
489 __iommu_wait_for_completion(iommu);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491}
492
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
494{
495 int i;
496
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
499 continue;
500 flush_domain_on_iommu(iommu, i);
501 }
502
503}
504
505/*
448 * This function is used to flush the IO/TLB for a given protection domain 506 * This function is used to flush the IO/TLB for a given protection domain
449 * on every IOMMU in the system 507 * on every IOMMU in the system
450 */ 508 */
451static void iommu_flush_domain(u16 domid) 509static void iommu_flush_domain(u16 domid)
452{ 510{
453 unsigned long flags;
454 struct amd_iommu *iommu; 511 struct amd_iommu *iommu;
455 struct iommu_cmd cmd;
456 512
457 INC_STATS_COUNTER(domain_flush_all); 513 INC_STATS_COUNTER(domain_flush_all);
458 514
459 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 515 for_each_iommu(iommu)
460 domid, 1, 1); 516 flush_domain_on_iommu(iommu, domid);
461
462 for_each_iommu(iommu) {
463 spin_lock_irqsave(&iommu->lock, flags);
464 __iommu_queue_command(iommu, &cmd);
465 __iommu_completion_wait(iommu);
466 __iommu_wait_for_completion(iommu);
467 spin_unlock_irqrestore(&iommu->lock, flags);
468 }
469} 517}
470 518
471void amd_iommu_flush_all_domains(void) 519void amd_iommu_flush_all_domains(void)
472{ 520{
521 struct amd_iommu *iommu;
522
523 for_each_iommu(iommu)
524 flush_all_domains_on_iommu(iommu);
525}
526
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
528{
473 int i; 529 int i;
474 530
475 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 531 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
476 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 532 if (iommu != amd_iommu_rlookup_table[i])
477 continue; 533 continue;
478 iommu_flush_domain(i); 534
535 iommu_queue_inv_dev_entry(iommu, i);
536 iommu_completion_wait(iommu);
479 } 537 }
480} 538}
481 539
482void amd_iommu_flush_all_devices(void) 540static void flush_devices_by_domain(struct protection_domain *domain)
483{ 541{
484 struct amd_iommu *iommu; 542 struct amd_iommu *iommu;
485 int i; 543 int i;
486 544
487 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 545 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
488 if (amd_iommu_pd_table[i] == NULL) 546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
547 (amd_iommu_pd_table[i] != domain))
489 continue; 548 continue;
490 549
491 iommu = amd_iommu_rlookup_table[i]; 550 iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
497 } 556 }
498} 557}
499 558
559static void reset_iommu_command_buffer(struct amd_iommu *iommu)
560{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
562
563 if (iommu->reset_in_progress)
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
565
566 iommu->reset_in_progress = true;
567
568 amd_iommu_reset_cmd_buffer(iommu);
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571
572 iommu->reset_in_progress = false;
573}
574
575void amd_iommu_flush_all_devices(void)
576{
577 flush_devices_by_domain(NULL);
578}
579
500/**************************************************************************** 580/****************************************************************************
501 * 581 *
502 * The functions below are used the create the page table mappings for 582 * The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
514static int iommu_map_page(struct protection_domain *dom, 594static int iommu_map_page(struct protection_domain *dom,
515 unsigned long bus_addr, 595 unsigned long bus_addr,
516 unsigned long phys_addr, 596 unsigned long phys_addr,
517 int prot) 597 int prot,
598 int map_size)
518{ 599{
519 u64 __pte, *pte; 600 u64 __pte, *pte;
520 601
521 bus_addr = PAGE_ALIGN(bus_addr); 602 bus_addr = PAGE_ALIGN(bus_addr);
522 phys_addr = PAGE_ALIGN(phys_addr); 603 phys_addr = PAGE_ALIGN(phys_addr);
523 604
524 /* only support 512GB address spaces for now */ 605 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
525 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 606 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
607
608 if (!(prot & IOMMU_PROT_MASK))
526 return -EINVAL; 609 return -EINVAL;
527 610
528 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); 611 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
529 612
530 if (IOMMU_PTE_PRESENT(*pte)) 613 if (IOMMU_PTE_PRESENT(*pte))
531 return -EBUSY; 614 return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
538 621
539 *pte = __pte; 622 *pte = __pte;
540 623
624 update_domain(dom);
625
541 return 0; 626 return 0;
542} 627}
543 628
544static void iommu_unmap_page(struct protection_domain *dom, 629static void iommu_unmap_page(struct protection_domain *dom,
545 unsigned long bus_addr) 630 unsigned long bus_addr, int map_size)
546{ 631{
547 u64 *pte; 632 u64 *pte = fetch_pte(dom, bus_addr, map_size);
548
549 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
550
551 if (!IOMMU_PTE_PRESENT(*pte))
552 return;
553
554 pte = IOMMU_PTE_PAGE(*pte);
555 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
556 633
557 if (!IOMMU_PTE_PRESENT(*pte)) 634 if (pte)
558 return; 635 *pte = 0;
559
560 pte = IOMMU_PTE_PAGE(*pte);
561 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
562
563 *pte = 0;
564} 636}
565 637
566/* 638/*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
615 687
616 for (addr = e->address_start; addr < e->address_end; 688 for (addr = e->address_start; addr < e->address_end;
617 addr += PAGE_SIZE) { 689 addr += PAGE_SIZE) {
618 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); 690 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
691 PM_MAP_4k);
619 if (ret) 692 if (ret)
620 return ret; 693 return ret;
621 /* 694 /*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
670 * This function checks if there is a PTE for a given dma address. If 743 * This function checks if there is a PTE for a given dma address. If
671 * there is one, it returns the pointer to it. 744 * there is one, it returns the pointer to it.
672 */ 745 */
673static u64* fetch_pte(struct protection_domain *domain, 746static u64 *fetch_pte(struct protection_domain *domain,
674 unsigned long address) 747 unsigned long address, int map_size)
675{ 748{
749 int level;
676 u64 *pte; 750 u64 *pte;
677 751
678 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; 752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
679 754
680 if (!IOMMU_PTE_PRESENT(*pte)) 755 while (level > map_size) {
681 return NULL; 756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
682 758
683 pte = IOMMU_PTE_PAGE(*pte); 759 level -= 1;
684 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
685 760
686 if (!IOMMU_PTE_PRESENT(*pte)) 761 pte = IOMMU_PTE_PAGE(*pte);
687 return NULL; 762 pte = &pte[PM_LEVEL_INDEX(level, address)];
688 763
689 pte = IOMMU_PTE_PAGE(*pte); 764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
690 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 765 pte = NULL;
766 break;
767 }
768 }
691 769
692 return pte; 770 return pte;
693} 771}
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
727 u64 *pte, *pte_page; 805 u64 *pte, *pte_page;
728 806
729 for (i = 0; i < num_ptes; ++i) { 807 for (i = 0; i < num_ptes; ++i) {
730 pte = alloc_pte(&dma_dom->domain, address, 808 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
731 &pte_page, gfp); 809 &pte_page, gfp);
732 if (!pte) 810 if (!pte)
733 goto out_free; 811 goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
760 for (i = dma_dom->aperture[index]->offset; 838 for (i = dma_dom->aperture[index]->offset;
761 i < dma_dom->aperture_size; 839 i < dma_dom->aperture_size;
762 i += PAGE_SIZE) { 840 i += PAGE_SIZE) {
763 u64 *pte = fetch_pte(&dma_dom->domain, i); 841 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
764 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 842 if (!pte || !IOMMU_PTE_PRESENT(*pte))
765 continue; 843 continue;
766 844
767 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); 845 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
768 } 846 }
769 847
848 update_domain(&dma_dom->domain);
849
770 return 0; 850 return 0;
771 851
772out_free: 852out_free:
853 update_domain(&dma_dom->domain);
854
773 free_page((unsigned long)dma_dom->aperture[index]->bitmap); 855 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
774 856
775 kfree(dma_dom->aperture[index]); 857 kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1009 dma_dom->domain.id = domain_id_alloc(); 1091 dma_dom->domain.id = domain_id_alloc();
1010 if (dma_dom->domain.id == 0) 1092 if (dma_dom->domain.id == 0)
1011 goto free_dma_dom; 1093 goto free_dma_dom;
1012 dma_dom->domain.mode = PAGE_MODE_3_LEVEL; 1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1013 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1014 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1096 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1015 dma_dom->domain.priv = dma_dom; 1097 dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
1063 return dom; 1145 return dom;
1064} 1146}
1065 1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{
1150 u64 pte_root = virt_to_phys(domain->pt_root);
1151
1152 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1153 << DEV_ENTRY_MODE_SHIFT;
1154 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1155
1156 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1159
1160 amd_iommu_pd_table[devid] = domain;
1161}
1162
1163/*
1164 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware
1166 */
1167static void __attach_device(struct amd_iommu *iommu,
1168 struct protection_domain *domain,
1169 u16 devid)
1170{
1171 /* lock domain */
1172 spin_lock(&domain->lock);
1173
1174 /* update DTE entry */
1175 set_dte_entry(devid, domain);
1176
1177 domain->dev_cnt += 1;
1178
1179 /* ready */
1180 spin_unlock(&domain->lock);
1181}
1182
1066/* 1183/*
1067 * If a device is not yet associated with a domain, this function does 1184 * If a device is not yet associated with a domain, this function does
1068 * assigns it visible for the hardware 1185 * assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
1072 u16 devid) 1189 u16 devid)
1073{ 1190{
1074 unsigned long flags; 1191 unsigned long flags;
1075 u64 pte_root = virt_to_phys(domain->pt_root);
1076
1077 domain->dev_cnt += 1;
1078
1079 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1080 << DEV_ENTRY_MODE_SHIFT;
1081 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1082 1192
1083 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1084 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1194 __attach_device(iommu, domain, devid);
1085 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1086 amd_iommu_dev_table[devid].data[2] = domain->id;
1087
1088 amd_iommu_pd_table[devid] = domain;
1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1090 1196
1091 /* 1197 /*
1092 * We might boot into a crash-kernel here. The crashed kernel 1198 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush 1199 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff. 1200 * here to evict all dirty stuff.
1095 */ 1201 */
1096 iommu_queue_inv_dev_entry(iommu, devid); 1202 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id); 1203 iommu_flush_tlb_pde(iommu, domain->id);
1098} 1204}
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
1119 1225
1120 /* ready */ 1226 /* ready */
1121 spin_unlock(&domain->lock); 1227 spin_unlock(&domain->lock);
1228
1229 /*
1230 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain
1232 */
1233 if (iommu_pass_through) {
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1122} 1237}
1123 1238
1124/* 1239/*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
1164 case BUS_NOTIFY_UNBOUND_DRIVER: 1279 case BUS_NOTIFY_UNBOUND_DRIVER:
1165 if (!domain) 1280 if (!domain)
1166 goto out; 1281 goto out;
1282 if (iommu_pass_through)
1283 break;
1167 detach_device(domain, devid); 1284 detach_device(domain, devid);
1168 break; 1285 break;
1169 case BUS_NOTIFY_ADD_DEVICE: 1286 case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
1292 return 1; 1409 return 1;
1293} 1410}
1294 1411
1412static void update_device_table(struct protection_domain *domain)
1413{
1414 unsigned long flags;
1415 int i;
1416
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
1418 if (amd_iommu_pd_table[i] != domain)
1419 continue;
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 }
1424}
1425
1426static void update_domain(struct protection_domain *domain)
1427{
1428 if (!domain->updated)
1429 return;
1430
1431 update_device_table(domain);
1432 flush_devices_by_domain(domain);
1433 iommu_flush_domain(domain->id);
1434
1435 domain->updated = false;
1436}
1437
1295/* 1438/*
1296 * If the pte_page is not yet allocated this function is called 1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1297 */ 1442 */
1298static u64* alloc_pte(struct protection_domain *dom, 1443static bool increase_address_space(struct protection_domain *domain,
1299 unsigned long address, u64 **pte_page, gfp_t gfp) 1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1300{ 1470{
1301 u64 *pte, *page; 1471 u64 *pte, *page;
1472 int level;
1302 1473
1303 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; 1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1304 1476
1305 if (!IOMMU_PTE_PRESENT(*pte)) { 1477 level = domain->mode - 1;
1306 page = (u64 *)get_zeroed_page(gfp); 1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1307 if (!page)
1308 return NULL;
1309 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1310 }
1311 1479
1312 pte = IOMMU_PTE_PAGE(*pte); 1480 while (level > end_lvl) {
1313 pte = &pte[IOMMU_PTE_L1_INDEX(address)]; 1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1314 1487
1315 if (!IOMMU_PTE_PRESENT(*pte)) { 1488 level -= 1;
1316 page = (u64 *)get_zeroed_page(gfp);
1317 if (!page)
1318 return NULL;
1319 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1320 }
1321 1489
1322 pte = IOMMU_PTE_PAGE(*pte); 1490 pte = IOMMU_PTE_PAGE(*pte);
1323 1491
1324 if (pte_page) 1492 if (pte_page && level == end_lvl)
1325 *pte_page = pte; 1493 *pte_page = pte;
1326 1494
1327 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1328 1497
1329 return pte; 1498 return pte;
1330} 1499}
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1344 1513
1345 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1514 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1346 if (!pte) { 1515 if (!pte) {
1347 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); 1516 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
1517 GFP_ATOMIC);
1348 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1518 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1349 } else 1519 } else
1350 pte += IOMMU_PTE_L0_INDEX(address); 1520 pte += PM_LEVEL_INDEX(0, address);
1521
1522 update_domain(&dom->domain);
1351 1523
1352 return pte; 1524 return pte;
1353} 1525}
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1409 if (!pte) 1581 if (!pte)
1410 return; 1582 return;
1411 1583
1412 pte += IOMMU_PTE_L0_INDEX(address); 1584 pte += PM_LEVEL_INDEX(0, address);
1413 1585
1414 WARN_ON(!*pte); 1586 WARN_ON(!*pte);
1415 1587
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
1988 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1989} 2161}
1990 2162
1991static int amd_iommu_domain_init(struct iommu_domain *dom) 2163static void protection_domain_free(struct protection_domain *domain)
2164{
2165 if (!domain)
2166 return;
2167
2168 if (domain->id)
2169 domain_id_free(domain->id);
2170
2171 kfree(domain);
2172}
2173
2174static struct protection_domain *protection_domain_alloc(void)
1992{ 2175{
1993 struct protection_domain *domain; 2176 struct protection_domain *domain;
1994 2177
1995 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2178 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1996 if (!domain) 2179 if (!domain)
1997 return -ENOMEM; 2180 return NULL;
1998 2181
1999 spin_lock_init(&domain->lock); 2182 spin_lock_init(&domain->lock);
2000 domain->mode = PAGE_MODE_3_LEVEL;
2001 domain->id = domain_id_alloc(); 2183 domain->id = domain_id_alloc();
2002 if (!domain->id) 2184 if (!domain->id)
2185 goto out_err;
2186
2187 return domain;
2188
2189out_err:
2190 kfree(domain);
2191
2192 return NULL;
2193}
2194
2195static int amd_iommu_domain_init(struct iommu_domain *dom)
2196{
2197 struct protection_domain *domain;
2198
2199 domain = protection_domain_alloc();
2200 if (!domain)
2003 goto out_free; 2201 goto out_free;
2202
2203 domain->mode = PAGE_MODE_3_LEVEL;
2004 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2204 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2005 if (!domain->pt_root) 2205 if (!domain->pt_root)
2006 goto out_free; 2206 goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
2010 return 0; 2210 return 0;
2011 2211
2012out_free: 2212out_free:
2013 kfree(domain); 2213 protection_domain_free(domain);
2014 2214
2015 return -ENOMEM; 2215 return -ENOMEM;
2016} 2216}
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2115 paddr &= PAGE_MASK; 2315 paddr &= PAGE_MASK;
2116 2316
2117 for (i = 0; i < npages; ++i) { 2317 for (i = 0; i < npages; ++i) {
2118 ret = iommu_map_page(domain, iova, paddr, prot); 2318 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2119 if (ret) 2319 if (ret)
2120 return ret; 2320 return ret;
2121 2321
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2136 iova &= PAGE_MASK; 2336 iova &= PAGE_MASK;
2137 2337
2138 for (i = 0; i < npages; ++i) { 2338 for (i = 0; i < npages; ++i) {
2139 iommu_unmap_page(domain, iova); 2339 iommu_unmap_page(domain, iova, PM_MAP_4k);
2140 iova += PAGE_SIZE; 2340 iova += PAGE_SIZE;
2141 } 2341 }
2142 2342
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2151 phys_addr_t paddr; 2351 phys_addr_t paddr;
2152 u64 *pte; 2352 u64 *pte;
2153 2353
2154 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; 2354 pte = fetch_pte(domain, iova, PM_MAP_4k);
2155
2156 if (!IOMMU_PTE_PRESENT(*pte))
2157 return 0;
2158
2159 pte = IOMMU_PTE_PAGE(*pte);
2160 pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
2161
2162 if (!IOMMU_PTE_PRESENT(*pte))
2163 return 0;
2164
2165 pte = IOMMU_PTE_PAGE(*pte);
2166 pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
2167 2355
2168 if (!IOMMU_PTE_PRESENT(*pte)) 2356 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2169 return 0; 2357 return 0;
2170 2358
2171 paddr = *pte & IOMMU_PAGE_MASK; 2359 paddr = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
2191 .domain_has_cap = amd_iommu_domain_has_cap, 2379 .domain_has_cap = amd_iommu_domain_has_cap,
2192}; 2380};
2193 2381
2382/*****************************************************************************
2383 *
2384 * The next functions do a basic initialization of IOMMU for pass through
2385 * mode
2386 *
2387 * In passthrough mode the IOMMU is initialized and enabled but not used for
2388 * DMA-API translation.
2389 *
2390 *****************************************************************************/
2391
2392int __init amd_iommu_init_passthrough(void)
2393{
2394 struct pci_dev *dev = NULL;
2395 u16 devid, devid2;
2396
2397 /* allocate passthroug domain */
2398 pt_domain = protection_domain_alloc();
2399 if (!pt_domain)
2400 return -ENOMEM;
2401
2402 pt_domain->mode |= PAGE_MODE_NONE;
2403
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406
2407 devid = calc_devid(dev->bus->number, dev->devfn);
2408 if (devid > amd_iommu_last_bdf)
2409 continue;
2410
2411 devid2 = amd_iommu_alias_table[devid];
2412
2413 iommu = amd_iommu_rlookup_table[devid2];
2414 if (!iommu)
2415 continue;
2416
2417 __attach_device(iommu, pt_domain, devid);
2418 __attach_device(iommu, pt_domain, devid2);
2419 }
2420
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2422
2423 return 0;
2424}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252e..b4b61d462dcc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
252/* Function to enable the hardware */ 252/* Function to enable the hardware */
253static void iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
254{ 254{
255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
256 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
257 257
258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
435} 435}
436 436
437/* 437/*
438 * This function resets the command buffer if the IOMMU stopped fetching
439 * commands from it.
440 */
441void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
442{
443 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
444
445 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
446 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
447
448 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
449}
450
451/*
438 * This function writes the command buffer address to the hardware and 452 * This function writes the command buffer address to the hardware and
439 * enables it. 453 * enables it.
440 */ 454 */
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
450 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 464 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
451 &entry, sizeof(entry)); 465 &entry, sizeof(entry));
452 466
453 /* set head and tail to zero manually */ 467 amd_iommu_reset_cmd_buffer(iommu);
454 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
455 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
456
457 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
458} 468}
459 469
460static void __init free_command_buffer(struct amd_iommu *iommu) 470static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
858 switch (*p) { 868 switch (*p) {
859 case ACPI_IVHD_TYPE: 869 case ACPI_IVHD_TYPE:
860 870
861 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " 871 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
862 "seg: %d flags: %01x info %04x\n", 872 "seg: %d flags: %01x info %04x\n",
863 PCI_BUS(h->devid), PCI_SLOT(h->devid), 873 PCI_BUS(h->devid), PCI_SLOT(h->devid),
864 PCI_FUNC(h->devid), h->cap_ptr, 874 PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
902 912
903 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 913 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
904 IRQF_SAMPLE_RANDOM, 914 IRQF_SAMPLE_RANDOM,
905 "AMD IOMMU", 915 "AMD-Vi",
906 NULL); 916 NULL);
907 917
908 if (r) { 918 if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
1150 1160
1151 1161
1152 if (no_iommu) { 1162 if (no_iommu) {
1153 printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); 1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1154 return 0; 1164 return 0;
1155 } 1165 }
1156 1166
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
1242 if (ret) 1252 if (ret)
1243 goto free; 1253 goto free;
1244 1254
1245 ret = amd_iommu_init_dma_ops(); 1255 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough();
1257 else
1258 ret = amd_iommu_init_dma_ops();
1246 if (ret) 1259 if (ret)
1247 goto free; 1260 goto free;
1248 1261
1249 enable_iommus(); 1262 enable_iommus();
1250 1263
1251 printk(KERN_INFO "AMD IOMMU: device isolation "); 1264 if (iommu_pass_through)
1265 goto out;
1266
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1252 if (amd_iommu_isolate) 1268 if (amd_iommu_isolate)
1253 printk("enabled\n"); 1269 printk("enabled\n");
1254 else 1270 else
1255 printk("disabled\n"); 1271 printk("disabled\n");
1256 1272
1257 if (amd_iommu_unmap_flush) 1273 if (amd_iommu_unmap_flush)
1258 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); 1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1259 else 1275 else
1260 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); 1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1261 1277
1262out: 1278out:
1263 return ret; 1279 return ret;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 676debfc1702..128111d8ffe0 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,6 +20,7 @@
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/suspend.h> 22#include <linux/suspend.h>
23#include <linux/kmemleak.h>
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include <asm/iommu.h> 26#include <asm/iommu.h>
@@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void)
94 * code for safe 95 * code for safe
95 */ 96 */
96 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 97 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(p);
97 if (!p || __pa(p)+aper_size > 0xffffffff) { 103 if (!p || __pa(p)+aper_size > 0xffffffff) {
98 printk(KERN_ERR 104 printk(KERN_ERR
99 "Cannot allocate aperture memory hole (%p,%uK)\n", 105 "Cannot allocate aperture memory hole (%p,%uK)\n",
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c2830ec66..894aa97f0717 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
20#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -35,7 +35,8 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/perf_counter.h> 38#include <asm/perf_event.h>
39#include <asm/x86_init.h>
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
@@ -49,6 +50,7 @@
49#include <asm/mtrr.h> 50#include <asm/mtrr.h>
50#include <asm/smp.h> 51#include <asm/smp.h>
51#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h>
52 54
53unsigned int num_processors; 55unsigned int num_processors;
54 56
@@ -60,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
60/* 62/*
61 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
62 * 64 *
63 * This determines the messaging protocol we can use: if all APIC IDs 65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
64 * are in the 0 ... 7 range, then we can use logical addressing which 66 * are in the 0 ... 7 range, then we can use logical addressing which
65 * has some performance advantages (better broadcasting). 67 * has some performance advantages (better broadcasting).
66 * 68 *
@@ -977,7 +979,7 @@ void lapic_shutdown(void)
977{ 979{
978 unsigned long flags; 980 unsigned long flags;
979 981
980 if (!cpu_has_apic) 982 if (!cpu_has_apic && !apic_from_smp_config())
981 return; 983 return;
982 984
983 local_irq_save(flags); 985 local_irq_save(flags);
@@ -1187,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1187 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1188 } 1190 }
1189#endif 1191#endif
1190 perf_counters_lapic_init(); 1192 perf_events_lapic_init();
1191 1193
1192 preempt_disable(); 1194 preempt_disable();
1193 1195
@@ -1195,8 +1197,7 @@ void __cpuinit setup_local_APIC(void)
1195 * Double-check whether this APIC is really registered. 1197 * Double-check whether this APIC is really registered.
1196 * This is meaningless in clustered apic mode, so we skip it. 1198 * This is meaningless in clustered apic mode, so we skip it.
1197 */ 1199 */
1198 if (!apic->apic_id_registered()) 1200 BUG_ON(!apic->apic_id_registered());
1199 BUG();
1200 1201
1201 /* 1202 /*
1202 * Intel recommends to set DFR, LDR and TPR before enabling 1203 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1361,52 +1362,80 @@ void enable_x2apic(void)
1361} 1362}
1362#endif /* CONFIG_X86_X2APIC */ 1363#endif /* CONFIG_X86_X2APIC */
1363 1364
1364void __init enable_IR_x2apic(void) 1365int __init enable_IR(void)
1365{ 1366{
1366#ifdef CONFIG_INTR_REMAP 1367#ifdef CONFIG_INTR_REMAP
1367 int ret;
1368 unsigned long flags;
1369 struct IO_APIC_route_entry **ioapic_entries = NULL;
1370
1371 ret = dmar_table_init();
1372 if (ret) {
1373 pr_debug("dmar_table_init() failed with %d:\n", ret);
1374 goto ir_failed;
1375 }
1376
1377 if (!intr_remapping_supported()) { 1368 if (!intr_remapping_supported()) {
1378 pr_debug("intr-remapping not supported\n"); 1369 pr_debug("intr-remapping not supported\n");
1379 goto ir_failed; 1370 return 0;
1380 } 1371 }
1381 1372
1382
1383 if (!x2apic_preenabled && skip_ioapic_setup) { 1373 if (!x2apic_preenabled && skip_ioapic_setup) {
1384 pr_info("Skipped enabling intr-remap because of skipping " 1374 pr_info("Skipped enabling intr-remap because of skipping "
1385 "io-apic setup\n"); 1375 "io-apic setup\n");
1386 return; 1376 return 0;
1387 } 1377 }
1388 1378
1379 if (enable_intr_remapping(x2apic_supported()))
1380 return 0;
1381
1382 pr_info("Enabled Interrupt-remapping\n");
1383
1384 return 1;
1385
1386#endif
1387 return 0;
1388}
1389
1390void __init enable_IR_x2apic(void)
1391{
1392 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0;
1396
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret)
1400 pr_debug("dmar_table_init() failed with %d:\n",
1401 dmar_table_init_ret);
1402#endif
1403
1389 ioapic_entries = alloc_ioapic_entries(); 1404 ioapic_entries = alloc_ioapic_entries();
1390 if (!ioapic_entries) { 1405 if (!ioapic_entries) {
1391 pr_info("Allocate ioapic_entries failed: %d\n", ret); 1406 pr_err("Allocate ioapic_entries failed\n");
1392 goto end; 1407 goto out;
1393 } 1408 }
1394 1409
1395 ret = save_IO_APIC_setup(ioapic_entries); 1410 ret = save_IO_APIC_setup(ioapic_entries);
1396 if (ret) { 1411 if (ret) {
1397 pr_info("Saving IO-APIC state failed: %d\n", ret); 1412 pr_info("Saving IO-APIC state failed: %d\n", ret);
1398 goto end; 1413 goto out;
1399 } 1414 }
1400 1415
1401 local_irq_save(flags); 1416 local_irq_save(flags);
1402 mask_IO_APIC_setup(ioapic_entries);
1403 mask_8259A(); 1417 mask_8259A();
1418 mask_IO_APIC_setup(ioapic_entries);
1404 1419
1405 ret = enable_intr_remapping(x2apic_supported()); 1420 if (dmar_table_init_ret)
1406 if (ret) 1421 ret = 0;
1407 goto end_restore; 1422 else
1423 ret = enable_IR();
1408 1424
1409 pr_info("Enabled Interrupt-remapping\n"); 1425 if (!ret) {
1426 /* IR is required if there is APIC ID > 255 even when running
1427 * under KVM
1428 */
1429 if (max_physical_apicid > 255 || !kvm_para_available())
1430 goto nox2apic;
1431 /*
1432 * without IR all CPUs can be addressed by IOAPIC/MSI
1433 * only in physical mode
1434 */
1435 x2apic_force_phys();
1436 }
1437
1438 x2apic_enabled = 1;
1410 1439
1411 if (x2apic_supported() && !x2apic_mode) { 1440 if (x2apic_supported() && !x2apic_mode) {
1412 x2apic_mode = 1; 1441 x2apic_mode = 1;
@@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void)
1414 pr_info("Enabled x2apic\n"); 1443 pr_info("Enabled x2apic\n");
1415 } 1444 }
1416 1445
1417end_restore: 1446nox2apic:
1418 if (ret) 1447 if (!ret) /* IR enabling failed */
1419 /*
1420 * IR enabling failed
1421 */
1422 restore_IO_APIC_setup(ioapic_entries); 1448 restore_IO_APIC_setup(ioapic_entries);
1423
1424 unmask_8259A(); 1449 unmask_8259A();
1425 local_irq_restore(flags); 1450 local_irq_restore(flags);
1426 1451
1427end: 1452out:
1428 if (ioapic_entries) 1453 if (ioapic_entries)
1429 free_ioapic_entries(ioapic_entries); 1454 free_ioapic_entries(ioapic_entries);
1430 1455
1431 if (!ret) 1456 if (x2apic_enabled)
1432 return; 1457 return;
1433 1458
1434ir_failed:
1435 if (x2apic_preenabled) 1459 if (x2apic_preenabled)
1436 panic("x2apic enabled by bios. But IR enabling failed"); 1460 panic("x2apic: enabled by BIOS but kernel init failed.");
1437 else if (cpu_has_x2apic) 1461 else if (cpu_has_x2apic)
1438 pr_info("Not enabling x2apic,Intr-remapping\n"); 1462 pr_info("Not enabling x2apic, Intr-remapping init failed.\n");
1439#else
1440 if (!cpu_has_x2apic)
1441 return;
1442
1443 if (x2apic_preenabled)
1444 panic("x2apic enabled prior OS handover,"
1445 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1446#endif
1447
1448 return;
1449} 1463}
1450 1464
1451
1452#ifdef CONFIG_X86_64 1465#ifdef CONFIG_X86_64
1453/* 1466/*
1454 * Detect and enable local APICs on non-SMP boards. 1467 * Detect and enable local APICs on non-SMP boards.
@@ -1549,8 +1562,6 @@ no_apic:
1549#ifdef CONFIG_X86_64 1562#ifdef CONFIG_X86_64
1550void __init early_init_lapic_mapping(void) 1563void __init early_init_lapic_mapping(void)
1551{ 1564{
1552 unsigned long phys_addr;
1553
1554 /* 1565 /*
1555 * If no local APIC can be found then go out 1566 * If no local APIC can be found then go out
1556 * : it means there is no mpatable and MADT 1567 * : it means there is no mpatable and MADT
@@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void)
1558 if (!smp_found_config) 1569 if (!smp_found_config)
1559 return; 1570 return;
1560 1571
1561 phys_addr = mp_lapic_addr; 1572 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
1562
1563 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1564 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", 1573 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1565 APIC_BASE, phys_addr); 1574 APIC_BASE, mp_lapic_addr);
1566 1575
1567 /* 1576 /*
1568 * Fetch the APIC ID of the BSP in case we have a 1577 * Fetch the APIC ID of the BSP in case we have a
@@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void)
1651 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1660 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1652 pr_err("BIOS bug, local APIC 0x%x not detected!...\n", 1661 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
1653 boot_cpu_physical_apicid); 1662 boot_cpu_physical_apicid);
1654 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1655 return -1; 1663 return -1;
1656 } 1664 }
1657#endif 1665#endif
@@ -1701,7 +1709,7 @@ int __init APIC_init_uniprocessor(void)
1701 localise_nmi_watchdog(); 1709 localise_nmi_watchdog();
1702#endif 1710#endif
1703 1711
1704 setup_boot_clock(); 1712 x86_init.timers.setup_percpu_clockev();
1705#ifdef CONFIG_X86_64 1713#ifdef CONFIG_X86_64
1706 check_nmi_watchdog(); 1714 check_nmi_watchdog();
1707#endif 1715#endif
@@ -1908,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
1908 max_physical_apicid = apicid; 1916 max_physical_apicid = apicid;
1909 1917
1910#ifdef CONFIG_X86_32 1918#ifdef CONFIG_X86_32
1911 /* 1919 switch (boot_cpu_data.x86_vendor) {
1912 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1920 case X86_VENDOR_INTEL:
1913 * but we need to work other dependencies like SMP_SUSPEND etc 1921 if (num_processors > 8)
1914 * before this can be done without some confusion. 1922 def_to_bigsmp = 1;
1915 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1923 break;
1916 * - Ashok Raj <ashok.raj@intel.com> 1924 case X86_VENDOR_AMD:
1917 */ 1925 if (max_physical_apicid >= 8)
1918 if (max_physical_apicid >= 8) {
1919 switch (boot_cpu_data.x86_vendor) {
1920 case X86_VENDOR_INTEL:
1921 if (!APIC_XAPIC(version)) {
1922 def_to_bigsmp = 0;
1923 break;
1924 }
1925 /* If P4 and above fall through */
1926 case X86_VENDOR_AMD:
1927 def_to_bigsmp = 1; 1926 def_to_bigsmp = 1;
1928 }
1929 } 1927 }
1930#endif 1928#endif
1931 1929
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c0..77a06413b6b2 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
112 return physids_promote(0xFFL); 112 return physids_promote(0xFFL);
113} 113}
114 114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) 115static int bigsmp_check_phys_apicid_present(int phys_apicid)
116{ 116{
117 return 1; 117 return 1;
118} 118}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8952a5890281..89174f847b49 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void)
167{ 167{
168 /* MPENTIUMIII */ 168 /* MPENTIUMIII */
169 if (boot_cpu_data.x86 == 6 && 169 if (boot_cpu_data.x86 == 6 &&
170 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) 170 (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
171 return 1; 171 return 1;
172 172
173 return 0; 173 return 0;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5ddc80..dc69f28489f5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -66,6 +66,8 @@
66#include <asm/apic.h> 66#include <asm/apic.h>
67 67
68#define __apicdebuginit(type) static type __init 68#define __apicdebuginit(type) static type __init
69#define for_each_irq_pin(entry, head) \
70 for (entry = head; entry; entry = entry->next)
69 71
70/* 72/*
71 * Is the SiS APIC rmw bug present ? 73 * Is the SiS APIC rmw bug present ?
@@ -85,12 +87,20 @@ int nr_ioapic_registers[MAX_IO_APICS];
85struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; 87struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 88int nr_ioapics;
87 89
90/* IO APIC gsi routing info */
91struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
92
88/* MP IRQ source entries */ 93/* MP IRQ source entries */
89struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 94struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 95
91/* # of MP IRQ source entries */ 96/* # of MP IRQ source entries */
92int mp_irq_entries; 97int mp_irq_entries;
93 98
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY;
103
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 104#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES]; 105int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif 106#endif
@@ -116,15 +126,6 @@ static int __init parse_noapic(char *str)
116} 126}
117early_param("noapic", parse_noapic); 127early_param("noapic", parse_noapic);
118 128
119struct irq_pin_list;
120
121/*
122 * This is performance-critical, we want to do it O(1)
123 *
124 * the indexing order of this array favors 1:1 mappings
125 * between pins and IRQs.
126 */
127
128struct irq_pin_list { 129struct irq_pin_list {
129 int apic, pin; 130 int apic, pin;
130 struct irq_pin_list *next; 131 struct irq_pin_list *next;
@@ -139,6 +140,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
139 return pin; 140 return pin;
140} 141}
141 142
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
142struct irq_cfg { 148struct irq_cfg {
143 struct irq_pin_list *irq_2_pin; 149 struct irq_pin_list *irq_2_pin;
144 cpumask_var_t domain; 150 cpumask_var_t domain;
@@ -172,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
172 [15] = { .vector = IRQ15_VECTOR, }, 178 [15] = { .vector = IRQ15_VECTOR, },
173}; 179};
174 180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186
175int __init arch_early_irq_init(void) 187int __init arch_early_irq_init(void)
176{ 188{
177 struct irq_cfg *cfg; 189 struct irq_cfg *cfg;
@@ -189,7 +201,7 @@ int __init arch_early_irq_init(void)
189 desc->chip_data = &cfg[i]; 201 desc->chip_data = &cfg[i];
190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
192 if (i < NR_IRQS_LEGACY) 204 if (i < nr_legacy_irqs)
193 cpumask_setall(cfg[i].domain); 205 cpumask_setall(cfg[i].domain);
194 } 206 }
195 207
@@ -215,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
215 227
216 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 228 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
217 if (cfg) { 229 if (cfg) {
218 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 230 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
219 kfree(cfg); 231 kfree(cfg);
220 cfg = NULL; 232 cfg = NULL;
221 } else if (!alloc_cpumask_var_node(&cfg->old_domain, 233 } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
222 GFP_ATOMIC, node)) { 234 GFP_ATOMIC, node)) {
223 free_cpumask_var(cfg->domain); 235 free_cpumask_var(cfg->domain);
224 kfree(cfg); 236 kfree(cfg);
225 cfg = NULL; 237 cfg = NULL;
226 } else {
227 cpumask_clear(cfg->domain);
228 cpumask_clear(cfg->old_domain);
229 } 238 }
230 } 239 }
231 240
@@ -414,13 +423,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
414 unsigned long flags; 423 unsigned long flags;
415 424
416 spin_lock_irqsave(&ioapic_lock, flags); 425 spin_lock_irqsave(&ioapic_lock, flags);
417 entry = cfg->irq_2_pin; 426 for_each_irq_pin(entry, cfg->irq_2_pin) {
418 for (;;) {
419 unsigned int reg; 427 unsigned int reg;
420 int pin; 428 int pin;
421 429
422 if (!entry)
423 break;
424 pin = entry->pin; 430 pin = entry->pin;
425 reg = io_apic_read(entry->apic, 0x10 + pin*2); 431 reg = io_apic_read(entry->apic, 0x10 + pin*2);
426 /* Is the remote IRR bit set? */ 432 /* Is the remote IRR bit set? */
@@ -428,9 +434,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
428 spin_unlock_irqrestore(&ioapic_lock, flags); 434 spin_unlock_irqrestore(&ioapic_lock, flags);
429 return true; 435 return true;
430 } 436 }
431 if (!entry->next)
432 break;
433 entry = entry->next;
434 } 437 }
435 spin_unlock_irqrestore(&ioapic_lock, flags); 438 spin_unlock_irqrestore(&ioapic_lock, flags);
436 439
@@ -498,72 +501,68 @@ static void ioapic_mask_entry(int apic, int pin)
498 * shared ISA-space IRQs, so we have to support them. We are super 501 * shared ISA-space IRQs, so we have to support them. We are super
499 * fast in the common case, and fast for shared ISA-space IRQs. 502 * fast in the common case, and fast for shared ISA-space IRQs.
500 */ 503 */
501static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 504static int
505add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
502{ 506{
503 struct irq_pin_list *entry; 507 struct irq_pin_list **last, *entry;
504 508
505 entry = cfg->irq_2_pin; 509 /* don't allow duplicates */
506 if (!entry) { 510 last = &cfg->irq_2_pin;
507 entry = get_one_free_irq_2_pin(node); 511 for_each_irq_pin(entry, cfg->irq_2_pin) {
508 if (!entry) {
509 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
510 apic, pin);
511 return;
512 }
513 cfg->irq_2_pin = entry;
514 entry->apic = apic;
515 entry->pin = pin;
516 return;
517 }
518
519 while (entry->next) {
520 /* not again, please */
521 if (entry->apic == apic && entry->pin == pin) 512 if (entry->apic == apic && entry->pin == pin)
522 return; 513 return 0;
523 514 last = &entry->next;
524 entry = entry->next;
525 } 515 }
526 516
527 entry->next = get_one_free_irq_2_pin(node); 517 entry = get_one_free_irq_2_pin(node);
528 entry = entry->next; 518 if (!entry) {
519 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
520 node, apic, pin);
521 return -ENOMEM;
522 }
529 entry->apic = apic; 523 entry->apic = apic;
530 entry->pin = pin; 524 entry->pin = pin;
525
526 *last = entry;
527 return 0;
528}
529
530static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
531{
532 if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
533 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
531} 534}
532 535
533/* 536/*
534 * Reroute an IRQ to a different pin. 537 * Reroute an IRQ to a different pin.
535 */ 538 */
536static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, 539static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
537 int oldapic, int oldpin, 540 int oldapic, int oldpin,
538 int newapic, int newpin) 541 int newapic, int newpin)
539{ 542{
540 struct irq_pin_list *entry = cfg->irq_2_pin; 543 struct irq_pin_list *entry;
541 int replaced = 0;
542 544
543 while (entry) { 545 for_each_irq_pin(entry, cfg->irq_2_pin) {
544 if (entry->apic == oldapic && entry->pin == oldpin) { 546 if (entry->apic == oldapic && entry->pin == oldpin) {
545 entry->apic = newapic; 547 entry->apic = newapic;
546 entry->pin = newpin; 548 entry->pin = newpin;
547 replaced = 1;
548 /* every one is different, right? */ 549 /* every one is different, right? */
549 break; 550 return;
550 } 551 }
551 entry = entry->next;
552 } 552 }
553 553
554 /* why? call replace before add? */ 554 /* old apic/pin didn't exist, so just add new ones */
555 if (!replaced) 555 add_pin_to_irq_node(cfg, node, newapic, newpin);
556 add_pin_to_irq_node(cfg, node, newapic, newpin);
557} 556}
558 557
559static inline void io_apic_modify_irq(struct irq_cfg *cfg, 558static void io_apic_modify_irq(struct irq_cfg *cfg,
560 int mask_and, int mask_or, 559 int mask_and, int mask_or,
561 void (*final)(struct irq_pin_list *entry)) 560 void (*final)(struct irq_pin_list *entry))
562{ 561{
563 int pin; 562 int pin;
564 struct irq_pin_list *entry; 563 struct irq_pin_list *entry;
565 564
566 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 565 for_each_irq_pin(entry, cfg->irq_2_pin) {
567 unsigned int reg; 566 unsigned int reg;
568 pin = entry->pin; 567 pin = entry->pin;
569 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 568 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
@@ -580,7 +579,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
580 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); 579 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
581} 580}
582 581
583#ifdef CONFIG_X86_64
584static void io_apic_sync(struct irq_pin_list *entry) 582static void io_apic_sync(struct irq_pin_list *entry)
585{ 583{
586 /* 584 /*
@@ -596,11 +594,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
596{ 594{
597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
598} 596}
599#else /* CONFIG_X86_32 */
600static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
601{
602 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
603}
604 597
605static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) 598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
606{ 599{
@@ -613,7 +606,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
613 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
614 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
615} 608}
616#endif /* CONFIG_X86_32 */
617 609
618static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 610static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
619{ 611{
@@ -883,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type)
883 */ 875 */
884static int EISA_ELCR(unsigned int irq) 876static int EISA_ELCR(unsigned int irq)
885{ 877{
886 if (irq < NR_IRQS_LEGACY) { 878 if (irq < nr_legacy_irqs) {
887 unsigned int port = 0x4d0 + (irq >> 3); 879 unsigned int port = 0x4d0 + (irq >> 3);
888 return (inb(port) >> (irq & 7)) & 1; 880 return (inb(port) >> (irq & 7)) & 1;
889 } 881 }
@@ -1480,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1480 } 1472 }
1481 1473
1482 ioapic_register_intr(irq, desc, trigger); 1474 ioapic_register_intr(irq, desc, trigger);
1483 if (irq < NR_IRQS_LEGACY) 1475 if (irq < nr_legacy_irqs)
1484 disable_8259A_irq(irq); 1476 disable_8259A_irq(irq);
1485 1477
1486 ioapic_write_entry(apic_id, pin, entry); 1478 ioapic_write_entry(apic_id, pin, entry);
@@ -1702,12 +1694,8 @@ __apicdebuginit(void) print_IO_APIC(void)
1702 if (!entry) 1694 if (!entry)
1703 continue; 1695 continue;
1704 printk(KERN_DEBUG "IRQ%d ", irq); 1696 printk(KERN_DEBUG "IRQ%d ", irq);
1705 for (;;) { 1697 for_each_irq_pin(entry, cfg->irq_2_pin)
1706 printk("-> %d:%d", entry->apic, entry->pin); 1698 printk("-> %d:%d", entry->apic, entry->pin);
1707 if (!entry->next)
1708 break;
1709 entry = entry->next;
1710 }
1711 printk("\n"); 1699 printk("\n");
1712 } 1700 }
1713 1701
@@ -1851,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void)
1851 unsigned int v; 1839 unsigned int v;
1852 unsigned long flags; 1840 unsigned long flags;
1853 1841
1854 if (apic_verbosity == APIC_QUIET) 1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
1855 return; 1843 return;
1856 1844
1857 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1845 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1883,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void)
1883 print_PIC(); 1871 print_PIC();
1884 1872
1885 /* don't print out if apic is not there */ 1873 /* don't print out if apic is not there */
1886 if (!cpu_has_apic || disable_apic) 1874 if (!cpu_has_apic && !apic_from_smp_config())
1887 return 0; 1875 return 0;
1888 1876
1889 print_all_local_APICs(); 1877 print_all_local_APICs();
@@ -1914,6 +1902,10 @@ void __init enable_IO_APIC(void)
1914 spin_unlock_irqrestore(&ioapic_lock, flags); 1902 spin_unlock_irqrestore(&ioapic_lock, flags);
1915 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1916 } 1904 }
1905
1906 if (!nr_legacy_irqs)
1907 return;
1908
1917 for(apic = 0; apic < nr_ioapics; apic++) { 1909 for(apic = 0; apic < nr_ioapics; apic++) {
1918 int pin; 1910 int pin;
1919 /* See if any of the pins is in ExtINT mode */ 1911 /* See if any of the pins is in ExtINT mode */
@@ -1968,6 +1960,9 @@ void disable_IO_APIC(void)
1968 */ 1960 */
1969 clear_IO_APIC(); 1961 clear_IO_APIC();
1970 1962
1963 if (!nr_legacy_irqs)
1964 return;
1965
1971 /* 1966 /*
1972 * If the i8259 is routed through an IOAPIC 1967 * If the i8259 is routed through an IOAPIC
1973 * Put that IOAPIC in virtual wire mode 1968 * Put that IOAPIC in virtual wire mode
@@ -2001,7 +1996,7 @@ void disable_IO_APIC(void)
2001 /* 1996 /*
2002 * Use virtual wire A mode when interrupt remapping is enabled. 1997 * Use virtual wire A mode when interrupt remapping is enabled.
2003 */ 1998 */
2004 if (cpu_has_apic) 1999 if (cpu_has_apic || apic_from_smp_config())
2005 disconnect_bsp_APIC(!intr_remapping_enabled && 2000 disconnect_bsp_APIC(!intr_remapping_enabled &&
2006 ioapic_i8259.pin != -1); 2001 ioapic_i8259.pin != -1);
2007} 2002}
@@ -2014,7 +2009,7 @@ void disable_IO_APIC(void)
2014 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 2009 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
2015 */ 2010 */
2016 2011
2017static void __init setup_ioapic_ids_from_mpc(void) 2012void __init setup_ioapic_ids_from_mpc(void)
2018{ 2013{
2019 union IO_APIC_reg_00 reg_00; 2014 union IO_APIC_reg_00 reg_00;
2020 physid_mask_t phys_id_present_map; 2015 physid_mask_t phys_id_present_map;
@@ -2023,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
2023 unsigned char old_id; 2018 unsigned char old_id;
2024 unsigned long flags; 2019 unsigned long flags;
2025 2020
2026 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) 2021 if (acpi_ioapic)
2027 return; 2022 return;
2028
2029 /* 2023 /*
2030 * Don't check I/O APIC IDs for xAPIC systems. They have 2024 * Don't check I/O APIC IDs for xAPIC systems. They have
2031 * no meaning without the serial APIC bus. 2025 * no meaning without the serial APIC bus.
@@ -2199,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2199 struct irq_cfg *cfg; 2193 struct irq_cfg *cfg;
2200 2194
2201 spin_lock_irqsave(&ioapic_lock, flags); 2195 spin_lock_irqsave(&ioapic_lock, flags);
2202 if (irq < NR_IRQS_LEGACY) { 2196 if (irq < nr_legacy_irqs) {
2203 disable_8259A_irq(irq); 2197 disable_8259A_irq(irq);
2204 if (i8259A_irq_pending(irq)) 2198 if (i8259A_irq_pending(irq))
2205 was_pending = 1; 2199 was_pending = 1;
@@ -2211,7 +2205,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2211 return was_pending; 2205 return was_pending;
2212} 2206}
2213 2207
2214#ifdef CONFIG_X86_64
2215static int ioapic_retrigger_irq(unsigned int irq) 2208static int ioapic_retrigger_irq(unsigned int irq)
2216{ 2209{
2217 2210
@@ -2224,14 +2217,6 @@ static int ioapic_retrigger_irq(unsigned int irq)
2224 2217
2225 return 1; 2218 return 1;
2226} 2219}
2227#else
2228static int ioapic_retrigger_irq(unsigned int irq)
2229{
2230 apic->send_IPI_self(irq_cfg(irq)->vector);
2231
2232 return 1;
2233}
2234#endif
2235 2220
2236/* 2221/*
2237 * Level and edge triggered IO-APIC interrupts need different handling, 2222 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2269,13 +2254,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2269 struct irq_pin_list *entry; 2254 struct irq_pin_list *entry;
2270 u8 vector = cfg->vector; 2255 u8 vector = cfg->vector;
2271 2256
2272 entry = cfg->irq_2_pin; 2257 for_each_irq_pin(entry, cfg->irq_2_pin) {
2273 for (;;) {
2274 unsigned int reg; 2258 unsigned int reg;
2275 2259
2276 if (!entry)
2277 break;
2278
2279 apic = entry->apic; 2260 apic = entry->apic;
2280 pin = entry->pin; 2261 pin = entry->pin;
2281 /* 2262 /*
@@ -2288,9 +2269,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2288 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2269 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2289 reg |= vector; 2270 reg |= vector;
2290 io_apic_modify(apic, 0x10 + pin*2, reg); 2271 io_apic_modify(apic, 0x10 + pin*2, reg);
2291 if (!entry->next)
2292 break;
2293 entry = entry->next;
2294 } 2272 }
2295} 2273}
2296 2274
@@ -2515,11 +2493,8 @@ atomic_t irq_mis_count;
2515static void ack_apic_level(unsigned int irq) 2493static void ack_apic_level(unsigned int irq)
2516{ 2494{
2517 struct irq_desc *desc = irq_to_desc(irq); 2495 struct irq_desc *desc = irq_to_desc(irq);
2518
2519#ifdef CONFIG_X86_32
2520 unsigned long v; 2496 unsigned long v;
2521 int i; 2497 int i;
2522#endif
2523 struct irq_cfg *cfg; 2498 struct irq_cfg *cfg;
2524 int do_unmask_irq = 0; 2499 int do_unmask_irq = 0;
2525 2500
@@ -2532,31 +2507,28 @@ static void ack_apic_level(unsigned int irq)
2532 } 2507 }
2533#endif 2508#endif
2534 2509
2535#ifdef CONFIG_X86_32
2536 /* 2510 /*
2537 * It appears there is an erratum which affects at least version 0x11 2511 * It appears there is an erratum which affects at least version 0x11
2538 * of I/O APIC (that's the 82093AA and cores integrated into various 2512 * of I/O APIC (that's the 82093AA and cores integrated into various
2539 * chipsets). Under certain conditions a level-triggered interrupt is 2513 * chipsets). Under certain conditions a level-triggered interrupt is
2540 * erroneously delivered as edge-triggered one but the respective IRR 2514 * erroneously delivered as edge-triggered one but the respective IRR
2541 * bit gets set nevertheless. As a result the I/O unit expects an EOI 2515 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2542 * message but it will never arrive and further interrupts are blocked 2516 * message but it will never arrive and further interrupts are blocked
2543 * from the source. The exact reason is so far unknown, but the 2517 * from the source. The exact reason is so far unknown, but the
2544 * phenomenon was observed when two consecutive interrupt requests 2518 * phenomenon was observed when two consecutive interrupt requests
2545 * from a given source get delivered to the same CPU and the source is 2519 * from a given source get delivered to the same CPU and the source is
2546 * temporarily disabled in between. 2520 * temporarily disabled in between.
2547 * 2521 *
2548 * A workaround is to simulate an EOI message manually. We achieve it 2522 * A workaround is to simulate an EOI message manually. We achieve it
2549 * by setting the trigger mode to edge and then to level when the edge 2523 * by setting the trigger mode to edge and then to level when the edge
2550 * trigger mode gets detected in the TMR of a local APIC for a 2524 * trigger mode gets detected in the TMR of a local APIC for a
2551 * level-triggered interrupt. We mask the source for the time of the 2525 * level-triggered interrupt. We mask the source for the time of the
2552 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2526 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2553 * The idea is from Manfred Spraul. --macro 2527 * The idea is from Manfred Spraul. --macro
2554 */ 2528 */
2555 cfg = desc->chip_data; 2529 cfg = desc->chip_data;
2556 i = cfg->vector; 2530 i = cfg->vector;
2557
2558 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2531 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2559#endif
2560 2532
2561 /* 2533 /*
2562 * We must acknowledge the irq before we move it or the acknowledge will 2534 * We must acknowledge the irq before we move it or the acknowledge will
@@ -2598,7 +2570,7 @@ static void ack_apic_level(unsigned int irq)
2598 unmask_IO_APIC_irq_desc(desc); 2570 unmask_IO_APIC_irq_desc(desc);
2599 } 2571 }
2600 2572
2601#ifdef CONFIG_X86_32 2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2602 if (!(v & (1 << (i & 0x1f)))) { 2574 if (!(v & (1 << (i & 0x1f)))) {
2603 atomic_inc(&irq_mis_count); 2575 atomic_inc(&irq_mis_count);
2604 spin_lock(&ioapic_lock); 2576 spin_lock(&ioapic_lock);
@@ -2606,26 +2578,15 @@ static void ack_apic_level(unsigned int irq)
2606 __unmask_and_level_IO_APIC_irq(cfg); 2578 __unmask_and_level_IO_APIC_irq(cfg);
2607 spin_unlock(&ioapic_lock); 2579 spin_unlock(&ioapic_lock);
2608 } 2580 }
2609#endif
2610} 2581}
2611 2582
2612#ifdef CONFIG_INTR_REMAP 2583#ifdef CONFIG_INTR_REMAP
2613static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2614{ 2585{
2615 int apic, pin;
2616 struct irq_pin_list *entry; 2586 struct irq_pin_list *entry;
2617 2587
2618 entry = cfg->irq_2_pin; 2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2619 for (;;) { 2589 io_apic_eoi(entry->apic, entry->pin);
2620
2621 if (!entry)
2622 break;
2623
2624 apic = entry->apic;
2625 pin = entry->pin;
2626 io_apic_eoi(apic, pin);
2627 entry = entry->next;
2628 }
2629} 2590}
2630 2591
2631static void 2592static void
@@ -2710,7 +2671,7 @@ static inline void init_IO_APIC_traps(void)
2710 * so default to an old-fashioned 8259 2671 * so default to an old-fashioned 8259
2711 * interrupt if we can.. 2672 * interrupt if we can..
2712 */ 2673 */
2713 if (irq < NR_IRQS_LEGACY) 2674 if (irq < nr_legacy_irqs)
2714 make_8259A_irq(irq); 2675 make_8259A_irq(irq);
2715 else 2676 else
2716 /* Strange. Oh, well.. */ 2677 /* Strange. Oh, well.. */
@@ -3046,7 +3007,7 @@ out:
3046 * the I/O APIC in all cases now. No actual device should request 3007 * the I/O APIC in all cases now. No actual device should request
3047 * it anyway. --macro 3008 * it anyway. --macro
3048 */ 3009 */
3049#define PIC_IRQS (1 << PIC_CASCADE_IR) 3010#define PIC_IRQS (1UL << PIC_CASCADE_IR)
3050 3011
3051void __init setup_IO_APIC(void) 3012void __init setup_IO_APIC(void)
3052{ 3013{
@@ -3054,21 +3015,19 @@ void __init setup_IO_APIC(void)
3054 /* 3015 /*
3055 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3056 */ 3017 */
3057 3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3058 io_apic_irqs = ~PIC_IRQS;
3059 3019
3060 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3061 /* 3021 /*
3062 * Set up IO-APIC IRQ routing. 3022 * Set up IO-APIC IRQ routing.
3063 */ 3023 */
3064#ifdef CONFIG_X86_32 3024 x86_init.mpparse.setup_ioapic_ids();
3065 if (!acpi_ioapic) 3025
3066 setup_ioapic_ids_from_mpc();
3067#endif
3068 sync_Arb_IDs(); 3026 sync_Arb_IDs();
3069 setup_IO_APIC_irqs(); 3027 setup_IO_APIC_irqs();
3070 init_IO_APIC_traps(); 3028 init_IO_APIC_traps();
3071 check_timer(); 3029 if (nr_legacy_irqs)
3030 check_timer();
3072} 3031}
3073 3032
3074/* 3033/*
@@ -3169,7 +3128,6 @@ static int __init ioapic_init_sysfs(void)
3169 3128
3170device_initcall(ioapic_init_sysfs); 3129device_initcall(ioapic_init_sysfs);
3171 3130
3172static int nr_irqs_gsi = NR_IRQS_LEGACY;
3173/* 3131/*
3174 * Dynamic irq allocate and deallocation 3132 * Dynamic irq allocate and deallocation
3175 */ 3133 */
@@ -3241,8 +3199,7 @@ void destroy_irq(unsigned int irq)
3241 cfg = desc->chip_data; 3199 cfg = desc->chip_data;
3242 dynamic_irq_cleanup(irq); 3200 dynamic_irq_cleanup(irq);
3243 /* connect back irq_cfg */ 3201 /* connect back irq_cfg */
3244 if (desc) 3202 desc->chip_data = cfg;
3245 desc->chip_data = cfg;
3246 3203
3247 free_irte(irq); 3204 free_irte(irq);
3248 spin_lock_irqsave(&vector_lock, flags); 3205 spin_lock_irqsave(&vector_lock, flags);
@@ -3910,9 +3867,13 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3910 /* 3867 /*
3911 * IRQs < 16 are already in the irq_2_pin[] map 3868 * IRQs < 16 are already in the irq_2_pin[] map
3912 */ 3869 */
3913 if (irq >= NR_IRQS_LEGACY) { 3870 if (irq >= nr_legacy_irqs) {
3914 cfg = desc->chip_data; 3871 cfg = desc->chip_data;
3915 add_pin_to_irq_node(cfg, node, ioapic, pin); 3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3873 printk(KERN_INFO "can not add pin %d for irq %d\n",
3874 pin, irq);
3875 return 0;
3876 }
3916 } 3877 }
3917 3878
3918 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); 3879 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
@@ -3941,11 +3902,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq,
3941 return __io_apic_set_pci_routing(dev, irq, irq_attr); 3902 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3942} 3903}
3943 3904
3944/* -------------------------------------------------------------------------- 3905u8 __init io_apic_unique_id(u8 id)
3945 ACPI-based IOAPIC Configuration 3906{
3946 -------------------------------------------------------------------------- */ 3907#ifdef CONFIG_X86_32
3908 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3909 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3910 return io_apic_get_unique_id(nr_ioapics, id);
3911 else
3912 return id;
3913#else
3914 int i;
3915 DECLARE_BITMAP(used, 256);
3947 3916
3948#ifdef CONFIG_ACPI 3917 bitmap_zero(used, 256);
3918 for (i = 0; i < nr_ioapics; i++) {
3919 struct mpc_ioapic *ia = &mp_ioapics[i];
3920 __set_bit(ia->apicid, used);
3921 }
3922 if (!test_bit(id, used))
3923 return id;
3924 return find_first_zero_bit(used, 256);
3925#endif
3926}
3949 3927
3950#ifdef CONFIG_X86_32 3928#ifdef CONFIG_X86_32
3951int __init io_apic_get_unique_id(int ioapic, int apic_id) 3929int __init io_apic_get_unique_id(int ioapic, int apic_id)
@@ -4054,8 +4032,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4054 return 0; 4032 return 0;
4055} 4033}
4056 4034
4057#endif /* CONFIG_ACPI */
4058
4059/* 4035/*
4060 * This function currently is only a helper for the i386 smp boot process where 4036 * This function currently is only a helper for the i386 smp boot process where
4061 * we need to reprogram the ioredtbls to cater for the cpus which have come online 4037 * we need to reprogram the ioredtbls to cater for the cpus which have come online
@@ -4109,7 +4085,7 @@ void __init setup_ioapic_dest(void)
4109 4085
4110static struct resource *ioapic_resources; 4086static struct resource *ioapic_resources;
4111 4087
4112static struct resource * __init ioapic_setup_resources(void) 4088static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4113{ 4089{
4114 unsigned long n; 4090 unsigned long n;
4115 struct resource *res; 4091 struct resource *res;
@@ -4125,15 +4101,13 @@ static struct resource * __init ioapic_setup_resources(void)
4125 mem = alloc_bootmem(n); 4101 mem = alloc_bootmem(n);
4126 res = (void *)mem; 4102 res = (void *)mem;
4127 4103
4128 if (mem != NULL) { 4104 mem += sizeof(struct resource) * nr_ioapics;
4129 mem += sizeof(struct resource) * nr_ioapics;
4130 4105
4131 for (i = 0; i < nr_ioapics; i++) { 4106 for (i = 0; i < nr_ioapics; i++) {
4132 res[i].name = mem; 4107 res[i].name = mem;
4133 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4134 sprintf(mem, "IOAPIC %u", i); 4109 sprintf(mem, "IOAPIC %u", i);
4135 mem += IOAPIC_RESOURCE_NAME_SIZE; 4110 mem += IOAPIC_RESOURCE_NAME_SIZE;
4136 }
4137 } 4111 }
4138 4112
4139 ioapic_resources = res; 4113 ioapic_resources = res;
@@ -4147,7 +4121,7 @@ void __init ioapic_init_mappings(void)
4147 struct resource *ioapic_res; 4121 struct resource *ioapic_res;
4148 int i; 4122 int i;
4149 4123
4150 ioapic_res = ioapic_setup_resources(); 4124 ioapic_res = ioapic_setup_resources(nr_ioapics);
4151 for (i = 0; i < nr_ioapics; i++) { 4125 for (i = 0; i < nr_ioapics; i++) {
4152 if (smp_found_config) { 4126 if (smp_found_config) {
4153 ioapic_phys = mp_ioapics[i].apicaddr; 4127 ioapic_phys = mp_ioapics[i].apicaddr;
@@ -4176,11 +4150,9 @@ fake_ioapic_page:
4176 __fix_to_virt(idx), ioapic_phys); 4150 __fix_to_virt(idx), ioapic_phys);
4177 idx++; 4151 idx++;
4178 4152
4179 if (ioapic_res != NULL) { 4153 ioapic_res->start = ioapic_phys;
4180 ioapic_res->start = ioapic_phys; 4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
4181 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4155 ioapic_res++;
4182 ioapic_res++;
4183 }
4184 } 4156 }
4185} 4157}
4186 4158
@@ -4201,3 +4173,76 @@ void __init ioapic_insert_resources(void)
4201 r++; 4173 r++;
4202 } 4174 }
4203} 4175}
4176
4177int mp_find_ioapic(int gsi)
4178{
4179 int i = 0;
4180
4181 /* Find the IOAPIC that manages this GSI. */
4182 for (i = 0; i < nr_ioapics; i++) {
4183 if ((gsi >= mp_gsi_routing[i].gsi_base)
4184 && (gsi <= mp_gsi_routing[i].gsi_end))
4185 return i;
4186 }
4187
4188 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
4189 return -1;
4190}
4191
4192int mp_find_ioapic_pin(int ioapic, int gsi)
4193{
4194 if (WARN_ON(ioapic == -1))
4195 return -1;
4196 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
4197 return -1;
4198
4199 return gsi - mp_gsi_routing[ioapic].gsi_base;
4200}
4201
4202static int bad_ioapic(unsigned long address)
4203{
4204 if (nr_ioapics >= MAX_IO_APICS) {
4205 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
4206 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4207 return 1;
4208 }
4209 if (!address) {
4210 printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
4211 " found in table, skipping!\n");
4212 return 1;
4213 }
4214 return 0;
4215}
4216
4217void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4218{
4219 int idx = 0;
4220
4221 if (bad_ioapic(address))
4222 return;
4223
4224 idx = nr_ioapics;
4225
4226 mp_ioapics[idx].type = MP_IOAPIC;
4227 mp_ioapics[idx].flags = MPC_APIC_USABLE;
4228 mp_ioapics[idx].apicaddr = address;
4229
4230 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4231 mp_ioapics[idx].apicid = io_apic_unique_id(id);
4232 mp_ioapics[idx].apicver = io_apic_get_version(idx);
4233
4234 /*
4235 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4236 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4237 */
4238 mp_gsi_routing[idx].gsi_base = gsi_base;
4239 mp_gsi_routing[idx].gsi_end = gsi_base +
4240 io_apic_get_redir_entries(idx);
4241
4242 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4243 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
4244 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
4245 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
4246
4247 nr_ioapics++;
4248}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ef00ba4c886..08385e090a6f 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -153,7 +153,7 @@ int safe_smp_processor_id(void)
153{ 153{
154 int apicid, cpuid; 154 int apicid, cpuid;
155 155
156 if (!boot_cpu_has(X86_FEATURE_APIC)) 156 if (!cpu_has_apic)
157 return 0; 157 return 0;
158 158
159 apicid = hard_smp_processor_id(); 159 apicid = hard_smp_processor_id();
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63a..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_var_t backtrace_mask; 42static cpumask_t backtrace_mask __read_mostly;
43 43
44/* nmi_active: 44/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 45 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_NEW_MCE) 69#if defined(CONFIG_X86_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
138 if (!prev_nmi_count) 138 if (!prev_nmi_count)
139 goto error; 139 goto error;
140 140
141 alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
142 printk(KERN_INFO "Testing NMI watchdog ... "); 141 printk(KERN_INFO "Testing NMI watchdog ... ");
143 142
144#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
415 } 414 }
416 415
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 416 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { 417 if (cpumask_test_cpu(cpu, &backtrace_mask)) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
420 419
421 spin_lock(&lock); 420 spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
422 show_regs(regs);
423 dump_stack(); 423 dump_stack();
424 spin_unlock(&lock); 424 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, backtrace_mask); 425 cpumask_clear_cpu(cpu, &backtrace_mask);
426
427 rc = 1;
426 } 428 }
427 429
428 /* Could check oops_in_progress here too, but it's safer not to */ 430 /* Could check oops_in_progress here too, but it's safer not to */
@@ -506,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
506/* 508/*
507 * proc handler for /proc/sys/kernel/nmi 509 * proc handler for /proc/sys/kernel/nmi
508 */ 510 */
509int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, 511int proc_nmi_enabled(struct ctl_table *table, int write,
510 void __user *buffer, size_t *length, loff_t *ppos) 512 void __user *buffer, size_t *length, loff_t *ppos)
511{ 513{
512 int old_state; 514 int old_state;
513 515
514 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; 516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
515 old_state = nmi_watchdog_enabled; 517 old_state = nmi_watchdog_enabled;
516 proc_dointvec(table, write, file, buffer, length, ppos); 518 proc_dointvec(table, write, buffer, length, ppos);
517 if (!!old_state == !!nmi_watchdog_enabled) 519 if (!!old_state == !!nmi_watchdog_enabled)
518 return 0; 520 return 0;
519 521
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
552 return 0; 554 return 0;
553} 555}
554 556
555void __trigger_all_cpu_backtrace(void) 557void arch_trigger_all_cpu_backtrace(void)
556{ 558{
557 int i; 559 int i;
558 560
559 cpumask_copy(backtrace_mask, cpu_online_mask); 561 cpumask_copy(&backtrace_mask, cpu_online_mask);
562
563 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR);
565
560 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
561 for (i = 0; i < 10 * 1000; i++) { 567 for (i = 0; i < 10 * 1000; i++) {
562 if (cpumask_empty(backtrace_mask)) 568 if (cpumask_empty(&backtrace_mask))
563 break; 569 break;
564 mdelay(1); 570 mdelay(1);
565 } 571 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d23..efa00e2b8505 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
66 unsigned short trans_reserved; 66 unsigned short trans_reserved;
67}; 67};
68 68
69/* x86_quirks member */
70static int mpc_record; 69static int mpc_record;
71 70
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; 71static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
130 } 129 }
131} 130}
132 131
133static int __init numaq_pre_time_init(void) 132static void __init numaq_tsc_init(void)
134{ 133{
135 numaq_tsc_disable(); 134 numaq_tsc_disable();
136 return 0;
137} 135}
138 136
139static inline int generate_logical_apicid(int quad, int phys_apicid) 137static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
177 quad_local_to_mp_bus_id[quad][local] = m->busid; 175 quad_local_to_mp_bus_id[quad][local] = m->busid;
178} 176}
179 177
178/*
179 * Called from mpparse code.
180 * mode = 0: prescan
181 * mode = 1: one mpc entry scanned
182 */
183static void numaq_mpc_record(unsigned int mode)
184{
185 if (!mode)
186 mpc_record = 0;
187 else
188 mpc_record++;
189}
190
180static void __init MP_translation_info(struct mpc_trans *m) 191static void __init MP_translation_info(struct mpc_trans *m)
181{ 192{
182 printk(KERN_INFO 193 printk(KERN_INFO
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
206/* 217/*
207 * Read/parse the MPC oem tables 218 * Read/parse the MPC oem tables
208 */ 219 */
209static void __init 220static void __init smp_read_mpc_oem(struct mpc_table *mpc)
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{ 221{
222 struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
212 int count = sizeof(*oemtable); /* the header size */ 223 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 224 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214 225
@@ -250,29 +261,6 @@ static void __init
250 } 261 }
251} 262}
252 263
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
277{ 265{
278 /* 266 /*
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void)
286 if (smp_found_config) 274 if (smp_found_config)
287 early_get_smp_config(); 275 early_get_smp_config();
288 276
289 if (found_numaq) 277 if (found_numaq) {
290 x86_quirks = &numaq_x86_quirks; 278 x86_init.mpparse.mpc_record = numaq_mpc_record;
279 x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
280 x86_init.mpparse.mpc_apic_id = mpc_apic_id;
281 x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init;
285 }
291} 286}
292 287
293int __init get_memcfg_numaq(void) 288int __init get_memcfg_numaq(void)
@@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
418/* Where the IO area was mapped on multiquad, always 0 otherwise */ 413/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio; 414void *xquad_portio;
420 415
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) 416static inline int numaq_check_phys_apicid_present(int phys_apicid)
422{ 417{
423 return 1; 418 return 1;
424} 419}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880f9b82..c4cbd3080c1c 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -44,29 +44,46 @@ static struct apic *apic_probe[] __initdata = {
44 NULL, 44 NULL,
45}; 45};
46 46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{
49 return hard_smp_processor_id() >> index_msb;
50}
51
47/* 52/*
48 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 53 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
49 */ 54 */
50void __init default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
51{ 56{
52#ifdef CONFIG_X86_X2APIC 57#ifdef CONFIG_X86_X2APIC
53 if (x2apic_mode && (apic != &apic_x2apic_phys && 58 if (x2apic_mode
54#ifdef CONFIG_X86_UV 59#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x && 60 && apic != &apic_x2apic_uv_x
56#endif 61#endif
57 apic != &apic_x2apic_cluster)) { 62 ) {
58 if (x2apic_phys) 63 if (x2apic_phys)
59 apic = &apic_x2apic_phys; 64 apic = &apic_x2apic_phys;
60 else 65 else
61 apic = &apic_x2apic_cluster; 66 apic = &apic_x2apic_cluster;
62 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
63 } 67 }
64#endif 68#endif
65 69
66 if (apic == &apic_flat) { 70 if (apic == &apic_flat) {
67 if (max_physical_apicid >= 8) 71 switch (boot_cpu_data.x86_vendor) {
68 apic = &apic_physflat; 72 case X86_VENDOR_INTEL:
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83
84 if (is_vsmp_box()) {
85 /* need to update phys_pkg_id */
86 apic->phys_pkg_id = apicid_phys_pkg_id;
70 } 87 }
71 88
72 /* 89 /*
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea95..645ecc4ff0be 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)
272 return physid_mask_of_physid(0); 272 return physid_mask_of_physid(0);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
276{ 276{
277 return 1; 277 return 1;
278} 278}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e87..f5f5886a6b53 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
390} 390}
391 391
392static __init void map_mmr_high(int max_pnode)
393{
394 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
395 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
396
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
400}
401
392static __init void map_mmioh_high(int max_pnode) 402static __init void map_mmioh_high(int max_pnode)
393{ 403{
394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 404 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -643,6 +653,7 @@ void __init uv_system_init(void)
643 } 653 }
644 654
645 map_gru_high(max_pnode); 655 map_gru_high(max_pnode);
656 map_mmr_high(max_pnode);
646 map_mmioh_high(max_pnode); 657 map_mmioh_high(max_pnode);
647 658
648 uv_cpu_init(); 659 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 442b5508893f..151ace69a5aa 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 404static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 405static DEFINE_SPINLOCK(user_list_lock);
406static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; 406
407/*
408 * Set up a segment that references the real mode segment 0x40
409 * that extends up to the end of page zero (that we have reserved).
410 * This is for buggy BIOS's that refer to (real mode) segment 0x40
411 * even though they are called in protected mode.
412 */
413static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
414 (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
407 415
408static const char driver_version[] = "1.16ac"; /* no spaces */ 416static const char driver_version[] = "1.16ac"; /* no spaces */
409 417
@@ -2332,15 +2340,6 @@ static int __init apm_init(void)
2332 pm_flags |= PM_APM; 2340 pm_flags |= PM_APM;
2333 2341
2334 /* 2342 /*
2335 * Set up a segment that references the real mode segment 0x40
2336 * that extends up to the end of page zero (that we have reserved).
2337 * This is for buggy BIOS's that refer to (real mode) segment 0x40
2338 * even though they are called in protected mode.
2339 */
2340 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2341 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2342
2343 /*
2344 * Set up the long jump entry point to the APM BIOS, which is called 2343 * Set up the long jump entry point to the APM BIOS, which is called
2345 * from inline assembly. 2344 * from inline assembly.
2346 */ 2345 */
@@ -2358,12 +2357,12 @@ static int __init apm_init(void)
2358 * code to that CPU. 2357 * code to that CPU.
2359 */ 2358 */
2360 gdt = get_cpu_gdt_table(0); 2359 gdt = get_cpu_gdt_table(0);
2361 set_base(gdt[APM_CS >> 3], 2360 set_desc_base(&gdt[APM_CS >> 3],
2362 __va((unsigned long)apm_info.bios.cseg << 4)); 2361 (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
2363 set_base(gdt[APM_CS_16 >> 3], 2362 set_desc_base(&gdt[APM_CS_16 >> 3],
2364 __va((unsigned long)apm_info.bios.cseg_16 << 4)); 2363 (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
2365 set_base(gdt[APM_DS >> 3], 2364 set_desc_base(&gdt[APM_DS >> 3],
2366 __va((unsigned long)apm_info.bios.dseg << 4)); 2365 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
2367 2366
2368 proc_create("apm", 0, NULL, &apm_file_ops); 2367 proc_create("apm", 0, NULL, &apm_file_ops);
2369 2368
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
3 * This code generates raw asm output which is post-processed to extract 3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data. 4 * and format the required data.
5 */ 5 */
6#define COMPILE_OFFSETS
6 7
7#include <linux/crypto.h> 8#include <linux/crypto.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..68537e957a9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
29 29
30obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 30obj-$(CONFIG_PERF_EVENTS) += perf_event.o
31 31
32obj-$(CONFIG_X86_MCE) += mcheck/ 32obj-$(CONFIG_X86_MCE) += mcheck/
33obj-$(CONFIG_MTRR) += mtrr/ 33obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 63fddcd082cd..c910a716a71c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -2,7 +2,7 @@
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4 4
5#include <asm/io.h> 5#include <linux/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
@@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
45#define CBAR_ENB (0x80000000) 45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB) 46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) { 47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB) 48 if (inl(CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR); 49 outl(0 | CBAR_KEY, CBAR);
50 } 50 }
51} 51}
52 52
@@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
87 d = d2-d; 87 d = d2-d;
88 88
89 if (d > 20*K6_BUG_LOOP) 89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n"); 90 printk(KERN_CONT
91 "system stability may be impaired when more than 32 MB are used.\n");
91 else 92 else
92 printk("probably OK (after B9730xxxx).\n"); 93 printk(KERN_CONT "probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); 94 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 } 95 }
95 96
@@ -183,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
183 * approved Athlon 184 * approved Athlon
184 */ 185 */
185 WARN_ONCE(1, "WARNING: This combination of AMD" 186 WARN_ONCE(1, "WARNING: This combination of AMD"
186 "processors is not suitable for SMP.\n"); 187 " processors is not suitable for SMP.\n");
187 if (!test_taint(TAINT_UNSAFE_SMP)) 188 if (!test_taint(TAINT_UNSAFE_SMP))
188 add_taint(TAINT_UNSAFE_SMP); 189 add_taint(TAINT_UNSAFE_SMP);
189 190
@@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
219 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { 220 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
220 rdmsr(MSR_K7_CLK_CTL, l, h); 221 rdmsr(MSR_K7_CLK_CTL, l, h);
221 if ((l & 0xfff00000) != 0x20000000) { 222 if ((l & 0xfff00000) != 0x20000000) {
222 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, 223 printk(KERN_INFO
223 ((l & 0x000fffff)|0x20000000)); 224 "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
225 l, ((l & 0x000fffff)|0x20000000));
224 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); 226 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
225 } 227 }
226 } 228 }
@@ -251,6 +253,64 @@ static int __cpuinit nearby_node(int apicid)
251#endif 253#endif
252 254
253/* 255/*
256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */
261#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{
264#ifdef CONFIG_PCI
265 u32 t, cpn;
266 u8 n, n_id;
267 int cpu = smp_processor_id();
268
269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return;
272
273 /* check for multi-node processor on boot cpu */
274 t = read_pci_config(0, 24, 3, 0xe8);
275 if (!(t & (1 << 29)))
276 return;
277
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
279
280 /* cores per node: each internal node has half the number of cores */
281 cpn = c->x86_max_cores >> 1;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306
307 /* fixup core id to be in range from 0 to cpn */
308 c->cpu_core_id = c->cpu_core_id % cpn;
309#endif
310}
311#endif
312
313/*
254 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 314 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
255 * Assumes number of cores is a power of two. 315 * Assumes number of cores is a power of two.
256 */ 316 */
@@ -267,17 +327,31 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
267 c->phys_proc_id = c->initial_apicid >> bits; 327 c->phys_proc_id = c->initial_apicid >> bits;
268 /* use socket ID also for last level cache */ 328 /* use socket ID also for last level cache */
269 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 329 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
330 /* fixup topology information on multi-node processors */
331 if ((c->x86 == 0x10) && (c->x86_model == 9))
332 amd_fixup_dcm(c);
270#endif 333#endif
271} 334}
272 335
336int amd_get_nb_id(int cpu)
337{
338 int id = 0;
339#ifdef CONFIG_SMP
340 id = per_cpu(cpu_llc_id, cpu);
341#endif
342 return id;
343}
344EXPORT_SYMBOL_GPL(amd_get_nb_id);
345
273static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 346static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
274{ 347{
275#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 348#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
276 int cpu = smp_processor_id(); 349 int cpu = smp_processor_id();
277 int node; 350 int node;
278 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; 351 unsigned apicid = c->apicid;
352
353 node = per_cpu(cpu_llc_id, cpu);
279 354
280 node = c->phys_proc_id;
281 if (apicid_to_node[apicid] != NUMA_NO_NODE) 355 if (apicid_to_node[apicid] != NUMA_NO_NODE)
282 node = apicid_to_node[apicid]; 356 node = apicid_to_node[apicid];
283 if (!node_online(node)) { 357 if (!node_online(node)) {
@@ -398,18 +472,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
398 u32 level; 472 u32 level;
399 473
400 level = cpuid_eax(1); 474 level = cpuid_eax(1);
401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) 475 if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
402 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 476 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
403 477
404 /* 478 /*
405 * Some BIOSes incorrectly force this feature, but only K8 479 * Some BIOSes incorrectly force this feature, but only K8
406 * revision D (model = 0x14) and later actually support it. 480 * revision D (model = 0x14) and later actually support it.
481 * (AMD Erratum #110, docId: 25759).
407 */ 482 */
408 if (c->x86_model < 0x14) 483 if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
484 u64 val;
485
409 clear_cpu_cap(c, X86_FEATURE_LAHF_LM); 486 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
487 if (!rdmsrl_amd_safe(0xc001100d, &val)) {
488 val &= ~(1ULL << 32);
489 wrmsrl_amd_safe(0xc001100d, val);
490 }
491 }
492
410 } 493 }
411 if (c->x86 == 0x10 || c->x86 == 0x11) 494 if (c->x86 == 0x10 || c->x86 == 0x11)
412 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 495 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
496
497 /* get apicid instead of initial apic id from cpuid */
498 c->apicid = hard_smp_processor_id();
413#else 499#else
414 500
415 /* 501 /*
@@ -494,27 +580,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
494 * benefit in doing so. 580 * benefit in doing so.
495 */ 581 */
496 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 582 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
497 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 583 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
498 if ((tseg>>PMD_SHIFT) < 584 if ((tseg>>PMD_SHIFT) <
499 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || 585 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
500 ((tseg>>PMD_SHIFT) < 586 ((tseg>>PMD_SHIFT) <
501 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && 587 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
502 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) 588 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
503 set_memory_4k((unsigned long)__va(tseg), 1); 589 set_memory_4k((unsigned long)__va(tseg), 1);
504 } 590 }
505 } 591 }
506#endif 592#endif
507} 593}
508 594
509#ifdef CONFIG_X86_32 595#ifdef CONFIG_X86_32
510static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 596static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
597 unsigned int size)
511{ 598{
512 /* AMD errata T13 (order #21922) */ 599 /* AMD errata T13 (order #21922) */
513 if ((c->x86 == 6)) { 600 if ((c->x86 == 6)) {
514 if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ 601 /* Duron Rev A0 */
602 if (c->x86_model == 3 && c->x86_mask == 0)
515 size = 64; 603 size = 64;
604 /* Tbird rev A1/A2 */
516 if (c->x86_model == 4 && 605 if (c->x86_model == 4 &&
517 (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ 606 (c->x86_mask == 0 || c->x86_mask == 1))
518 size = 256; 607 size = 256;
519 } 608 }
520 return size; 609 return size;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c8e315f1aa83..01a265212395 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -81,7 +81,7 @@ static void __init check_fpu(void)
81 81
82 boot_cpu_data.fdiv_bug = fdiv_bug; 82 boot_cpu_data.fdiv_bug = fdiv_bug;
83 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
84 printk("Hmm, FPU with FDIV bug.\n"); 84 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
85} 85}
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
@@ -98,7 +98,7 @@ static void __init check_hlt(void)
98 halt(); 98 halt();
99 halt(); 99 halt();
100 halt(); 100 halt();
101 printk("OK.\n"); 101 printk(KERN_CONT "OK.\n");
102} 102}
103 103
104/* 104/*
@@ -122,9 +122,9 @@ static void __init check_popad(void)
122 * CPU hard. Too bad. 122 * CPU hard. Too bad.
123 */ 123 */
124 if (res != 12345678) 124 if (res != 12345678)
125 printk("Buggy.\n"); 125 printk(KERN_CONT "Buggy.\n");
126 else 126 else
127 printk("OK.\n"); 127 printk(KERN_CONT "OK.\n");
128#endif 128#endif
129} 129}
130 130
@@ -156,7 +156,7 @@ void __init check_bugs(void)
156{ 156{
157 identify_boot_cpu(); 157 identify_boot_cpu();
158#ifndef CONFIG_SMP 158#ifndef CONFIG_SMP
159 printk("CPU: "); 159 printk(KERN_INFO "CPU: ");
160 print_cpu_info(&boot_cpu_data); 160 print_cpu_info(&boot_cpu_data);
161#endif 161#endif
162 check_config(); 162 check_config();
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..04f0fe5af83e 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
15{ 15{
16 identify_boot_cpu(); 16 identify_boot_cpu();
17#if !defined(CONFIG_SMP) 17#if !defined(CONFIG_SMP)
18 printk("CPU: "); 18 printk(KERN_INFO "CPU: ");
19 print_cpu_info(&boot_cpu_data); 19 print_cpu_info(&boot_cpu_data);
20#endif 20#endif
21 alternative_instructions(); 21 alternative_instructions();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ce60a88027b..cc25c2b4a567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,13 +13,13 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h> 16#include <asm/perf_event.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19#include <asm/processor.h> 19#include <asm/processor.h>
20#include <asm/sections.h> 20#include <asm/sections.h>
21#include <asm/topology.h> 21#include <linux/topology.h>
22#include <asm/cpumask.h> 22#include <linux/cpumask.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/atomic.h> 24#include <asm/atomic.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
@@ -28,13 +28,12 @@
28#include <asm/desc.h> 28#include <asm/desc.h>
29#include <asm/i387.h> 29#include <asm/i387.h>
30#include <asm/mtrr.h> 30#include <asm/mtrr.h>
31#include <asm/numa.h> 31#include <linux/numa.h>
32#include <asm/asm.h> 32#include <asm/asm.h>
33#include <asm/cpu.h> 33#include <asm/cpu.h>
34#include <asm/mce.h> 34#include <asm/mce.h>
35#include <asm/msr.h> 35#include <asm/msr.h>
36#include <asm/pat.h> 36#include <asm/pat.h>
37#include <asm/smp.h>
38 37
39#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
40#include <asm/uv/uv.h> 39#include <asm/uv/uv.h>
@@ -94,45 +93,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
94 * TLS descriptors are currently at a different place compared to i386. 93 * TLS descriptors are currently at a different place compared to i386.
95 * Hopefully nobody expects them at a fixed place (Wine?) 94 * Hopefully nobody expects them at a fixed place (Wine?)
96 */ 95 */
97 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 96 [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
98 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 97 [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
99 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 98 [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
100 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 99 [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
101 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 100 [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
102 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 101 [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
103#else 102#else
104 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 103 [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
105 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 104 [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
106 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 105 [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
107 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, 106 [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
108 /* 107 /*
109 * Segments used for calling PnP BIOS have byte granularity. 108 * Segments used for calling PnP BIOS have byte granularity.
110 * They code segments and data segments have fixed 64k limits, 109 * They code segments and data segments have fixed 64k limits,
111 * the transfer segment sizes are set at run time. 110 * the transfer segment sizes are set at run time.
112 */ 111 */
113 /* 32-bit code */ 112 /* 32-bit code */
114 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, 113 [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
115 /* 16-bit code */ 114 /* 16-bit code */
116 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, 115 [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
117 /* 16-bit data */ 116 /* 16-bit data */
118 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, 117 [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
119 /* 16-bit data */ 118 /* 16-bit data */
120 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, 119 [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
121 /* 16-bit data */ 120 /* 16-bit data */
122 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, 121 [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
123 /* 122 /*
124 * The APM segments have byte granularity and their bases 123 * The APM segments have byte granularity and their bases
125 * are set at run time. All have 64k limits. 124 * are set at run time. All have 64k limits.
126 */ 125 */
127 /* 32-bit code */ 126 /* 32-bit code */
128 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, 127 [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
129 /* 16-bit code */ 128 /* 16-bit code */
130 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, 129 [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
131 /* data */ 130 /* data */
132 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 131 [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
133 132
134 [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, 133 [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
135 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 134 [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
136 GDT_STACK_CANARY_INIT 135 GDT_STACK_CANARY_INIT
137#endif 136#endif
138} }; 137} };
@@ -870,7 +869,7 @@ void __init identify_boot_cpu(void)
870#else 869#else
871 vgetcpu_set_mode(); 870 vgetcpu_set_mode();
872#endif 871#endif
873 init_hw_perf_counters(); 872 init_hw_perf_events();
874} 873}
875 874
876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 875void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -982,18 +981,26 @@ static __init int setup_disablecpuid(char *arg)
982__setup("clearcpuid=", setup_disablecpuid); 981__setup("clearcpuid=", setup_disablecpuid);
983 982
984#ifdef CONFIG_X86_64 983#ifdef CONFIG_X86_64
985struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 984struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
986 985
987DEFINE_PER_CPU_FIRST(union irq_stack_union, 986DEFINE_PER_CPU_FIRST(union irq_stack_union,
988 irq_stack_union) __aligned(PAGE_SIZE); 987 irq_stack_union) __aligned(PAGE_SIZE);
989 988
990DEFINE_PER_CPU(char *, irq_stack_ptr) = 989/*
991 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 990 * The following four percpu variables are hot. Align current_task to
991 * cacheline size such that all four fall in the same cacheline.
992 */
993DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
994 &init_task;
995EXPORT_PER_CPU_SYMBOL(current_task);
992 996
993DEFINE_PER_CPU(unsigned long, kernel_stack) = 997DEFINE_PER_CPU(unsigned long, kernel_stack) =
994 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 998 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
995EXPORT_PER_CPU_SYMBOL(kernel_stack); 999EXPORT_PER_CPU_SYMBOL(kernel_stack);
996 1000
1001DEFINE_PER_CPU(char *, irq_stack_ptr) =
1002 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
1003
997DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1004DEFINE_PER_CPU(unsigned int, irq_count) = -1;
998 1005
999/* 1006/*
@@ -1008,8 +1015,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1008}; 1015};
1009 1016
1010static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 1017static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1011 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) 1018 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1012 __aligned(PAGE_SIZE);
1013 1019
1014/* May not be marked __init: used by software suspend */ 1020/* May not be marked __init: used by software suspend */
1015void syscall_init(void) 1021void syscall_init(void)
@@ -1042,8 +1048,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
1042 1048
1043#else /* CONFIG_X86_64 */ 1049#else /* CONFIG_X86_64 */
1044 1050
1051DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1052EXPORT_PER_CPU_SYMBOL(current_task);
1053
1045#ifdef CONFIG_CC_STACKPROTECTOR 1054#ifdef CONFIG_CC_STACKPROTECTOR
1046DEFINE_PER_CPU(unsigned long, stack_canary); 1055DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1047#endif 1056#endif
1048 1057
1049/* Make sure %fs and %gs are initialized properly in idle threads */ 1058/* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..7d5c3b0ea8da 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <trace/power.h> 36#include <trace/events/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h> 39#include <linux/io.h>
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79
80DEFINE_TRACE(power_mark);
81 74
82/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
83static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 237 return cmd.val;
245} 238}
246 239
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 240/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 241static void read_measured_perf_ctrs(void *_cur)
259{ 242{
260 struct perf_pair *cur = _cur; 243 struct aperfmperf *am = _cur;
261 244
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 245 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 246}
265 247
266/* 248/*
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 261static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 262 unsigned int cpu)
281{ 263{
282 struct perf_pair readin, cur; 264 struct aperfmperf perf;
283 unsigned int perf_percent; 265 unsigned long ratio;
284 unsigned int retval; 266 unsigned int retval;
285 267
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 269 return 0;
288 270
289 cur.aperf.whole = readin.aperf.whole - 271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 272 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323
324#else
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337 273
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100; 274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
339 275
340 return retval; 276 return retval;
341} 277}
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
394 unsigned int next_perf_state = 0; /* Index into perf table */ 330 unsigned int next_perf_state = 0; /* Index into perf table */
395 unsigned int i; 331 unsigned int i;
396 int result = 0; 332 int result = 0;
397 struct power_trace it;
398 333
399 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 334 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
400 335
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
426 } 361 }
427 } 362 }
428 363
429 trace_power_mark(&it, POWER_PSTATE, next_perf_state); 364 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
430 365
431 switch (data->cpu_feature) { 366 switch (data->cpu_feature) {
432 case SYSTEM_INTEL_MSR_CAPABLE: 367 case SYSTEM_INTEL_MSR_CAPABLE:
@@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
588 }, 523 },
589 { } 524 { }
590}; 525};
526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/
533 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) &&
535 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable())
537 return -ENODEV;
538 }
539 return 0;
540}
591#endif 541#endif
592 542
593static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) 543static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
602 552
603 dprintk("acpi_cpufreq_cpu_init\n"); 553 dprintk("acpi_cpufreq_cpu_init\n");
604 554
555#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c);
557 if (result)
558 return result;
559#endif
560
605 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
606 if (!data) 562 if (!data)
607 return -ENOMEM; 563 return -ENOMEM;
@@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 687 acpi_processor_notify_smm(THIS_MODULE);
732 688
733 /* Check for APERF/MPERF support in hardware */ 689 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 690 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 691 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 692
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 693 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 694 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef891000..6394aa5c7985 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
605 return 0; 605 return 0;
606} 606}
607 607
608static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) 608static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
609 unsigned int entry)
609{ 610{
610 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 611 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
611} 612}
612 613
613static void print_basics(struct powernow_k8_data *data) 614static void print_basics(struct powernow_k8_data *data)
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
854 goto err_out; 855 goto err_out;
855 } 856 }
856 857
858 /* fill in data */
859 data->numps = data->acpi_data.state_count;
860 powernow_k8_acpi_pst_values(data, 0);
861
857 if (cpu_family == CPU_HW_PSTATE) 862 if (cpu_family == CPU_HW_PSTATE)
858 ret_val = fill_powernow_table_pstate(data, powernow_table); 863 ret_val = fill_powernow_table_pstate(data, powernow_table);
859 else 864 else
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
866 powernow_table[data->acpi_data.state_count].index = 0; 871 powernow_table[data->acpi_data.state_count].index = 0;
867 data->powernow_table = powernow_table; 872 data->powernow_table = powernow_table;
868 873
869 /* fill in data */
870 data->numps = data->acpi_data.state_count;
871 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 874 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
872 print_basics(data); 875 print_basics(data);
873 powernow_k8_acpi_pst_values(data, 0);
874 876
875 /* notify BIOS that we exist */ 877 /* notify BIOS that we exist */
876 acpi_processor_notify_smm(THIS_MODULE); 878 acpi_processor_notify_smm(THIS_MODULE);
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
914 "bad value %d.\n", i, index); 916 "bad value %d.\n", i, index);
915 printk(KERN_ERR PFX "Please report to BIOS " 917 printk(KERN_ERR PFX "Please report to BIOS "
916 "manufacturer\n"); 918 "manufacturer\n");
917 invalidate_entry(data, i); 919 invalidate_entry(powernow_table, i);
918 continue; 920 continue;
919 } 921 }
920 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 922 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
921 if (!(hi & HW_PSTATE_VALID_MASK)) { 923 if (!(hi & HW_PSTATE_VALID_MASK)) {
922 dprintk("invalid pstate %d, ignoring\n", index); 924 dprintk("invalid pstate %d, ignoring\n", index);
923 invalidate_entry(data, i); 925 invalidate_entry(powernow_table, i);
924 continue; 926 continue;
925 } 927 }
926 928
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
941 struct cpufreq_frequency_table *powernow_table) 943 struct cpufreq_frequency_table *powernow_table)
942{ 944{
943 int i; 945 int i;
944 int cntlofreq = 0;
945 946
946 for (i = 0; i < data->acpi_data.state_count; i++) { 947 for (i = 0; i < data->acpi_data.state_count; i++) {
947 u32 fid; 948 u32 fid;
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
970 /* verify frequency is OK */ 971 /* verify frequency is OK */
971 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { 972 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
972 dprintk("invalid freq %u kHz, ignoring\n", freq); 973 dprintk("invalid freq %u kHz, ignoring\n", freq);
973 invalidate_entry(data, i); 974 invalidate_entry(powernow_table, i);
974 continue; 975 continue;
975 } 976 }
976 977
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
978 * BIOSs are using "off" to indicate invalid */ 979 * BIOSs are using "off" to indicate invalid */
979 if (vid == VID_OFF) { 980 if (vid == VID_OFF) {
980 dprintk("invalid vid %u, ignoring\n", vid); 981 dprintk("invalid vid %u, ignoring\n", vid);
981 invalidate_entry(data, i); 982 invalidate_entry(powernow_table, i);
982 continue; 983 continue;
983 } 984 }
984 985
985 /* verify only 1 entry from the lo frequency table */
986 if (fid < HI_FID_TABLE_BOTTOM) {
987 if (cntlofreq) {
988 /* if both entries are the same,
989 * ignore this one ... */
990 if ((freq != powernow_table[cntlofreq].frequency) ||
991 (index != powernow_table[cntlofreq].index)) {
992 printk(KERN_ERR PFX
993 "Too many lo freq table "
994 "entries\n");
995 return 1;
996 }
997
998 dprintk("double low frequency table entry, "
999 "ignoring it.\n");
1000 invalidate_entry(data, i);
1001 continue;
1002 } else
1003 cntlofreq = i;
1004 }
1005
1006 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { 986 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
1007 printk(KERN_INFO PFX "invalid freq entries " 987 printk(KERN_INFO PFX "invalid freq entries "
1008 "%u kHz vs. %u kHz\n", freq, 988 "%u kHz vs. %u kHz\n", freq,
1009 (unsigned int) 989 (unsigned int)
1010 (data->acpi_data.states[i].core_frequency 990 (data->acpi_data.states[i].core_frequency
1011 * 1000)); 991 * 1000));
1012 invalidate_entry(data, i); 992 invalidate_entry(powernow_table, i);
1013 continue; 993 continue;
1014 } 994 }
1015 } 995 }
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 593171e967ef..19807b89f058 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -3,10 +3,10 @@
3#include <linux/delay.h> 3#include <linux/delay.h>
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <asm/dma.h> 5#include <asm/dma.h>
6#include <asm/io.h> 6#include <linux/io.h>
7#include <asm/processor-cyrix.h> 7#include <asm/processor-cyrix.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include <asm/timer.h> 9#include <linux/timer.h>
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/tsc.h> 11#include <asm/tsc.h>
12 12
@@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
282 * The 5510/5520 companion chips have a funky PIT. 282 * The 5510/5520 companion chips have a funky PIT.
283 */ 283 */
284 if (vendor == PCI_VENDOR_ID_CYRIX && 284 if (vendor == PCI_VENDOR_ID_CYRIX &&
285 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) 285 (device == PCI_DEVICE_ID_CYRIX_5510 ||
286 device == PCI_DEVICE_ID_CYRIX_5520))
286 mark_tsc_unstable("cyrix 5510/5520 detected"); 287 mark_tsc_unstable("cyrix 5510/5520 detected");
287 } 288 }
288#endif 289#endif
@@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
299 * ? : 0x7x 300 * ? : 0x7x
300 * GX1 : 0x8x GX1 datasheet 56 301 * GX1 : 0x8x GX1 datasheet 56
301 */ 302 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 303 if ((0x30 <= dir1 && dir1 <= 0x6f) ||
304 (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 305 geode_configure();
304 return; 306 return;
305 } else { /* MediaGX */ 307 } else { /* MediaGX */
@@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
427 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); 429 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
428 local_irq_save(flags); 430 local_irq_save(flags);
429 ccr3 = getCx86(CX86_CCR3); 431 ccr3 = getCx86(CX86_CCR3);
430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 432 /* enable MAPEN */
431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ 433 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 434 /* enable cpuid */
435 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
436 /* disable MAPEN */
437 setCx86(CX86_CCR3, ccr3);
433 local_irq_restore(flags); 438 local_irq_restore(flags);
434 } 439 }
435 } 440 }
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index fb5b86af0b01..08be922de33a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -28,18 +28,10 @@
28static inline void __cpuinit 28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{ 30{
31 if (vmware_platform()) { 31 if (vmware_platform())
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else { 33 else
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43} 35}
44 36
45static inline void __cpuinit 37static inline void __cpuinit
@@ -56,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
56 detect_hypervisor_vendor(c); 48 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c); 49 hypervisor_set_feature_bits(c);
58} 50}
51
52void __init init_hypervisor_platform(void)
53{
54 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
56 vmware_platform_setup();
57}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3260ab044996..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,17 +7,17 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/uaccess.h>
10 11
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14#include <asm/uaccess.h>
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h> 17#include <asm/cpu.h>
18 18
19#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
20#include <asm/topology.h> 20#include <linux/topology.h>
21#include <asm/numa_64.h> 21#include <asm/numa_64.h>
22#endif 22#endif
23 23
@@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
174#ifdef CONFIG_X86_F00F_BUG 174#ifdef CONFIG_X86_F00F_BUG
175 /* 175 /*
176 * All current models of Pentium and Pentium with MMX technology CPUs 176 * All current models of Pentium and Pentium with MMX technology CPUs
177 * have the F0 0F bug, which lets nonprivileged users lock up the system. 177 * have the F0 0F bug, which lets nonprivileged users lock up the
178 * system.
178 * Note that the workaround only should be initialized once... 179 * Note that the workaround only should be initialized once...
179 */ 180 */
180 c->f00f_bug = 0; 181 c->f00f_bug = 0;
@@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
207 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); 208 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
208 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); 209 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
209 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; 210 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
210 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 211 wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
211 } 212 }
212 } 213 }
213 214
@@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
283 /* Intel has a non-standard dependency on %ecx for this CPUID level. */ 284 /* Intel has a non-standard dependency on %ecx for this CPUID level. */
284 cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); 285 cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
285 if (eax & 0x1f) 286 if (eax & 0x1f)
286 return ((eax >> 26) + 1); 287 return (eax >> 26) + 1;
287 else 288 else
288 return 1; 289 return 1;
289} 290}
@@ -349,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
349 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
350 } 351 }
351 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
352 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
353 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
354 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 789efe217e1a..804c40e2bc3e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
9 9
@@ -16,7 +16,7 @@
16#include <linux/pci.h> 16#include <linux/pci.h>
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21 21
22#define LVL_1_INST 1 22#define LVL_1_INST 1
@@ -25,14 +25,15 @@
25#define LVL_3 4 25#define LVL_3 4
26#define LVL_TRACE 5 26#define LVL_TRACE 5
27 27
28struct _cache_table 28struct _cache_table {
29{
30 unsigned char descriptor; 29 unsigned char descriptor;
31 char cache_type; 30 char cache_type;
32 short size; 31 short size;
33}; 32};
34 33
35/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */
36
36static const struct _cache_table __cpuinitconst cache_table[] = 37static const struct _cache_table __cpuinitconst cache_table[] =
37{ 38{
38 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 39 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
@@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
105}; 106};
106 107
107 108
108enum _cache_type 109enum _cache_type {
109{
110 CACHE_TYPE_NULL = 0, 110 CACHE_TYPE_NULL = 0,
111 CACHE_TYPE_DATA = 1, 111 CACHE_TYPE_DATA = 1,
112 CACHE_TYPE_INST = 2, 112 CACHE_TYPE_INST = 2,
@@ -170,31 +170,31 @@ unsigned short num_cache_leaves;
170 Maybe later */ 170 Maybe later */
171union l1_cache { 171union l1_cache {
172 struct { 172 struct {
173 unsigned line_size : 8; 173 unsigned line_size:8;
174 unsigned lines_per_tag : 8; 174 unsigned lines_per_tag:8;
175 unsigned assoc : 8; 175 unsigned assoc:8;
176 unsigned size_in_kb : 8; 176 unsigned size_in_kb:8;
177 }; 177 };
178 unsigned val; 178 unsigned val;
179}; 179};
180 180
181union l2_cache { 181union l2_cache {
182 struct { 182 struct {
183 unsigned line_size : 8; 183 unsigned line_size:8;
184 unsigned lines_per_tag : 4; 184 unsigned lines_per_tag:4;
185 unsigned assoc : 4; 185 unsigned assoc:4;
186 unsigned size_in_kb : 16; 186 unsigned size_in_kb:16;
187 }; 187 };
188 unsigned val; 188 unsigned val;
189}; 189};
190 190
191union l3_cache { 191union l3_cache {
192 struct { 192 struct {
193 unsigned line_size : 8; 193 unsigned line_size:8;
194 unsigned lines_per_tag : 4; 194 unsigned lines_per_tag:4;
195 unsigned assoc : 4; 195 unsigned assoc:4;
196 unsigned res : 2; 196 unsigned res:2;
197 unsigned size_encoded : 14; 197 unsigned size_encoded:14;
198 }; 198 };
199 unsigned val; 199 unsigned val;
200}; 200};
@@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
241 case 0: 241 case 0:
242 if (!l1->val) 242 if (!l1->val)
243 return; 243 return;
244 assoc = l1->assoc; 244 assoc = assocs[l1->assoc];
245 line_size = l1->line_size; 245 line_size = l1->line_size;
246 lines_per_tag = l1->lines_per_tag; 246 lines_per_tag = l1->lines_per_tag;
247 size_in_kb = l1->size_in_kb; 247 size_in_kb = l1->size_in_kb;
@@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
249 case 2: 249 case 2:
250 if (!l2.val) 250 if (!l2.val)
251 return; 251 return;
252 assoc = l2.assoc; 252 assoc = assocs[l2.assoc];
253 line_size = l2.line_size; 253 line_size = l2.line_size;
254 lines_per_tag = l2.lines_per_tag; 254 lines_per_tag = l2.lines_per_tag;
255 /* cpu_data has errata corrections for K7 applied */ 255 /* cpu_data has errata corrections for K7 applied */
@@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
258 case 3: 258 case 3:
259 if (!l3.val) 259 if (!l3.val)
260 return; 260 return;
261 assoc = l3.assoc; 261 assoc = assocs[l3.assoc];
262 line_size = l3.line_size; 262 line_size = l3.line_size;
263 lines_per_tag = l3.lines_per_tag; 263 lines_per_tag = l3.lines_per_tag;
264 size_in_kb = l3.size_encoded * 512; 264 size_in_kb = l3.size_encoded * 512;
265 if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
266 size_in_kb = size_in_kb >> 1;
267 assoc = assoc >> 1;
268 }
265 break; 269 break;
266 default: 270 default:
267 return; 271 return;
@@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
270 eax->split.is_self_initializing = 1; 274 eax->split.is_self_initializing = 1;
271 eax->split.type = types[leaf]; 275 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 276 eax->split.level = levels[leaf];
273 if (leaf == 3) 277 eax->split.num_threads_sharing = 0;
274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
276 else
277 eax->split.num_threads_sharing = 0;
278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
279 279
280 280
281 if (assoc == 0xf) 281 if (assoc == 0xffff)
282 eax->split.is_fully_associative = 1; 282 eax->split.is_fully_associative = 1;
283 ebx->split.coherency_line_size = line_size - 1; 283 ebx->split.coherency_line_size = line_size - 1;
284 ebx->split.ways_of_associativity = assocs[assoc] - 1; 284 ebx->split.ways_of_associativity = assoc - 1;
285 ebx->split.physical_line_partition = lines_per_tag - 1; 285 ebx->split.physical_line_partition = lines_per_tag - 1;
286 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / 286 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
287 (ebx->split.ways_of_associativity + 1) - 1; 287 (ebx->split.ways_of_associativity + 1) - 1;
@@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void)
350 350
351unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) 351unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
352{ 352{
353 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ 353 /* Cache sizes */
354 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
354 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ 355 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
355 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ 356 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
356 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; 357 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
@@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
377 378
378 retval = cpuid4_cache_lookup_regs(i, &this_leaf); 379 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
379 if (retval >= 0) { 380 if (retval >= 0) {
380 switch(this_leaf.eax.split.level) { 381 switch (this_leaf.eax.split.level) {
381 case 1: 382 case 1:
382 if (this_leaf.eax.split.type == 383 if (this_leaf.eax.split.type ==
383 CACHE_TYPE_DATA) 384 CACHE_TYPE_DATA)
384 new_l1d = this_leaf.size/1024; 385 new_l1d = this_leaf.size/1024;
@@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
386 CACHE_TYPE_INST) 387 CACHE_TYPE_INST)
387 new_l1i = this_leaf.size/1024; 388 new_l1i = this_leaf.size/1024;
388 break; 389 break;
389 case 2: 390 case 2:
390 new_l2 = this_leaf.size/1024; 391 new_l2 = this_leaf.size/1024;
391 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 392 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
392 index_msb = get_count_order(num_threads_sharing); 393 index_msb = get_count_order(num_threads_sharing);
393 l2_id = c->apicid >> index_msb; 394 l2_id = c->apicid >> index_msb;
394 break; 395 break;
395 case 3: 396 case 3:
396 new_l3 = this_leaf.size/1024; 397 new_l3 = this_leaf.size/1024;
397 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 398 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
398 index_msb = get_count_order(num_threads_sharing); 399 index_msb = get_count_order(
400 num_threads_sharing);
399 l3_id = c->apicid >> index_msb; 401 l3_id = c->apicid >> index_msb;
400 break; 402 break;
401 default: 403 default:
402 break; 404 break;
403 } 405 }
404 } 406 }
@@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
421 /* Number of times to iterate */ 423 /* Number of times to iterate */
422 n = cpuid_eax(2) & 0xFF; 424 n = cpuid_eax(2) & 0xFF;
423 425
424 for ( i = 0 ; i < n ; i++ ) { 426 for (i = 0 ; i < n ; i++) {
425 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]); 427 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
426 428
427 /* If bit 31 is set, this is an unknown format */ 429 /* If bit 31 is set, this is an unknown format */
428 for ( j = 0 ; j < 3 ; j++ ) { 430 for (j = 0 ; j < 3 ; j++)
429 if (regs[j] & (1 << 31)) regs[j] = 0; 431 if (regs[j] & (1 << 31))
430 } 432 regs[j] = 0;
431 433
432 /* Byte 0 is level count, not a descriptor */ 434 /* Byte 0 is level count, not a descriptor */
433 for ( j = 1 ; j < 16 ; j++ ) { 435 for (j = 1 ; j < 16 ; j++) {
434 unsigned char des = dp[j]; 436 unsigned char des = dp[j];
435 unsigned char k = 0; 437 unsigned char k = 0;
436 438
437 /* look up this descriptor in the table */ 439 /* look up this descriptor in the table */
438 while (cache_table[k].descriptor != 0) 440 while (cache_table[k].descriptor != 0) {
439 {
440 if (cache_table[k].descriptor == des) { 441 if (cache_table[k].descriptor == des) {
441 if (only_trace && cache_table[k].cache_type != LVL_TRACE) 442 if (only_trace && cache_table[k].cache_type != LVL_TRACE)
442 break; 443 break;
@@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488 } 489 }
489 490
490 if (trace) 491 if (trace)
491 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); 492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
492 else if ( l1i ) 493 else if (l1i)
493 printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); 494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
494 495
495 if (l1d) 496 if (l1d)
496 printk(", L1 D cache: %dK\n", l1d); 497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
497 else 498 else
498 printk("\n"); 499 printk(KERN_CONT "\n");
499 500
500 if (l2) 501 if (l2)
501 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); 502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
@@ -522,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
522 int index_msb, i; 523 int index_msb, i;
523 struct cpuinfo_x86 *c = &cpu_data(cpu); 524 struct cpuinfo_x86 *c = &cpu_data(cpu);
524 525
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d;
528 for_each_online_cpu(i) {
529 if (!per_cpu(cpuid4_info, i))
530 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map),
534 d->llc_shared_map);
535 }
536 return;
537 }
525 this_leaf = CPUID4_INFO_IDX(cpu, index); 538 this_leaf = CPUID4_INFO_IDX(cpu, index);
526 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 539 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
527 540
@@ -558,8 +571,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
558 } 571 }
559} 572}
560#else 573#else
561static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} 574static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
562static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} 575{
576}
577
578static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
579{
580}
563#endif 581#endif
564 582
565static void __cpuinit free_cache_attributes(unsigned int cpu) 583static void __cpuinit free_cache_attributes(unsigned int cpu)
@@ -645,7 +663,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
645static ssize_t show_##file_name \ 663static ssize_t show_##file_name \
646 (struct _cpuid4_info *this_leaf, char *buf) \ 664 (struct _cpuid4_info *this_leaf, char *buf) \
647{ \ 665{ \
648 return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 666 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
649} 667}
650 668
651show_one_plus(level, eax.split.level, 0); 669show_one_plus(level, eax.split.level, 0);
@@ -656,7 +674,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
656 674
657static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 675static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
658{ 676{
659 return sprintf (buf, "%luK\n", this_leaf->size / 1024); 677 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
660} 678}
661 679
662static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, 680static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
@@ -669,7 +687,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
669 const struct cpumask *mask; 687 const struct cpumask *mask;
670 688
671 mask = to_cpumask(this_leaf->shared_cpu_map); 689 mask = to_cpumask(this_leaf->shared_cpu_map);
672 n = type? 690 n = type ?
673 cpulist_scnprintf(buf, len-2, mask) : 691 cpulist_scnprintf(buf, len-2, mask) :
674 cpumask_scnprintf(buf, len-2, mask); 692 cpumask_scnprintf(buf, len-2, mask);
675 buf[n++] = '\n'; 693 buf[n++] = '\n';
@@ -800,7 +818,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 818static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1); 819 show_cache_disable_1, store_cache_disable_1);
802 820
803static struct attribute * default_attrs[] = { 821static struct attribute *default_attrs[] = {
804 &type.attr, 822 &type.attr,
805 &level.attr, 823 &level.attr,
806 &coherency_line_size.attr, 824 &coherency_line_size.attr,
@@ -815,7 +833,7 @@ static struct attribute * default_attrs[] = {
815 NULL 833 NULL
816}; 834};
817 835
818static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 836static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
819{ 837{
820 struct _cache_attr *fattr = to_attr(attr); 838 struct _cache_attr *fattr = to_attr(attr);
821 struct _index_kobject *this_leaf = to_object(kobj); 839 struct _index_kobject *this_leaf = to_object(kobj);
@@ -828,8 +846,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
828 return ret; 846 return ret;
829} 847}
830 848
831static ssize_t store(struct kobject * kobj, struct attribute * attr, 849static ssize_t store(struct kobject *kobj, struct attribute *attr,
832 const char * buf, size_t count) 850 const char *buf, size_t count)
833{ 851{
834 struct _cache_attr *fattr = to_attr(attr); 852 struct _cache_attr *fattr = to_attr(attr);
835 struct _index_kobject *this_leaf = to_object(kobj); 853 struct _index_kobject *this_leaf = to_object(kobj);
@@ -883,7 +901,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
883 goto err_out; 901 goto err_out;
884 902
885 per_cpu(index_kobject, cpu) = kzalloc( 903 per_cpu(index_kobject, cpu) = kzalloc(
886 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 904 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
887 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 905 if (unlikely(per_cpu(index_kobject, cpu) == NULL))
888 goto err_out; 906 goto err_out;
889 907
@@ -917,7 +935,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
917 } 935 }
918 936
919 for (i = 0; i < num_cache_leaves; i++) { 937 for (i = 0; i < num_cache_leaves; i++) {
920 this_object = INDEX_KOBJECT_PTR(cpu,i); 938 this_object = INDEX_KOBJECT_PTR(cpu, i);
921 this_object->cpu = cpu; 939 this_object->cpu = cpu;
922 this_object->index = i; 940 this_object->index = i;
923 retval = kobject_init_and_add(&(this_object->kobj), 941 retval = kobject_init_and_add(&(this_object->kobj),
@@ -925,9 +943,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
925 per_cpu(cache_kobject, cpu), 943 per_cpu(cache_kobject, cpu),
926 "index%1lu", i); 944 "index%1lu", i);
927 if (unlikely(retval)) { 945 if (unlikely(retval)) {
928 for (j = 0; j < i; j++) { 946 for (j = 0; j < i; j++)
929 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); 947 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
930 }
931 kobject_put(per_cpu(cache_kobject, cpu)); 948 kobject_put(per_cpu(cache_kobject, cpu));
932 cpuid4_cache_sysfs_exit(cpu); 949 cpuid4_cache_sysfs_exit(cpu);
933 return retval; 950 return retval;
@@ -952,7 +969,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
952 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); 969 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
953 970
954 for (i = 0; i < num_cache_leaves; i++) 971 for (i = 0; i < num_cache_leaves; i++)
955 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 972 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
956 kobject_put(per_cpu(cache_kobject, cpu)); 973 kobject_put(per_cpu(cache_kobject, cpu));
957 cpuid4_cache_sysfs_exit(cpu); 974 cpuid4_cache_sysfs_exit(cpu);
958} 975}
@@ -977,8 +994,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
977 return NOTIFY_OK; 994 return NOTIFY_OK;
978} 995}
979 996
980static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = 997static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
981{
982 .notifier_call = cacheinfo_cpu_callback, 998 .notifier_call = cacheinfo_cpu_callback,
983}; 999};
984 1000
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2b..4ac6d48fe11b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
1obj-y = mce.o 1obj-y = mce.o mce-severity.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 3obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 6obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11 8
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For AMD Athlon/Duron: */
17static void k7_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 1; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57
58 /* Clear it: */
59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
60 /* Serialize: */
61 wmb();
62 add_taint(TAINT_MACHINE_CHECK);
63 }
64 }
65
66 if (recover & 2)
67 panic("CPU context corrupt");
68 if (recover & 1)
69 panic("Unable to continue");
70
71 printk(KERN_EMERG "Attempting to continue.\n");
72
73 mcgstl &= ~(1<<2);
74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
75}
76
77
78/* AMD K7 machine check is Intel like: */
79void amd_mcheck_init(struct cpuinfo_x86 *c)
80{
81 u32 l, h;
82 int i;
83
84 if (!cpu_has(c, X86_FEATURE_MCE))
85 return;
86
87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
89 wmb();
90
91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
93 rdmsr(MSR_IA32_MCG_CAP, l, h);
94 if (l & (1<<8)) /* Control register present ? */
95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
96 nr_mce_banks = l & 0xff;
97
98 /*
99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
102 if (boot_cpu_data.x86 == 6) {
103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
104 i = 1;
105 } else
106 i = 0;
107
108 for (; i < nr_mce_banks; i++) {
109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
111 }
112
113 set_in_cr4(X86_CR4_MCE);
114 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
115 smp_processor_id());
116}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f09..472763d92098 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/notifier.h>
22#include <linux/kdebug.h>
23#include <linux/cpu.h>
24#include <linux/sched.h>
21#include <asm/mce.h> 25#include <asm/mce.h>
26#include <asm/apic.h>
22 27
23/* Update fake mce registers on current CPU. */ 28/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m) 29static void inject_mce(struct mce *m)
@@ -39,44 +44,142 @@ static void inject_mce(struct mce *m)
39 i->finished = 1; 44 i->finished = 1;
40} 45}
41 46
42struct delayed_mce { 47static void raise_poll(struct mce *m)
43 struct timer_list timer; 48{
44 struct mce m; 49 unsigned long flags;
45}; 50 mce_banks_t b;
46 51
47/* Inject mce on current CPU */ 52 memset(&b, 0xff, sizeof(mce_banks_t));
48static void raise_mce(unsigned long data) 53 local_irq_save(flags);
54 machine_check_poll(0, &b);
55 local_irq_restore(flags);
56 m->finished = 0;
57}
58
59static void raise_exception(struct mce *m, struct pt_regs *pregs)
49{ 60{
50 struct delayed_mce *dm = (struct delayed_mce *)data; 61 struct pt_regs regs;
51 struct mce *m = &dm->m; 62 unsigned long flags;
52 int cpu = m->extcpu;
53 63
54 inject_mce(m); 64 if (!pregs) {
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs)); 65 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip; 66 regs.ip = m->ip;
59 regs.cs = m->cs; 67 regs.cs = m->cs;
68 pregs = &regs;
69 }
70 /* in mcheck exeception handler, irq will be disabled */
71 local_irq_save(flags);
72 do_machine_check(pregs, 0);
73 local_irq_restore(flags);
74 m->finished = 0;
75}
76
77static cpumask_t mce_inject_cpumask;
78
79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data)
81{
82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs);
90 else if (m->status)
91 raise_poll(m);
92 return NOTIFY_STOP;
93}
94
95static struct notifier_block mce_raise_nb = {
96 .notifier_call = mce_raise_notify,
97 .priority = 1000,
98};
99
100/* Inject mce on current CPU */
101static int raise_local(void)
102{
103 struct mce *m = &__get_cpu_var(injectm);
104 int context = MCJ_CTX(m->inject_flags);
105 int ret = 0;
106 int cpu = m->extcpu;
107
108 if (m->inject_flags & MCJ_EXCEPTION) {
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); 109 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0); 110 switch (context) {
111 case MCJ_CTX_IRQ:
112 /*
113 * Could do more to fake interrupts like
114 * calling irq_enter, but the necessary
115 * machinery isn't exported currently.
116 */
117 /*FALL THROUGH*/
118 case MCJ_CTX_PROCESS:
119 raise_exception(m, NULL);
120 break;
121 default:
122 printk(KERN_INFO "Invalid MCE context\n");
123 ret = -EINVAL;
124 }
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); 125 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else { 126 } else if (m->status) {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); 127 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b); 128 raise_poll(m);
68 mce_notify_irq(); 129 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n", 130 printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
70 cpu); 131 } else
71 } 132 m->finished = 0;
72 kfree(dm); 133
134 return ret;
135}
136
137static void raise_mce(struct mce *m)
138{
139 int context = MCJ_CTX(m->inject_flags);
140
141 inject_mce(m);
142
143 if (context == MCJ_CTX_RANDOM)
144 return;
145
146#ifdef CONFIG_X86_LOCAL_APIC
147 if (m->inject_flags & MCJ_NMI_BROADCAST) {
148 unsigned long start;
149 int cpu;
150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map;
152 cpu_clear(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask);
158 }
159 if (!cpus_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask));
167 break;
168 }
169 cpu_relax();
170 }
171 raise_local();
172 put_cpu();
173 put_online_cpus();
174 } else
175#endif
176 raise_local();
73} 177}
74 178
75/* Error injection interface */ 179/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf, 180static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off) 181 size_t usize, loff_t *off)
78{ 182{
79 struct delayed_mce *dm;
80 struct mce m; 183 struct mce m;
81 184
82 if (!capable(CAP_SYS_ADMIN)) 185 if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) 199 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL; 200 return -EINVAL;
98 201
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /* 202 /*
104 * Need to give user space some time to set everything up, 203 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere. 204 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */ 205 */
108 memcpy(&dm->m, &m, sizeof(struct mce)); 206 schedule_timeout(2);
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm); 207 raise_mce(&m);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize; 208 return usize;
113} 209}
114 210
@@ -116,6 +212,7 @@ static int inject_init(void)
116{ 212{
117 printk(KERN_INFO "Machine check injector initialized\n"); 213 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write; 214 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb);
119 return 0; 216 return 0;
120} 217}
121 218
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e5..32996f9fab67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
1#include <linux/sysdev.h>
1#include <asm/mce.h> 2#include <asm/mce.h>
2 3
3enum severity_level { 4enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
10 MCE_PANIC_SEVERITY, 11 MCE_PANIC_SEVERITY,
11}; 12};
12 13
14#define ATTR_LEN 16
15
16/* One object for each MCE bank, shared by all CPUs */
17struct mce_bank {
18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */
22};
23
13int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void);
14 26
15extern int mce_ser; 27extern int mce_ser;
28
29extern struct mce_bank *mce_banks;
30
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f97056..8a85dd1b1aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
139 } 139 }
140} 140}
141 141
142#ifdef CONFIG_DEBUG_FS
142static void *s_start(struct seq_file *f, loff_t *pos) 143static void *s_start(struct seq_file *f, loff_t *pos)
143{ 144{
144 if (*pos >= ARRAY_SIZE(severities)) 145 if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
197{ 198{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 199 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199 200
200 dmce = debugfs_create_dir("mce", NULL); 201 dmce = mce_get_debugfs_dir();
201 if (dmce == NULL) 202 if (dmce == NULL)
202 goto err_out; 203 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage", 204 fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
209 return 0; 210 return 0;
210 211
211err_out: 212err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM; 213 return -ENOMEM;
217} 214}
218late_initcall(severities_debugfs_init); 215late_initcall(severities_debugfs_init);
216#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62f..183c3457d2f4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/debugfs.h>
37 38
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
@@ -45,21 +46,8 @@
45 46
46#include "mce-internal.h" 47#include "mce-internal.h"
47 48
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly; 49int mce_disabled __read_mostly;
60 50
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227 51#define MISC_MCELOG_MINOR 227
64 52
65#define SPINUNIT 100 /* 100ns */ 53#define SPINUNIT 100 /* 100ns */
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
77 */ 65 */
78static int tolerant __read_mostly = 1; 66static int tolerant __read_mostly = 1;
79static int banks __read_mostly; 67static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly; 68static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1; 69static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1;
@@ -87,13 +74,13 @@ int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly; 74int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly; 75int mce_ser __read_mostly;
89 76
77struct mce_bank *mce_banks __read_mostly;
78
90/* User mode helper program triggered by machine check event */ 79/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 80static unsigned long mce_need_notify;
92static char mce_helper[128]; 81static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL }; 82static char *mce_helper_argv[2] = { mce_helper, NULL };
94 83
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen); 85static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 86static int cpu_missing;
@@ -104,11 +91,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 91 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105}; 92};
106 93
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work); 94static DEFINE_PER_CPU(struct work_struct, mce_work);
113 95
114/* Do initial initialization of a struct mce */ 96/* Do initial initialization of a struct mce */
@@ -183,6 +165,11 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 165 set_bit(0, &mce_need_notify);
184} 166}
185 167
168void __weak decode_mce(struct mce *m)
169{
170 return;
171}
172
186static void print_mce(struct mce *m) 173static void print_mce(struct mce *m)
187{ 174{
188 printk(KERN_EMERG 175 printk(KERN_EMERG
@@ -205,6 +192,8 @@ static void print_mce(struct mce *m)
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 192 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 193 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 194 m->apicid);
195
196 decode_mce(m);
208} 197}
209 198
210static void print_mce_head(void) 199static void print_mce_head(void)
@@ -222,6 +211,9 @@ static void print_mce_tail(void)
222 211
223static atomic_t mce_paniced; 212static atomic_t mce_paniced;
224 213
214static int fake_panic;
215static atomic_t mce_fake_paniced;
216
225/* Panic in progress. Enable interrupts and wait for final IPI */ 217/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void) 218static void wait_for_panic(void)
227{ 219{
@@ -239,15 +231,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
239{ 231{
240 int i; 232 int i;
241 233
242 /* 234 if (!fake_panic) {
243 * Make sure only one CPU runs in machine check panic 235 /*
244 */ 236 * Make sure only one CPU runs in machine check panic
245 if (atomic_add_return(1, &mce_paniced) > 1) 237 */
246 wait_for_panic(); 238 if (atomic_inc_return(&mce_paniced) > 1)
247 barrier(); 239 wait_for_panic();
240 barrier();
248 241
249 bust_spinlocks(1); 242 bust_spinlocks(1);
250 console_verbose(); 243 console_verbose();
244 } else {
245 /* Don't log too much for fake panic */
246 if (atomic_inc_return(&mce_fake_paniced) > 1)
247 return;
248 }
251 print_mce_head(); 249 print_mce_head();
252 /* First print corrected ones that are still unlogged */ 250 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) { 251 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -274,9 +272,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
274 print_mce_tail(); 272 print_mce_tail();
275 if (exp) 273 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp); 274 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0) 275 if (!fake_panic) {
278 panic_timeout = mce_panic_timeout; 276 if (panic_timeout == 0)
279 panic(msg); 277 panic_timeout = mce_panic_timeout;
278 panic(msg);
279 } else
280 printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
280} 281}
281 282
282/* Support code for software error injection */ 283/* Support code for software error injection */
@@ -286,11 +287,11 @@ static int msr_to_offset(u32 msr)
286 unsigned bank = __get_cpu_var(injectm.bank); 287 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr) 288 if (msr == rip_msr)
288 return offsetof(struct mce, ip); 289 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 290 if (msr == MSR_IA32_MCx_STATUS(bank))
290 return offsetof(struct mce, status); 291 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 292 if (msr == MSR_IA32_MCx_ADDR(bank))
292 return offsetof(struct mce, addr); 293 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4) 294 if (msr == MSR_IA32_MCx_MISC(bank))
294 return offsetof(struct mce, misc); 295 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS) 296 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus); 297 return offsetof(struct mce, mcgstatus);
@@ -301,13 +302,25 @@ static int msr_to_offset(u32 msr)
301static u64 mce_rdmsrl(u32 msr) 302static u64 mce_rdmsrl(u32 msr)
302{ 303{
303 u64 v; 304 u64 v;
305
304 if (__get_cpu_var(injectm).finished) { 306 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr); 307 int offset = msr_to_offset(msr);
308
306 if (offset < 0) 309 if (offset < 0)
307 return 0; 310 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 311 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 } 312 }
310 rdmsrl(msr, v); 313
314 if (rdmsrl_safe(msr, &v)) {
315 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
316 /*
317 * Return zero in case the access faulted. This should
318 * not happen normally but can happen if the CPU does
319 * something weird, or if the code is buggy.
320 */
321 v = 0;
322 }
323
311 return v; 324 return v;
312} 325}
313 326
@@ -315,6 +328,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
315{ 328{
316 if (__get_cpu_var(injectm).finished) { 329 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr); 330 int offset = msr_to_offset(msr);
331
318 if (offset >= 0) 332 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 333 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return; 334 return;
@@ -411,7 +425,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
411 m->ip = mce_rdmsrl(rip_msr); 425 m->ip = mce_rdmsrl(rip_msr);
412} 426}
413 427
414#ifdef CONFIG_X86_LOCAL_APIC 428#ifdef CONFIG_X86_LOCAL_APIC
415/* 429/*
416 * Called after interrupts have been reenabled again 430 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region 431 * when a MCE happened during an interrupts off region
@@ -495,7 +509,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
495 509
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 510 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) { 511 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b)) 512 if (!mce_banks[i].ctl || !test_bit(i, *b))
499 continue; 513 continue;
500 514
501 m.misc = 0; 515 m.misc = 0;
@@ -504,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
504 m.tsc = 0; 518 m.tsc = 0;
505 519
506 barrier(); 520 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 521 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
508 if (!(m.status & MCI_STATUS_VAL)) 522 if (!(m.status & MCI_STATUS_VAL))
509 continue; 523 continue;
510 524
@@ -519,9 +533,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
519 continue; 533 continue;
520 534
521 if (m.status & MCI_STATUS_MISCV) 535 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 536 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
523 if (m.status & MCI_STATUS_ADDRV) 537 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 538 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
525 539
526 if (!(flags & MCP_TIMESTAMP)) 540 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0; 541 m.tsc = 0;
@@ -537,7 +551,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
537 /* 551 /*
538 * Clear state for this bank. 552 * Clear state for this bank.
539 */ 553 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 554 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
541 } 555 }
542 556
543 /* 557 /*
@@ -558,7 +572,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
558 int i; 572 int i;
559 573
560 for (i = 0; i < banks; i++) { 574 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 575 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 576 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1; 577 return 1;
564 } 578 }
@@ -618,7 +632,7 @@ out:
618 * This way we prevent any potential data corruption in a unrecoverable case 632 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined. 633 * and also makes sure always all CPU's errors are examined.
620 * 634 *
621 * Also this detects the case of an machine check event coming from outer 635 * Also this detects the case of a machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants 636 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too. 637 * us to shut down, so panic too.
624 * 638 *
@@ -671,7 +685,7 @@ static void mce_reign(void)
671 * No machine check event found. Must be some external 685 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic. 686 * source or one CPU is hung. Panic.
673 */ 687 */
674 if (!m && tolerant < 3) 688 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL); 689 mce_panic("Machine check from unknown source", NULL, NULL);
676 690
677 /* 691 /*
@@ -705,7 +719,7 @@ static int mce_start(int *no_way_out)
705 * global_nwo should be updated before mce_callin 719 * global_nwo should be updated before mce_callin
706 */ 720 */
707 smp_wmb(); 721 smp_wmb();
708 order = atomic_add_return(1, &mce_callin); 722 order = atomic_inc_return(&mce_callin);
709 723
710 /* 724 /*
711 * Wait for everyone. 725 * Wait for everyone.
@@ -842,7 +856,7 @@ static void mce_clear_state(unsigned long *toclear)
842 856
843 for (i = 0; i < banks; i++) { 857 for (i = 0; i < banks; i++) {
844 if (test_bit(i, toclear)) 858 if (test_bit(i, toclear))
845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 859 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
846 } 860 }
847} 861}
848 862
@@ -895,11 +909,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
895 mce_setup(&m); 909 mce_setup(&m);
896 910
897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 911 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
898 no_way_out = mce_no_way_out(&m, &msg);
899
900 final = &__get_cpu_var(mces_seen); 912 final = &__get_cpu_var(mces_seen);
901 *final = m; 913 *final = m;
902 914
915 no_way_out = mce_no_way_out(&m, &msg);
916
903 barrier(); 917 barrier();
904 918
905 /* 919 /*
@@ -916,14 +930,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
916 order = mce_start(&no_way_out); 930 order = mce_start(&no_way_out);
917 for (i = 0; i < banks; i++) { 931 for (i = 0; i < banks; i++) {
918 __clear_bit(i, toclear); 932 __clear_bit(i, toclear);
919 if (!bank[i]) 933 if (!mce_banks[i].ctl)
920 continue; 934 continue;
921 935
922 m.misc = 0; 936 m.misc = 0;
923 m.addr = 0; 937 m.addr = 0;
924 m.bank = i; 938 m.bank = i;
925 939
926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 940 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
927 if ((m.status & MCI_STATUS_VAL) == 0) 941 if ((m.status & MCI_STATUS_VAL) == 0)
928 continue; 942 continue;
929 943
@@ -964,9 +978,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
964 kill_it = 1; 978 kill_it = 1;
965 979
966 if (m.status & MCI_STATUS_MISCV) 980 if (m.status & MCI_STATUS_MISCV)
967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 981 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
968 if (m.status & MCI_STATUS_ADDRV) 982 if (m.status & MCI_STATUS_ADDRV)
969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 983 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
970 984
971 /* 985 /*
972 * Action optional error. Queue address for later processing. 986 * Action optional error. Queue address for later processing.
@@ -1091,7 +1105,7 @@ void mce_log_therm_throt_event(__u64 status)
1091 */ 1105 */
1092static int check_interval = 5 * 60; /* 5 minutes */ 1106static int check_interval = 5 * 60; /* 5 minutes */
1093 1107
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1108static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1109static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096 1110
1097static void mcheck_timer(unsigned long data) 1111static void mcheck_timer(unsigned long data)
@@ -1110,7 +1124,7 @@ static void mcheck_timer(unsigned long data)
1110 * Alert userspace if needed. If we logged an MCE, reduce the 1124 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval. 1125 * polling interval, otherwise increase the polling interval.
1112 */ 1126 */
1113 n = &__get_cpu_var(next_interval); 1127 n = &__get_cpu_var(mce_next_interval);
1114 if (mce_notify_irq()) 1128 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100); 1129 *n = max(*n/2, HZ/100);
1116 else 1130 else
@@ -1159,10 +1173,26 @@ int mce_notify_irq(void)
1159} 1173}
1160EXPORT_SYMBOL_GPL(mce_notify_irq); 1174EXPORT_SYMBOL_GPL(mce_notify_irq);
1161 1175
1176static int mce_banks_init(void)
1177{
1178 int i;
1179
1180 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1181 if (!mce_banks)
1182 return -ENOMEM;
1183 for (i = 0; i < banks; i++) {
1184 struct mce_bank *b = &mce_banks[i];
1185
1186 b->ctl = -1ULL;
1187 b->init = 1;
1188 }
1189 return 0;
1190}
1191
1162/* 1192/*
1163 * Initialize Machine Checks for a CPU. 1193 * Initialize Machine Checks for a CPU.
1164 */ 1194 */
1165static int mce_cap_init(void) 1195static int __cpuinit mce_cap_init(void)
1166{ 1196{
1167 unsigned b; 1197 unsigned b;
1168 u64 cap; 1198 u64 cap;
@@ -1182,11 +1212,11 @@ static int mce_cap_init(void)
1182 /* Don't support asymmetric configurations today */ 1212 /* Don't support asymmetric configurations today */
1183 WARN_ON(banks != 0 && b != banks); 1213 WARN_ON(banks != 0 && b != banks);
1184 banks = b; 1214 banks = b;
1185 if (!bank) { 1215 if (!mce_banks) {
1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1216 int err = mce_banks_init();
1187 if (!bank) 1217
1188 return -ENOMEM; 1218 if (err)
1189 memset(bank, 0xff, banks * sizeof(u64)); 1219 return err;
1190 } 1220 }
1191 1221
1192 /* Use accurate RIP reporting if available. */ 1222 /* Use accurate RIP reporting if available. */
@@ -1218,15 +1248,17 @@ static void mce_init(void)
1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1248 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1219 1249
1220 for (i = 0; i < banks; i++) { 1250 for (i = 0; i < banks; i++) {
1221 if (skip_bank_init(i)) 1251 struct mce_bank *b = &mce_banks[i];
1252
1253 if (!b->init)
1222 continue; 1254 continue;
1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1255 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1256 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1225 } 1257 }
1226} 1258}
1227 1259
1228/* Add per CPU specific workarounds here */ 1260/* Add per CPU specific workarounds here */
1229static int mce_cpu_quirks(struct cpuinfo_x86 *c) 1261static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1230{ 1262{
1231 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1263 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1232 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1264 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1241,7 +1273,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1241 * trips off incorrectly with the IOMMU & 3ware 1273 * trips off incorrectly with the IOMMU & 3ware
1242 * & Cerberus: 1274 * & Cerberus:
1243 */ 1275 */
1244 clear_bit(10, (unsigned long *)&bank[4]); 1276 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1245 } 1277 }
1246 if (c->x86 <= 17 && mce_bootlog < 0) { 1278 if (c->x86 <= 17 && mce_bootlog < 0) {
1247 /* 1279 /*
@@ -1255,7 +1287,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1255 * by default. 1287 * by default.
1256 */ 1288 */
1257 if (c->x86 == 6 && banks > 0) 1289 if (c->x86 == 6 && banks > 0)
1258 bank[0] = 0; 1290 mce_banks[0].ctl = 0;
1259 } 1291 }
1260 1292
1261 if (c->x86_vendor == X86_VENDOR_INTEL) { 1293 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1268,8 +1300,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1268 * valid event later, merely don't write CTL0. 1300 * valid event later, merely don't write CTL0.
1269 */ 1301 */
1270 1302
1271 if (c->x86 == 6 && c->x86_model < 0x1A) 1303 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1272 __set_bit(0, &dont_init_banks); 1304 mce_banks[0].init = 0;
1273 1305
1274 /* 1306 /*
1275 * All newer Intel systems support MCE broadcasting. Enable 1307 * All newer Intel systems support MCE broadcasting. Enable
@@ -1325,7 +1357,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1325static void mce_init_timer(void) 1357static void mce_init_timer(void)
1326{ 1358{
1327 struct timer_list *t = &__get_cpu_var(mce_timer); 1359 struct timer_list *t = &__get_cpu_var(mce_timer);
1328 int *n = &__get_cpu_var(next_interval); 1360 int *n = &__get_cpu_var(mce_next_interval);
1329 1361
1330 if (mce_ignore_ce) 1362 if (mce_ignore_ce)
1331 return; 1363 return;
@@ -1338,6 +1370,17 @@ static void mce_init_timer(void)
1338 add_timer_on(t, smp_processor_id()); 1370 add_timer_on(t, smp_processor_id());
1339} 1371}
1340 1372
1373/* Handle unconfigured int18 (should never happen) */
1374static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1375{
1376 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1377 smp_processor_id());
1378}
1379
1380/* Call the installed machine check handler for this CPU setup. */
1381void (*machine_check_vector)(struct pt_regs *, long error_code) =
1382 unexpected_machine_check;
1383
1341/* 1384/*
1342 * Called for each booted CPU to set up machine checks. 1385 * Called for each booted CPU to set up machine checks.
1343 * Must be called with preempt off: 1386 * Must be called with preempt off:
@@ -1551,8 +1594,10 @@ static struct miscdevice mce_log_device = {
1551 */ 1594 */
1552static int __init mcheck_enable(char *str) 1595static int __init mcheck_enable(char *str)
1553{ 1596{
1554 if (*str == 0) 1597 if (*str == 0) {
1555 enable_p5_mce(); 1598 enable_p5_mce();
1599 return 1;
1600 }
1556 if (*str == '=') 1601 if (*str == '=')
1557 str++; 1602 str++;
1558 if (!strcmp(str, "off")) 1603 if (!strcmp(str, "off"))
@@ -1593,8 +1638,10 @@ static int mce_disable(void)
1593 int i; 1638 int i;
1594 1639
1595 for (i = 0; i < banks; i++) { 1640 for (i = 0; i < banks; i++) {
1596 if (!skip_bank_init(i)) 1641 struct mce_bank *b = &mce_banks[i];
1597 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1642
1643 if (b->init)
1644 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1598 } 1645 }
1599 return 0; 1646 return 0;
1600} 1647}
@@ -1669,14 +1716,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
1669__cpuinitdata 1716__cpuinitdata
1670void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1717void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1671 1718
1672static struct sysdev_attribute *bank_attrs; 1719static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1720{
1721 return container_of(attr, struct mce_bank, attr);
1722}
1673 1723
1674static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1724static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1675 char *buf) 1725 char *buf)
1676{ 1726{
1677 u64 b = bank[attr - bank_attrs]; 1727 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1678
1679 return sprintf(buf, "%llx\n", b);
1680} 1728}
1681 1729
1682static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1730static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1687,7 +1735,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1687 if (strict_strtoull(buf, 0, &new) < 0) 1735 if (strict_strtoull(buf, 0, &new) < 0)
1688 return -EINVAL; 1736 return -EINVAL;
1689 1737
1690 bank[attr - bank_attrs] = new; 1738 attr_to_bank(attr)->ctl = new;
1691 mce_restart(); 1739 mce_restart();
1692 1740
1693 return size; 1741 return size;
@@ -1829,7 +1877,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1829 } 1877 }
1830 for (j = 0; j < banks; j++) { 1878 for (j = 0; j < banks; j++) {
1831 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1879 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1832 &bank_attrs[j]); 1880 &mce_banks[j].attr);
1833 if (err) 1881 if (err)
1834 goto error2; 1882 goto error2;
1835 } 1883 }
@@ -1838,10 +1886,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1838 return 0; 1886 return 0;
1839error2: 1887error2:
1840 while (--j >= 0) 1888 while (--j >= 0)
1841 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1889 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1842error: 1890error:
1843 while (--i >= 0) 1891 while (--i >= 0)
1844 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1892 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1845 1893
1846 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1894 sysdev_unregister(&per_cpu(mce_dev, cpu));
1847 1895
@@ -1859,7 +1907,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1859 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1907 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1860 1908
1861 for (i = 0; i < banks; i++) 1909 for (i = 0; i < banks; i++)
1862 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1910 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1863 1911
1864 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1912 sysdev_unregister(&per_cpu(mce_dev, cpu));
1865 cpumask_clear_cpu(cpu, mce_dev_initialized); 1913 cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1876,8 +1924,10 @@ static void mce_disable_cpu(void *h)
1876 if (!(action & CPU_TASKS_FROZEN)) 1924 if (!(action & CPU_TASKS_FROZEN))
1877 cmci_clear(); 1925 cmci_clear();
1878 for (i = 0; i < banks; i++) { 1926 for (i = 0; i < banks; i++) {
1879 if (!skip_bank_init(i)) 1927 struct mce_bank *b = &mce_banks[i];
1880 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1928
1929 if (b->init)
1930 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1881 } 1931 }
1882} 1932}
1883 1933
@@ -1892,8 +1942,10 @@ static void mce_reenable_cpu(void *h)
1892 if (!(action & CPU_TASKS_FROZEN)) 1942 if (!(action & CPU_TASKS_FROZEN))
1893 cmci_reenable(); 1943 cmci_reenable();
1894 for (i = 0; i < banks; i++) { 1944 for (i = 0; i < banks; i++) {
1895 if (!skip_bank_init(i)) 1945 struct mce_bank *b = &mce_banks[i];
1896 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1946
1947 if (b->init)
1948 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1897 } 1949 }
1898} 1950}
1899 1951
@@ -1925,7 +1977,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1925 case CPU_DOWN_FAILED: 1977 case CPU_DOWN_FAILED:
1926 case CPU_DOWN_FAILED_FROZEN: 1978 case CPU_DOWN_FAILED_FROZEN:
1927 t->expires = round_jiffies(jiffies + 1979 t->expires = round_jiffies(jiffies +
1928 __get_cpu_var(next_interval)); 1980 __get_cpu_var(mce_next_interval));
1929 add_timer_on(t, cpu); 1981 add_timer_on(t, cpu);
1930 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1982 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1931 break; 1983 break;
@@ -1941,35 +1993,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1941 .notifier_call = mce_cpu_callback, 1993 .notifier_call = mce_cpu_callback,
1942}; 1994};
1943 1995
1944static __init int mce_init_banks(void) 1996static __init void mce_init_banks(void)
1945{ 1997{
1946 int i; 1998 int i;
1947 1999
1948 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1949 GFP_KERNEL);
1950 if (!bank_attrs)
1951 return -ENOMEM;
1952
1953 for (i = 0; i < banks; i++) { 2000 for (i = 0; i < banks; i++) {
1954 struct sysdev_attribute *a = &bank_attrs[i]; 2001 struct mce_bank *b = &mce_banks[i];
2002 struct sysdev_attribute *a = &b->attr;
1955 2003
1956 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 2004 a->attr.name = b->attrname;
1957 if (!a->attr.name) 2005 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
1958 goto nomem;
1959 2006
1960 a->attr.mode = 0644; 2007 a->attr.mode = 0644;
1961 a->show = show_bank; 2008 a->show = show_bank;
1962 a->store = set_bank; 2009 a->store = set_bank;
1963 } 2010 }
1964 return 0;
1965
1966nomem:
1967 while (--i >= 0)
1968 kfree(bank_attrs[i].attr.name);
1969 kfree(bank_attrs);
1970 bank_attrs = NULL;
1971
1972 return -ENOMEM;
1973} 2011}
1974 2012
1975static __init int mce_init_device(void) 2013static __init int mce_init_device(void)
@@ -1982,9 +2020,7 @@ static __init int mce_init_device(void)
1982 2020
1983 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2021 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1984 2022
1985 err = mce_init_banks(); 2023 mce_init_banks();
1986 if (err)
1987 return err;
1988 2024
1989 err = sysdev_class_register(&mce_sysclass); 2025 err = sysdev_class_register(&mce_sysclass);
1990 if (err) 2026 if (err)
@@ -2004,57 +2040,65 @@ static __init int mce_init_device(void)
2004 2040
2005device_initcall(mce_init_device); 2041device_initcall(mce_init_device);
2006 2042
2007#else /* CONFIG_X86_OLD_MCE: */ 2043/*
2008 2044 * Old style boot options parsing. Only for compatibility.
2009int nr_mce_banks; 2045 */
2010EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2046static int __init mcheck_disable(char *str)
2047{
2048 mce_disabled = 1;
2049 return 1;
2050}
2051__setup("nomce", mcheck_disable);
2011 2052
2012/* This has to be run for each processor */ 2053#ifdef CONFIG_DEBUG_FS
2013void mcheck_init(struct cpuinfo_x86 *c) 2054struct dentry *mce_get_debugfs_dir(void)
2014{ 2055{
2015 if (mce_disabled) 2056 static struct dentry *dmce;
2016 return;
2017 2057
2018 switch (c->x86_vendor) { 2058 if (!dmce)
2019 case X86_VENDOR_AMD: 2059 dmce = debugfs_create_dir("mce", NULL);
2020 amd_mcheck_init(c);
2021 break;
2022 2060
2023 case X86_VENDOR_INTEL: 2061 return dmce;
2024 if (c->x86 == 5) 2062}
2025 intel_p5_mcheck_init(c);
2026 if (c->x86 == 6)
2027 intel_p6_mcheck_init(c);
2028 if (c->x86 == 15)
2029 intel_p4_mcheck_init(c);
2030 break;
2031 2063
2032 case X86_VENDOR_CENTAUR: 2064static void mce_reset(void)
2033 if (c->x86 == 5) 2065{
2034 winchip_mcheck_init(c); 2066 cpu_missing = 0;
2035 break; 2067 atomic_set(&mce_fake_paniced, 0);
2068 atomic_set(&mce_executing, 0);
2069 atomic_set(&mce_callin, 0);
2070 atomic_set(&global_nwo, 0);
2071}
2036 2072
2037 default: 2073static int fake_panic_get(void *data, u64 *val)
2038 break; 2074{
2039 } 2075 *val = fake_panic;
2040 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2076 return 0;
2041} 2077}
2042 2078
2043static int __init mcheck_enable(char *str) 2079static int fake_panic_set(void *data, u64 val)
2044{ 2080{
2045 mce_p5_enabled = 1; 2081 mce_reset();
2046 return 1; 2082 fake_panic = val;
2083 return 0;
2047} 2084}
2048__setup("mce", mcheck_enable);
2049 2085
2050#endif /* CONFIG_X86_OLD_MCE */ 2086DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2087 fake_panic_set, "%llu\n");
2051 2088
2052/* 2089static int __init mce_debugfs_init(void)
2053 * Old style boot options parsing. Only for compatibility.
2054 */
2055static int __init mcheck_disable(char *str)
2056{ 2090{
2057 mce_disabled = 1; 2091 struct dentry *dmce, *ffake_panic;
2058 return 1; 2092
2093 dmce = mce_get_debugfs_dir();
2094 if (!dmce)
2095 return -ENOMEM;
2096 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2097 &fake_panic_fops);
2098 if (!ffake_panic)
2099 return -ENOMEM;
2100
2101 return 0;
2059} 2102}
2060__setup("nomce", mcheck_disable); 2103late_initcall(mce_debugfs_init);
2104#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..83a3d1f4efca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
@@ -489,12 +489,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 int i, err = 0; 489 int i, err = 0;
490 struct threshold_bank *b = NULL; 490 struct threshold_bank *b = NULL;
491 char name[32]; 491 char name[32];
492#ifdef CONFIG_SMP
493 struct cpuinfo_x86 *c = &cpu_data(cpu);
494#endif
492 495
493 sprintf(name, "threshold_bank%i", bank); 496 sprintf(name, "threshold_bank%i", bank);
494 497
495#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
496 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 499 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
497 i = cpumask_first(cpu_core_mask(cpu)); 500 i = cpumask_first(c->llc_shared_map);
498 501
499 /* first core not up yet */ 502 /* first core not up yet */
500 if (cpu_data(i).cpu_core_id) 503 if (cpu_data(i).cpu_core_id)
@@ -514,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
514 if (err) 517 if (err)
515 goto out; 518 goto out;
516 519
517 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 520 cpumask_copy(b->cpus, c->llc_shared_map);
518 per_cpu(threshold_banks, cpu)[bank] = b; 521 per_cpu(threshold_banks, cpu)[bank] = b;
519 522
520 goto out; 523 goto out;
@@ -539,7 +542,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
539#ifndef CONFIG_SMP 542#ifndef CONFIG_SMP
540 cpumask_setall(b->cpus); 543 cpumask_setall(b->cpus);
541#else 544#else
542 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 545 cpumask_copy(b->cpus, c->llc_shared_map);
543#endif 546#endif
544 547
545 per_cpu(threshold_banks, cpu)[bank] = b; 548 per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a32..889f665fe93d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot)
90 if (test_bit(i, owned)) 90 if (test_bit(i, owned))
91 continue; 91 continue;
92 92
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 93 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
94 94
95 /* Already owned by someone else? */ 95 /* Already owned by someone else? */
96 if (val & CMCI_EN) { 96 if (val & CMCI_EN) {
@@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot)
101 } 101 }
102 102
103 val |= CMCI_EN | CMCI_THRESHOLD; 103 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 104 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 105 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
106 106
107 /* Did the enable bit stick? -- the bank supports CMCI */ 107 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) { 108 if (val & CMCI_EN) {
@@ -152,9 +152,9 @@ void cmci_clear(void)
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 152 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue; 153 continue;
154 /* Disable CMCI */ 154 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 155 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 157 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 158 __clear_bit(i, __get_cpu_var(mce_banks_owned));
159 } 159 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags); 160 spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9#include <linux/interrupt.h>
10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17
18#include <asm/processor.h>
19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h>
22
23static int firstbank;
24
25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27static void mce_checkregs(void *info)
28{
29 u32 low, high;
30 int i;
31
32 for (i = firstbank; i < nr_mce_banks; i++) {
33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
34
35 if (!(high & (1<<31)))
36 continue;
37
38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
40 smp_processor_id());
41
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /*
45 * Scrub the error so we don't pick it up in MCE_RATE
46 * seconds time:
47 */
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
49
50 /* Serialize: */
51 wmb();
52 add_taint(TAINT_MACHINE_CHECK);
53 }
54}
55
56static void mce_work_fn(struct work_struct *work);
57static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
58
59static void mce_work_fn(struct work_struct *work)
60{
61 on_each_cpu(mce_checkregs, NULL, 1);
62 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0;
91}
92module_init(init_nonfatal_mce_checker);
93
94MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/smp.h>
8
9#include <asm/processor.h>
10#include <asm/mce.h>
11#include <asm/msr.h>
12
13/* as supported by the P4/Xeon family */
14struct intel_mce_extended_msrs {
15 u32 eax;
16 u32 ebx;
17 u32 ecx;
18 u32 edx;
19 u32 esi;
20 u32 edi;
21 u32 ebp;
22 u32 esp;
23 u32 eflags;
24 u32 eip;
25 /* u32 *reserved[]; */
26};
27
28static int mce_num_extended_msrs;
29
30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
32{
33 u32 h;
34
35 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
36 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
37 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
38 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
39 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
40 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
41 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
42 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
43 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
44 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
45}
46
47static void intel_machine_check(struct pt_regs *regs, long error_code)
48{
49 u32 alow, ahigh, high, low;
50 u32 mcgstl, mcgsth;
51 int recover = 1;
52 int i;
53
54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
55 if (mcgstl & (1<<0)) /* Recoverable ? */
56 recover = 0;
57
58 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
59 smp_processor_id(), mcgsth, mcgstl);
60
61 if (mce_num_extended_msrs > 0) {
62 struct intel_mce_extended_msrs dbg;
63
64 intel_get_extended_msrs(&dbg);
65
66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
69 smp_processor_id(), dbg.eip, dbg.eflags,
70 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
71 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
72 }
73
74 for (i = 0; i < nr_mce_banks; i++) {
75 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
76 if (high & (1<<31)) {
77 char misc[20];
78 char addr[24];
79
80 misc[0] = addr[0] = '\0';
81 if (high & (1<<29))
82 recover |= 1;
83 if (high & (1<<25))
84 recover |= 2;
85 high &= ~(1<<31);
86 if (high & (1<<27)) {
87 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
88 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
89 }
90 if (high & (1<<26)) {
91 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
92 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
93 }
94 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
95 smp_processor_id(), i, high, low, misc, addr);
96 }
97 }
98
99 if (recover & 2)
100 panic("CPU context corrupt");
101 if (recover & 1)
102 panic("Unable to continue");
103
104 printk(KERN_EMERG "Attempting to continue.\n");
105
106 /*
107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
108 * recoverable/continuable.This will allow BIOS to look at the MSRs
109 * for errors if the OS could not log the error.
110 */
111 for (i = 0; i < nr_mce_banks; i++) {
112 u32 msr;
113 msr = MSR_IA32_MC0_STATUS+i*4;
114 rdmsr(msr, low, high);
115 if (high&(1<<31)) {
116 /* Clear it */
117 wrmsr(msr, 0UL, 0UL);
118 /* Serialize */
119 wmb();
120 add_taint(TAINT_MACHINE_CHECK);
121 }
122 }
123 mcgstl &= ~(1<<2);
124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
125}
126
127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
128{
129 u32 l, h;
130 int i;
131
132 machine_check_vector = intel_machine_check;
133 wmb();
134
135 printk(KERN_INFO "Intel machine check architecture supported.\n");
136 rdmsr(MSR_IA32_MCG_CAP, l, h);
137 if (l & (1<<8)) /* Control register present ? */
138 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
139 nr_mce_banks = l & 0xff;
140
141 for (i = 0; i < nr_mce_banks; i++) {
142 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
143 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
144 }
145
146 set_in_cr4(X86_CR4_MCE);
147 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
148 smp_processor_id());
149
150 /* Check for P4/Xeon extended MCE MSRs */
151 rdmsr(MSR_IA32_MCG_CAP, l, h);
152 if (l & (1<<9)) {/* MCG_EXT_P */
153 mce_num_extended_msrs = (l >> 16) & 0xff;
154 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
155 " available\n",
156 smp_processor_id(), mce_num_extended_msrs);
157
158#ifdef CONFIG_X86_MCE_P4THERMAL
159 /* Check for P4/Xeon Thermal monitor */
160 intel_init_thermal(c);
161#endif
162 }
163}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For PII/PIII */
17static void intel_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 0; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57 }
58 }
59
60 if (recover & 2)
61 panic("CPU context corrupt");
62 if (recover & 1)
63 panic("Unable to continue");
64
65 printk(KERN_EMERG "Attempting to continue.\n");
66 /*
67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
68 * recoverable/continuable.This will allow BIOS to look at the MSRs
69 * for errors if the OS could not log the error:
70 */
71 for (i = 0; i < nr_mce_banks; i++) {
72 unsigned int msr;
73
74 msr = MSR_IA32_MC0_STATUS+i*4;
75 rdmsr(msr, low, high);
76 if (high & (1<<31)) {
77 /* Clear it: */
78 wrmsr(msr, 0UL, 0UL);
79 /* Serialize: */
80 wmb();
81 add_taint(TAINT_MACHINE_CHECK);
82 }
83 }
84 mcgstl &= ~(1<<2);
85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
86}
87
88/* Set up machine check reporting for processors with Intel style MCE: */
89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
90{
91 u32 l, h;
92 int i;
93
94 /* Check for MCE support */
95 if (!cpu_has(c, X86_FEATURE_MCE))
96 return;
97
98 /* Check for PPro style MCA */
99 if (!cpu_has(c, X86_FEATURE_MCA))
100 return;
101
102 /* Ok machine check is available */
103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
105 wmb();
106
107 printk(KERN_INFO "Intel machine check architecture supported.\n");
108 rdmsr(MSR_IA32_MCG_CAP, l, h);
109 if (l & (1<<8)) /* Control register present ? */
110 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
111 nr_mce_banks = l & 0xff;
112
113 /*
114 * Following the example in IA-32 SDM Vol 3:
115 * - MC0_CTL should not be written
116 * - Status registers on all banks should be cleared on reset
117 */
118 for (i = 1; i < nr_mce_banks; i++)
119 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
120
121 for (i = 0; i < nr_mce_banks; i++)
122 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
123
124 set_in_cr4(X86_CR4_MCE);
125 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
126 smp_processor_id());
127}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e5173..b3a1dba75330 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,31 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37/*
38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38 * Current thermal throttling state:
39static DEFINE_PER_CPU(bool, thermal_throttle_active); 39 */
40struct thermal_state {
41 bool is_throttled;
42
43 u64 next_check;
44 unsigned long throttle_count;
45 unsigned long last_throttle_count;
46};
47
48static DEFINE_PER_CPU(struct thermal_state, thermal_state);
40 49
41static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
42 51
43#ifdef CONFIG_SYSFS 52#ifdef CONFIG_SYSFS
44#define define_therm_throt_sysdev_one_ro(_name) \ 53#define define_therm_throt_sysdev_one_ro(_name) \
45 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
46 55
47#define define_therm_throt_sysdev_show_func(name) \ 56#define define_therm_throt_sysdev_show_func(name) \
48static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 57 \
49 struct sysdev_attribute *attr, \ 58static ssize_t therm_throt_sysdev_show_##name( \
50 char *buf) \ 59 struct sys_device *dev, \
60 struct sysdev_attribute *attr, \
61 char *buf) \
51{ \ 62{ \
52 unsigned int cpu = dev->id; \ 63 unsigned int cpu = dev->id; \
53 ssize_t ret; \ 64 ssize_t ret; \
@@ -55,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
55 preempt_disable(); /* CPU hotplug */ \ 66 preempt_disable(); /* CPU hotplug */ \
56 if (cpu_online(cpu)) \ 67 if (cpu_online(cpu)) \
57 ret = sprintf(buf, "%lu\n", \ 68 ret = sprintf(buf, "%lu\n", \
58 per_cpu(thermal_throttle_##name, cpu)); \ 69 per_cpu(thermal_state, cpu).name); \
59 else \ 70 else \
60 ret = 0; \ 71 ret = 0; \
61 preempt_enable(); \ 72 preempt_enable(); \
@@ -63,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
63 return ret; \ 74 return ret; \
64} 75}
65 76
66define_therm_throt_sysdev_show_func(count); 77define_therm_throt_sysdev_show_func(throttle_count);
67define_therm_throt_sysdev_one_ro(count); 78define_therm_throt_sysdev_one_ro(throttle_count);
68 79
69static struct attribute *thermal_throttle_attrs[] = { 80static struct attribute *thermal_throttle_attrs[] = {
70 &attr_count.attr, 81 &attr_throttle_count.attr,
71 NULL 82 NULL
72}; 83};
73 84
@@ -93,33 +104,39 @@ static struct attribute_group thermal_throttle_attr_group = {
93 * 1 : Event should be logged further, and a message has been 104 * 1 : Event should be logged further, and a message has been
94 * printed to the syslog. 105 * printed to the syslog.
95 */ 106 */
96static int therm_throt_process(int curr) 107static int therm_throt_process(bool is_throttled)
97{ 108{
98 unsigned int cpu = smp_processor_id(); 109 struct thermal_state *state;
99 __u64 tmp_jiffs = get_jiffies_64(); 110 unsigned int this_cpu;
100 bool was_throttled = __get_cpu_var(thermal_throttle_active); 111 bool was_throttled;
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; 112 u64 now;
113
114 this_cpu = smp_processor_id();
115 now = get_jiffies_64();
116 state = &per_cpu(thermal_state, this_cpu);
117
118 was_throttled = state->is_throttled;
119 state->is_throttled = is_throttled;
102 120
103 if (is_throttled) 121 if (is_throttled)
104 __get_cpu_var(thermal_throttle_count)++; 122 state->throttle_count++;
105 123
106 if (!(was_throttled ^ is_throttled) && 124 if (time_before64(now, state->next_check) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check))) 125 state->throttle_count != state->last_throttle_count)
108 return 0; 126 return 0;
109 127
110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 128 state->next_check = now + CHECK_INTERVAL;
129 state->last_throttle_count = state->throttle_count;
111 130
112 /* if we just entered the thermal event */ 131 /* if we just entered the thermal event */
113 if (is_throttled) { 132 if (is_throttled) {
114 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 133 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
115 "cpu clock throttled (total events = %lu)\n",
116 cpu, __get_cpu_var(thermal_throttle_count));
117 134
118 add_taint(TAINT_MACHINE_CHECK); 135 add_taint(TAINT_MACHINE_CHECK);
119 return 1; 136 return 1;
120 } 137 }
121 if (was_throttled) { 138 if (was_throttled) {
122 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); 139 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
123 return 1; 140 return 1;
124 } 141 }
125 142
@@ -213,7 +230,7 @@ static void intel_thermal_interrupt(void)
213 __u64 msr_val; 230 __u64 msr_val;
214 231
215 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 232 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
216 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) 233 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
217 mce_log_therm_throt_event(msr_val); 234 mce_log_therm_throt_event(msr_val);
218} 235}
219 236
@@ -260,9 +277,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
260 return; 277 return;
261 } 278 }
262 279
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */ 280 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) { 281 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG 282 printk(KERN_DEBUG
@@ -271,6 +285,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
271 return; 285 return;
272 } 286 }
273 287
288 /* early Pentium M models use different method for enabling TM2 */
289 if (cpu_has(c, X86_FEATURE_TM2)) {
290 if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
291 rdmsr(MSR_THERM2_CTL, l, h);
292 if (l & MSR_THERM2_CTL_TM_SELECT)
293 tm2 = 1;
294 } else if (l & MSR_IA32_MISC_ENABLE_TM2)
295 tm2 = 1;
296 }
297
274 /* We'll mask the thermal vector in the lapic till we're ready: */ 298 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 299 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h); 300 apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index ee2331b0e58f..33af14110dfd 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -7,15 +7,15 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned long *size, mtrr_type * type) 10 unsigned long *size, mtrr_type *type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
14 rdmsr(MSR_K6_UWCCR, low, high); 14 rdmsr(MSR_K6_UWCCR, low, high);
15 /* Upper dword is region 1, lower is region 0 */ 15 /* Upper dword is region 1, lower is region 0 */
16 if (reg == 1) 16 if (reg == 1)
17 low = high; 17 low = high;
18 /* The base masks off on the right alignment */ 18 /* The base masks off on the right alignment */
19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT; 19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
20 *type = 0; 20 *type = 0;
21 if (low & 1) 21 if (low & 1)
@@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base,
27 return; 27 return;
28 } 28 }
29 /* 29 /*
30 * This needs a little explaining. The size is stored as an 30 * This needs a little explaining. The size is stored as an
31 * inverted mask of bits of 128K granularity 15 bits long offset 31 * inverted mask of bits of 128K granularity 15 bits long offset
32 * 2 bits 32 * 2 bits.
33 * 33 *
34 * So to get a size we do invert the mask and add 1 to the lowest 34 * So to get a size we do invert the mask and add 1 to the lowest
35 * mask bit (4 as its 2 bits in). This gives us a size we then shift 35 * mask bit (4 as its 2 bits in). This gives us a size we then shift
36 * to turn into 128K blocks 36 * to turn into 128K blocks.
37 * 37 *
38 * eg 111 1111 1111 1100 is 512K 38 * eg 111 1111 1111 1100 is 512K
39 * 39 *
40 * invert 000 0000 0000 0011 40 * invert 000 0000 0000 0011
41 * +1 000 0000 0000 0100 41 * +1 000 0000 0000 0100
42 * *128K ... 42 * *128K ...
43 */ 43 */
44 low = (~low) & 0x1FFFC; 44 low = (~low) & 0x1FFFC;
45 *size = (low + 4) << (15 - PAGE_SHIFT); 45 *size = (low + 4) << (15 - PAGE_SHIFT);
46 return;
47} 46}
48 47
49static void amd_set_mtrr(unsigned int reg, unsigned long base, 48/**
50 unsigned long size, mtrr_type type) 49 * amd_set_mtrr - Set variable MTRR register on the local CPU.
51/* [SUMMARY] Set variable MTRR register on the local CPU. 50 *
52 <reg> The register to set. 51 * @reg The register to set.
53 <base> The base address of the region. 52 * @base The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled. 53 * @size The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region. 54 * @type The type of the region.
56 [RETURNS] Nothing. 55 *
57*/ 56 * Returns nothing.
57 */
58static void
59amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
58{ 60{
59 u32 regs[2]; 61 u32 regs[2];
60 62
61 /* 63 /*
62 * Low is MTRR0 , High MTRR 1 64 * Low is MTRR0, High MTRR 1
63 */ 65 */
64 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); 66 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
65 /* 67 /*
66 * Blank to disable 68 * Blank to disable
67 */ 69 */
68 if (size == 0) 70 if (size == 0) {
69 regs[reg] = 0; 71 regs[reg] = 0;
70 else 72 } else {
71 /* Set the register to the base, the type (off by one) and an 73 /*
72 inverted bitmask of the size The size is the only odd 74 * Set the register to the base, the type (off by one) and an
73 bit. We are fed say 512K We invert this and we get 111 1111 75 * inverted bitmask of the size The size is the only odd
74 1111 1011 but if you subtract one and invert you get the 76 * bit. We are fed say 512K We invert this and we get 111 1111
75 desired 111 1111 1111 1100 mask 77 * 1111 1011 but if you subtract one and invert you get the
76 78 * desired 111 1111 1111 1100 mask
77 But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ 79 *
80 * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
81 */
78 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) 82 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
79 | (base << PAGE_SHIFT) | (type + 1); 83 | (base << PAGE_SHIFT) | (type + 1);
84 }
80 85
81 /* 86 /*
82 * The writeback rule is quite specific. See the manual. Its 87 * The writeback rule is quite specific. See the manual. Its
83 * disable local interrupts, write back the cache, set the mtrr 88 * disable local interrupts, write back the cache, set the mtrr
84 */ 89 */
85 wbinvd(); 90 wbinvd();
86 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); 91 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
87} 92}
88 93
89static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) 94static int
95amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
90{ 96{
91 /* Apply the K6 block alignment and size rules 97 /*
92 In order 98 * Apply the K6 block alignment and size rules
93 o Uncached or gathering only 99 * In order
94 o 128K or bigger block 100 * o Uncached or gathering only
95 o Power of 2 block 101 * o 128K or bigger block
96 o base suitably aligned to the power 102 * o Power of 2 block
97 */ 103 * o base suitably aligned to the power
104 */
98 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) 105 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
99 || (size & ~(size - 1)) - size || (base & (size - 1))) 106 || (size & ~(size - 1)) - size || (base & (size - 1)))
100 return -EINVAL; 107 return -EINVAL;
@@ -115,5 +122,3 @@ int __init amd_init_mtrr(void)
115 set_mtrr_ops(&amd_mtrr_ops); 122 set_mtrr_ops(&amd_mtrr_ops);
116 return 0; 123 return 0;
117} 124}
118
119//arch_initcall(amd_mtrr_init);
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index cb9aa3a7a7ab..de89f14eff3a 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -1,7 +1,9 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3
3#include <asm/mtrr.h> 4#include <asm/mtrr.h>
4#include <asm/msr.h> 5#include <asm/msr.h>
6
5#include "mtrr.h" 7#include "mtrr.h"
6 8
7static struct { 9static struct {
@@ -12,25 +14,25 @@ static struct {
12static u8 centaur_mcr_reserved; 14static u8 centaur_mcr_reserved;
13static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ 15static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
14 16
15/* 17/**
16 * Report boot time MCR setups 18 * centaur_get_free_region - Get a free MTRR.
19 *
20 * @base: The starting (base) address of the region.
21 * @size: The size (in bytes) of the region.
22 *
23 * Returns: the index of the region on success, else -1 on error.
17 */ 24 */
18
19static int 25static int
20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) 26centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region.
24 [RETURNS] The index of the region on success, else -1 on error.
25*/
26{ 27{
27 int i, max;
28 mtrr_type ltype;
29 unsigned long lbase, lsize; 28 unsigned long lbase, lsize;
29 mtrr_type ltype;
30 int i, max;
30 31
31 max = num_var_ranges; 32 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max) 33 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg; 34 return replace_reg;
35
34 for (i = 0; i < max; ++i) { 36 for (i = 0; i < max; ++i) {
35 if (centaur_mcr_reserved & (1 << i)) 37 if (centaur_mcr_reserved & (1 << i))
36 continue; 38 continue;
@@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
38 if (lsize == 0) 40 if (lsize == 0)
39 return i; 41 return i;
40 } 42 }
43
41 return -ENOSPC; 44 return -ENOSPC;
42} 45}
43 46
44void 47/*
45mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) 48 * Report boot time MCR setups
49 */
50void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
46{ 51{
47 centaur_mcr[mcr].low = lo; 52 centaur_mcr[mcr].low = lo;
48 centaur_mcr[mcr].high = hi; 53 centaur_mcr[mcr].high = hi;
@@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base,
54{ 59{
55 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 60 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 61 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
57 *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ 62 *type = MTRR_TYPE_WRCOMB; /* write-combining */
63
58 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) 64 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
59 *type = MTRR_TYPE_UNCACHABLE; 65 *type = MTRR_TYPE_UNCACHABLE;
60 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) 66 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
61 *type = MTRR_TYPE_WRBACK; 67 *type = MTRR_TYPE_WRBACK;
62 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) 68 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
63 *type = MTRR_TYPE_WRBACK; 69 *type = MTRR_TYPE_WRBACK;
64
65} 70}
66 71
67static void centaur_set_mcr(unsigned int reg, unsigned long base, 72static void
68 unsigned long size, mtrr_type type) 73centaur_set_mcr(unsigned int reg, unsigned long base,
74 unsigned long size, mtrr_type type)
69{ 75{
70 unsigned long low, high; 76 unsigned long low, high;
71 77
72 if (size == 0) { 78 if (size == 0) {
73 /* Disable */ 79 /* Disable */
74 high = low = 0; 80 high = low = 0;
75 } else { 81 } else {
76 high = base << PAGE_SHIFT; 82 high = base << PAGE_SHIFT;
77 if (centaur_mcr_type == 0) 83 if (centaur_mcr_type == 0) {
78 low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ 84 /* Only support write-combining... */
79 else { 85 low = -size << PAGE_SHIFT | 0x1f;
86 } else {
80 if (type == MTRR_TYPE_UNCACHABLE) 87 if (type == MTRR_TYPE_UNCACHABLE)
81 low = -size << PAGE_SHIFT | 0x02; /* NC */ 88 low = -size << PAGE_SHIFT | 0x02; /* NC */
82 else 89 else
83 low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ 90 low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
84 } 91 }
85 } 92 }
86 centaur_mcr[reg].high = high; 93 centaur_mcr[reg].high = high;
@@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base,
88 wrmsr(MSR_IDT_MCR0 + reg, low, high); 95 wrmsr(MSR_IDT_MCR0 + reg, low, high);
89} 96}
90 97
91#if 0 98static int
92/* 99centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
93 * Initialise the later (saner) Winchip MCR variant. In this version
94 * the BIOS can pass us the registers it has used (but not their values)
95 * and the control register is read/write
96 */
97
98static void __init
99centaur_mcr1_init(void)
100{
101 unsigned i;
102 u32 lo, hi;
103
104 /* Unfortunately, MCR's are read-only, so there is no way to
105 * find out what the bios might have done.
106 */
107
108 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
109 if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */
110 lo &= ~0x1C0; /* clear key */
111 lo |= 0x040; /* set key to 1 */
112 wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */
113 }
114
115 centaur_mcr_type = 1;
116
117 /*
118 * Clear any unconfigured MCR's.
119 */
120
121 for (i = 0; i < 8; ++i) {
122 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
123 if (!(lo & (1 << (9 + i))))
124 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
125 else
126 /*
127 * If the BIOS set up an MCR we cannot see it
128 * but we don't wish to obliterate it
129 */
130 centaur_mcr_reserved |= (1 << i);
131 }
132 }
133 /*
134 * Throw the main write-combining switch...
135 * However if OOSTORE is enabled then people have already done far
136 * cleverer things and we should behave.
137 */
138
139 lo |= 15; /* Write combine enables */
140 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
141}
142
143/*
144 * Initialise the original winchip with read only MCR registers
145 * no used bitmask for the BIOS to pass on and write only control
146 */
147
148static void __init
149centaur_mcr0_init(void)
150{
151 unsigned i;
152
153 /* Unfortunately, MCR's are read-only, so there is no way to
154 * find out what the bios might have done.
155 */
156
157 /* Clear any unconfigured MCR's.
158 * This way we are sure that the centaur_mcr array contains the actual
159 * values. The disadvantage is that any BIOS tweaks are thus undone.
160 *
161 */
162 for (i = 0; i < 8; ++i) {
163 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
164 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
165 }
166
167 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
168}
169
170/*
171 * Initialise Winchip series MCR registers
172 */
173
174static void __init
175centaur_mcr_init(void)
176{
177 struct set_mtrr_context ctxt;
178
179 set_mtrr_prepare_save(&ctxt);
180 set_mtrr_cache_disable(&ctxt);
181
182 if (boot_cpu_data.x86_model == 4)
183 centaur_mcr0_init();
184 else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
185 centaur_mcr1_init();
186
187 set_mtrr_done(&ctxt);
188}
189#endif
190
191static int centaur_validate_add_page(unsigned long base,
192 unsigned long size, unsigned int type)
193{ 100{
194 /* 101 /*
195 * FIXME: Winchip2 supports uncached 102 * FIXME: Winchip2 supports uncached
196 */ 103 */
197 if (type != MTRR_TYPE_WRCOMB && 104 if (type != MTRR_TYPE_WRCOMB &&
198 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { 105 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
199 printk(KERN_WARNING 106 pr_warning("mtrr: only write-combining%s supported\n",
200 "mtrr: only write-combining%s supported\n", 107 centaur_mcr_type ? " and uncacheable are" : " is");
201 centaur_mcr_type ? " and uncacheable are"
202 : " is");
203 return -EINVAL; 108 return -EINVAL;
204 } 109 }
205 return 0; 110 return 0;
@@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base,
207 112
208static struct mtrr_ops centaur_mtrr_ops = { 113static struct mtrr_ops centaur_mtrr_ops = {
209 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
210// .init = centaur_mcr_init,
211 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
212 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
213 .get_free_region = centaur_get_free_region, 117 .get_free_region = centaur_get_free_region,
@@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void)
220 set_mtrr_ops(&centaur_mtrr_ops); 124 set_mtrr_ops(&centaur_mtrr_ops);
221 return 0; 125 return 0;
222} 126}
223
224//arch_initcall(centaur_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 1d584a18a50d..315738c74aad 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -1,51 +1,75 @@
1/* MTRR (Memory Type Range Register) cleanup 1/*
2 2 * MTRR (Memory Type Range Register) cleanup
3 Copyright (C) 2009 Yinghai Lu 3 *
4 4 * Copyright (C) 2009 Yinghai Lu
5 This library is free software; you can redistribute it and/or 5 *
6 modify it under the terms of the GNU Library General Public 6 * This library is free software; you can redistribute it and/or
7 License as published by the Free Software Foundation; either 7 * modify it under the terms of the GNU Library General Public
8 version 2 of the License, or (at your option) any later version. 8 * License as published by the Free Software Foundation; either
9 9 * version 2 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful, 10 *
11 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * This library is distributed in the hope that it will be useful,
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 Library General Public License for more details. 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 14 * Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public 15 *
16 License along with this library; if not, write to the Free 16 * You should have received a copy of the GNU Library General Public
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 * License along with this library; if not, write to the Free
18*/ 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 19 */
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h> 25#include <linux/sort.h>
26#include <linux/mutex.h>
27#include <linux/uaccess.h>
28#include <linux/kvm_para.h>
27 29
30#include <asm/processor.h>
28#include <asm/e820.h> 31#include <asm/e820.h>
29#include <asm/mtrr.h> 32#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h> 33#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35 34
36/* should be related to MTRR_VAR_RANGES nums */ 35#include "mtrr.h"
37#define RANGE_NUM 256
38 36
39struct res_range { 37struct res_range {
40 unsigned long start; 38 unsigned long start;
41 unsigned long end; 39 unsigned long end;
40};
41
42struct var_mtrr_range_state {
43 unsigned long base_pfn;
44 unsigned long size_pfn;
45 mtrr_type type;
46};
47
48struct var_mtrr_state {
49 unsigned long range_startk;
50 unsigned long range_sizek;
51 unsigned long chunk_sizek;
52 unsigned long gran_sizek;
53 unsigned int reg;
42}; 54};
43 55
56/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256
58
59static struct res_range __initdata range[RANGE_NUM];
60static int __initdata nr_range;
61
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
63
64static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66
67
44static int __init 68static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start, 69add_range(struct res_range *range, int nr_range,
46 unsigned long end) 70 unsigned long start, unsigned long end)
47{ 71{
48 /* out of slots */ 72 /* Out of slots: */
49 if (nr_range >= RANGE_NUM) 73 if (nr_range >= RANGE_NUM)
50 return nr_range; 74 return nr_range;
51 75
@@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start,
58} 82}
59 83
60static int __init 84static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, 85add_range_with_merge(struct res_range *range, int nr_range,
62 unsigned long end) 86 unsigned long start, unsigned long end)
63{ 87{
64 int i; 88 int i;
65 89
66 /* try to merge it with old one */ 90 /* Try to merge it with old one: */
67 for (i = 0; i < nr_range; i++) { 91 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end; 92 unsigned long final_start, final_end;
69 unsigned long common_start, common_end; 93 unsigned long common_start, common_end;
@@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
84 return nr_range; 108 return nr_range;
85 } 109 }
86 110
87 /* need to add that */ 111 /* Need to add it: */
88 return add_range(range, nr_range, start, end); 112 return add_range(range, nr_range, start, end);
89} 113}
90 114
@@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117 } 141 }
118 142
119 if (start > range[j].start && end < range[j].end) { 143 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */ 144 /* Find the new spare: */
121 for (i = 0; i < RANGE_NUM; i++) { 145 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0) 146 if (range[i].end == 0)
123 break; 147 break;
@@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2)
146 return start1 - start2; 170 return start1 - start2;
147} 171}
148 172
149struct var_mtrr_range_state { 173#define BIOS_BUG_MSG KERN_WARNING \
150 unsigned long base_pfn; 174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157 175
158static int __init 176static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 177x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
@@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
180 range[i].start, range[i].end + 1); 198 range[i].start, range[i].end + 1);
181 } 199 }
182 200
183 /* take out UC ranges */ 201 /* Take out UC ranges: */
184 for (i = 0; i < num_var_ranges; i++) { 202 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type; 203 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE && 204 if (type != MTRR_TYPE_UNCACHABLE &&
@@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && 211 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) { 212 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */ 213 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " 214 printk(BIOS_BUG_MSG, i);
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT))) 215 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue; 216 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base; 217 size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
237 return nr_range; 253 return nr_range;
238} 254}
239 255
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER 256#ifdef CONFIG_MTRR_SANITIZER
244 257
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 258static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{ 259{
247 unsigned long sum; 260 unsigned long sum = 0;
248 int i; 261 int i;
249 262
250 sum = 0;
251 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start; 264 sum += range[i].end + 1 - range[i].start;
253 265
@@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str)
278} 290}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); 291early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280 292
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init 293static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, 294set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits) 295 unsigned char type, unsigned int address_bits)
292{ 296{
293 u32 base_lo, base_hi, mask_lo, mask_hi; 297 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask; 298 u64 base, mask;
@@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
301 mask = (1ULL << address_bits) - 1; 305 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1); 306 mask &= ~((((u64)sizek) << 10) - 1);
303 307
304 base = ((u64)basek) << 10; 308 base = ((u64)basek) << 10;
305 309
306 base |= type; 310 base |= type;
307 mask |= 0x800; 311 mask |= 0x800;
@@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
317 321
318static void __init 322static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, 323save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type) 324 unsigned char type)
321{ 325{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); 326 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); 327 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type; 328 range_state[reg].type = type;
325} 329}
326 330
327static void __init 331static void __init set_var_mtrr_all(unsigned int address_bits)
328set_var_mtrr_all(unsigned int address_bits)
329{ 332{
330 unsigned long basek, sizek; 333 unsigned long basek, sizek;
331 unsigned char type; 334 unsigned char type;
@@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits)
342 345
343static unsigned long to_size_factor(unsigned long sizek, char *factorp) 346static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{ 347{
345 char factor;
346 unsigned long base = sizek; 348 unsigned long base = sizek;
349 char factor;
347 350
348 if (base & ((1<<10) - 1)) { 351 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */ 352 /* Not MB-aligned: */
350 factor = 'K'; 353 factor = 'K';
351 } else if (base & ((1<<20) - 1)) { 354 } else if (base & ((1<<20) - 1)) {
352 factor = 'M'; 355 factor = 'M';
@@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
372 unsigned long max_align, align; 375 unsigned long max_align, align;
373 unsigned long sizek; 376 unsigned long sizek;
374 377
375 /* Compute the maximum size I can make a range */ 378 /* Compute the maximum size with which we can make a range: */
376 if (range_startk) 379 if (range_startk)
377 max_align = ffs(range_startk) - 1; 380 max_align = ffs(range_startk) - 1;
378 else 381 else
379 max_align = 32; 382 max_align = 32;
383
380 align = fls(range_sizek) - 1; 384 align = fls(range_sizek) - 1;
381 if (align > max_align) 385 if (align > max_align)
382 align = max_align; 386 align = max_align;
@@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
386 char start_factor = 'K', size_factor = 'K'; 390 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base; 391 unsigned long start_base, size_base;
388 392
389 start_base = to_size_factor(range_startk, 393 start_base = to_size_factor(range_startk, &start_factor);
390 &start_factor), 394 size_base = to_size_factor(sizek, &size_factor);
391 size_base = to_size_factor(sizek, &size_factor),
392 395
393 printk(KERN_DEBUG "Setting variable MTRR %d, " 396 Dprintk("Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n", 397 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor, 398 reg, start_base, start_factor,
396 size_base, size_factor, 399 size_base, size_factor,
@@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
425 chunk_sizek = state->chunk_sizek; 428 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek; 429 gran_sizek = state->gran_sizek;
427 430
428 /* align with gran size, prevent small block used up MTRRs */ 431 /* Align with gran size, prevent small block used up MTRRs: */
429 range_basek = ALIGN(state->range_startk, gran_sizek); 432 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek) 433 if ((range_basek > basek) && basek)
431 return second_sizek; 434 return second_sizek;
435
432 state->range_sizek -= (range_basek - state->range_startk); 436 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek); 437 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434 438
@@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
439 } 443 }
440 state->range_sizek = range_sizek; 444 state->range_sizek = range_sizek;
441 445
442 /* try to append some small hole */ 446 /* Try to append some small hole: */
443 range0_basek = state->range_startk; 447 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 448 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445 449
446 /* no increase */ 450 /* No increase: */
447 if (range0_sizek == state->range_sizek) { 451 if (range0_sizek == state->range_sizek) {
448 if (debug_print) 452 Dprintk("rangeX: %016lx - %016lx\n",
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 453 range0_basek<<10,
450 range0_basek<<10, 454 (range0_basek + state->range_sizek)<<10);
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek, 455 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK); 456 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0; 457 return 0;
455 } 458 }
456 459
457 /* only cut back, when it is not the last */ 460 /* Only cut back when it is not the last: */
458 if (sizek) { 461 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) { 462 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek) 463 if (range0_sizek >= chunk_sizek)
@@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
470second_try: 473second_try:
471 range_basek = range0_basek + range0_sizek; 474 range_basek = range0_basek + range0_sizek;
472 475
473 /* one hole in the middle */ 476 /* One hole in the middle: */
474 if (range_basek > basek && range_basek <= (basek + sizek)) 477 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek; 478 second_sizek = range_basek - basek;
476 479
477 if (range0_sizek > state->range_sizek) { 480 if (range0_sizek > state->range_sizek) {
478 481
479 /* one hole in middle or at end */ 482 /* One hole in middle or at the end: */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek; 483 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481 484
482 /* hole size should be less than half of range0 size */ 485 /* Hole size should be less than half of range0 size: */
483 if (hole_sizek >= (range0_sizek >> 1) && 486 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) { 487 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek; 488 range0_sizek -= chunk_sizek;
@@ -491,32 +494,30 @@ second_try:
491 } 494 }
492 495
493 if (range0_sizek) { 496 if (range0_sizek) {
494 if (debug_print) 497 Dprintk("range0: %016lx - %016lx\n",
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n", 498 range0_basek<<10,
496 range0_basek<<10, 499 (range0_basek + range0_sizek)<<10);
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek, 500 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK); 501 range0_sizek, MTRR_TYPE_WRBACK);
500 } 502 }
501 503
502 if (range0_sizek < state->range_sizek) { 504 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */ 505 /* Need to handle left over range: */
504 range_sizek = state->range_sizek - range0_sizek; 506 range_sizek = state->range_sizek - range0_sizek;
505 507
506 if (debug_print) 508 Dprintk("range: %016lx - %016lx\n",
507 printk(KERN_DEBUG "range: %016lx - %016lx\n", 509 range_basek<<10,
508 range_basek<<10, 510 (range_basek + range_sizek)<<10);
509 (range_basek + range_sizek)<<10); 511
510 state->reg = range_to_mtrr(state->reg, range_basek, 512 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK); 513 range_sizek, MTRR_TYPE_WRBACK);
512 } 514 }
513 515
514 if (hole_sizek) { 516 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek; 517 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print) 518 Dprintk("hole: %016lx - %016lx\n",
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 519 hole_basek<<10,
518 hole_basek<<10, 520 (hole_basek + hole_sizek)<<10);
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek, 521 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE); 522 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 } 523 }
@@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
537 basek = base_pfn << (PAGE_SHIFT - 10); 538 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10); 539 sizek = size_pfn << (PAGE_SHIFT - 10);
539 540
540 /* See if I can merge with the last range */ 541 /* See if I can merge with the last range: */
541 if ((basek <= 1024) || 542 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) { 543 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek; 544 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk; 545 state->range_sizek = endk - state->range_startk;
545 return; 546 return;
546 } 547 }
547 /* Write the range mtrrs */ 548 /* Write the range mtrrs: */
548 if (state->range_sizek != 0) 549 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek); 550 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550 551
551 /* Allocate an msr */ 552 /* Allocate an msr: */
552 state->range_startk = basek + second_sizek; 553 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek; 554 state->range_sizek = sizek - second_sizek;
554} 555}
555 556
556/* mininum size of mtrr block that can take hole */ 557/* Mininum size of mtrr block that can take hole: */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20); 558static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558 559
559static int __init parse_mtrr_chunk_size_opt(char *p) 560static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p)
565} 566}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); 567early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567 568
568/* granity of mtrr of block */ 569/* Granularity of mtrr of block: */
569static u64 mtrr_gran_size __initdata; 570static u64 mtrr_gran_size __initdata;
570 571
571static int __init parse_mtrr_gran_size_opt(char *p) 572static int __init parse_mtrr_gran_size_opt(char *p)
@@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p)
577} 578}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); 579early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579 580
580static int nr_mtrr_spare_reg __initdata = 581static unsigned long nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; 582 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582 583
583static int __init parse_mtrr_spare_reg(char *arg) 584static int __init parse_mtrr_spare_reg(char *arg)
@@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); 587 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0; 588 return 0;
588} 589}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591 591
592static int __init 592static int __init
@@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size) 594 u64 chunk_size, u64 gran_size)
595{ 595{
596 struct var_mtrr_state var_state; 596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg; 597 int num_reg;
598 int i;
599 599
600 var_state.range_startk = 0; 600 var_state.range_startk = 0;
601 var_state.range_sizek = 0; 601 var_state.range_sizek = 0;
@@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
605 605
606 memset(range_state, 0, sizeof(range_state)); 606 memset(range_state, 0, sizeof(range_state));
607 607
608 /* Write the range etc */ 608 /* Write the range: */
609 for (i = 0; i < nr_range; i++) 609 for (i = 0; i < nr_range; i++) {
610 set_var_mtrr_range(&var_state, range[i].start, 610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1); 611 range[i].end - range[i].start + 1);
612 }
612 613
613 /* Write the last range */ 614 /* Write the last range: */
614 if (var_state.range_sizek != 0) 615 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0); 616 range_to_mtrr_with_hole(&var_state, 0, 0);
616 617
617 num_reg = var_state.reg; 618 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */ 619 /* Clear out the extra MTRR's: */
619 while (var_state.reg < num_var_ranges) { 620 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0); 621 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++; 622 var_state.reg++;
@@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
625} 626}
626 627
627struct mtrr_cleanup_result { 628struct mtrr_cleanup_result {
628 unsigned long gran_sizek; 629 unsigned long gran_sizek;
629 unsigned long chunk_sizek; 630 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek; 631 unsigned long lose_cover_sizek;
631 unsigned int num_reg; 632 unsigned int num_reg;
632 int bad; 633 int bad;
633}; 634};
634 635
635/* 636/*
@@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645 646
646static void __init print_out_mtrr_range_state(void) 647static void __init print_out_mtrr_range_state(void)
647{ 648{
648 int i;
649 char start_factor = 'K', size_factor = 'K'; 649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base; 650 unsigned long start_base, size_base;
651 mtrr_type type; 651 mtrr_type type;
652 int i;
652 653
653 for (i = 0; i < num_var_ranges; i++) { 654 for (i = 0; i < num_var_ranges; i++) {
654 655
@@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void)
676 int i; 677 int i;
677 mtrr_type type; 678 mtrr_type type;
678 unsigned long size; 679 unsigned long size;
679 /* extra one for all 0 */ 680 /* Extra one for all 0: */
680 int num[MTRR_NUM_TYPES + 1]; 681 int num[MTRR_NUM_TYPES + 1];
681 682
682 /* check entries number */ 683 /* Check entries number: */
683 memset(num, 0, sizeof(num)); 684 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) { 685 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type; 686 type = range_state[i].type;
@@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void)
693 num[type]++; 694 num[type]++;
694 } 695 }
695 696
696 /* check if we got UC entries */ 697 /* Check if we got UC entries: */
697 if (!num[MTRR_TYPE_UNCACHABLE]) 698 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0; 699 return 0;
699 700
700 /* check if we only had WB and UC */ 701 /* Check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != 702 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES]) 703 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0; 704 return 0;
704 705
705 return 1; 706 return 1;
706} 707}
707 708
708static unsigned long __initdata range_sums; 709static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, 710
710 unsigned long extra_remove_base, 711static void __init
711 unsigned long extra_remove_size, 712mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
712 int i) 713 unsigned long x_remove_base,
714 unsigned long x_remove_size, int i)
713{ 715{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM]; 716 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new; 717 unsigned long range_sums_new;
718 static int nr_range_new;
719 int num_reg;
718 720
719 /* convert ranges to var ranges state */ 721 /* Convert ranges to var ranges state: */
720 num_reg = x86_setup_var_mtrrs(range, nr_range, 722 num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
721 chunk_size, gran_size);
722 723
723 /* we got new setting in range_state, check it */ 724 /* We got new setting in range_state, check it: */
724 memset(range_new, 0, sizeof(range_new)); 725 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0, 726 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size); 727 x_remove_base, x_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new); 728 range_sums_new = sum_ranges(range_new, nr_range_new);
728 729
729 result[i].chunk_sizek = chunk_size >> 10; 730 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10; 731 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg; 732 result[i].num_reg = num_reg;
733
732 if (range_sums < range_sums_new) { 734 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek = 735 result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1; 736 result[i].bad = 1;
736 } else 737 } else {
737 result[i].lose_cover_sizek = 738 result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
738 (range_sums - range_sums_new) << PSHIFT; 739 }
739 740
740 /* double check it */ 741 /* Double check it: */
741 if (!result[i].bad && !result[i].lose_cover_sizek) { 742 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range || 743 if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
743 memcmp(range, range_new, sizeof(range))) 744 result[i].bad = 1;
744 result[i].bad = 1;
745 } 745 }
746 746
747 if (!result[i].bad && (range_sums - range_sums_new < 747 if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
748 min_loss_pfn[num_reg])) { 748 min_loss_pfn[num_reg] = range_sums - range_sums_new;
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752} 749}
753 750
754static void __init mtrr_print_out_one_result(int i) 751static void __init mtrr_print_out_one_result(int i)
755{ 752{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base; 753 unsigned long gran_base, chunk_base, lose_base;
754 char gran_factor, chunk_factor, lose_factor;
758 755
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 756 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 757 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 758 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", 759
763 result[i].bad ? "*BAD*" : " ", 760 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
764 gran_base, gran_factor, chunk_base, chunk_factor); 761 result[i].bad ? "*BAD*" : " ",
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", 762 gran_base, gran_factor, chunk_base, chunk_factor);
766 result[i].num_reg, result[i].bad ? "-" : "", 763 pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
767 lose_base, lose_factor); 764 result[i].num_reg, result[i].bad ? "-" : "",
765 lose_base, lose_factor);
768} 766}
769 767
770static int __init mtrr_search_optimal_index(void) 768static int __init mtrr_search_optimal_index(void)
771{ 769{
772 int i;
773 int num_reg_good; 770 int num_reg_good;
774 int index_good; 771 int index_good;
772 int i;
775 773
776 if (nr_mtrr_spare_reg >= num_var_ranges) 774 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1; 775 nr_mtrr_spare_reg = num_var_ranges - 1;
776
778 num_reg_good = -1; 777 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 778 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i]) 779 if (!min_loss_pfn[i])
@@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void)
796 return index_good; 795 return index_good;
797} 796}
798 797
799
800int __init mtrr_cleanup(unsigned address_bits) 798int __init mtrr_cleanup(unsigned address_bits)
801{ 799{
802 unsigned long extra_remove_base, extra_remove_size; 800 unsigned long x_remove_base, x_remove_size;
803 unsigned long base, size, def, dummy; 801 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size; 802 u64 chunk_size, gran_size;
803 mtrr_type type;
806 int index_good; 804 int index_good;
807 int i; 805 int i;
808 806
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 807 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 808 return 0;
809
811 rdmsr(MSR_MTRRdefType, def, dummy); 810 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 811 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 812 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 813 return 0;
815 814
816 /* get it and store it aside */ 815 /* Get it and store it aside: */
817 memset(range_state, 0, sizeof(range_state)); 816 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) { 817 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type); 818 mtrr_if->get(i, &base, &size, &type);
@@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits)
822 range_state[i].type = type; 821 range_state[i].type = type;
823 } 822 }
824 823
825 /* check if we need handle it and can handle it */ 824 /* Check if we need handle it and can handle it: */
826 if (!mtrr_need_cleanup()) 825 if (!mtrr_need_cleanup())
827 return 0; 826 return 0;
828 827
829 /* print original var MTRRs at first, for debugging: */ 828 /* Print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n"); 829 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state(); 830 print_out_mtrr_range_state();
832 831
833 memset(range, 0, sizeof(range)); 832 memset(range, 0, sizeof(range));
834 extra_remove_size = 0; 833 x_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT); 834 x_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2) 835 if (mtrr_tom2)
837 extra_remove_size = 836 x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 837
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 838 nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size);
840 extra_remove_size);
841 /* 839 /*
842 * [0, 1M) should always be coverred by var mtrr with WB 840 * [0, 1M) should always be covered by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it 841 * and fixed mtrrs should take effect before var mtrr for it:
844 */ 842 */
845 nr_range = add_range_with_merge(range, nr_range, 0, 843 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1); 844 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */ 845 /* Sort the ranges: */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849 847
850 range_sums = sum_ranges(range, nr_range); 848 range_sums = sum_ranges(range, nr_range);
@@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits)
854 if (mtrr_chunk_size && mtrr_gran_size) { 852 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0; 853 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, 854 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i); 855 x_remove_base, x_remove_size, i);
858 856
859 mtrr_print_out_one_result(i); 857 mtrr_print_out_one_result(i);
860 858
@@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits)
880 continue; 878 continue;
881 879
882 mtrr_calc_range_state(chunk_size, gran_size, 880 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i); 881 x_remove_base, x_remove_size, i);
884 if (debug_print) { 882 if (debug_print) {
885 mtrr_print_out_one_result(i); 883 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n"); 884 printk(KERN_INFO "\n");
@@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits)
890 } 888 }
891 } 889 }
892 890
893 /* try to find the optimal index */ 891 /* Try to find the optimal index: */
894 index_good = mtrr_search_optimal_index(); 892 index_good = mtrr_search_optimal_index();
895 893
896 if (index_good != -1) { 894 if (index_good != -1) {
@@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits)
898 i = index_good; 896 i = index_good;
899 mtrr_print_out_one_result(i); 897 mtrr_print_out_one_result(i);
900 898
901 /* convert ranges to var ranges state */ 899 /* Convert ranges to var ranges state: */
902 chunk_size = result[i].chunk_sizek; 900 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10; 901 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek; 902 gran_size = result[i].gran_sizek;
@@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't 939 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild. 940 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */ 941 */
944#define Tom2Enabled (1U << 21) 942#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22) 943#define Tom2ForceMemTypeWB (1U << 22)
946 944
947int __init amd_special_default_mtrr(void) 945int __init amd_special_default_mtrr(void)
948{ 946{
@@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void)
952 return 0; 950 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 951 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0; 952 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */ 953 /* In case some hypervisor doesn't pass SYSCFG through: */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 954 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0; 955 return 0;
958 /* 956 /*
@@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void)
965 return 0; 963 return 0;
966} 964}
967 965
968static u64 __init real_trim_memory(unsigned long start_pfn, 966static u64 __init
969 unsigned long limit_pfn) 967real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
970{ 968{
971 u64 trim_start, trim_size; 969 u64 trim_start, trim_size;
970
972 trim_start = start_pfn; 971 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT; 972 trim_start <<= PAGE_SHIFT;
973
974 trim_size = limit_pfn; 974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT; 975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start; 976 trim_size -= trim_start;
977 977
978 return e820_update_range(trim_start, trim_size, E820_RAM, 978 return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
979 E820_RESERVED);
980} 979}
980
981/** 981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number 983 * @end_pfn: ending page frame number
@@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn,
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain 985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches 986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover 987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any 988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's 989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message. 990 * allocation pools, warning the user with an obnoxious message.
991 */ 991 */
@@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
994 unsigned long i, base, size, highest_pfn = 0, def, dummy; 994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type; 995 mtrr_type type;
996 u64 total_trim_size; 996 u64 total_trim_size;
997
998 /* extra one for all 0 */ 997 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1]; 998 int num[MTRR_NUM_TYPES + 1];
999
1000 /* 1000 /*
1001 * Make sure we only trim uncachable memory on machines that 1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture: 1002 * support the Intel MTRR architecture:
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006
1006 rdmsr(MSR_MTRRdefType, def, dummy); 1007 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1008 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1009 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1010 return 0;
1010 1011
1011 /* get it and store it aside */ 1012 /* Get it and store it aside: */
1012 memset(range_state, 0, sizeof(range_state)); 1013 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) { 1014 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type); 1015 mtrr_if->get(i, &base, &size, &type);
@@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1017 range_state[i].type = type; 1018 range_state[i].type = type;
1018 } 1019 }
1019 1020
1020 /* Find highest cached pfn */ 1021 /* Find highest cached pfn: */
1021 for (i = 0; i < num_var_ranges; i++) { 1022 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type; 1023 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK) 1024 if (type != MTRR_TYPE_WRBACK)
@@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1028 highest_pfn = base + size; 1029 highest_pfn = base + size;
1029 } 1030 }
1030 1031
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1032 /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
1032 if (!highest_pfn) { 1033 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); 1034 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0; 1035 return 0;
1035 } 1036 }
1036 1037
1037 /* check entries number */ 1038 /* Check entries number: */
1038 memset(num, 0, sizeof(num)); 1039 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) { 1040 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type; 1041 type = range_state[i].type;
@@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1046 num[type]++; 1047 num[type]++;
1047 } 1048 }
1048 1049
1049 /* no entry for WB? */ 1050 /* No entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK]) 1051 if (!num[MTRR_TYPE_WRBACK])
1051 return 0; 1052 return 0;
1052 1053
1053 /* check if we only had WB and UC */ 1054 /* Check if we only had WB and UC: */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != 1055 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES]) 1056 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0; 1057 return 0;
@@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1066 } 1067 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 1068 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068 1069
1070 /* Check the head: */
1069 total_trim_size = 0; 1071 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start) 1072 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start); 1073 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */ 1074
1075 /* Check the holes: */
1074 for (i = 0; i < nr_range - 1; i++) { 1076 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start) 1077 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1, 1078 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start); 1079 range[i+1].start);
1078 } 1080 }
1079 /* check the top */ 1081
1082 /* Check the top: */
1080 i = nr_range - 1; 1083 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn) 1084 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1, 1085 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn); 1086 end_pfn);
1084 1087
1085 if (total_trim_size) { 1088 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" 1089 pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089 1090
1090 if (!changed_by_mtrr_cleanup) 1091 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1); 1092 WARN_ON(1);
1092 1093
1093 printk(KERN_INFO "update e820 for mtrr\n"); 1094 pr_info("update e820 for mtrr\n");
1094 update_e820(); 1095 update_e820();
1095 1096
1096 return 1; 1097 return 1;
@@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1098 1099
1099 return 0; 1100 return 0;
1100} 1101}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ff14c320040c..228d982ce09c 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -1,38 +1,40 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/io.h>
2#include <linux/mm.h> 3#include <linux/mm.h>
3#include <asm/mtrr.h> 4
4#include <asm/msr.h>
5#include <asm/io.h>
6#include <asm/processor-cyrix.h> 5#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
8#include "mtrr.h" 10#include "mtrr.h"
9 11
10static void 12static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 13cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned long *size, mtrr_type * type) 14 unsigned long *size, mtrr_type * type)
13{ 15{
14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 16 unsigned char arr, ccr3, rcr, shift;
17 unsigned long flags;
16 18
17 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ 19 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
18 20
19 /* Save flags and disable interrupts */
20 local_irq_save(flags); 21 local_irq_save(flags);
21 22
22 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
23 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 24 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
24 ((unsigned char *) base)[3] = getCx86(arr); 25 ((unsigned char *)base)[3] = getCx86(arr);
25 ((unsigned char *) base)[2] = getCx86(arr + 1); 26 ((unsigned char *)base)[2] = getCx86(arr + 1);
26 ((unsigned char *) base)[1] = getCx86(arr + 2); 27 ((unsigned char *)base)[1] = getCx86(arr + 2);
27 rcr = getCx86(CX86_RCR_BASE + reg); 28 rcr = getCx86(CX86_RCR_BASE + reg);
28 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 29 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
29 30
30 /* Enable interrupts if it was enabled previously */
31 local_irq_restore(flags); 31 local_irq_restore(flags);
32
32 shift = ((unsigned char *) base)[1] & 0x0f; 33 shift = ((unsigned char *) base)[1] & 0x0f;
33 *base >>= PAGE_SHIFT; 34 *base >>= PAGE_SHIFT;
34 35
35 /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 36 /*
37 * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
36 * Note: shift==0xf means 4G, this is unsupported. 38 * Note: shift==0xf means 4G, this is unsupported.
37 */ 39 */
38 if (shift) 40 if (shift)
@@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
76 } 78 }
77} 79}
78 80
81/*
82 * cyrix_get_free_region - get a free ARR.
83 *
84 * @base: the starting (base) address of the region.
85 * @size: the size (in bytes) of the region.
86 *
87 * Returns: the index of the region on success, else -1 on error.
88*/
79static int 89static int
80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) 90cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region.
84 [RETURNS] The index of the region on success, else -1 on error.
85*/
86{ 91{
87 int i;
88 mtrr_type ltype;
89 unsigned long lbase, lsize; 92 unsigned long lbase, lsize;
93 mtrr_type ltype;
94 int i;
90 95
91 switch (replace_reg) { 96 switch (replace_reg) {
92 case 7: 97 case 7:
@@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
107 cyrix_get_arr(7, &lbase, &lsize, &ltype); 112 cyrix_get_arr(7, &lbase, &lsize, &ltype);
108 if (lsize == 0) 113 if (lsize == 0)
109 return 7; 114 return 7;
110 /* Else try ARR0-ARR6 first */ 115 /* Else try ARR0-ARR6 first */
111 } else { 116 } else {
112 for (i = 0; i < 7; i++) { 117 for (i = 0; i < 7; i++) {
113 cyrix_get_arr(i, &lbase, &lsize, &ltype); 118 cyrix_get_arr(i, &lbase, &lsize, &ltype);
114 if (lsize == 0) 119 if (lsize == 0)
115 return i; 120 return i;
116 } 121 }
117 /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ 122 /*
123 * ARR0-ARR6 isn't free
124 * try ARR7 but its size must be at least 256K
125 */
118 cyrix_get_arr(i, &lbase, &lsize, &ltype); 126 cyrix_get_arr(i, &lbase, &lsize, &ltype);
119 if ((lsize == 0) && (size >= 0x40)) 127 if ((lsize == 0) && (size >= 0x40))
120 return i; 128 return i;
@@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
122 return -ENOSPC; 130 return -ENOSPC;
123} 131}
124 132
125static u32 cr4 = 0; 133static u32 cr4, ccr3;
126static u32 ccr3;
127 134
128static void prepare_set(void) 135static void prepare_set(void)
129{ 136{
130 u32 cr0; 137 u32 cr0;
131 138
132 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 139 /* Save value of CR4 and clear Page Global Enable (bit 7) */
133 if ( cpu_has_pge ) { 140 if (cpu_has_pge) {
134 cr4 = read_cr4(); 141 cr4 = read_cr4();
135 write_cr4(cr4 & ~X86_CR4_PGE); 142 write_cr4(cr4 & ~X86_CR4_PGE);
136 } 143 }
137 144
138 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 145 /*
139 a side-effect */ 146 * Disable and flush caches.
147 * Note that wbinvd flushes the TLBs as a side-effect
148 */
140 cr0 = read_cr0() | X86_CR0_CD; 149 cr0 = read_cr0() | X86_CR0_CD;
141 wbinvd(); 150 wbinvd();
142 write_cr0(cr0); 151 write_cr0(cr0);
@@ -147,22 +156,21 @@ static void prepare_set(void)
147 156
148 /* Cyrix ARRs - everything else was excluded at the top */ 157 /* Cyrix ARRs - everything else was excluded at the top */
149 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); 158 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
150
151} 159}
152 160
153static void post_set(void) 161static void post_set(void)
154{ 162{
155 /* Flush caches and TLBs */ 163 /* Flush caches and TLBs */
156 wbinvd(); 164 wbinvd();
157 165
158 /* Cyrix ARRs - everything else was excluded at the top */ 166 /* Cyrix ARRs - everything else was excluded at the top */
159 setCx86(CX86_CCR3, ccr3); 167 setCx86(CX86_CCR3, ccr3);
160 168
161 /* Enable caches */ 169 /* Enable caches */
162 write_cr0(read_cr0() & 0xbfffffff); 170 write_cr0(read_cr0() & 0xbfffffff);
163 171
164 /* Restore value of CR4 */ 172 /* Restore value of CR4 */
165 if ( cpu_has_pge ) 173 if (cpu_has_pge)
166 write_cr4(cr4); 174 write_cr4(cr4);
167} 175}
168 176
@@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
178 size >>= 6; 186 size >>= 6;
179 187
180 size &= 0x7fff; /* make sure arr_size <= 14 */ 188 size &= 0x7fff; /* make sure arr_size <= 14 */
181 for (arr_size = 0; size; arr_size++, size >>= 1) ; 189 for (arr_size = 0; size; arr_size++, size >>= 1)
190 ;
182 191
183 if (reg < 7) { 192 if (reg < 7) {
184 switch (type) { 193 switch (type) {
@@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
215 prepare_set(); 224 prepare_set();
216 225
217 base <<= PAGE_SHIFT; 226 base <<= PAGE_SHIFT;
218 setCx86(arr, ((unsigned char *) &base)[3]); 227 setCx86(arr + 0, ((unsigned char *)&base)[3]);
219 setCx86(arr + 1, ((unsigned char *) &base)[2]); 228 setCx86(arr + 1, ((unsigned char *)&base)[2]);
220 setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); 229 setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
221 setCx86(CX86_RCR_BASE + reg, arr_type); 230 setCx86(CX86_RCR_BASE + reg, arr_type);
222 231
223 post_set(); 232 post_set();
224} 233}
225 234
226typedef struct { 235typedef struct {
227 unsigned long base; 236 unsigned long base;
228 unsigned long size; 237 unsigned long size;
229 mtrr_type type; 238 mtrr_type type;
230} arr_state_t; 239} arr_state_t;
231 240
232static arr_state_t arr_state[8] = { 241static arr_state_t arr_state[8] = {
@@ -247,16 +256,17 @@ static void cyrix_set_all(void)
247 setCx86(CX86_CCR0 + i, ccr_state[i]); 256 setCx86(CX86_CCR0 + i, ccr_state[i]);
248 for (; i < 7; i++) 257 for (; i < 7; i++)
249 setCx86(CX86_CCR4 + i, ccr_state[i]); 258 setCx86(CX86_CCR4 + i, ccr_state[i]);
250 for (i = 0; i < 8; i++) 259
251 cyrix_set_arr(i, arr_state[i].base, 260 for (i = 0; i < 8; i++) {
261 cyrix_set_arr(i, arr_state[i].base,
252 arr_state[i].size, arr_state[i].type); 262 arr_state[i].size, arr_state[i].type);
263 }
253 264
254 post_set(); 265 post_set();
255} 266}
256 267
257static struct mtrr_ops cyrix_mtrr_ops = { 268static struct mtrr_ops cyrix_mtrr_ops = {
258 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
259// .init = cyrix_arr_init,
260 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
261 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
262 .get = cyrix_get_arr, 272 .get = cyrix_get_arr,
@@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void)
270 set_mtrr_ops(&cyrix_mtrr_ops); 280 set_mtrr_ops(&cyrix_mtrr_ops);
271 return 0; 281 return 0;
272} 282}
273
274//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0543f69f0b27..55da0c5f68dd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,28 +1,34 @@
1/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 1/*
2 because MTRRs can span upto 40 bits (36bits on most modern x86) */ 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86)
4 */
5#define DEBUG
6
7#include <linux/module.h>
3#include <linux/init.h> 8#include <linux/init.h>
4#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/io.h>
5#include <linux/mm.h> 11#include <linux/mm.h>
6#include <linux/module.h> 12
7#include <asm/io.h>
8#include <asm/mtrr.h>
9#include <asm/msr.h>
10#include <asm/system.h>
11#include <asm/cpufeature.h>
12#include <asm/processor-flags.h> 13#include <asm/processor-flags.h>
14#include <asm/cpufeature.h>
13#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16#include <asm/system.h>
17#include <asm/mtrr.h>
18#include <asm/msr.h>
14#include <asm/pat.h> 19#include <asm/pat.h>
20
15#include "mtrr.h" 21#include "mtrr.h"
16 22
17struct fixed_range_block { 23struct fixed_range_block {
18 int base_msr; /* start address of an MTRR block */ 24 int base_msr; /* start address of an MTRR block */
19 int ranges; /* number of MTRRs in this block */ 25 int ranges; /* number of MTRRs in this block */
20}; 26};
21 27
22static struct fixed_range_block fixed_range_blocks[] = { 28static struct fixed_range_block fixed_range_blocks[] = {
23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ 29 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ 30 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ 31 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 32 {}
27}; 33};
28 34
@@ -30,10 +36,10 @@ static unsigned long smp_changes_mask;
30static int mtrr_state_set; 36static int mtrr_state_set;
31u64 mtrr_tom2; 37u64 mtrr_tom2;
32 38
33struct mtrr_state_type mtrr_state = {}; 39struct mtrr_state_type mtrr_state;
34EXPORT_SYMBOL_GPL(mtrr_state); 40EXPORT_SYMBOL_GPL(mtrr_state);
35 41
36/** 42/*
37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example 43 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD 44 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section 45 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
@@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
104 * Look of multiple ranges matching this address and pick type 110 * Look of multiple ranges matching this address and pick type
105 * as per MTRR precedence 111 * as per MTRR precedence
106 */ 112 */
107 if (!(mtrr_state.enabled & 2)) { 113 if (!(mtrr_state.enabled & 2))
108 return mtrr_state.def_type; 114 return mtrr_state.def_type;
109 }
110 115
111 prev_match = 0xFF; 116 prev_match = 0xFF;
112 for (i = 0; i < num_var_ranges; ++i) { 117 for (i = 0; i < num_var_ranges; ++i) {
@@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
125 if (start_state != end_state) 130 if (start_state != end_state)
126 return 0xFE; 131 return 0xFE;
127 132
128 if ((start & mask) != (base & mask)) { 133 if ((start & mask) != (base & mask))
129 continue; 134 continue;
130 }
131 135
132 curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; 136 curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
133 if (prev_match == 0xFF) { 137 if (prev_match == 0xFF) {
@@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
148 curr_match = MTRR_TYPE_WRTHROUGH; 152 curr_match = MTRR_TYPE_WRTHROUGH;
149 } 153 }
150 154
151 if (prev_match != curr_match) { 155 if (prev_match != curr_match)
152 return MTRR_TYPE_UNCACHABLE; 156 return MTRR_TYPE_UNCACHABLE;
153 }
154 } 157 }
155 158
156 if (mtrr_tom2) { 159 if (mtrr_tom2) {
@@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end)
164 return mtrr_state.def_type; 167 return mtrr_state.def_type;
165} 168}
166 169
167/* Get the MSR pair relating to a var range */ 170/* Get the MSR pair relating to a var range */
168static void 171static void
169get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 172get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
170{ 173{
@@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
172 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 175 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
173} 176}
174 177
175/* fill the MSR pair relating to a var range */ 178/* Fill the MSR pair relating to a var range */
176void fill_mtrr_var_range(unsigned int index, 179void fill_mtrr_var_range(unsigned int index,
177 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) 180 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
178{ 181{
@@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index,
186 vr[index].mask_hi = mask_hi; 189 vr[index].mask_hi = mask_hi;
187} 190}
188 191
189static void 192static void get_fixed_ranges(mtrr_type *frs)
190get_fixed_ranges(mtrr_type * frs)
191{ 193{
192 unsigned int *p = (unsigned int *) frs; 194 unsigned int *p = (unsigned int *)frs;
193 int i; 195 int i;
194 196
195 k8_check_syscfg_dram_mod_en(); 197 k8_check_syscfg_dram_mod_en();
@@ -217,22 +219,22 @@ static void __init print_fixed_last(void)
217 if (!last_fixed_end) 219 if (!last_fixed_end)
218 return; 220 return;
219 221
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, 222 pr_debug(" %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); 223 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222 224
223 last_fixed_end = 0; 225 last_fixed_end = 0;
224} 226}
225 227
226static void __init update_fixed_last(unsigned base, unsigned end, 228static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type) 229 mtrr_type type)
228{ 230{
229 last_fixed_start = base; 231 last_fixed_start = base;
230 last_fixed_end = end; 232 last_fixed_end = end;
231 last_fixed_type = type; 233 last_fixed_type = type;
232} 234}
233 235
234static void __init print_fixed(unsigned base, unsigned step, 236static void __init
235 const mtrr_type *types) 237print_fixed(unsigned base, unsigned step, const mtrr_type *types)
236{ 238{
237 unsigned i; 239 unsigned i;
238 240
@@ -259,54 +261,55 @@ static void __init print_mtrr_state(void)
259 unsigned int i; 261 unsigned int i;
260 int high_width; 262 int high_width;
261 263
262 printk(KERN_DEBUG "MTRR default type: %s\n", 264 pr_debug("MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type)); 265 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) { 266 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", 267 pr_debug("MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis"); 268 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); 269 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i) 270 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); 271 print_fixed(0x80000 + i * 0x20000, 0x04000,
272 mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i) 273 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); 274 print_fixed(0xC0000 + i * 0x08000, 0x01000,
275 mtrr_state.fixed_ranges + (i + 3) * 8);
272 276
273 /* tail */ 277 /* tail */
274 print_fixed_last(); 278 print_fixed_last();
275 } 279 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", 280 pr_debug("MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis"); 281 mtrr_state.enabled & 2 ? "en" : "dis");
278 if (size_or_mask & 0xffffffffUL) 282 if (size_or_mask & 0xffffffffUL)
279 high_width = ffs(size_or_mask & 0xffffffffUL) - 1; 283 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
280 else 284 else
281 high_width = ffs(size_or_mask>>32) + 32 - 1; 285 high_width = ffs(size_or_mask>>32) + 32 - 1;
282 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; 286 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
287
283 for (i = 0; i < num_var_ranges; ++i) { 288 for (i = 0; i < num_var_ranges; ++i) {
284 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 289 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
285 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", 290 pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
286 i, 291 i,
287 high_width, 292 high_width,
288 mtrr_state.var_ranges[i].base_hi, 293 mtrr_state.var_ranges[i].base_hi,
289 mtrr_state.var_ranges[i].base_lo >> 12, 294 mtrr_state.var_ranges[i].base_lo >> 12,
290 high_width, 295 high_width,
291 mtrr_state.var_ranges[i].mask_hi, 296 mtrr_state.var_ranges[i].mask_hi,
292 mtrr_state.var_ranges[i].mask_lo >> 12, 297 mtrr_state.var_ranges[i].mask_lo >> 12,
293 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); 298 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
294 else 299 else
295 printk(KERN_DEBUG " %u disabled\n", i); 300 pr_debug(" %u disabled\n", i);
296 }
297 if (mtrr_tom2) {
298 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
299 mtrr_tom2, mtrr_tom2>>20);
300 } 301 }
302 if (mtrr_tom2)
303 pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
301} 304}
302 305
303/* Grab all of the MTRR state for this CPU into *state */ 306/* Grab all of the MTRR state for this CPU into *state */
304void __init get_mtrr_state(void) 307void __init get_mtrr_state(void)
305{ 308{
306 unsigned int i;
307 struct mtrr_var_range *vrs; 309 struct mtrr_var_range *vrs;
308 unsigned lo, dummy;
309 unsigned long flags; 310 unsigned long flags;
311 unsigned lo, dummy;
312 unsigned int i;
310 313
311 vrs = mtrr_state.var_ranges; 314 vrs = mtrr_state.var_ranges;
312 315
@@ -324,6 +327,7 @@ void __init get_mtrr_state(void)
324 327
325 if (amd_special_default_mtrr()) { 328 if (amd_special_default_mtrr()) {
326 unsigned low, high; 329 unsigned low, high;
330
327 /* TOP_MEM2 */ 331 /* TOP_MEM2 */
328 rdmsr(MSR_K8_TOP_MEM2, low, high); 332 rdmsr(MSR_K8_TOP_MEM2, low, high);
329 mtrr_tom2 = high; 333 mtrr_tom2 = high;
@@ -344,10 +348,9 @@ void __init get_mtrr_state(void)
344 348
345 post_set(); 349 post_set();
346 local_irq_restore(flags); 350 local_irq_restore(flags);
347
348} 351}
349 352
350/* Some BIOS's are fucked and don't set all MTRRs the same! */ 353/* Some BIOS's are messed up and don't set all MTRRs the same! */
351void __init mtrr_state_warn(void) 354void __init mtrr_state_warn(void)
352{ 355{
353 unsigned long mask = smp_changes_mask; 356 unsigned long mask = smp_changes_mask;
@@ -355,28 +358,33 @@ void __init mtrr_state_warn(void)
355 if (!mask) 358 if (!mask)
356 return; 359 return;
357 if (mask & MTRR_CHANGE_MASK_FIXED) 360 if (mask & MTRR_CHANGE_MASK_FIXED)
358 printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); 361 pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
359 if (mask & MTRR_CHANGE_MASK_VARIABLE) 362 if (mask & MTRR_CHANGE_MASK_VARIABLE)
360 printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); 363 pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
361 if (mask & MTRR_CHANGE_MASK_DEFTYPE) 364 if (mask & MTRR_CHANGE_MASK_DEFTYPE)
362 printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); 365 pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
366
363 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); 367 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
364 printk(KERN_INFO "mtrr: corrected configuration.\n"); 368 printk(KERN_INFO "mtrr: corrected configuration.\n");
365} 369}
366 370
367/* Doesn't attempt to pass an error out to MTRR users 371/*
368 because it's quite complicated in some cases and probably not 372 * Doesn't attempt to pass an error out to MTRR users
369 worth it because the best error handling is to ignore it. */ 373 * because it's quite complicated in some cases and probably not
374 * worth it because the best error handling is to ignore it.
375 */
370void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) 376void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
371{ 377{
372 if (wrmsr_safe(msr, a, b) < 0) 378 if (wrmsr_safe(msr, a, b) < 0) {
373 printk(KERN_ERR 379 printk(KERN_ERR
374 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", 380 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
375 smp_processor_id(), msr, a, b); 381 smp_processor_id(), msr, a, b);
382 }
376} 383}
377 384
378/** 385/**
379 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 386 * set_fixed_range - checks & updates a fixed-range MTRR if it
387 * differs from the value it should have
380 * @msr: MSR address of the MTTR which should be checked and updated 388 * @msr: MSR address of the MTTR which should be checked and updated
381 * @changed: pointer which indicates whether the MTRR needed to be changed 389 * @changed: pointer which indicates whether the MTRR needed to be changed
382 * @msrwords: pointer to the MSR values which the MSR should have 390 * @msrwords: pointer to the MSR values which the MSR should have
@@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
401 * 409 *
402 * Returns: The index of the region on success, else negative on error. 410 * Returns: The index of the region on success, else negative on error.
403 */ 411 */
404int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) 412int
413generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
405{ 414{
406 int i, max;
407 mtrr_type ltype;
408 unsigned long lbase, lsize; 415 unsigned long lbase, lsize;
416 mtrr_type ltype;
417 int i, max;
409 418
410 max = num_var_ranges; 419 max = num_var_ranges;
411 if (replace_reg >= 0 && replace_reg < max) 420 if (replace_reg >= 0 && replace_reg < max)
412 return replace_reg; 421 return replace_reg;
422
413 for (i = 0; i < max; ++i) { 423 for (i = 0; i < max; ++i) {
414 mtrr_if->get(i, &lbase, &lsize, &ltype); 424 mtrr_if->get(i, &lbase, &lsize, &ltype);
415 if (lsize == 0) 425 if (lsize == 0)
416 return i; 426 return i;
417 } 427 }
428
418 return -ENOSPC; 429 return -ENOSPC;
419} 430}
420 431
@@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
434 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 445 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
435 446
436 if ((mask_lo & 0x800) == 0) { 447 if ((mask_lo & 0x800) == 0) {
437 /* Invalid (i.e. free) range */ 448 /* Invalid (i.e. free) range */
438 *base = 0; 449 *base = 0;
439 *size = 0; 450 *size = 0;
440 *type = 0; 451 *type = 0;
@@ -471,27 +482,31 @@ out_put_cpu:
471} 482}
472 483
473/** 484/**
474 * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set 485 * set_fixed_ranges - checks & updates the fixed-range MTRRs if they
486 * differ from the saved set
475 * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() 487 * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
476 */ 488 */
477static int set_fixed_ranges(mtrr_type * frs) 489static int set_fixed_ranges(mtrr_type *frs)
478{ 490{
479 unsigned long long *saved = (unsigned long long *) frs; 491 unsigned long long *saved = (unsigned long long *)frs;
480 bool changed = false; 492 bool changed = false;
481 int block=-1, range; 493 int block = -1, range;
482 494
483 k8_check_syscfg_dram_mod_en(); 495 k8_check_syscfg_dram_mod_en();
484 496
485 while (fixed_range_blocks[++block].ranges) 497 while (fixed_range_blocks[++block].ranges) {
486 for (range=0; range < fixed_range_blocks[block].ranges; range++) 498 for (range = 0; range < fixed_range_blocks[block].ranges; range++)
487 set_fixed_range(fixed_range_blocks[block].base_msr + range, 499 set_fixed_range(fixed_range_blocks[block].base_msr + range,
488 &changed, (unsigned int *) saved++); 500 &changed, (unsigned int *)saved++);
501 }
489 502
490 return changed; 503 return changed;
491} 504}
492 505
493/* Set the MSR pair relating to a var range. Returns TRUE if 506/*
494 changes are made */ 507 * Set the MSR pair relating to a var range.
508 * Returns true if changes are made.
509 */
495static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) 510static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
496{ 511{
497 unsigned int lo, hi; 512 unsigned int lo, hi;
@@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
501 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) 516 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
502 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 517 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
503 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 518 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
519
504 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); 520 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
505 changed = true; 521 changed = true;
506 } 522 }
@@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi;
526 */ 542 */
527static unsigned long set_mtrr_state(void) 543static unsigned long set_mtrr_state(void)
528{ 544{
529 unsigned int i;
530 unsigned long change_mask = 0; 545 unsigned long change_mask = 0;
546 unsigned int i;
531 547
532 for (i = 0; i < num_var_ranges; i++) 548 for (i = 0; i < num_var_ranges; i++) {
533 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 549 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
534 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 550 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
551 }
535 552
536 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) 553 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
537 change_mask |= MTRR_CHANGE_MASK_FIXED; 554 change_mask |= MTRR_CHANGE_MASK_FIXED;
538 555
539 /* Set_mtrr_restore restores the old value of MTRRdefType, 556 /*
540 so to set it we fiddle with the saved value */ 557 * Set_mtrr_restore restores the old value of MTRRdefType,
558 * so to set it we fiddle with the saved value:
559 */
541 if ((deftype_lo & 0xff) != mtrr_state.def_type 560 if ((deftype_lo & 0xff) != mtrr_state.def_type
542 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 561 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
543 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); 562
563 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
564 (mtrr_state.enabled << 10);
544 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 565 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
545 } 566 }
546 567
@@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void)
548} 569}
549 570
550 571
551static unsigned long cr4 = 0; 572static unsigned long cr4;
552static DEFINE_SPINLOCK(set_atomicity_lock); 573static DEFINE_SPINLOCK(set_atomicity_lock);
553 574
554/* 575/*
555 * Since we are disabling the cache don't allow any interrupts - they 576 * Since we are disabling the cache don't allow any interrupts,
556 * would run extremely slow and would only increase the pain. The caller must 577 * they would run extremely slow and would only increase the pain.
557 * ensure that local interrupts are disabled and are reenabled after post_set() 578 *
558 * has been called. 579 * The caller must ensure that local interrupts are disabled and
580 * are reenabled after post_set() has been called.
559 */ 581 */
560
561static void prepare_set(void) __acquires(set_atomicity_lock) 582static void prepare_set(void) __acquires(set_atomicity_lock)
562{ 583{
563 unsigned long cr0; 584 unsigned long cr0;
564 585
565 /* Note that this is not ideal, since the cache is only flushed/disabled 586 /*
566 for this CPU while the MTRRs are changed, but changing this requires 587 * Note that this is not ideal
567 more invasive changes to the way the kernel boots */ 588 * since the cache is only flushed/disabled for this CPU while the
589 * MTRRs are changed, but changing this requires more invasive
590 * changes to the way the kernel boots
591 */
568 592
569 spin_lock(&set_atomicity_lock); 593 spin_lock(&set_atomicity_lock);
570 594
571 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
572 cr0 = read_cr0() | X86_CR0_CD; 596 cr0 = read_cr0() | X86_CR0_CD;
573 write_cr0(cr0); 597 write_cr0(cr0);
574 wbinvd(); 598 wbinvd();
575 599
576 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 600 /* Save value of CR4 and clear Page Global Enable (bit 7) */
577 if ( cpu_has_pge ) { 601 if (cpu_has_pge) {
578 cr4 = read_cr4(); 602 cr4 = read_cr4();
579 write_cr4(cr4 & ~X86_CR4_PGE); 603 write_cr4(cr4 & ~X86_CR4_PGE);
580 } 604 }
@@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
582 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 606 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
583 __flush_tlb(); 607 __flush_tlb();
584 608
585 /* Save MTRR state */ 609 /* Save MTRR state */
586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); 610 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 611
588 /* Disable MTRRs, and set the default type to uncached */ 612 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 613 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 614}
591 615
592static void post_set(void) __releases(set_atomicity_lock) 616static void post_set(void) __releases(set_atomicity_lock)
593{ 617{
594 /* Flush TLBs (no need to flush caches - they are disabled) */ 618 /* Flush TLBs (no need to flush caches - they are disabled) */
595 __flush_tlb(); 619 __flush_tlb();
596 620
597 /* Intel (P6) standard MTRRs */ 621 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); 622 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 623
600 /* Enable caches */ 624 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 625 write_cr0(read_cr0() & 0xbfffffff);
602 626
603 /* Restore value of CR4 */ 627 /* Restore value of CR4 */
604 if ( cpu_has_pge ) 628 if (cpu_has_pge)
605 write_cr4(cr4); 629 write_cr4(cr4);
606 spin_unlock(&set_atomicity_lock); 630 spin_unlock(&set_atomicity_lock);
607} 631}
@@ -623,24 +647,27 @@ static void generic_set_all(void)
623 post_set(); 647 post_set();
624 local_irq_restore(flags); 648 local_irq_restore(flags);
625 649
626 /* Use the atomic bitops to update the global mask */ 650 /* Use the atomic bitops to update the global mask */
627 for (count = 0; count < sizeof mask * 8; ++count) { 651 for (count = 0; count < sizeof mask * 8; ++count) {
628 if (mask & 0x01) 652 if (mask & 0x01)
629 set_bit(count, &smp_changes_mask); 653 set_bit(count, &smp_changes_mask);
630 mask >>= 1; 654 mask >>= 1;
631 } 655 }
632 656
633} 657}
634 658
659/**
660 * generic_set_mtrr - set variable MTRR register on the local CPU.
661 *
662 * @reg: The register to set.
663 * @base: The base address of the region.
664 * @size: The size of the region. If this is 0 the region is disabled.
665 * @type: The type of the region.
666 *
667 * Returns nothing.
668 */
635static void generic_set_mtrr(unsigned int reg, unsigned long base, 669static void generic_set_mtrr(unsigned int reg, unsigned long base,
636 unsigned long size, mtrr_type type) 670 unsigned long size, mtrr_type type)
637/* [SUMMARY] Set variable MTRR register on the local CPU.
638 <reg> The register to set.
639 <base> The base address of the region.
640 <size> The size of the region. If this is 0 the region is disabled.
641 <type> The type of the region.
642 [RETURNS] Nothing.
643*/
644{ 671{
645 unsigned long flags; 672 unsigned long flags;
646 struct mtrr_var_range *vr; 673 struct mtrr_var_range *vr;
@@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
651 prepare_set(); 678 prepare_set();
652 679
653 if (size == 0) { 680 if (size == 0) {
654 /* The invalid bit is kept in the mask, so we simply clear the 681 /*
655 relevant mask register to disable a range. */ 682 * The invalid bit is kept in the mask, so we simply
683 * clear the relevant mask register to disable a range.
684 */
656 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); 685 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
657 memset(vr, 0, sizeof(struct mtrr_var_range)); 686 memset(vr, 0, sizeof(struct mtrr_var_range));
658 } else { 687 } else {
@@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
669 local_irq_restore(flags); 698 local_irq_restore(flags);
670} 699}
671 700
672int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) 701int generic_validate_add_page(unsigned long base, unsigned long size,
702 unsigned int type)
673{ 703{
674 unsigned long lbase, last; 704 unsigned long lbase, last;
675 705
676 /* For Intel PPro stepping <= 7, must be 4 MiB aligned 706 /*
677 and not touch 0x70000000->0x7003FFFF */ 707 * For Intel PPro stepping <= 7
708 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
709 */
678 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && 710 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
679 boot_cpu_data.x86_model == 1 && 711 boot_cpu_data.x86_model == 1 &&
680 boot_cpu_data.x86_mask <= 7) { 712 boot_cpu_data.x86_mask <= 7) {
681 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { 713 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
682 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 714 pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
683 return -EINVAL; 715 return -EINVAL;
684 } 716 }
685 if (!(base + size < 0x70000 || base > 0x7003F) && 717 if (!(base + size < 0x70000 || base > 0x7003F) &&
686 (type == MTRR_TYPE_WRCOMB 718 (type == MTRR_TYPE_WRCOMB
687 || type == MTRR_TYPE_WRBACK)) { 719 || type == MTRR_TYPE_WRBACK)) {
688 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 720 pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
689 return -EINVAL; 721 return -EINVAL;
690 } 722 }
691 } 723 }
692 724
693 /* Check upper bits of base and last are equal and lower bits are 0 725 /*
694 for base and 1 for last */ 726 * Check upper bits of base and last are equal and lower bits are 0
727 * for base and 1 for last
728 */
695 last = base + size - 1; 729 last = base + size - 1;
696 for (lbase = base; !(lbase & 1) && (last & 1); 730 for (lbase = base; !(lbase & 1) && (last & 1);
697 lbase = lbase >> 1, last = last >> 1) ; 731 lbase = lbase >> 1, last = last >> 1)
732 ;
698 if (lbase != last) { 733 if (lbase != last) {
699 printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", 734 pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
700 base, size);
701 return -EINVAL; 735 return -EINVAL;
702 } 736 }
703 return 0; 737 return 0;
704} 738}
705 739
706
707static int generic_have_wrcomb(void) 740static int generic_have_wrcomb(void)
708{ 741{
709 unsigned long config, dummy; 742 unsigned long config, dummy;
710 rdmsr(MSR_MTRRcap, config, dummy); 743 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 744 return config & (1 << 10);
712} 745}
713 746
714int positive_have_wrcomb(void) 747int positive_have_wrcomb(void)
@@ -716,14 +749,15 @@ int positive_have_wrcomb(void)
716 return 1; 749 return 1;
717} 750}
718 751
719/* generic structure... 752/*
753 * Generic structure...
720 */ 754 */
721struct mtrr_ops generic_mtrr_ops = { 755struct mtrr_ops generic_mtrr_ops = {
722 .use_intel_if = 1, 756 .use_intel_if = 1,
723 .set_all = generic_set_all, 757 .set_all = generic_set_all,
724 .get = generic_get_mtrr, 758 .get = generic_get_mtrr,
725 .get_free_region = generic_get_free_region, 759 .get_free_region = generic_get_free_region,
726 .set = generic_set_mtrr, 760 .set = generic_set_mtrr,
727 .validate_add_page = generic_validate_add_page, 761 .validate_add_page = generic_validate_add_page,
728 .have_wrcomb = generic_have_wrcomb, 762 .have_wrcomb = generic_have_wrcomb,
729}; 763};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index fb73a52913a4..f04e72527604 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -1,27 +1,28 @@
1#include <linux/init.h>
2#include <linux/proc_fs.h>
3#include <linux/capability.h> 1#include <linux/capability.h>
4#include <linux/ctype.h>
5#include <linux/module.h>
6#include <linux/seq_file.h> 2#include <linux/seq_file.h>
7#include <asm/uaccess.h> 3#include <linux/uaccess.h>
4#include <linux/proc_fs.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/init.h>
8 8
9#define LINE_SIZE 80 9#define LINE_SIZE 80
10 10
11#include <asm/mtrr.h> 11#include <asm/mtrr.h>
12
12#include "mtrr.h" 13#include "mtrr.h"
13 14
14#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 15#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
15 16
16static const char *const mtrr_strings[MTRR_NUM_TYPES] = 17static const char *const mtrr_strings[MTRR_NUM_TYPES] =
17{ 18{
18 "uncachable", /* 0 */ 19 "uncachable", /* 0 */
19 "write-combining", /* 1 */ 20 "write-combining", /* 1 */
20 "?", /* 2 */ 21 "?", /* 2 */
21 "?", /* 3 */ 22 "?", /* 3 */
22 "write-through", /* 4 */ 23 "write-through", /* 4 */
23 "write-protect", /* 5 */ 24 "write-protect", /* 5 */
24 "write-back", /* 6 */ 25 "write-back", /* 6 */
25}; 26};
26 27
27const char *mtrr_attrib_to_str(int x) 28const char *mtrr_attrib_to_str(int x)
@@ -35,8 +36,8 @@ static int
35mtrr_file_add(unsigned long base, unsigned long size, 36mtrr_file_add(unsigned long base, unsigned long size,
36 unsigned int type, bool increment, struct file *file, int page) 37 unsigned int type, bool increment, struct file *file, int page)
37{ 38{
39 unsigned int *fcount = FILE_FCOUNT(file);
38 int reg, max; 40 int reg, max;
39 unsigned int *fcount = FILE_FCOUNT(file);
40 41
41 max = num_var_ranges; 42 max = num_var_ranges;
42 if (fcount == NULL) { 43 if (fcount == NULL) {
@@ -61,8 +62,8 @@ static int
61mtrr_file_del(unsigned long base, unsigned long size, 62mtrr_file_del(unsigned long base, unsigned long size,
62 struct file *file, int page) 63 struct file *file, int page)
63{ 64{
64 int reg;
65 unsigned int *fcount = FILE_FCOUNT(file); 65 unsigned int *fcount = FILE_FCOUNT(file);
66 int reg;
66 67
67 if (!page) { 68 if (!page) {
68 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) 69 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
@@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size,
81 return reg; 82 return reg;
82} 83}
83 84
84/* RED-PEN: seq_file can seek now. this is ignored. */ 85/*
86 * seq_file can seek but we ignore it.
87 *
88 * Format of control line:
89 * "base=%Lx size=%Lx type=%s" or "disable=%d"
90 */
85static ssize_t 91static ssize_t
86mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) 92mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
87/* Format of control line:
88 "base=%Lx size=%Lx type=%s" OR:
89 "disable=%d"
90*/
91{ 93{
92 int i, err; 94 int i, err;
93 unsigned long reg; 95 unsigned long reg;
@@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
100 return -EPERM; 102 return -EPERM;
101 if (!len) 103 if (!len)
102 return -EINVAL; 104 return -EINVAL;
105
103 memset(line, 0, LINE_SIZE); 106 memset(line, 0, LINE_SIZE);
104 if (len > LINE_SIZE) 107 if (len > LINE_SIZE)
105 len = LINE_SIZE; 108 len = LINE_SIZE;
106 if (copy_from_user(line, buf, len - 1)) 109 if (copy_from_user(line, buf, len - 1))
107 return -EFAULT; 110 return -EFAULT;
111
108 linelen = strlen(line); 112 linelen = strlen(line);
109 ptr = line + linelen - 1; 113 ptr = line + linelen - 1;
110 if (linelen && *ptr == '\n') 114 if (linelen && *ptr == '\n')
111 *ptr = '\0'; 115 *ptr = '\0';
116
112 if (!strncmp(line, "disable=", 8)) { 117 if (!strncmp(line, "disable=", 8)) {
113 reg = simple_strtoul(line + 8, &ptr, 0); 118 reg = simple_strtoul(line + 8, &ptr, 0);
114 err = mtrr_del_page(reg, 0, 0); 119 err = mtrr_del_page(reg, 0, 0);
@@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
116 return err; 121 return err;
117 return len; 122 return len;
118 } 123 }
124
119 if (strncmp(line, "base=", 5)) 125 if (strncmp(line, "base=", 5))
120 return -EINVAL; 126 return -EINVAL;
127
121 base = simple_strtoull(line + 5, &ptr, 0); 128 base = simple_strtoull(line + 5, &ptr, 0);
122 for (; isspace(*ptr); ++ptr) ; 129 while (isspace(*ptr))
130 ptr++;
131
123 if (strncmp(ptr, "size=", 5)) 132 if (strncmp(ptr, "size=", 5))
124 return -EINVAL; 133 return -EINVAL;
134
125 size = simple_strtoull(ptr + 5, &ptr, 0); 135 size = simple_strtoull(ptr + 5, &ptr, 0);
126 if ((base & 0xfff) || (size & 0xfff)) 136 if ((base & 0xfff) || (size & 0xfff))
127 return -EINVAL; 137 return -EINVAL;
128 for (; isspace(*ptr); ++ptr) ; 138 while (isspace(*ptr))
139 ptr++;
140
129 if (strncmp(ptr, "type=", 5)) 141 if (strncmp(ptr, "type=", 5))
130 return -EINVAL; 142 return -EINVAL;
131 ptr += 5; 143 ptr += 5;
132 for (; isspace(*ptr); ++ptr) ; 144 while (isspace(*ptr))
145 ptr++;
146
133 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 147 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
134 if (strcmp(ptr, mtrr_strings[i])) 148 if (strcmp(ptr, mtrr_strings[i]))
135 continue; 149 continue;
136 base >>= PAGE_SHIFT; 150 base >>= PAGE_SHIFT;
137 size >>= PAGE_SHIFT; 151 size >>= PAGE_SHIFT;
138 err = 152 err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
139 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
140 true);
141 if (err < 0) 153 if (err < 0)
142 return err; 154 return err;
143 return len; 155 return len;
@@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
181 case MTRRIOC32_SET_PAGE_ENTRY: 193 case MTRRIOC32_SET_PAGE_ENTRY:
182 case MTRRIOC32_DEL_PAGE_ENTRY: 194 case MTRRIOC32_DEL_PAGE_ENTRY:
183 case MTRRIOC32_KILL_PAGE_ENTRY: { 195 case MTRRIOC32_KILL_PAGE_ENTRY: {
184 struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; 196 struct mtrr_sentry32 __user *s32;
197
198 s32 = (struct mtrr_sentry32 __user *)__arg;
185 err = get_user(sentry.base, &s32->base); 199 err = get_user(sentry.base, &s32->base);
186 err |= get_user(sentry.size, &s32->size); 200 err |= get_user(sentry.size, &s32->size);
187 err |= get_user(sentry.type, &s32->type); 201 err |= get_user(sentry.type, &s32->type);
@@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
191 } 205 }
192 case MTRRIOC32_GET_ENTRY: 206 case MTRRIOC32_GET_ENTRY:
193 case MTRRIOC32_GET_PAGE_ENTRY: { 207 case MTRRIOC32_GET_PAGE_ENTRY: {
194 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; 208 struct mtrr_gentry32 __user *g32;
209
210 g32 = (struct mtrr_gentry32 __user *)__arg;
195 err = get_user(gentry.regnum, &g32->regnum); 211 err = get_user(gentry.regnum, &g32->regnum);
196 err |= get_user(gentry.base, &g32->base); 212 err |= get_user(gentry.base, &g32->base);
197 err |= get_user(gentry.size, &g32->size); 213 err |= get_user(gentry.size, &g32->size);
@@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
314 if (err) 330 if (err)
315 return err; 331 return err;
316 332
317 switch(cmd) { 333 switch (cmd) {
318 case MTRRIOC_GET_ENTRY: 334 case MTRRIOC_GET_ENTRY:
319 case MTRRIOC_GET_PAGE_ENTRY: 335 case MTRRIOC_GET_PAGE_ENTRY:
320 if (copy_to_user(arg, &gentry, sizeof gentry)) 336 if (copy_to_user(arg, &gentry, sizeof gentry))
@@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
323#ifdef CONFIG_COMPAT 339#ifdef CONFIG_COMPAT
324 case MTRRIOC32_GET_ENTRY: 340 case MTRRIOC32_GET_ENTRY:
325 case MTRRIOC32_GET_PAGE_ENTRY: { 341 case MTRRIOC32_GET_PAGE_ENTRY: {
326 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; 342 struct mtrr_gentry32 __user *g32;
343
344 g32 = (struct mtrr_gentry32 __user *)__arg;
327 err = put_user(gentry.base, &g32->base); 345 err = put_user(gentry.base, &g32->base);
328 err |= put_user(gentry.size, &g32->size); 346 err |= put_user(gentry.size, &g32->size);
329 err |= put_user(gentry.regnum, &g32->regnum); 347 err |= put_user(gentry.regnum, &g32->regnum);
@@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
335 return err; 353 return err;
336} 354}
337 355
338static int 356static int mtrr_close(struct inode *ino, struct file *file)
339mtrr_close(struct inode *ino, struct file *file)
340{ 357{
341 int i, max;
342 unsigned int *fcount = FILE_FCOUNT(file); 358 unsigned int *fcount = FILE_FCOUNT(file);
359 int i, max;
343 360
344 if (fcount != NULL) { 361 if (fcount != NULL) {
345 max = num_var_ranges; 362 max = num_var_ranges;
@@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset);
359 376
360static int mtrr_open(struct inode *inode, struct file *file) 377static int mtrr_open(struct inode *inode, struct file *file)
361{ 378{
362 if (!mtrr_if) 379 if (!mtrr_if)
363 return -EIO; 380 return -EIO;
364 if (!mtrr_if->get) 381 if (!mtrr_if->get)
365 return -ENXIO; 382 return -ENXIO;
366 return single_open(file, mtrr_seq_show, NULL); 383 return single_open(file, mtrr_seq_show, NULL);
367} 384}
368 385
369static const struct file_operations mtrr_fops = { 386static const struct file_operations mtrr_fops = {
370 .owner = THIS_MODULE, 387 .owner = THIS_MODULE,
371 .open = mtrr_open, 388 .open = mtrr_open,
372 .read = seq_read, 389 .read = seq_read,
373 .llseek = seq_lseek, 390 .llseek = seq_lseek,
374 .write = mtrr_write, 391 .write = mtrr_write,
375 .unlocked_ioctl = mtrr_ioctl, 392 .unlocked_ioctl = mtrr_ioctl,
376 .compat_ioctl = mtrr_ioctl, 393 .compat_ioctl = mtrr_ioctl,
377 .release = mtrr_close, 394 .release = mtrr_close,
378}; 395};
379 396
380static int mtrr_seq_show(struct seq_file *seq, void *offset) 397static int mtrr_seq_show(struct seq_file *seq, void *offset)
@@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
388 max = num_var_ranges; 405 max = num_var_ranges;
389 for (i = 0; i < max; i++) { 406 for (i = 0; i < max; i++) {
390 mtrr_if->get(i, &base, &size, &type); 407 mtrr_if->get(i, &base, &size, &type);
391 if (size == 0) 408 if (size == 0) {
392 mtrr_usage_table[i] = 0; 409 mtrr_usage_table[i] = 0;
393 else { 410 continue;
394 if (size < (0x100000 >> PAGE_SHIFT)) {
395 /* less than 1MB */
396 factor = 'K';
397 size <<= PAGE_SHIFT - 10;
398 } else {
399 factor = 'M';
400 size >>= 20 - PAGE_SHIFT;
401 }
402 /* RED-PEN: base can be > 32bit */
403 len += seq_printf(seq,
404 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
405 i, base, base >> (20 - PAGE_SHIFT), size, factor,
406 mtrr_usage_table[i], mtrr_attrib_to_str(type));
407 } 411 }
412 if (size < (0x100000 >> PAGE_SHIFT)) {
413 /* less than 1MB */
414 factor = 'K';
415 size <<= PAGE_SHIFT - 10;
416 } else {
417 factor = 'M';
418 size >>= 20 - PAGE_SHIFT;
419 }
420 /* Base can be > 32bit */
421 len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
422 "(%5luMB), size=%5lu%cB, count=%d: %s\n",
423 i, base, base >> (20 - PAGE_SHIFT), size,
424 factor, mtrr_usage_table[i],
425 mtrr_attrib_to_str(type));
408 } 426 }
409 return 0; 427 return 0;
410} 428}
@@ -422,6 +440,5 @@ static int __init mtrr_if_init(void)
422 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); 440 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
423 return 0; 441 return 0;
424} 442}
425
426arch_initcall(mtrr_if_init); 443arch_initcall(mtrr_if_init);
427#endif /* CONFIG_PROC_FS */ 444#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 8fc248b5aeaf..84e83de54575 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -25,43 +25,49 @@
25 Operating System Writer's Guide" (Intel document number 242692), 25 Operating System Writer's Guide" (Intel document number 242692),
26 section 11.11.7 26 section 11.11.7
27 27
28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> 28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
29 on 6-7 March 2002. 29 on 6-7 March 2002.
30 Source: Intel Architecture Software Developers Manual, Volume 3: 30 Source: Intel Architecture Software Developers Manual, Volume 3:
31 System Programming Guide; Section 9.11. (1997 edition - PPro). 31 System Programming Guide; Section 9.11. (1997 edition - PPro).
32*/ 32*/
33 33
34#define DEBUG
35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37
38#include <linux/kvm_para.h>
39#include <linux/uaccess.h>
34#include <linux/module.h> 40#include <linux/module.h>
41#include <linux/mutex.h>
35#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/sort.h>
44#include <linux/cpu.h>
36#include <linux/pci.h> 45#include <linux/pci.h>
37#include <linux/smp.h> 46#include <linux/smp.h>
38#include <linux/cpu.h>
39#include <linux/mutex.h>
40#include <linux/sort.h>
41 47
48#include <asm/processor.h>
42#include <asm/e820.h> 49#include <asm/e820.h>
43#include <asm/mtrr.h> 50#include <asm/mtrr.h>
44#include <asm/uaccess.h>
45#include <asm/processor.h>
46#include <asm/msr.h> 51#include <asm/msr.h>
47#include <asm/kvm_para.h> 52
48#include "mtrr.h" 53#include "mtrr.h"
49 54
50u32 num_var_ranges = 0; 55u32 num_var_ranges;
51 56
52unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; 57unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
53static DEFINE_MUTEX(mtrr_mutex); 58static DEFINE_MUTEX(mtrr_mutex);
54 59
55u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init;
56 62
57static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; 63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
58 64
59struct mtrr_ops * mtrr_if = NULL; 65struct mtrr_ops *mtrr_if;
60 66
61static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
62 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
63 69
64void set_mtrr_ops(struct mtrr_ops * ops) 70void set_mtrr_ops(struct mtrr_ops *ops)
65{ 71{
66 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
67 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -72,30 +78,36 @@ static int have_wrcomb(void)
72{ 78{
73 struct pci_dev *dev; 79 struct pci_dev *dev;
74 u8 rev; 80 u8 rev;
75 81
76 if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { 82 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
77 /* ServerWorks LE chipsets < rev 6 have problems with write-combining 83 if (dev != NULL) {
78 Don't allow it and leave room for other chipsets to be tagged */ 84 /*
85 * ServerWorks LE chipsets < rev 6 have problems with
86 * write-combining. Don't allow it and leave room for other
87 * chipsets to be tagged
88 */
79 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 89 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
80 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { 90 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
81 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 91 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
82 if (rev <= 5) { 92 if (rev <= 5) {
83 printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); 93 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
84 pci_dev_put(dev); 94 pci_dev_put(dev);
85 return 0; 95 return 0;
86 } 96 }
87 } 97 }
88 /* Intel 450NX errata # 23. Non ascending cacheline evictions to 98 /*
89 write combining memory may resulting in data corruption */ 99 * Intel 450NX errata # 23. Non ascending cacheline evictions to
100 * write combining memory may resulting in data corruption
101 */
90 if (dev->vendor == PCI_VENDOR_ID_INTEL && 102 if (dev->vendor == PCI_VENDOR_ID_INTEL &&
91 dev->device == PCI_DEVICE_ID_INTEL_82451NX) { 103 dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
92 printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); 104 pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
93 pci_dev_put(dev); 105 pci_dev_put(dev);
94 return 0; 106 return 0;
95 } 107 }
96 pci_dev_put(dev); 108 pci_dev_put(dev);
97 } 109 }
98 return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); 110 return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
99} 111}
100 112
101/* This function returns the number of variable MTRRs */ 113/* This function returns the number of variable MTRRs */
@@ -103,12 +115,13 @@ static void __init set_num_var_ranges(void)
103{ 115{
104 unsigned long config = 0, dummy; 116 unsigned long config = 0, dummy;
105 117
106 if (use_intel()) { 118 if (use_intel())
107 rdmsr(MSR_MTRRcap, config, dummy); 119 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 120 else if (is_cpu(AMD))
109 config = 2; 121 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 122 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
111 config = 8; 123 config = 8;
124
112 num_var_ranges = config & 0xff; 125 num_var_ranges = config & 0xff;
113} 126}
114 127
@@ -130,10 +143,12 @@ struct set_mtrr_data {
130 mtrr_type smp_type; 143 mtrr_type smp_type;
131}; 144};
132 145
146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 *
149 * Returns nothing.
150 */
133static void ipi_handler(void *info) 151static void ipi_handler(void *info)
134/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
135 [RETURNS] Nothing.
136*/
137{ 152{
138#ifdef CONFIG_SMP 153#ifdef CONFIG_SMP
139 struct set_mtrr_data *data = info; 154 struct set_mtrr_data *data = info;
@@ -142,18 +157,22 @@ static void ipi_handler(void *info)
142 local_irq_save(flags); 157 local_irq_save(flags);
143 158
144 atomic_dec(&data->count); 159 atomic_dec(&data->count);
145 while(!atomic_read(&data->gate)) 160 while (!atomic_read(&data->gate))
146 cpu_relax(); 161 cpu_relax();
147 162
148 /* The master has cleared me to execute */ 163 /* The master has cleared me to execute */
149 if (data->smp_reg != ~0U) 164 if (data->smp_reg != ~0U) {
150 mtrr_if->set(data->smp_reg, data->smp_base, 165 mtrr_if->set(data->smp_reg, data->smp_base,
151 data->smp_size, data->smp_type); 166 data->smp_size, data->smp_type);
152 else 167 } else if (mtrr_aps_delayed_init) {
168 /*
169 * Initialize the MTRRs inaddition to the synchronisation.
170 */
153 mtrr_if->set_all(); 171 mtrr_if->set_all();
172 }
154 173
155 atomic_dec(&data->count); 174 atomic_dec(&data->count);
156 while(atomic_read(&data->gate)) 175 while (atomic_read(&data->gate))
157 cpu_relax(); 176 cpu_relax();
158 177
159 atomic_dec(&data->count); 178 atomic_dec(&data->count);
@@ -161,7 +180,8 @@ static void ipi_handler(void *info)
161#endif 180#endif
162} 181}
163 182
164static inline int types_compatible(mtrr_type type1, mtrr_type type2) { 183static inline int types_compatible(mtrr_type type1, mtrr_type type2)
184{
165 return type1 == MTRR_TYPE_UNCACHABLE || 185 return type1 == MTRR_TYPE_UNCACHABLE ||
166 type2 == MTRR_TYPE_UNCACHABLE || 186 type2 == MTRR_TYPE_UNCACHABLE ||
167 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || 187 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
@@ -176,10 +196,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 * @type: mtrr type 196 * @type: mtrr type
177 * 197 *
178 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 198 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
179 * 199 *
180 * 1. Send IPI to do the following: 200 * 1. Send IPI to do the following:
181 * 2. Disable Interrupts 201 * 2. Disable Interrupts
182 * 3. Wait for all procs to do so 202 * 3. Wait for all procs to do so
183 * 4. Enter no-fill cache mode 203 * 4. Enter no-fill cache mode
184 * 5. Flush caches 204 * 5. Flush caches
185 * 6. Clear PGE bit 205 * 6. Clear PGE bit
@@ -189,26 +209,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
189 * 10. Enable all range registers 209 * 10. Enable all range registers
190 * 11. Flush all TLBs and caches again 210 * 11. Flush all TLBs and caches again
191 * 12. Enter normal cache mode and reenable caching 211 * 12. Enter normal cache mode and reenable caching
192 * 13. Set PGE 212 * 13. Set PGE
193 * 14. Wait for buddies to catch up 213 * 14. Wait for buddies to catch up
194 * 15. Enable interrupts. 214 * 15. Enable interrupts.
195 * 215 *
196 * What does that mean for us? Well, first we set data.count to the number 216 * What does that mean for us? Well, first we set data.count to the number
197 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 217 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
198 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 218 * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
199 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 219 * Meanwhile, they are waiting for that flag to be set. Once it's set, each
200 * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it 220 * CPU goes through the transition of updating MTRRs.
201 * differently, so we call mtrr_if->set() callback and let them take care of it. 221 * The CPU vendors may each do it differently,
202 * When they're done, they again decrement data->count and wait for data.gate to 222 * so we call mtrr_if->set() callback and let them take care of it.
203 * be reset. 223 * When they're done, they again decrement data->count and wait for data.gate
204 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. 224 * to be reset.
225 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
205 * Everyone then enables interrupts and we all continue on. 226 * Everyone then enables interrupts and we all continue on.
206 * 227 *
207 * Note that the mechanism is the same for UP systems, too; all the SMP stuff 228 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
208 * becomes nops. 229 * becomes nops.
209 */ 230 */
210static void set_mtrr(unsigned int reg, unsigned long base, 231static void
211 unsigned long size, mtrr_type type) 232set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
212{ 233{
213 struct set_mtrr_data data; 234 struct set_mtrr_data data;
214 unsigned long flags; 235 unsigned long flags;
@@ -218,121 +239,124 @@ static void set_mtrr(unsigned int reg, unsigned long base,
218 data.smp_size = size; 239 data.smp_size = size;
219 data.smp_type = type; 240 data.smp_type = type;
220 atomic_set(&data.count, num_booting_cpus() - 1); 241 atomic_set(&data.count, num_booting_cpus() - 1);
221 /* make sure data.count is visible before unleashing other CPUs */ 242
243 /* Make sure data.count is visible before unleashing other CPUs */
222 smp_wmb(); 244 smp_wmb();
223 atomic_set(&data.gate,0); 245 atomic_set(&data.gate, 0);
224 246
225 /* Start the ball rolling on other CPUs */ 247 /* Start the ball rolling on other CPUs */
226 if (smp_call_function(ipi_handler, &data, 0) != 0) 248 if (smp_call_function(ipi_handler, &data, 0) != 0)
227 panic("mtrr: timed out waiting for other CPUs\n"); 249 panic("mtrr: timed out waiting for other CPUs\n");
228 250
229 local_irq_save(flags); 251 local_irq_save(flags);
230 252
231 while(atomic_read(&data.count)) 253 while (atomic_read(&data.count))
232 cpu_relax(); 254 cpu_relax();
233 255
234 /* ok, reset count and toggle gate */ 256 /* Ok, reset count and toggle gate */
235 atomic_set(&data.count, num_booting_cpus() - 1); 257 atomic_set(&data.count, num_booting_cpus() - 1);
236 smp_wmb(); 258 smp_wmb();
237 atomic_set(&data.gate,1); 259 atomic_set(&data.gate, 1);
238 260
239 /* do our MTRR business */ 261 /* Do our MTRR business */
240 262
241 /* HACK! 263 /*
264 * HACK!
242 * We use this same function to initialize the mtrrs on boot. 265 * We use this same function to initialize the mtrrs on boot.
243 * The state of the boot cpu's mtrrs has been saved, and we want 266 * The state of the boot cpu's mtrrs has been saved, and we want
244 * to replicate across all the APs. 267 * to replicate across all the APs.
245 * If we're doing that @reg is set to something special... 268 * If we're doing that @reg is set to something special...
246 */ 269 */
247 if (reg != ~0U) 270 if (reg != ~0U)
248 mtrr_if->set(reg,base,size,type); 271 mtrr_if->set(reg, base, size, type);
272 else if (!mtrr_aps_delayed_init)
273 mtrr_if->set_all();
249 274
250 /* wait for the others */ 275 /* Wait for the others */
251 while(atomic_read(&data.count)) 276 while (atomic_read(&data.count))
252 cpu_relax(); 277 cpu_relax();
253 278
254 atomic_set(&data.count, num_booting_cpus() - 1); 279 atomic_set(&data.count, num_booting_cpus() - 1);
255 smp_wmb(); 280 smp_wmb();
256 atomic_set(&data.gate,0); 281 atomic_set(&data.gate, 0);
257 282
258 /* 283 /*
259 * Wait here for everyone to have seen the gate change 284 * Wait here for everyone to have seen the gate change
260 * So we're the last ones to touch 'data' 285 * So we're the last ones to touch 'data'
261 */ 286 */
262 while(atomic_read(&data.count)) 287 while (atomic_read(&data.count))
263 cpu_relax(); 288 cpu_relax();
264 289
265 local_irq_restore(flags); 290 local_irq_restore(flags);
266} 291}
267 292
268/** 293/**
269 * mtrr_add_page - Add a memory type region 294 * mtrr_add_page - Add a memory type region
270 * @base: Physical base address of region in pages (in units of 4 kB!) 295 * @base: Physical base address of region in pages (in units of 4 kB!)
271 * @size: Physical size of region in pages (4 kB) 296 * @size: Physical size of region in pages (4 kB)
272 * @type: Type of MTRR desired 297 * @type: Type of MTRR desired
273 * @increment: If this is true do usage counting on the region 298 * @increment: If this is true do usage counting on the region
274 * 299 *
275 * Memory type region registers control the caching on newer Intel and 300 * Memory type region registers control the caching on newer Intel and
276 * non Intel processors. This function allows drivers to request an 301 * non Intel processors. This function allows drivers to request an
277 * MTRR is added. The details and hardware specifics of each processor's 302 * MTRR is added. The details and hardware specifics of each processor's
278 * implementation are hidden from the caller, but nevertheless the 303 * implementation are hidden from the caller, but nevertheless the
279 * caller should expect to need to provide a power of two size on an 304 * caller should expect to need to provide a power of two size on an
280 * equivalent power of two boundary. 305 * equivalent power of two boundary.
281 * 306 *
282 * If the region cannot be added either because all regions are in use 307 * If the region cannot be added either because all regions are in use
283 * or the CPU cannot support it a negative value is returned. On success 308 * or the CPU cannot support it a negative value is returned. On success
284 * the register number for this entry is returned, but should be treated 309 * the register number for this entry is returned, but should be treated
285 * as a cookie only. 310 * as a cookie only.
286 * 311 *
287 * On a multiprocessor machine the changes are made to all processors. 312 * On a multiprocessor machine the changes are made to all processors.
288 * This is required on x86 by the Intel processors. 313 * This is required on x86 by the Intel processors.
289 * 314 *
290 * The available types are 315 * The available types are
291 * 316 *
292 * %MTRR_TYPE_UNCACHABLE - No caching 317 * %MTRR_TYPE_UNCACHABLE - No caching
293 * 318 *
294 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever 319 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
295 * 320 *
296 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts 321 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
297 * 322 *
298 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes 323 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
299 * 324 *
300 * BUGS: Needs a quiet flag for the cases where drivers do not mind 325 * BUGS: Needs a quiet flag for the cases where drivers do not mind
301 * failures and do not wish system log messages to be sent. 326 * failures and do not wish system log messages to be sent.
302 */ 327 */
303 328int mtrr_add_page(unsigned long base, unsigned long size,
304int mtrr_add_page(unsigned long base, unsigned long size,
305 unsigned int type, bool increment) 329 unsigned int type, bool increment)
306{ 330{
331 unsigned long lbase, lsize;
307 int i, replace, error; 332 int i, replace, error;
308 mtrr_type ltype; 333 mtrr_type ltype;
309 unsigned long lbase, lsize;
310 334
311 if (!mtrr_if) 335 if (!mtrr_if)
312 return -ENXIO; 336 return -ENXIO;
313 337
314 if ((error = mtrr_if->validate_add_page(base,size,type))) 338 error = mtrr_if->validate_add_page(base, size, type);
339 if (error)
315 return error; 340 return error;
316 341
317 if (type >= MTRR_NUM_TYPES) { 342 if (type >= MTRR_NUM_TYPES) {
318 printk(KERN_WARNING "mtrr: type: %u invalid\n", type); 343 pr_warning("mtrr: type: %u invalid\n", type);
319 return -EINVAL; 344 return -EINVAL;
320 } 345 }
321 346
322 /* If the type is WC, check that this processor supports it */ 347 /* If the type is WC, check that this processor supports it */
323 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { 348 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
324 printk(KERN_WARNING 349 pr_warning("mtrr: your processor doesn't support write-combining\n");
325 "mtrr: your processor doesn't support write-combining\n");
326 return -ENOSYS; 350 return -ENOSYS;
327 } 351 }
328 352
329 if (!size) { 353 if (!size) {
330 printk(KERN_WARNING "mtrr: zero sized request\n"); 354 pr_warning("mtrr: zero sized request\n");
331 return -EINVAL; 355 return -EINVAL;
332 } 356 }
333 357
334 if (base & size_or_mask || size & size_or_mask) { 358 if (base & size_or_mask || size & size_or_mask) {
335 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 359 pr_warning("mtrr: base or size exceeds the MTRR width\n");
336 return -EINVAL; 360 return -EINVAL;
337 } 361 }
338 362
@@ -341,36 +365,40 @@ int mtrr_add_page(unsigned long base, unsigned long size,
341 365
342 /* No CPU hotplug when we change MTRR entries */ 366 /* No CPU hotplug when we change MTRR entries */
343 get_online_cpus(); 367 get_online_cpus();
344 /* Search for existing MTRR */ 368
369 /* Search for existing MTRR */
345 mutex_lock(&mtrr_mutex); 370 mutex_lock(&mtrr_mutex);
346 for (i = 0; i < num_var_ranges; ++i) { 371 for (i = 0; i < num_var_ranges; ++i) {
347 mtrr_if->get(i, &lbase, &lsize, &ltype); 372 mtrr_if->get(i, &lbase, &lsize, &ltype);
348 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) 373 if (!lsize || base > lbase + lsize - 1 ||
374 base + size - 1 < lbase)
349 continue; 375 continue;
350 /* At this point we know there is some kind of overlap/enclosure */ 376 /*
377 * At this point we know there is some kind of
378 * overlap/enclosure
379 */
351 if (base < lbase || base + size - 1 > lbase + lsize - 1) { 380 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
352 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { 381 if (base <= lbase &&
382 base + size - 1 >= lbase + lsize - 1) {
353 /* New region encloses an existing region */ 383 /* New region encloses an existing region */
354 if (type == ltype) { 384 if (type == ltype) {
355 replace = replace == -1 ? i : -2; 385 replace = replace == -1 ? i : -2;
356 continue; 386 continue;
357 } 387 } else if (types_compatible(type, ltype))
358 else if (types_compatible(type, ltype))
359 continue; 388 continue;
360 } 389 }
361 printk(KERN_WARNING 390 pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
362 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 391 " 0x%lx000,0x%lx000\n", base, size, lbase,
363 " 0x%lx000,0x%lx000\n", base, size, lbase, 392 lsize);
364 lsize);
365 goto out; 393 goto out;
366 } 394 }
367 /* New region is enclosed by an existing region */ 395 /* New region is enclosed by an existing region */
368 if (ltype != type) { 396 if (ltype != type) {
369 if (types_compatible(type, ltype)) 397 if (types_compatible(type, ltype))
370 continue; 398 continue;
371 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 399 pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
372 base, size, mtrr_attrib_to_str(ltype), 400 base, size, mtrr_attrib_to_str(ltype),
373 mtrr_attrib_to_str(type)); 401 mtrr_attrib_to_str(type));
374 goto out; 402 goto out;
375 } 403 }
376 if (increment) 404 if (increment)
@@ -378,7 +406,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
378 error = i; 406 error = i;
379 goto out; 407 goto out;
380 } 408 }
381 /* Search for an empty MTRR */ 409 /* Search for an empty MTRR */
382 i = mtrr_if->get_free_region(base, size, replace); 410 i = mtrr_if->get_free_region(base, size, replace);
383 if (i >= 0) { 411 if (i >= 0) {
384 set_mtrr(i, base, size, type); 412 set_mtrr(i, base, size, type);
@@ -393,8 +421,9 @@ int mtrr_add_page(unsigned long base, unsigned long size,
393 mtrr_usage_table[replace] = 0; 421 mtrr_usage_table[replace] = 0;
394 } 422 }
395 } 423 }
396 } else 424 } else {
397 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 425 pr_info("mtrr: no more MTRRs available\n");
426 }
398 error = i; 427 error = i;
399 out: 428 out:
400 mutex_unlock(&mtrr_mutex); 429 mutex_unlock(&mtrr_mutex);
@@ -405,10 +434,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
405static int mtrr_check(unsigned long base, unsigned long size) 434static int mtrr_check(unsigned long base, unsigned long size)
406{ 435{
407 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { 436 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
408 printk(KERN_WARNING 437 pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
409 "mtrr: size and base must be multiples of 4 kiB\n"); 438 pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
410 printk(KERN_DEBUG
411 "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
412 dump_stack(); 439 dump_stack();
413 return -1; 440 return -1;
414 } 441 }
@@ -416,66 +443,64 @@ static int mtrr_check(unsigned long base, unsigned long size)
416} 443}
417 444
418/** 445/**
419 * mtrr_add - Add a memory type region 446 * mtrr_add - Add a memory type region
420 * @base: Physical base address of region 447 * @base: Physical base address of region
421 * @size: Physical size of region 448 * @size: Physical size of region
422 * @type: Type of MTRR desired 449 * @type: Type of MTRR desired
423 * @increment: If this is true do usage counting on the region 450 * @increment: If this is true do usage counting on the region
424 * 451 *
425 * Memory type region registers control the caching on newer Intel and 452 * Memory type region registers control the caching on newer Intel and
426 * non Intel processors. This function allows drivers to request an 453 * non Intel processors. This function allows drivers to request an
427 * MTRR is added. The details and hardware specifics of each processor's 454 * MTRR is added. The details and hardware specifics of each processor's
428 * implementation are hidden from the caller, but nevertheless the 455 * implementation are hidden from the caller, but nevertheless the
429 * caller should expect to need to provide a power of two size on an 456 * caller should expect to need to provide a power of two size on an
430 * equivalent power of two boundary. 457 * equivalent power of two boundary.
431 * 458 *
432 * If the region cannot be added either because all regions are in use 459 * If the region cannot be added either because all regions are in use
433 * or the CPU cannot support it a negative value is returned. On success 460 * or the CPU cannot support it a negative value is returned. On success
434 * the register number for this entry is returned, but should be treated 461 * the register number for this entry is returned, but should be treated
435 * as a cookie only. 462 * as a cookie only.
436 * 463 *
437 * On a multiprocessor machine the changes are made to all processors. 464 * On a multiprocessor machine the changes are made to all processors.
438 * This is required on x86 by the Intel processors. 465 * This is required on x86 by the Intel processors.
439 * 466 *
440 * The available types are 467 * The available types are
441 * 468 *
442 * %MTRR_TYPE_UNCACHABLE - No caching 469 * %MTRR_TYPE_UNCACHABLE - No caching
443 * 470 *
444 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever 471 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
445 * 472 *
446 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts 473 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
447 * 474 *
448 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes 475 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
449 * 476 *
450 * BUGS: Needs a quiet flag for the cases where drivers do not mind 477 * BUGS: Needs a quiet flag for the cases where drivers do not mind
451 * failures and do not wish system log messages to be sent. 478 * failures and do not wish system log messages to be sent.
452 */ 479 */
453 480int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
454int 481 bool increment)
455mtrr_add(unsigned long base, unsigned long size, unsigned int type,
456 bool increment)
457{ 482{
458 if (mtrr_check(base, size)) 483 if (mtrr_check(base, size))
459 return -EINVAL; 484 return -EINVAL;
460 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, 485 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
461 increment); 486 increment);
462} 487}
488EXPORT_SYMBOL(mtrr_add);
463 489
464/** 490/**
465 * mtrr_del_page - delete a memory type region 491 * mtrr_del_page - delete a memory type region
466 * @reg: Register returned by mtrr_add 492 * @reg: Register returned by mtrr_add
467 * @base: Physical base address 493 * @base: Physical base address
468 * @size: Size of region 494 * @size: Size of region
469 * 495 *
470 * If register is supplied then base and size are ignored. This is 496 * If register is supplied then base and size are ignored. This is
471 * how drivers should call it. 497 * how drivers should call it.
472 * 498 *
473 * Releases an MTRR region. If the usage count drops to zero the 499 * Releases an MTRR region. If the usage count drops to zero the
474 * register is freed and the region returns to default state. 500 * register is freed and the region returns to default state.
475 * On success the register is returned, on failure a negative error 501 * On success the register is returned, on failure a negative error
476 * code. 502 * code.
477 */ 503 */
478
479int mtrr_del_page(int reg, unsigned long base, unsigned long size) 504int mtrr_del_page(int reg, unsigned long base, unsigned long size)
480{ 505{
481 int i, max; 506 int i, max;
@@ -500,22 +525,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
500 } 525 }
501 } 526 }
502 if (reg < 0) { 527 if (reg < 0) {
503 printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, 528 pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
504 size); 529 base, size);
505 goto out; 530 goto out;
506 } 531 }
507 } 532 }
508 if (reg >= max) { 533 if (reg >= max) {
509 printk(KERN_WARNING "mtrr: register: %d too big\n", reg); 534 pr_warning("mtrr: register: %d too big\n", reg);
510 goto out; 535 goto out;
511 } 536 }
512 mtrr_if->get(reg, &lbase, &lsize, &ltype); 537 mtrr_if->get(reg, &lbase, &lsize, &ltype);
513 if (lsize < 1) { 538 if (lsize < 1) {
514 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); 539 pr_warning("mtrr: MTRR %d not used\n", reg);
515 goto out; 540 goto out;
516 } 541 }
517 if (mtrr_usage_table[reg] < 1) { 542 if (mtrr_usage_table[reg] < 1) {
518 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); 543 pr_warning("mtrr: reg: %d has count=0\n", reg);
519 goto out; 544 goto out;
520 } 545 }
521 if (--mtrr_usage_table[reg] < 1) 546 if (--mtrr_usage_table[reg] < 1)
@@ -526,33 +551,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
526 put_online_cpus(); 551 put_online_cpus();
527 return error; 552 return error;
528} 553}
554
529/** 555/**
530 * mtrr_del - delete a memory type region 556 * mtrr_del - delete a memory type region
531 * @reg: Register returned by mtrr_add 557 * @reg: Register returned by mtrr_add
532 * @base: Physical base address 558 * @base: Physical base address
533 * @size: Size of region 559 * @size: Size of region
534 * 560 *
535 * If register is supplied then base and size are ignored. This is 561 * If register is supplied then base and size are ignored. This is
536 * how drivers should call it. 562 * how drivers should call it.
537 * 563 *
538 * Releases an MTRR region. If the usage count drops to zero the 564 * Releases an MTRR region. If the usage count drops to zero the
539 * register is freed and the region returns to default state. 565 * register is freed and the region returns to default state.
540 * On success the register is returned, on failure a negative error 566 * On success the register is returned, on failure a negative error
541 * code. 567 * code.
542 */ 568 */
543 569int mtrr_del(int reg, unsigned long base, unsigned long size)
544int
545mtrr_del(int reg, unsigned long base, unsigned long size)
546{ 570{
547 if (mtrr_check(base, size)) 571 if (mtrr_check(base, size))
548 return -EINVAL; 572 return -EINVAL;
549 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); 573 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
550} 574}
551
552EXPORT_SYMBOL(mtrr_add);
553EXPORT_SYMBOL(mtrr_del); 575EXPORT_SYMBOL(mtrr_del);
554 576
555/* HACK ALERT! 577/*
578 * HACK ALERT!
556 * These should be called implicitly, but we can't yet until all the initcall 579 * These should be called implicitly, but we can't yet until all the initcall
557 * stuff is done... 580 * stuff is done...
558 */ 581 */
@@ -576,29 +599,28 @@ struct mtrr_value {
576 599
577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 600static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 601
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 602static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
580{ 603{
581 int i; 604 int i;
582 605
583 for (i = 0; i < num_var_ranges; i++) { 606 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 607 mtrr_if->get(i, &mtrr_value[i].lbase,
585 &mtrr_value[i].lbase, 608 &mtrr_value[i].lsize,
586 &mtrr_value[i].lsize, 609 &mtrr_value[i].ltype);
587 &mtrr_value[i].ltype);
588 } 610 }
589 return 0; 611 return 0;
590} 612}
591 613
592static int mtrr_restore(struct sys_device * sysdev) 614static int mtrr_restore(struct sys_device *sysdev)
593{ 615{
594 int i; 616 int i;
595 617
596 for (i = 0; i < num_var_ranges; i++) { 618 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_value[i].lsize) 619 if (mtrr_value[i].lsize) {
598 set_mtrr(i, 620 set_mtrr(i, mtrr_value[i].lbase,
599 mtrr_value[i].lbase, 621 mtrr_value[i].lsize,
600 mtrr_value[i].lsize, 622 mtrr_value[i].ltype);
601 mtrr_value[i].ltype); 623 }
602 } 624 }
603 return 0; 625 return 0;
604} 626}
@@ -615,26 +637,29 @@ int __initdata changed_by_mtrr_cleanup;
615/** 637/**
616 * mtrr_bp_init - initialize mtrrs on the boot CPU 638 * mtrr_bp_init - initialize mtrrs on the boot CPU
617 * 639 *
618 * This needs to be called early; before any of the other CPUs are 640 * This needs to be called early; before any of the other CPUs are
619 * initialized (i.e. before smp_init()). 641 * initialized (i.e. before smp_init()).
620 * 642 *
621 */ 643 */
622void __init mtrr_bp_init(void) 644void __init mtrr_bp_init(void)
623{ 645{
624 u32 phys_addr; 646 u32 phys_addr;
647
625 init_ifs(); 648 init_ifs();
626 649
627 phys_addr = 32; 650 phys_addr = 32;
628 651
629 if (cpu_has_mtrr) { 652 if (cpu_has_mtrr) {
630 mtrr_if = &generic_mtrr_ops; 653 mtrr_if = &generic_mtrr_ops;
631 size_or_mask = 0xff000000; /* 36 bits */ 654 size_or_mask = 0xff000000; /* 36 bits */
632 size_and_mask = 0x00f00000; 655 size_and_mask = 0x00f00000;
633 phys_addr = 36; 656 phys_addr = 36;
634 657
635 /* This is an AMD specific MSR, but we assume(hope?) that 658 /*
636 Intel will implement it to when they extend the address 659 * This is an AMD specific MSR, but we assume(hope?) that
637 bus of the Xeon. */ 660 * Intel will implement it to when they extend the address
661 * bus of the Xeon.
662 */
638 if (cpuid_eax(0x80000000) >= 0x80000008) { 663 if (cpuid_eax(0x80000000) >= 0x80000008) {
639 phys_addr = cpuid_eax(0x80000008) & 0xff; 664 phys_addr = cpuid_eax(0x80000008) & 0xff;
640 /* CPUID workaround for Intel 0F33/0F34 CPU */ 665 /* CPUID workaround for Intel 0F33/0F34 CPU */
@@ -649,9 +674,11 @@ void __init mtrr_bp_init(void)
649 size_and_mask = ~size_or_mask & 0xfffff00000ULL; 674 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
650 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && 675 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
651 boot_cpu_data.x86 == 6) { 676 boot_cpu_data.x86 == 6) {
652 /* VIA C* family have Intel style MTRRs, but 677 /*
653 don't support PAE */ 678 * VIA C* family have Intel style MTRRs,
654 size_or_mask = 0xfff00000; /* 32 bits */ 679 * but don't support PAE
680 */
681 size_or_mask = 0xfff00000; /* 32 bits */
655 size_and_mask = 0; 682 size_and_mask = 0;
656 phys_addr = 32; 683 phys_addr = 32;
657 } 684 }
@@ -694,30 +721,28 @@ void __init mtrr_bp_init(void)
694 changed_by_mtrr_cleanup = 1; 721 changed_by_mtrr_cleanup = 1;
695 mtrr_if->set_all(); 722 mtrr_if->set_all();
696 } 723 }
697
698 } 724 }
699 } 725 }
700} 726}
701 727
702void mtrr_ap_init(void) 728void mtrr_ap_init(void)
703{ 729{
704 unsigned long flags; 730 if (!use_intel() || mtrr_aps_delayed_init)
705
706 if (!mtrr_if || !use_intel())
707 return; 731 return;
708 /* 732 /*
709 * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, 733 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
710 * but this routine will be called in cpu boot time, holding the lock 734 * changed, but this routine will be called in cpu boot time,
711 * breaks it. This routine is called in two cases: 1.very earily time 735 * holding the lock breaks it.
712 * of software resume, when there absolutely isn't mtrr entry changes; 736 *
713 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to 737 * This routine is called in two cases:
714 * prevent mtrr entry changes 738 *
739 * 1. very earily time of software resume, when there absolutely
740 * isn't mtrr entry changes;
741 *
742 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
743 * lock to prevent mtrr entry changes
715 */ 744 */
716 local_irq_save(flags); 745 set_mtrr(~0U, 0, 0, 0);
717
718 mtrr_if->set_all();
719
720 local_irq_restore(flags);
721} 746}
722 747
723/** 748/**
@@ -728,23 +753,55 @@ void mtrr_save_state(void)
728 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
729} 754}
730 755
756void set_mtrr_aps_delayed_init(void)
757{
758 if (!use_intel())
759 return;
760
761 mtrr_aps_delayed_init = true;
762}
763
764/*
765 * MTRR initialization for all AP's
766 */
767void mtrr_aps_init(void)
768{
769 if (!use_intel())
770 return;
771
772 set_mtrr(~0U, 0, 0, 0);
773 mtrr_aps_delayed_init = false;
774}
775
776void mtrr_bp_restore(void)
777{
778 if (!use_intel())
779 return;
780
781 mtrr_if->set_all();
782}
783
731static int __init mtrr_init_finialize(void) 784static int __init mtrr_init_finialize(void)
732{ 785{
733 if (!mtrr_if) 786 if (!mtrr_if)
734 return 0; 787 return 0;
788
735 if (use_intel()) { 789 if (use_intel()) {
736 if (!changed_by_mtrr_cleanup) 790 if (!changed_by_mtrr_cleanup)
737 mtrr_state_warn(); 791 mtrr_state_warn();
738 } else { 792 return 0;
739 /* The CPUs haven't MTRR and seem to not support SMP. They have
740 * specific drivers, we use a tricky method to support
741 * suspend/resume for them.
742 * TBD: is there any system with such CPU which supports
743 * suspend/resume? if no, we should remove the code.
744 */
745 sysdev_driver_register(&cpu_sysdev_class,
746 &mtrr_sysdev_driver);
747 } 793 }
794
795 /*
796 * The CPU has no MTRR and seems to not support SMP. They have
797 * specific drivers, we use a tricky method to support
798 * suspend/resume for them.
799 *
800 * TBD: is there any system with such CPU which supports
801 * suspend/resume? If no, we should remove the code.
802 */
803 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver);
804
748 return 0; 805 return 0;
749} 806}
750subsys_initcall(mtrr_init_finialize); 807subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 7538b767f206..a501dee9a87a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * local mtrr defines. 2 * local MTRR defines.
3 */ 3 */
4 4
5#include <linux/types.h> 5#include <linux/types.h>
@@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
14struct mtrr_ops { 14struct mtrr_ops {
15 u32 vendor; 15 u32 vendor;
16 u32 use_intel_if; 16 u32 use_intel_if;
17// void (*init)(void);
18 void (*set)(unsigned int reg, unsigned long base, 17 void (*set)(unsigned int reg, unsigned long base,
19 unsigned long size, mtrr_type type); 18 unsigned long size, mtrr_type type);
20 void (*set_all)(void); 19 void (*set_all)(void);
21 20
22 void (*get)(unsigned int reg, unsigned long *base, 21 void (*get)(unsigned int reg, unsigned long *base,
23 unsigned long *size, mtrr_type * type); 22 unsigned long *size, mtrr_type *type);
24 int (*get_free_region)(unsigned long base, unsigned long size, 23 int (*get_free_region)(unsigned long base, unsigned long size,
25 int replace_reg); 24 int replace_reg);
26 int (*validate_add_page)(unsigned long base, unsigned long size, 25 int (*validate_add_page)(unsigned long base, unsigned long size,
@@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void);
39 38
40/* library functions for processor-specific routines */ 39/* library functions for processor-specific routines */
41struct set_mtrr_context { 40struct set_mtrr_context {
42 unsigned long flags; 41 unsigned long flags;
43 unsigned long cr4val; 42 unsigned long cr4val;
44 u32 deftype_lo; 43 u32 deftype_lo;
45 u32 deftype_hi; 44 u32 deftype_hi;
46 u32 ccr3; 45 u32 ccr3;
47}; 46};
48 47
49void set_mtrr_done(struct set_mtrr_context *ctxt); 48void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
54 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
55void get_mtrr_state(void); 54void get_mtrr_state(void);
56 55
57extern void set_mtrr_ops(struct mtrr_ops * ops); 56extern void set_mtrr_ops(struct mtrr_ops *ops);
58 57
59extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
60extern struct mtrr_ops * mtrr_if; 59extern struct mtrr_ops *mtrr_if;
61 60
62#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
63#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 1f5fb1588d1f..dfc80b4e6b0d 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -1,24 +1,25 @@
1#include <linux/mm.h>
2#include <linux/init.h> 1#include <linux/init.h>
3#include <asm/io.h> 2#include <linux/io.h>
4#include <asm/mtrr.h> 3#include <linux/mm.h>
5#include <asm/msr.h> 4
6#include <asm/processor-cyrix.h> 5#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
8#include "mtrr.h" 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "mtrr.h"
10 11
11/* Put the processor into a state where MTRRs can be safely set */ 12/* Put the processor into a state where MTRRs can be safely set */
12void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) 13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
13{ 14{
14 unsigned int cr0; 15 unsigned int cr0;
15 16
16 /* Disable interrupts locally */ 17 /* Disable interrupts locally */
17 local_irq_save(ctxt->flags); 18 local_irq_save(ctxt->flags);
18 19
19 if (use_intel() || is_cpu(CYRIX)) { 20 if (use_intel() || is_cpu(CYRIX)) {
20 21
21 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
22 if (cpu_has_pge) { 23 if (cpu_has_pge) {
23 ctxt->cr4val = read_cr4(); 24 ctxt->cr4val = read_cr4();
24 write_cr4(ctxt->cr4val & ~X86_CR4_PGE); 25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
@@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
33 write_cr0(cr0); 34 write_cr0(cr0);
34 wbinvd(); 35 wbinvd();
35 36
36 if (use_intel()) 37 if (use_intel()) {
37 /* Save MTRR state */ 38 /* Save MTRR state */
38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); 39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 40 } else {
40 /* Cyrix ARRs - everything else were excluded at the top */ 41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
42 } 47 }
43} 48}
44 49
45void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) 50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 51{
47 if (use_intel()) 52 if (use_intel()) {
48 /* Disable MTRRs, and set the default type to uncached */ 53 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, 54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 55 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 56 } else {
52 /* Cyrix ARRs - everything else were excluded at the top */ 57 if (is_cpu(CYRIX)) {
53 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); 58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
54} 62}
55 63
56/* Restore the processor after a set_mtrr_prepare */ 64/* Restore the processor after a set_mtrr_prepare */
57void set_mtrr_done(struct set_mtrr_context *ctxt) 65void set_mtrr_done(struct set_mtrr_context *ctxt)
58{ 66{
59 if (use_intel() || is_cpu(CYRIX)) { 67 if (use_intel() || is_cpu(CYRIX)) {
60 68
61 /* Flush caches and TLBs */ 69 /* Flush caches and TLBs */
62 wbinvd(); 70 wbinvd();
63 71
64 /* Restore MTRRdefType */ 72 /* Restore MTRRdefType */
65 if (use_intel()) 73 if (use_intel()) {
66 /* Intel (P6) standard MTRRs */ 74 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); 75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
68 else 76 ctxt->deftype_hi);
69 /* Cyrix ARRs - everything else was excluded at the top */ 77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
70 setCx86(CX86_CCR3, ctxt->ccr3); 82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
71 84
72 /* Enable caches */ 85 /* Enable caches */
73 write_cr0(read_cr0() & 0xbfffffff); 86 write_cr0(read_cr0() & 0xbfffffff);
74 87
75 /* Restore value of CR4 */ 88 /* Restore value of CR4 */
76 if (cpu_has_pge) 89 if (cpu_has_pge)
77 write_cr4(ctxt->cr4val); 90 write_cr4(ctxt->cr4val);
78 } 91 }
79 /* Re-enable interrupts locally (if enabled previously) */ 92 /* Re-enable interrupts locally (if enabled previously) */
80 local_irq_restore(ctxt->flags); 93 local_irq_restore(ctxt->flags);
81} 94}
82
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c
index 900332b800f8..b5801c311846 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1,16 +1,17 @@
1/* 1/*
2 * Performance counter x86 architecture code 2 * Performance events x86 architecture code
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput 6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
9 * 10 *
10 * For licencing details see kernel-base/COPYING 11 * For licencing details see kernel-base/COPYING
11 */ 12 */
12 13
13#include <linux/perf_counter.h> 14#include <linux/perf_event.h>
14#include <linux/capability.h> 15#include <linux/capability.h>
15#include <linux/notifier.h> 16#include <linux/notifier.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
@@ -20,19 +21,60 @@
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/uaccess.h> 22#include <linux/uaccess.h>
22#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/cpu.h>
23 25
24#include <asm/apic.h> 26#include <asm/apic.h>
25#include <asm/stacktrace.h> 27#include <asm/stacktrace.h>
26#include <asm/nmi.h> 28#include <asm/nmi.h>
27 29
28static u64 perf_counter_mask __read_mostly; 30static u64 perf_event_mask __read_mostly;
29 31
30struct cpu_hw_counters { 32/* The maximal number of PEBS events: */
31 struct perf_counter *counters[X86_PMC_IDX_MAX]; 33#define MAX_PEBS_EVENTS 4
34
35/* The size of a BTS record in bytes: */
36#define BTS_RECORD_SIZE 24
37
38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
40
41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43
44
45/*
46 * Bits in the debugctlmsr controlling branch tracing.
47 */
48#define X86_DEBUGCTL_TR (1 << 6)
49#define X86_DEBUGCTL_BTS (1 << 7)
50#define X86_DEBUGCTL_BTINT (1 << 8)
51#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
52#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
53
54/*
55 * A debug store configuration.
56 *
57 * We only support architectures that use 64bit fields.
58 */
59struct debug_store {
60 u64 bts_buffer_base;
61 u64 bts_index;
62 u64 bts_absolute_maximum;
63 u64 bts_interrupt_threshold;
64 u64 pebs_buffer_base;
65 u64 pebs_index;
66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold;
68 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69};
70
71struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX];
32 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
34 unsigned long interrupts; 75 unsigned long interrupts;
35 int enabled; 76 int enabled;
77 struct debug_store *ds;
36}; 78};
37 79
38/* 80/*
@@ -44,25 +86,27 @@ struct x86_pmu {
44 int (*handle_irq)(struct pt_regs *); 86 int (*handle_irq)(struct pt_regs *);
45 void (*disable_all)(void); 87 void (*disable_all)(void);
46 void (*enable_all)(void); 88 void (*enable_all)(void);
47 void (*enable)(struct hw_perf_counter *, int); 89 void (*enable)(struct hw_perf_event *, int);
48 void (*disable)(struct hw_perf_counter *, int); 90 void (*disable)(struct hw_perf_event *, int);
49 unsigned eventsel; 91 unsigned eventsel;
50 unsigned perfctr; 92 unsigned perfctr;
51 u64 (*event_map)(int); 93 u64 (*event_map)(int);
52 u64 (*raw_event)(u64); 94 u64 (*raw_event)(u64);
53 int max_events; 95 int max_events;
54 int num_counters; 96 int num_events;
55 int num_counters_fixed; 97 int num_events_fixed;
56 int counter_bits; 98 int event_bits;
57 u64 counter_mask; 99 u64 event_mask;
58 int apic; 100 int apic;
59 u64 max_period; 101 u64 max_period;
60 u64 intel_ctrl; 102 u64 intel_ctrl;
103 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void);
61}; 105};
62 106
63static struct x86_pmu x86_pmu __read_mostly; 107static struct x86_pmu x86_pmu __read_mostly;
64 108
65static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { 109static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
66 .enabled = 1, 110 .enabled = 1,
67}; 111};
68 112
@@ -80,35 +124,35 @@ static const u64 p6_perfmon_event_map[] =
80 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, 124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
81}; 125};
82 126
83static u64 p6_pmu_event_map(int event) 127static u64 p6_pmu_event_map(int hw_event)
84{ 128{
85 return p6_perfmon_event_map[event]; 129 return p6_perfmon_event_map[hw_event];
86} 130}
87 131
88/* 132/*
89 * Counter setting that is specified not to count anything. 133 * Event setting that is specified not to count anything.
90 * We use this to effectively disable a counter. 134 * We use this to effectively disable a counter.
91 * 135 *
92 * L2_RQSTS with 0 MESI unit mask. 136 * L2_RQSTS with 0 MESI unit mask.
93 */ 137 */
94#define P6_NOP_COUNTER 0x0000002EULL 138#define P6_NOP_EVENT 0x0000002EULL
95 139
96static u64 p6_pmu_raw_event(u64 event) 140static u64 p6_pmu_raw_event(u64 hw_event)
97{ 141{
98#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL 142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
99#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL 143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
100#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL 144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
101#define P6_EVNTSEL_INV_MASK 0x00800000ULL 145#define P6_EVNTSEL_INV_MASK 0x00800000ULL
102#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL 146#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
103 147
104#define P6_EVNTSEL_MASK \ 148#define P6_EVNTSEL_MASK \
105 (P6_EVNTSEL_EVENT_MASK | \ 149 (P6_EVNTSEL_EVENT_MASK | \
106 P6_EVNTSEL_UNIT_MASK | \ 150 P6_EVNTSEL_UNIT_MASK | \
107 P6_EVNTSEL_EDGE_MASK | \ 151 P6_EVNTSEL_EDGE_MASK | \
108 P6_EVNTSEL_INV_MASK | \ 152 P6_EVNTSEL_INV_MASK | \
109 P6_EVNTSEL_COUNTER_MASK) 153 P6_EVNTSEL_REG_MASK)
110 154
111 return event & P6_EVNTSEL_MASK; 155 return hw_event & P6_EVNTSEL_MASK;
112} 156}
113 157
114 158
@@ -126,16 +170,16 @@ static const u64 intel_perfmon_event_map[] =
126 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
127}; 171};
128 172
129static u64 intel_pmu_event_map(int event) 173static u64 intel_pmu_event_map(int hw_event)
130{ 174{
131 return intel_perfmon_event_map[event]; 175 return intel_perfmon_event_map[hw_event];
132} 176}
133 177
134/* 178/*
135 * Generalized hw caching related event table, filled 179 * Generalized hw caching related hw_event table, filled
136 * in on a per model basis. A value of 0 means 180 * in on a per model basis. A value of 0 means
137 * 'not supported', -1 means 'event makes no sense on 181 * 'not supported', -1 means 'hw_event makes no sense on
138 * this CPU', any other value means the raw event 182 * this CPU', any other value means the raw hw_event
139 * ID. 183 * ID.
140 */ 184 */
141 185
@@ -419,22 +463,22 @@ static const u64 atom_hw_cache_event_ids
419 }, 463 },
420}; 464};
421 465
422static u64 intel_pmu_raw_event(u64 event) 466static u64 intel_pmu_raw_event(u64 hw_event)
423{ 467{
424#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL 468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
425#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
426#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
427#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
428#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL 472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
429 473
430#define CORE_EVNTSEL_MASK \ 474#define CORE_EVNTSEL_MASK \
431 (CORE_EVNTSEL_EVENT_MASK | \ 475 (CORE_EVNTSEL_EVENT_MASK | \
432 CORE_EVNTSEL_UNIT_MASK | \ 476 CORE_EVNTSEL_UNIT_MASK | \
433 CORE_EVNTSEL_EDGE_MASK | \ 477 CORE_EVNTSEL_EDGE_MASK | \
434 CORE_EVNTSEL_INV_MASK | \ 478 CORE_EVNTSEL_INV_MASK | \
435 CORE_EVNTSEL_COUNTER_MASK) 479 CORE_EVNTSEL_REG_MASK)
436 480
437 return event & CORE_EVNTSEL_MASK; 481 return hw_event & CORE_EVNTSEL_MASK;
438} 482}
439 483
440static const u64 amd_hw_cache_event_ids 484static const u64 amd_hw_cache_event_ids
@@ -541,52 +585,55 @@ static const u64 amd_perfmon_event_map[] =
541 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
542}; 586};
543 587
544static u64 amd_pmu_event_map(int event) 588static u64 amd_pmu_event_map(int hw_event)
545{ 589{
546 return amd_perfmon_event_map[event]; 590 return amd_perfmon_event_map[hw_event];
547} 591}
548 592
549static u64 amd_pmu_raw_event(u64 event) 593static u64 amd_pmu_raw_event(u64 hw_event)
550{ 594{
551#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL 595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
552#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
553#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
554#define K7_EVNTSEL_INV_MASK 0x000800000ULL 598#define K7_EVNTSEL_INV_MASK 0x000800000ULL
555#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL 599#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
556 600
557#define K7_EVNTSEL_MASK \ 601#define K7_EVNTSEL_MASK \
558 (K7_EVNTSEL_EVENT_MASK | \ 602 (K7_EVNTSEL_EVENT_MASK | \
559 K7_EVNTSEL_UNIT_MASK | \ 603 K7_EVNTSEL_UNIT_MASK | \
560 K7_EVNTSEL_EDGE_MASK | \ 604 K7_EVNTSEL_EDGE_MASK | \
561 K7_EVNTSEL_INV_MASK | \ 605 K7_EVNTSEL_INV_MASK | \
562 K7_EVNTSEL_COUNTER_MASK) 606 K7_EVNTSEL_REG_MASK)
563 607
564 return event & K7_EVNTSEL_MASK; 608 return hw_event & K7_EVNTSEL_MASK;
565} 609}
566 610
567/* 611/*
568 * Propagate counter elapsed time into the generic counter. 612 * Propagate event elapsed time into the generic event.
569 * Can only be executed on the CPU where the counter is active. 613 * Can only be executed on the CPU where the event is active.
570 * Returns the delta events processed. 614 * Returns the delta events processed.
571 */ 615 */
572static u64 616static u64
573x86_perf_counter_update(struct perf_counter *counter, 617x86_perf_event_update(struct perf_event *event,
574 struct hw_perf_counter *hwc, int idx) 618 struct hw_perf_event *hwc, int idx)
575{ 619{
576 int shift = 64 - x86_pmu.counter_bits; 620 int shift = 64 - x86_pmu.event_bits;
577 u64 prev_raw_count, new_raw_count; 621 u64 prev_raw_count, new_raw_count;
578 s64 delta; 622 s64 delta;
579 623
624 if (idx == X86_PMC_IDX_FIXED_BTS)
625 return 0;
626
580 /* 627 /*
581 * Careful: an NMI might modify the previous counter value. 628 * Careful: an NMI might modify the previous event value.
582 * 629 *
583 * Our tactic to handle this is to first atomically read and 630 * Our tactic to handle this is to first atomically read and
584 * exchange a new raw count - then add that new-prev delta 631 * exchange a new raw count - then add that new-prev delta
585 * count to the generic counter atomically: 632 * count to the generic event atomically:
586 */ 633 */
587again: 634again:
588 prev_raw_count = atomic64_read(&hwc->prev_count); 635 prev_raw_count = atomic64_read(&hwc->prev_count);
589 rdmsrl(hwc->counter_base + idx, new_raw_count); 636 rdmsrl(hwc->event_base + idx, new_raw_count);
590 637
591 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 638 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
592 new_raw_count) != prev_raw_count) 639 new_raw_count) != prev_raw_count)
@@ -595,7 +642,7 @@ again:
595 /* 642 /*
596 * Now we have the new raw value and have updated the prev 643 * Now we have the new raw value and have updated the prev
597 * timestamp already. We can now calculate the elapsed delta 644 * timestamp already. We can now calculate the elapsed delta
598 * (counter-)time and add that to the generic counter. 645 * (event-)time and add that to the generic event.
599 * 646 *
600 * Careful, not all hw sign-extends above the physical width 647 * Careful, not all hw sign-extends above the physical width
601 * of the count. 648 * of the count.
@@ -603,13 +650,13 @@ again:
603 delta = (new_raw_count << shift) - (prev_raw_count << shift); 650 delta = (new_raw_count << shift) - (prev_raw_count << shift);
604 delta >>= shift; 651 delta >>= shift;
605 652
606 atomic64_add(delta, &counter->count); 653 atomic64_add(delta, &event->count);
607 atomic64_sub(delta, &hwc->period_left); 654 atomic64_sub(delta, &hwc->period_left);
608 655
609 return new_raw_count; 656 return new_raw_count;
610} 657}
611 658
612static atomic_t active_counters; 659static atomic_t active_events;
613static DEFINE_MUTEX(pmc_reserve_mutex); 660static DEFINE_MUTEX(pmc_reserve_mutex);
614 661
615static bool reserve_pmc_hardware(void) 662static bool reserve_pmc_hardware(void)
@@ -620,12 +667,12 @@ static bool reserve_pmc_hardware(void)
620 if (nmi_watchdog == NMI_LOCAL_APIC) 667 if (nmi_watchdog == NMI_LOCAL_APIC)
621 disable_lapic_nmi_watchdog(); 668 disable_lapic_nmi_watchdog();
622 669
623 for (i = 0; i < x86_pmu.num_counters; i++) { 670 for (i = 0; i < x86_pmu.num_events; i++) {
624 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 671 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
625 goto perfctr_fail; 672 goto perfctr_fail;
626 } 673 }
627 674
628 for (i = 0; i < x86_pmu.num_counters; i++) { 675 for (i = 0; i < x86_pmu.num_events; i++) {
629 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 676 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
630 goto eventsel_fail; 677 goto eventsel_fail;
631 } 678 }
@@ -638,7 +685,7 @@ eventsel_fail:
638 for (i--; i >= 0; i--) 685 for (i--; i >= 0; i--)
639 release_evntsel_nmi(x86_pmu.eventsel + i); 686 release_evntsel_nmi(x86_pmu.eventsel + i);
640 687
641 i = x86_pmu.num_counters; 688 i = x86_pmu.num_events;
642 689
643perfctr_fail: 690perfctr_fail:
644 for (i--; i >= 0; i--) 691 for (i--; i >= 0; i--)
@@ -656,7 +703,7 @@ static void release_pmc_hardware(void)
656#ifdef CONFIG_X86_LOCAL_APIC 703#ifdef CONFIG_X86_LOCAL_APIC
657 int i; 704 int i;
658 705
659 for (i = 0; i < x86_pmu.num_counters; i++) { 706 for (i = 0; i < x86_pmu.num_events; i++) {
660 release_perfctr_nmi(x86_pmu.perfctr + i); 707 release_perfctr_nmi(x86_pmu.perfctr + i);
661 release_evntsel_nmi(x86_pmu.eventsel + i); 708 release_evntsel_nmi(x86_pmu.eventsel + i);
662 } 709 }
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
666#endif 713#endif
667} 714}
668 715
669static void hw_perf_counter_destroy(struct perf_counter *counter) 716static inline bool bts_available(void)
717{
718 return x86_pmu.enable_bts != NULL;
719}
720
721static inline void init_debug_store_on_cpu(int cpu)
722{
723 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724
725 if (!ds)
726 return;
727
728 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
729 (u32)((u64)(unsigned long)ds),
730 (u32)((u64)(unsigned long)ds >> 32));
731}
732
733static inline void fini_debug_store_on_cpu(int cpu)
734{
735 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return;
737
738 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
739}
740
741static void release_bts_hardware(void)
742{
743 int cpu;
744
745 if (!bts_available())
746 return;
747
748 get_online_cpus();
749
750 for_each_online_cpu(cpu)
751 fini_debug_store_on_cpu(cpu);
752
753 for_each_possible_cpu(cpu) {
754 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
755
756 if (!ds)
757 continue;
758
759 per_cpu(cpu_hw_events, cpu).ds = NULL;
760
761 kfree((void *)(unsigned long)ds->bts_buffer_base);
762 kfree(ds);
763 }
764
765 put_online_cpus();
766}
767
768static int reserve_bts_hardware(void)
670{ 769{
671 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { 770 int cpu, err = 0;
771
772 if (!bts_available())
773 return 0;
774
775 get_online_cpus();
776
777 for_each_possible_cpu(cpu) {
778 struct debug_store *ds;
779 void *buffer;
780
781 err = -ENOMEM;
782 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
783 if (unlikely(!buffer))
784 break;
785
786 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
787 if (unlikely(!ds)) {
788 kfree(buffer);
789 break;
790 }
791
792 ds->bts_buffer_base = (u64)(unsigned long)buffer;
793 ds->bts_index = ds->bts_buffer_base;
794 ds->bts_absolute_maximum =
795 ds->bts_buffer_base + BTS_BUFFER_SIZE;
796 ds->bts_interrupt_threshold =
797 ds->bts_absolute_maximum - BTS_OVFL_TH;
798
799 per_cpu(cpu_hw_events, cpu).ds = ds;
800 err = 0;
801 }
802
803 if (err)
804 release_bts_hardware();
805 else {
806 for_each_online_cpu(cpu)
807 init_debug_store_on_cpu(cpu);
808 }
809
810 put_online_cpus();
811
812 return err;
813}
814
815static void hw_perf_event_destroy(struct perf_event *event)
816{
817 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
672 release_pmc_hardware(); 818 release_pmc_hardware();
819 release_bts_hardware();
673 mutex_unlock(&pmc_reserve_mutex); 820 mutex_unlock(&pmc_reserve_mutex);
674 } 821 }
675} 822}
@@ -680,7 +827,7 @@ static inline int x86_pmu_initialized(void)
680} 827}
681 828
682static inline int 829static inline int
683set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) 830set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
684{ 831{
685 unsigned int cache_type, cache_op, cache_result; 832 unsigned int cache_type, cache_op, cache_result;
686 u64 config, val; 833 u64 config, val;
@@ -712,13 +859,49 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
712 return 0; 859 return 0;
713} 860}
714 861
862static void intel_pmu_enable_bts(u64 config)
863{
864 unsigned long debugctlmsr;
865
866 debugctlmsr = get_debugctlmsr();
867
868 debugctlmsr |= X86_DEBUGCTL_TR;
869 debugctlmsr |= X86_DEBUGCTL_BTS;
870 debugctlmsr |= X86_DEBUGCTL_BTINT;
871
872 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
873 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
874
875 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
876 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
877
878 update_debugctlmsr(debugctlmsr);
879}
880
881static void intel_pmu_disable_bts(void)
882{
883 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr;
885
886 if (!cpuc->ds)
887 return;
888
889 debugctlmsr = get_debugctlmsr();
890
891 debugctlmsr &=
892 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
893 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
894
895 update_debugctlmsr(debugctlmsr);
896}
897
715/* 898/*
716 * Setup the hardware configuration for a given attr_type 899 * Setup the hardware configuration for a given attr_type
717 */ 900 */
718static int __hw_perf_counter_init(struct perf_counter *counter) 901static int __hw_perf_event_init(struct perf_event *event)
719{ 902{
720 struct perf_counter_attr *attr = &counter->attr; 903 struct perf_event_attr *attr = &event->attr;
721 struct hw_perf_counter *hwc = &counter->hw; 904 struct hw_perf_event *hwc = &event->hw;
722 u64 config; 905 u64 config;
723 int err; 906 int err;
724 907
@@ -726,17 +909,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
726 return -ENODEV; 909 return -ENODEV;
727 910
728 err = 0; 911 err = 0;
729 if (!atomic_inc_not_zero(&active_counters)) { 912 if (!atomic_inc_not_zero(&active_events)) {
730 mutex_lock(&pmc_reserve_mutex); 913 mutex_lock(&pmc_reserve_mutex);
731 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) 914 if (atomic_read(&active_events) == 0) {
732 err = -EBUSY; 915 if (!reserve_pmc_hardware())
733 else 916 err = -EBUSY;
734 atomic_inc(&active_counters); 917 else
918 err = reserve_bts_hardware();
919 }
920 if (!err)
921 atomic_inc(&active_events);
735 mutex_unlock(&pmc_reserve_mutex); 922 mutex_unlock(&pmc_reserve_mutex);
736 } 923 }
737 if (err) 924 if (err)
738 return err; 925 return err;
739 926
927 event->destroy = hw_perf_event_destroy;
928
740 /* 929 /*
741 * Generate PMC IRQs: 930 * Generate PMC IRQs:
742 * (keep 'enabled' bit clear for now) 931 * (keep 'enabled' bit clear for now)
@@ -759,17 +948,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
759 /* 948 /*
760 * If we have a PMU initialized but no APIC 949 * If we have a PMU initialized but no APIC
761 * interrupts, we cannot sample hardware 950 * interrupts, we cannot sample hardware
762 * counters (user-space has to fall back and 951 * events (user-space has to fall back and
763 * sample via a hrtimer based software counter): 952 * sample via a hrtimer based software event):
764 */ 953 */
765 if (!x86_pmu.apic) 954 if (!x86_pmu.apic)
766 return -EOPNOTSUPP; 955 return -EOPNOTSUPP;
767 } 956 }
768 957
769 counter->destroy = hw_perf_counter_destroy;
770
771 /* 958 /*
772 * Raw event type provide the config in the event structure 959 * Raw hw_event type provide the config in the hw_event structure
773 */ 960 */
774 if (attr->type == PERF_TYPE_RAW) { 961 if (attr->type == PERF_TYPE_RAW) {
775 hwc->config |= x86_pmu.raw_event(attr->config); 962 hwc->config |= x86_pmu.raw_event(attr->config);
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
793 if (config == -1LL) 980 if (config == -1LL)
794 return -EINVAL; 981 return -EINVAL;
795 982
983 /*
984 * Branch tracing:
985 */
986 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
987 (hwc->sample_period == 1)) {
988 /* BTS is not supported by this architecture. */
989 if (!bts_available())
990 return -EOPNOTSUPP;
991
992 /* BTS is currently only allowed for user-mode. */
993 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
994 return -EOPNOTSUPP;
995 }
996
796 hwc->config |= config; 997 hwc->config |= config;
797 998
798 return 0; 999 return 0;
@@ -800,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
800 1001
801static void p6_pmu_disable_all(void) 1002static void p6_pmu_disable_all(void)
802{ 1003{
803 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1004 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
804 u64 val; 1005 u64 val;
805 1006
806 if (!cpuc->enabled) 1007 if (!cpuc->enabled)
@@ -817,12 +1018,23 @@ static void p6_pmu_disable_all(void)
817 1018
818static void intel_pmu_disable_all(void) 1019static void intel_pmu_disable_all(void)
819{ 1020{
1021 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022
1023 if (!cpuc->enabled)
1024 return;
1025
1026 cpuc->enabled = 0;
1027 barrier();
1028
820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 1029 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
1030
1031 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1032 intel_pmu_disable_bts();
821} 1033}
822 1034
823static void amd_pmu_disable_all(void) 1035static void amd_pmu_disable_all(void)
824{ 1036{
825 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1037 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
826 int idx; 1038 int idx;
827 1039
828 if (!cpuc->enabled) 1040 if (!cpuc->enabled)
@@ -831,12 +1043,12 @@ static void amd_pmu_disable_all(void)
831 cpuc->enabled = 0; 1043 cpuc->enabled = 0;
832 /* 1044 /*
833 * ensure we write the disable before we start disabling the 1045 * ensure we write the disable before we start disabling the
834 * counters proper, so that amd_pmu_enable_counter() does the 1046 * events proper, so that amd_pmu_enable_event() does the
835 * right thing. 1047 * right thing.
836 */ 1048 */
837 barrier(); 1049 barrier();
838 1050
839 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1051 for (idx = 0; idx < x86_pmu.num_events; idx++) {
840 u64 val; 1052 u64 val;
841 1053
842 if (!test_bit(idx, cpuc->active_mask)) 1054 if (!test_bit(idx, cpuc->active_mask))
@@ -858,7 +1070,7 @@ void hw_perf_disable(void)
858 1070
859static void p6_pmu_enable_all(void) 1071static void p6_pmu_enable_all(void)
860{ 1072{
861 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1073 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
862 unsigned long val; 1074 unsigned long val;
863 1075
864 if (cpuc->enabled) 1076 if (cpuc->enabled)
@@ -875,12 +1087,30 @@ static void p6_pmu_enable_all(void)
875 1087
876static void intel_pmu_enable_all(void) 1088static void intel_pmu_enable_all(void)
877{ 1089{
1090 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1091
1092 if (cpuc->enabled)
1093 return;
1094
1095 cpuc->enabled = 1;
1096 barrier();
1097
878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1099
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1101 struct perf_event *event =
1102 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1103
1104 if (WARN_ON_ONCE(!event))
1105 return;
1106
1107 intel_pmu_enable_bts(event->hw.config);
1108 }
879} 1109}
880 1110
881static void amd_pmu_enable_all(void) 1111static void amd_pmu_enable_all(void)
882{ 1112{
883 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1113 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 int idx; 1114 int idx;
885 1115
886 if (cpuc->enabled) 1116 if (cpuc->enabled)
@@ -889,14 +1119,14 @@ static void amd_pmu_enable_all(void)
889 cpuc->enabled = 1; 1119 cpuc->enabled = 1;
890 barrier(); 1120 barrier();
891 1121
892 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1122 for (idx = 0; idx < x86_pmu.num_events; idx++) {
893 struct perf_counter *counter = cpuc->counters[idx]; 1123 struct perf_event *event = cpuc->events[idx];
894 u64 val; 1124 u64 val;
895 1125
896 if (!test_bit(idx, cpuc->active_mask)) 1126 if (!test_bit(idx, cpuc->active_mask))
897 continue; 1127 continue;
898 1128
899 val = counter->hw.config; 1129 val = event->hw.config;
900 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
901 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
902 } 1132 }
@@ -923,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack)
923 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
924} 1154}
925 1155
926static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1156static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
927{ 1157{
928 (void)checking_wrmsrl(hwc->config_base + idx, 1158 (void)checking_wrmsrl(hwc->config_base + idx,
929 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
930} 1160}
931 1161
932static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1162static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
933{ 1163{
934 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
935} 1165}
936 1166
937static inline void 1167static inline void
938intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) 1168intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
939{ 1169{
940 int idx = __idx - X86_PMC_IDX_FIXED; 1170 int idx = __idx - X86_PMC_IDX_FIXED;
941 u64 ctrl_val, mask; 1171 u64 ctrl_val, mask;
@@ -948,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
948} 1178}
949 1179
950static inline void 1180static inline void
951p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1181p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
952{ 1182{
953 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
954 u64 val = P6_NOP_COUNTER; 1184 u64 val = P6_NOP_EVENT;
955 1185
956 if (cpuc->enabled) 1186 if (cpuc->enabled)
957 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -960,36 +1190,44 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
960} 1190}
961 1191
962static inline void 1192static inline void
963intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1193intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
964{ 1194{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1196 intel_pmu_disable_bts();
1197 return;
1198 }
1199
965 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1200 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
966 intel_pmu_disable_fixed(hwc, idx); 1201 intel_pmu_disable_fixed(hwc, idx);
967 return; 1202 return;
968 } 1203 }
969 1204
970 x86_pmu_disable_counter(hwc, idx); 1205 x86_pmu_disable_event(hwc, idx);
971} 1206}
972 1207
973static inline void 1208static inline void
974amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1209amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
975{ 1210{
976 x86_pmu_disable_counter(hwc, idx); 1211 x86_pmu_disable_event(hwc, idx);
977} 1212}
978 1213
979static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 1214static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
980 1215
981/* 1216/*
982 * Set the next IRQ period, based on the hwc->period_left value. 1217 * Set the next IRQ period, based on the hwc->period_left value.
983 * To be called with the counter disabled in hw: 1218 * To be called with the event disabled in hw:
984 */ 1219 */
985static int 1220static int
986x86_perf_counter_set_period(struct perf_counter *counter, 1221x86_perf_event_set_period(struct perf_event *event,
987 struct hw_perf_counter *hwc, int idx) 1222 struct hw_perf_event *hwc, int idx)
988{ 1223{
989 s64 left = atomic64_read(&hwc->period_left); 1224 s64 left = atomic64_read(&hwc->period_left);
990 s64 period = hwc->sample_period; 1225 s64 period = hwc->sample_period;
991 int err, ret = 0; 1226 int err, ret = 0;
992 1227
1228 if (idx == X86_PMC_IDX_FIXED_BTS)
1229 return 0;
1230
993 /* 1231 /*
994 * If we are way outside a reasoable range then just skip forward: 1232 * If we are way outside a reasoable range then just skip forward:
995 */ 1233 */
@@ -1007,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1007 ret = 1; 1245 ret = 1;
1008 } 1246 }
1009 /* 1247 /*
1010 * Quirk: certain CPUs dont like it if just 1 event is left: 1248 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1011 */ 1249 */
1012 if (unlikely(left < 2)) 1250 if (unlikely(left < 2))
1013 left = 2; 1251 left = 2;
@@ -1015,24 +1253,24 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1015 if (left > x86_pmu.max_period) 1253 if (left > x86_pmu.max_period)
1016 left = x86_pmu.max_period; 1254 left = x86_pmu.max_period;
1017 1255
1018 per_cpu(prev_left[idx], smp_processor_id()) = left; 1256 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1019 1257
1020 /* 1258 /*
1021 * The hw counter starts counting from this counter offset, 1259 * The hw event starts counting from this event offset,
1022 * mark it to be able to extra future deltas: 1260 * mark it to be able to extra future deltas:
1023 */ 1261 */
1024 atomic64_set(&hwc->prev_count, (u64)-left); 1262 atomic64_set(&hwc->prev_count, (u64)-left);
1025 1263
1026 err = checking_wrmsrl(hwc->counter_base + idx, 1264 err = checking_wrmsrl(hwc->event_base + idx,
1027 (u64)(-left) & x86_pmu.counter_mask); 1265 (u64)(-left) & x86_pmu.event_mask);
1028 1266
1029 perf_counter_update_userpage(counter); 1267 perf_event_update_userpage(event);
1030 1268
1031 return ret; 1269 return ret;
1032} 1270}
1033 1271
1034static inline void 1272static inline void
1035intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) 1273intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1036{ 1274{
1037 int idx = __idx - X86_PMC_IDX_FIXED; 1275 int idx = __idx - X86_PMC_IDX_FIXED;
1038 u64 ctrl_val, bits, mask; 1276 u64 ctrl_val, bits, mask;
@@ -1057,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
1057 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1295 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1058} 1296}
1059 1297
1060static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1298static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1061{ 1299{
1062 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1300 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1063 u64 val; 1301 u64 val;
1064 1302
1065 val = hwc->config; 1303 val = hwc->config;
@@ -1070,128 +1308,149 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1070} 1308}
1071 1309
1072 1310
1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1311static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1074{ 1312{
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1314 if (!__get_cpu_var(cpu_hw_events).enabled)
1315 return;
1316
1317 intel_pmu_enable_bts(hwc->config);
1318 return;
1319 }
1320
1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1321 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1076 intel_pmu_enable_fixed(hwc, idx); 1322 intel_pmu_enable_fixed(hwc, idx);
1077 return; 1323 return;
1078 } 1324 }
1079 1325
1080 x86_pmu_enable_counter(hwc, idx); 1326 x86_pmu_enable_event(hwc, idx);
1081} 1327}
1082 1328
1083static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1329static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1084{ 1330{
1085 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1331 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1086 1332
1087 if (cpuc->enabled) 1333 if (cpuc->enabled)
1088 x86_pmu_enable_counter(hwc, idx); 1334 x86_pmu_enable_event(hwc, idx);
1089} 1335}
1090 1336
1091static int 1337static int
1092fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) 1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1093{ 1339{
1094 unsigned int event; 1340 unsigned int hw_event;
1095 1341
1096 if (!x86_pmu.num_counters_fixed) 1342 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1097 return -1; 1343
1344 if (unlikely((hw_event ==
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1346 (hwc->sample_period == 1)))
1347 return X86_PMC_IDX_FIXED_BTS;
1098 1348
1099 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1349 if (!x86_pmu.num_events_fixed)
1350 return -1;
1100 1351
1101 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1102 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1103 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1104 return X86_PMC_IDX_FIXED_CPU_CYCLES; 1355 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1105 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) 1356 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1106 return X86_PMC_IDX_FIXED_BUS_CYCLES; 1357 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1107 1358
1108 return -1; 1359 return -1;
1109} 1360}
1110 1361
1111/* 1362/*
1112 * Find a PMC slot for the freshly enabled / scheduled in counter: 1363 * Find a PMC slot for the freshly enabled / scheduled in event:
1113 */ 1364 */
1114static int x86_pmu_enable(struct perf_counter *counter) 1365static int x86_pmu_enable(struct perf_event *event)
1115{ 1366{
1116 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1117 struct hw_perf_counter *hwc = &counter->hw; 1368 struct hw_perf_event *hwc = &event->hw;
1118 int idx; 1369 int idx;
1119 1370
1120 idx = fixed_mode_idx(counter, hwc); 1371 idx = fixed_mode_idx(event, hwc);
1121 if (idx >= 0) { 1372 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN;
1376
1377 hwc->config_base = 0;
1378 hwc->event_base = 0;
1379 hwc->idx = idx;
1380 } else if (idx >= 0) {
1122 /* 1381 /*
1123 * Try to get the fixed counter, if that is already taken 1382 * Try to get the fixed event, if that is already taken
1124 * then try to get a generic counter: 1383 * then try to get a generic event:
1125 */ 1384 */
1126 if (test_and_set_bit(idx, cpuc->used_mask)) 1385 if (test_and_set_bit(idx, cpuc->used_mask))
1127 goto try_generic; 1386 goto try_generic;
1128 1387
1129 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1130 /* 1389 /*
1131 * We set it so that counter_base + idx in wrmsr/rdmsr maps to 1390 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1132 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: 1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1133 */ 1392 */
1134 hwc->counter_base = 1393 hwc->event_base =
1135 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; 1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1136 hwc->idx = idx; 1395 hwc->idx = idx;
1137 } else { 1396 } else {
1138 idx = hwc->idx; 1397 idx = hwc->idx;
1139 /* Try to get the previous generic counter again */ 1398 /* Try to get the previous generic event again */
1140 if (test_and_set_bit(idx, cpuc->used_mask)) { 1399 if (test_and_set_bit(idx, cpuc->used_mask)) {
1141try_generic: 1400try_generic:
1142 idx = find_first_zero_bit(cpuc->used_mask, 1401 idx = find_first_zero_bit(cpuc->used_mask,
1143 x86_pmu.num_counters); 1402 x86_pmu.num_events);
1144 if (idx == x86_pmu.num_counters) 1403 if (idx == x86_pmu.num_events)
1145 return -EAGAIN; 1404 return -EAGAIN;
1146 1405
1147 set_bit(idx, cpuc->used_mask); 1406 set_bit(idx, cpuc->used_mask);
1148 hwc->idx = idx; 1407 hwc->idx = idx;
1149 } 1408 }
1150 hwc->config_base = x86_pmu.eventsel; 1409 hwc->config_base = x86_pmu.eventsel;
1151 hwc->counter_base = x86_pmu.perfctr; 1410 hwc->event_base = x86_pmu.perfctr;
1152 } 1411 }
1153 1412
1154 perf_counters_lapic_init(); 1413 perf_events_lapic_init();
1155 1414
1156 x86_pmu.disable(hwc, idx); 1415 x86_pmu.disable(hwc, idx);
1157 1416
1158 cpuc->counters[idx] = counter; 1417 cpuc->events[idx] = event;
1159 set_bit(idx, cpuc->active_mask); 1418 set_bit(idx, cpuc->active_mask);
1160 1419
1161 x86_perf_counter_set_period(counter, hwc, idx); 1420 x86_perf_event_set_period(event, hwc, idx);
1162 x86_pmu.enable(hwc, idx); 1421 x86_pmu.enable(hwc, idx);
1163 1422
1164 perf_counter_update_userpage(counter); 1423 perf_event_update_userpage(event);
1165 1424
1166 return 0; 1425 return 0;
1167} 1426}
1168 1427
1169static void x86_pmu_unthrottle(struct perf_counter *counter) 1428static void x86_pmu_unthrottle(struct perf_event *event)
1170{ 1429{
1171 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1430 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1172 struct hw_perf_counter *hwc = &counter->hw; 1431 struct hw_perf_event *hwc = &event->hw;
1173 1432
1174 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || 1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1175 cpuc->counters[hwc->idx] != counter)) 1434 cpuc->events[hwc->idx] != event))
1176 return; 1435 return;
1177 1436
1178 x86_pmu.enable(hwc, hwc->idx); 1437 x86_pmu.enable(hwc, hwc->idx);
1179} 1438}
1180 1439
1181void perf_counter_print_debug(void) 1440void perf_event_print_debug(void)
1182{ 1441{
1183 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1442 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1184 struct cpu_hw_counters *cpuc; 1443 struct cpu_hw_events *cpuc;
1185 unsigned long flags; 1444 unsigned long flags;
1186 int cpu, idx; 1445 int cpu, idx;
1187 1446
1188 if (!x86_pmu.num_counters) 1447 if (!x86_pmu.num_events)
1189 return; 1448 return;
1190 1449
1191 local_irq_save(flags); 1450 local_irq_save(flags);
1192 1451
1193 cpu = smp_processor_id(); 1452 cpu = smp_processor_id();
1194 cpuc = &per_cpu(cpu_hw_counters, cpu); 1453 cpuc = &per_cpu(cpu_hw_events, cpu);
1195 1454
1196 if (x86_pmu.version >= 2) { 1455 if (x86_pmu.version >= 2) {
1197 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1456 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1207,11 +1466,11 @@ void perf_counter_print_debug(void)
1207 } 1466 }
1208 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1209 1468
1210 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1469 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1213 1472
1214 prev_left = per_cpu(prev_left[idx], cpu); 1473 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1215 1474
1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1217 cpu, idx, pmc_ctrl); 1476 cpu, idx, pmc_ctrl);
@@ -1220,7 +1479,7 @@ void perf_counter_print_debug(void)
1220 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1479 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1221 cpu, idx, prev_left); 1480 cpu, idx, prev_left);
1222 } 1481 }
1223 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1482 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1224 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1483 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1225 1484
1226 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1485 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1229,10 +1488,69 @@ void perf_counter_print_debug(void)
1229 local_irq_restore(flags); 1488 local_irq_restore(flags);
1230} 1489}
1231 1490
1232static void x86_pmu_disable(struct perf_counter *counter) 1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1233{ 1492{
1234 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1493 struct debug_store *ds = cpuc->ds;
1235 struct hw_perf_counter *hwc = &counter->hw; 1494 struct bts_record {
1495 u64 from;
1496 u64 to;
1497 u64 flags;
1498 };
1499 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1500 struct bts_record *at, *top;
1501 struct perf_output_handle handle;
1502 struct perf_event_header header;
1503 struct perf_sample_data data;
1504 struct pt_regs regs;
1505
1506 if (!event)
1507 return;
1508
1509 if (!ds)
1510 return;
1511
1512 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1513 top = (struct bts_record *)(unsigned long)ds->bts_index;
1514
1515 if (top <= at)
1516 return;
1517
1518 ds->bts_index = ds->bts_buffer_base;
1519
1520
1521 data.period = event->hw.last_period;
1522 data.addr = 0;
1523 regs.ip = 0;
1524
1525 /*
1526 * Prepare a generic sample, i.e. fill in the invariant fields.
1527 * We will overwrite the from and to address before we output
1528 * the sample.
1529 */
1530 perf_prepare_sample(&header, &data, event, &regs);
1531
1532 if (perf_output_begin(&handle, event,
1533 header.size * (top - at), 1, 1))
1534 return;
1535
1536 for (; at < top; at++) {
1537 data.ip = at->from;
1538 data.addr = at->to;
1539
1540 perf_output_sample(&handle, &header, &data, event);
1541 }
1542
1543 perf_output_end(&handle);
1544
1545 /* There's new data available. */
1546 event->hw.interrupts++;
1547 event->pending_kill = POLL_IN;
1548}
1549
1550static void x86_pmu_disable(struct perf_event *event)
1551{
1552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1553 struct hw_perf_event *hwc = &event->hw;
1236 int idx = hwc->idx; 1554 int idx = hwc->idx;
1237 1555
1238 /* 1556 /*
@@ -1244,59 +1562,67 @@ static void x86_pmu_disable(struct perf_counter *counter)
1244 1562
1245 /* 1563 /*
1246 * Make sure the cleared pointer becomes visible before we 1564 * Make sure the cleared pointer becomes visible before we
1247 * (potentially) free the counter: 1565 * (potentially) free the event:
1248 */ 1566 */
1249 barrier(); 1567 barrier();
1250 1568
1251 /* 1569 /*
1252 * Drain the remaining delta count out of a counter 1570 * Drain the remaining delta count out of a event
1253 * that we are disabling: 1571 * that we are disabling:
1254 */ 1572 */
1255 x86_perf_counter_update(counter, hwc, idx); 1573 x86_perf_event_update(event, hwc, idx);
1256 cpuc->counters[idx] = NULL; 1574
1575 /* Drain the remaining BTS records. */
1576 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1577 intel_pmu_drain_bts_buffer(cpuc);
1578
1579 cpuc->events[idx] = NULL;
1257 clear_bit(idx, cpuc->used_mask); 1580 clear_bit(idx, cpuc->used_mask);
1258 1581
1259 perf_counter_update_userpage(counter); 1582 perf_event_update_userpage(event);
1260} 1583}
1261 1584
1262/* 1585/*
1263 * Save and restart an expired counter. Called by NMI contexts, 1586 * Save and restart an expired event. Called by NMI contexts,
1264 * so it has to be careful about preempting normal counter ops: 1587 * so it has to be careful about preempting normal event ops:
1265 */ 1588 */
1266static int intel_pmu_save_and_restart(struct perf_counter *counter) 1589static int intel_pmu_save_and_restart(struct perf_event *event)
1267{ 1590{
1268 struct hw_perf_counter *hwc = &counter->hw; 1591 struct hw_perf_event *hwc = &event->hw;
1269 int idx = hwc->idx; 1592 int idx = hwc->idx;
1270 int ret; 1593 int ret;
1271 1594
1272 x86_perf_counter_update(counter, hwc, idx); 1595 x86_perf_event_update(event, hwc, idx);
1273 ret = x86_perf_counter_set_period(counter, hwc, idx); 1596 ret = x86_perf_event_set_period(event, hwc, idx);
1274 1597
1275 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1598 if (event->state == PERF_EVENT_STATE_ACTIVE)
1276 intel_pmu_enable_counter(hwc, idx); 1599 intel_pmu_enable_event(hwc, idx);
1277 1600
1278 return ret; 1601 return ret;
1279} 1602}
1280 1603
1281static void intel_pmu_reset(void) 1604static void intel_pmu_reset(void)
1282{ 1605{
1606 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1283 unsigned long flags; 1607 unsigned long flags;
1284 int idx; 1608 int idx;
1285 1609
1286 if (!x86_pmu.num_counters) 1610 if (!x86_pmu.num_events)
1287 return; 1611 return;
1288 1612
1289 local_irq_save(flags); 1613 local_irq_save(flags);
1290 1614
1291 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1615 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1292 1616
1293 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1617 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1294 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 1618 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1295 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 1619 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1296 } 1620 }
1297 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1621 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1298 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1622 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1299 } 1623 }
1624 if (ds)
1625 ds->bts_index = ds->bts_buffer_base;
1300 1626
1301 local_irq_restore(flags); 1627 local_irq_restore(flags);
1302} 1628}
@@ -1304,39 +1630,38 @@ static void intel_pmu_reset(void)
1304static int p6_pmu_handle_irq(struct pt_regs *regs) 1630static int p6_pmu_handle_irq(struct pt_regs *regs)
1305{ 1631{
1306 struct perf_sample_data data; 1632 struct perf_sample_data data;
1307 struct cpu_hw_counters *cpuc; 1633 struct cpu_hw_events *cpuc;
1308 struct perf_counter *counter; 1634 struct perf_event *event;
1309 struct hw_perf_counter *hwc; 1635 struct hw_perf_event *hwc;
1310 int idx, handled = 0; 1636 int idx, handled = 0;
1311 u64 val; 1637 u64 val;
1312 1638
1313 data.regs = regs;
1314 data.addr = 0; 1639 data.addr = 0;
1315 1640
1316 cpuc = &__get_cpu_var(cpu_hw_counters); 1641 cpuc = &__get_cpu_var(cpu_hw_events);
1317 1642
1318 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1643 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1319 if (!test_bit(idx, cpuc->active_mask)) 1644 if (!test_bit(idx, cpuc->active_mask))
1320 continue; 1645 continue;
1321 1646
1322 counter = cpuc->counters[idx]; 1647 event = cpuc->events[idx];
1323 hwc = &counter->hw; 1648 hwc = &event->hw;
1324 1649
1325 val = x86_perf_counter_update(counter, hwc, idx); 1650 val = x86_perf_event_update(event, hwc, idx);
1326 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1651 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1327 continue; 1652 continue;
1328 1653
1329 /* 1654 /*
1330 * counter overflow 1655 * event overflow
1331 */ 1656 */
1332 handled = 1; 1657 handled = 1;
1333 data.period = counter->hw.last_period; 1658 data.period = event->hw.last_period;
1334 1659
1335 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1660 if (!x86_perf_event_set_period(event, hwc, idx))
1336 continue; 1661 continue;
1337 1662
1338 if (perf_counter_overflow(counter, 1, &data)) 1663 if (perf_event_overflow(event, 1, &data, regs))
1339 p6_pmu_disable_counter(hwc, idx); 1664 p6_pmu_disable_event(hwc, idx);
1340 } 1665 }
1341 1666
1342 if (handled) 1667 if (handled)
@@ -1352,16 +1677,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1352static int intel_pmu_handle_irq(struct pt_regs *regs) 1677static int intel_pmu_handle_irq(struct pt_regs *regs)
1353{ 1678{
1354 struct perf_sample_data data; 1679 struct perf_sample_data data;
1355 struct cpu_hw_counters *cpuc; 1680 struct cpu_hw_events *cpuc;
1356 int bit, loops; 1681 int bit, loops;
1357 u64 ack, status; 1682 u64 ack, status;
1358 1683
1359 data.regs = regs;
1360 data.addr = 0; 1684 data.addr = 0;
1361 1685
1362 cpuc = &__get_cpu_var(cpu_hw_counters); 1686 cpuc = &__get_cpu_var(cpu_hw_events);
1363 1687
1364 perf_disable(); 1688 perf_disable();
1689 intel_pmu_drain_bts_buffer(cpuc);
1365 status = intel_pmu_get_status(); 1690 status = intel_pmu_get_status();
1366 if (!status) { 1691 if (!status) {
1367 perf_enable(); 1692 perf_enable();
@@ -1371,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1371 loops = 0; 1696 loops = 0;
1372again: 1697again:
1373 if (++loops > 100) { 1698 if (++loops > 100) {
1374 WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); 1699 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1375 perf_counter_print_debug(); 1700 perf_event_print_debug();
1376 intel_pmu_reset(); 1701 intel_pmu_reset();
1377 perf_enable(); 1702 perf_enable();
1378 return 1; 1703 return 1;
@@ -1381,19 +1706,19 @@ again:
1381 inc_irq_stat(apic_perf_irqs); 1706 inc_irq_stat(apic_perf_irqs);
1382 ack = status; 1707 ack = status;
1383 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 1708 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1384 struct perf_counter *counter = cpuc->counters[bit]; 1709 struct perf_event *event = cpuc->events[bit];
1385 1710
1386 clear_bit(bit, (unsigned long *) &status); 1711 clear_bit(bit, (unsigned long *) &status);
1387 if (!test_bit(bit, cpuc->active_mask)) 1712 if (!test_bit(bit, cpuc->active_mask))
1388 continue; 1713 continue;
1389 1714
1390 if (!intel_pmu_save_and_restart(counter)) 1715 if (!intel_pmu_save_and_restart(event))
1391 continue; 1716 continue;
1392 1717
1393 data.period = counter->hw.last_period; 1718 data.period = event->hw.last_period;
1394 1719
1395 if (perf_counter_overflow(counter, 1, &data)) 1720 if (perf_event_overflow(event, 1, &data, regs))
1396 intel_pmu_disable_counter(&counter->hw, bit); 1721 intel_pmu_disable_event(&event->hw, bit);
1397 } 1722 }
1398 1723
1399 intel_pmu_ack_status(ack); 1724 intel_pmu_ack_status(ack);
@@ -1413,39 +1738,38 @@ again:
1413static int amd_pmu_handle_irq(struct pt_regs *regs) 1738static int amd_pmu_handle_irq(struct pt_regs *regs)
1414{ 1739{
1415 struct perf_sample_data data; 1740 struct perf_sample_data data;
1416 struct cpu_hw_counters *cpuc; 1741 struct cpu_hw_events *cpuc;
1417 struct perf_counter *counter; 1742 struct perf_event *event;
1418 struct hw_perf_counter *hwc; 1743 struct hw_perf_event *hwc;
1419 int idx, handled = 0; 1744 int idx, handled = 0;
1420 u64 val; 1745 u64 val;
1421 1746
1422 data.regs = regs;
1423 data.addr = 0; 1747 data.addr = 0;
1424 1748
1425 cpuc = &__get_cpu_var(cpu_hw_counters); 1749 cpuc = &__get_cpu_var(cpu_hw_events);
1426 1750
1427 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1751 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1428 if (!test_bit(idx, cpuc->active_mask)) 1752 if (!test_bit(idx, cpuc->active_mask))
1429 continue; 1753 continue;
1430 1754
1431 counter = cpuc->counters[idx]; 1755 event = cpuc->events[idx];
1432 hwc = &counter->hw; 1756 hwc = &event->hw;
1433 1757
1434 val = x86_perf_counter_update(counter, hwc, idx); 1758 val = x86_perf_event_update(event, hwc, idx);
1435 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1759 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1436 continue; 1760 continue;
1437 1761
1438 /* 1762 /*
1439 * counter overflow 1763 * event overflow
1440 */ 1764 */
1441 handled = 1; 1765 handled = 1;
1442 data.period = counter->hw.last_period; 1766 data.period = event->hw.last_period;
1443 1767
1444 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1768 if (!x86_perf_event_set_period(event, hwc, idx))
1445 continue; 1769 continue;
1446 1770
1447 if (perf_counter_overflow(counter, 1, &data)) 1771 if (perf_event_overflow(event, 1, &data, regs))
1448 amd_pmu_disable_counter(hwc, idx); 1772 amd_pmu_disable_event(hwc, idx);
1449 } 1773 }
1450 1774
1451 if (handled) 1775 if (handled)
@@ -1459,18 +1783,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1459 irq_enter(); 1783 irq_enter();
1460 ack_APIC_irq(); 1784 ack_APIC_irq();
1461 inc_irq_stat(apic_pending_irqs); 1785 inc_irq_stat(apic_pending_irqs);
1462 perf_counter_do_pending(); 1786 perf_event_do_pending();
1463 irq_exit(); 1787 irq_exit();
1464} 1788}
1465 1789
1466void set_perf_counter_pending(void) 1790void set_perf_event_pending(void)
1467{ 1791{
1468#ifdef CONFIG_X86_LOCAL_APIC 1792#ifdef CONFIG_X86_LOCAL_APIC
1793 if (!x86_pmu.apic || !x86_pmu_initialized())
1794 return;
1795
1469 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1796 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1470#endif 1797#endif
1471} 1798}
1472 1799
1473void perf_counters_lapic_init(void) 1800void perf_events_lapic_init(void)
1474{ 1801{
1475#ifdef CONFIG_X86_LOCAL_APIC 1802#ifdef CONFIG_X86_LOCAL_APIC
1476 if (!x86_pmu.apic || !x86_pmu_initialized()) 1803 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1484,13 +1811,13 @@ void perf_counters_lapic_init(void)
1484} 1811}
1485 1812
1486static int __kprobes 1813static int __kprobes
1487perf_counter_nmi_handler(struct notifier_block *self, 1814perf_event_nmi_handler(struct notifier_block *self,
1488 unsigned long cmd, void *__args) 1815 unsigned long cmd, void *__args)
1489{ 1816{
1490 struct die_args *args = __args; 1817 struct die_args *args = __args;
1491 struct pt_regs *regs; 1818 struct pt_regs *regs;
1492 1819
1493 if (!atomic_read(&active_counters)) 1820 if (!atomic_read(&active_events))
1494 return NOTIFY_DONE; 1821 return NOTIFY_DONE;
1495 1822
1496 switch (cmd) { 1823 switch (cmd) {
@@ -1509,7 +1836,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
1509#endif 1836#endif
1510 /* 1837 /*
1511 * Can't rely on the handled return value to say it was our NMI, two 1838 * Can't rely on the handled return value to say it was our NMI, two
1512 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1839 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1513 * 1840 *
1514 * If the first NMI handles both, the latter will be empty and daze 1841 * If the first NMI handles both, the latter will be empty and daze
1515 * the CPU. 1842 * the CPU.
@@ -1519,8 +1846,8 @@ perf_counter_nmi_handler(struct notifier_block *self,
1519 return NOTIFY_STOP; 1846 return NOTIFY_STOP;
1520} 1847}
1521 1848
1522static __read_mostly struct notifier_block perf_counter_nmi_notifier = { 1849static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1523 .notifier_call = perf_counter_nmi_handler, 1850 .notifier_call = perf_event_nmi_handler,
1524 .next = NULL, 1851 .next = NULL,
1525 .priority = 1 1852 .priority = 1
1526}; 1853};
@@ -1530,8 +1857,8 @@ static struct x86_pmu p6_pmu = {
1530 .handle_irq = p6_pmu_handle_irq, 1857 .handle_irq = p6_pmu_handle_irq,
1531 .disable_all = p6_pmu_disable_all, 1858 .disable_all = p6_pmu_disable_all,
1532 .enable_all = p6_pmu_enable_all, 1859 .enable_all = p6_pmu_enable_all,
1533 .enable = p6_pmu_enable_counter, 1860 .enable = p6_pmu_enable_event,
1534 .disable = p6_pmu_disable_counter, 1861 .disable = p6_pmu_disable_event,
1535 .eventsel = MSR_P6_EVNTSEL0, 1862 .eventsel = MSR_P6_EVNTSEL0,
1536 .perfctr = MSR_P6_PERFCTR0, 1863 .perfctr = MSR_P6_PERFCTR0,
1537 .event_map = p6_pmu_event_map, 1864 .event_map = p6_pmu_event_map,
@@ -1540,16 +1867,16 @@ static struct x86_pmu p6_pmu = {
1540 .apic = 1, 1867 .apic = 1,
1541 .max_period = (1ULL << 31) - 1, 1868 .max_period = (1ULL << 31) - 1,
1542 .version = 0, 1869 .version = 0,
1543 .num_counters = 2, 1870 .num_events = 2,
1544 /* 1871 /*
1545 * Counters have 40 bits implemented. However they are designed such 1872 * Events have 40 bits implemented. However they are designed such
1546 * that bits [32-39] are sign extensions of bit 31. As such the 1873 * that bits [32-39] are sign extensions of bit 31. As such the
1547 * effective width of a counter for P6-like PMU is 32 bits only. 1874 * effective width of a event for P6-like PMU is 32 bits only.
1548 * 1875 *
1549 * See IA-32 Intel Architecture Software developer manual Vol 3B 1876 * See IA-32 Intel Architecture Software developer manual Vol 3B
1550 */ 1877 */
1551 .counter_bits = 32, 1878 .event_bits = 32,
1552 .counter_mask = (1ULL << 32) - 1, 1879 .event_mask = (1ULL << 32) - 1,
1553}; 1880};
1554 1881
1555static struct x86_pmu intel_pmu = { 1882static struct x86_pmu intel_pmu = {
@@ -1557,8 +1884,8 @@ static struct x86_pmu intel_pmu = {
1557 .handle_irq = intel_pmu_handle_irq, 1884 .handle_irq = intel_pmu_handle_irq,
1558 .disable_all = intel_pmu_disable_all, 1885 .disable_all = intel_pmu_disable_all,
1559 .enable_all = intel_pmu_enable_all, 1886 .enable_all = intel_pmu_enable_all,
1560 .enable = intel_pmu_enable_counter, 1887 .enable = intel_pmu_enable_event,
1561 .disable = intel_pmu_disable_counter, 1888 .disable = intel_pmu_disable_event,
1562 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 1889 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1563 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 1890 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1564 .event_map = intel_pmu_event_map, 1891 .event_map = intel_pmu_event_map,
@@ -1568,9 +1895,11 @@ static struct x86_pmu intel_pmu = {
1568 /* 1895 /*
1569 * Intel PMCs cannot be accessed sanely above 32 bit width, 1896 * Intel PMCs cannot be accessed sanely above 32 bit width,
1570 * so we install an artificial 1<<31 period regardless of 1897 * so we install an artificial 1<<31 period regardless of
1571 * the generic counter period: 1898 * the generic event period:
1572 */ 1899 */
1573 .max_period = (1ULL << 31) - 1, 1900 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts,
1574}; 1903};
1575 1904
1576static struct x86_pmu amd_pmu = { 1905static struct x86_pmu amd_pmu = {
@@ -1578,16 +1907,16 @@ static struct x86_pmu amd_pmu = {
1578 .handle_irq = amd_pmu_handle_irq, 1907 .handle_irq = amd_pmu_handle_irq,
1579 .disable_all = amd_pmu_disable_all, 1908 .disable_all = amd_pmu_disable_all,
1580 .enable_all = amd_pmu_enable_all, 1909 .enable_all = amd_pmu_enable_all,
1581 .enable = amd_pmu_enable_counter, 1910 .enable = amd_pmu_enable_event,
1582 .disable = amd_pmu_disable_counter, 1911 .disable = amd_pmu_disable_event,
1583 .eventsel = MSR_K7_EVNTSEL0, 1912 .eventsel = MSR_K7_EVNTSEL0,
1584 .perfctr = MSR_K7_PERFCTR0, 1913 .perfctr = MSR_K7_PERFCTR0,
1585 .event_map = amd_pmu_event_map, 1914 .event_map = amd_pmu_event_map,
1586 .raw_event = amd_pmu_raw_event, 1915 .raw_event = amd_pmu_raw_event,
1587 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 1916 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1588 .num_counters = 4, 1917 .num_events = 4,
1589 .counter_bits = 48, 1918 .event_bits = 48,
1590 .counter_mask = (1ULL << 48) - 1, 1919 .event_mask = (1ULL << 48) - 1,
1591 .apic = 1, 1920 .apic = 1,
1592 /* use highest bit to detect overflow */ 1921 /* use highest bit to detect overflow */
1593 .max_period = (1ULL << 47) - 1, 1922 .max_period = (1ULL << 47) - 1,
@@ -1644,7 +1973,7 @@ static int intel_pmu_init(void)
1644 1973
1645 /* 1974 /*
1646 * Check whether the Architectural PerfMon supports 1975 * Check whether the Architectural PerfMon supports
1647 * Branch Misses Retired Event or not. 1976 * Branch Misses Retired hw_event or not.
1648 */ 1977 */
1649 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 1978 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1650 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 1979 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
@@ -1656,15 +1985,15 @@ static int intel_pmu_init(void)
1656 1985
1657 x86_pmu = intel_pmu; 1986 x86_pmu = intel_pmu;
1658 x86_pmu.version = version; 1987 x86_pmu.version = version;
1659 x86_pmu.num_counters = eax.split.num_counters; 1988 x86_pmu.num_events = eax.split.num_events;
1660 x86_pmu.counter_bits = eax.split.bit_width; 1989 x86_pmu.event_bits = eax.split.bit_width;
1661 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; 1990 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1662 1991
1663 /* 1992 /*
1664 * Quirk: v2 perfmon does not report fixed-purpose counters, so 1993 * Quirk: v2 perfmon does not report fixed-purpose events, so
1665 * assume at least 3 counters: 1994 * assume at least 3 events:
1666 */ 1995 */
1667 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 1996 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
1668 1997
1669 /* 1998 /*
1670 * Install the hw-cache-events table: 1999 * Install the hw-cache-events table:
@@ -1711,11 +2040,11 @@ static int amd_pmu_init(void)
1711 return 0; 2040 return 0;
1712} 2041}
1713 2042
1714void __init init_hw_perf_counters(void) 2043void __init init_hw_perf_events(void)
1715{ 2044{
1716 int err; 2045 int err;
1717 2046
1718 pr_info("Performance Counters: "); 2047 pr_info("Performance Events: ");
1719 2048
1720 switch (boot_cpu_data.x86_vendor) { 2049 switch (boot_cpu_data.x86_vendor) {
1721 case X86_VENDOR_INTEL: 2050 case X86_VENDOR_INTEL:
@@ -1728,45 +2057,45 @@ void __init init_hw_perf_counters(void)
1728 return; 2057 return;
1729 } 2058 }
1730 if (err != 0) { 2059 if (err != 0) {
1731 pr_cont("no PMU driver, software counters only.\n"); 2060 pr_cont("no PMU driver, software events only.\n");
1732 return; 2061 return;
1733 } 2062 }
1734 2063
1735 pr_cont("%s PMU driver.\n", x86_pmu.name); 2064 pr_cont("%s PMU driver.\n", x86_pmu.name);
1736 2065
1737 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
1738 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 2067 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1739 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 2068 x86_pmu.num_events, X86_PMC_MAX_GENERIC);
1740 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 2069 x86_pmu.num_events = X86_PMC_MAX_GENERIC;
1741 } 2070 }
1742 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 2071 perf_event_mask = (1 << x86_pmu.num_events) - 1;
1743 perf_max_counters = x86_pmu.num_counters; 2072 perf_max_events = x86_pmu.num_events;
1744 2073
1745 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 2074 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
1746 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 2075 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1747 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 2076 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
1748 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; 2077 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
1749 } 2078 }
1750 2079
1751 perf_counter_mask |= 2080 perf_event_mask |=
1752 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 2081 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
1753 x86_pmu.intel_ctrl = perf_counter_mask; 2082 x86_pmu.intel_ctrl = perf_event_mask;
1754 2083
1755 perf_counters_lapic_init(); 2084 perf_events_lapic_init();
1756 register_die_notifier(&perf_counter_nmi_notifier); 2085 register_die_notifier(&perf_event_nmi_notifier);
1757 2086
1758 pr_info("... version: %d\n", x86_pmu.version); 2087 pr_info("... version: %d\n", x86_pmu.version);
1759 pr_info("... bit width: %d\n", x86_pmu.counter_bits); 2088 pr_info("... bit width: %d\n", x86_pmu.event_bits);
1760 pr_info("... generic counters: %d\n", x86_pmu.num_counters); 2089 pr_info("... generic registers: %d\n", x86_pmu.num_events);
1761 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); 2090 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
1762 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2091 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1763 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); 2092 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
1764 pr_info("... counter mask: %016Lx\n", perf_counter_mask); 2093 pr_info("... event mask: %016Lx\n", perf_event_mask);
1765} 2094}
1766 2095
1767static inline void x86_pmu_read(struct perf_counter *counter) 2096static inline void x86_pmu_read(struct perf_event *event)
1768{ 2097{
1769 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); 2098 x86_perf_event_update(event, &event->hw, event->hw.idx);
1770} 2099}
1771 2100
1772static const struct pmu pmu = { 2101static const struct pmu pmu = {
@@ -1776,13 +2105,16 @@ static const struct pmu pmu = {
1776 .unthrottle = x86_pmu_unthrottle, 2105 .unthrottle = x86_pmu_unthrottle,
1777}; 2106};
1778 2107
1779const struct pmu *hw_perf_counter_init(struct perf_counter *counter) 2108const struct pmu *hw_perf_event_init(struct perf_event *event)
1780{ 2109{
1781 int err; 2110 int err;
1782 2111
1783 err = __hw_perf_counter_init(counter); 2112 err = __hw_perf_event_init(event);
1784 if (err) 2113 if (err) {
2114 if (event->destroy)
2115 event->destroy(event);
1785 return ERR_PTR(err); 2116 return ERR_PTR(err);
2117 }
1786 2118
1787 return &pmu; 2119 return &pmu;
1788} 2120}
@@ -1798,8 +2130,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1798 entry->ip[entry->nr++] = ip; 2130 entry->ip[entry->nr++] = ip;
1799} 2131}
1800 2132
1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame); 2135static DEFINE_PER_CPU(int, in_nmi_frame);
1804 2136
1805 2137
@@ -1952,9 +2284,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1952 struct perf_callchain_entry *entry; 2284 struct perf_callchain_entry *entry;
1953 2285
1954 if (in_nmi()) 2286 if (in_nmi())
1955 entry = &__get_cpu_var(nmi_entry); 2287 entry = &__get_cpu_var(pmc_nmi_entry);
1956 else 2288 else
1957 entry = &__get_cpu_var(irq_entry); 2289 entry = &__get_cpu_var(pmc_irq_entry);
1958 2290
1959 entry->nr = 0; 2291 entry->nr = 0;
1960 2292
@@ -1962,3 +2294,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1962 2294
1963 return entry; 2295 return entry;
1964} 2296}
2297
2298void hw_perf_event_setup_online(int cpu)
2299{
2300 init_debug_store_on_cpu(cpu);
2301}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index e60ed740d2b3..fab786f60ed6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_counter.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
@@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
68 /* returns the bit offset of the performance counter register */ 68 /* returns the bit offset of the performance counter register */
69 switch (boot_cpu_data.x86_vendor) { 69 switch (boot_cpu_data.x86_vendor) {
70 case X86_VENDOR_AMD: 70 case X86_VENDOR_AMD:
71 return (msr - MSR_K7_PERFCTR0); 71 return msr - MSR_K7_PERFCTR0;
72 case X86_VENDOR_INTEL: 72 case X86_VENDOR_INTEL:
73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
74 return (msr - MSR_ARCH_PERFMON_PERFCTR0); 74 return msr - MSR_ARCH_PERFMON_PERFCTR0;
75 75
76 switch (boot_cpu_data.x86) { 76 switch (boot_cpu_data.x86) {
77 case 6: 77 case 6:
78 return (msr - MSR_P6_PERFCTR0); 78 return msr - MSR_P6_PERFCTR0;
79 case 15: 79 case 15:
80 return (msr - MSR_P4_BPU_PERFCTR0); 80 return msr - MSR_P4_BPU_PERFCTR0;
81 } 81 }
82 } 82 }
83 return 0; 83 return 0;
@@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
92 /* returns the bit offset of the event selection register */ 92 /* returns the bit offset of the event selection register */
93 switch (boot_cpu_data.x86_vendor) { 93 switch (boot_cpu_data.x86_vendor) {
94 case X86_VENDOR_AMD: 94 case X86_VENDOR_AMD:
95 return (msr - MSR_K7_EVNTSEL0); 95 return msr - MSR_K7_EVNTSEL0;
96 case X86_VENDOR_INTEL: 96 case X86_VENDOR_INTEL:
97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
98 return (msr - MSR_ARCH_PERFMON_EVENTSEL0); 98 return msr - MSR_ARCH_PERFMON_EVENTSEL0;
99 99
100 switch (boot_cpu_data.x86) { 100 switch (boot_cpu_data.x86) {
101 case 6: 101 case 6:
102 return (msr - MSR_P6_EVNTSEL0); 102 return msr - MSR_P6_EVNTSEL0;
103 case 15: 103 case 15:
104 return (msr - MSR_P4_BSU_ESCR0); 104 return msr - MSR_P4_BSU_ESCR0;
105 } 105 }
106 } 106 }
107 return 0; 107 return 0;
@@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
113{ 113{
114 BUG_ON(counter > NMI_MAX_COUNTER_BITS); 114 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115 115
116 return (!test_bit(counter, perfctr_nmi_owner)); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118 118
119/* checks the an msr for availability */ 119/* checks the an msr for availability */
@@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
124 counter = nmi_perfctr_msr_to_bit(msr); 124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS); 125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126 126
127 return (!test_bit(counter, perfctr_nmi_owner)); 127 return !test_bit(counter, perfctr_nmi_owner);
128} 128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 130
@@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
237 */ 237 */
238 counter_val = (u64)cpu_khz * 1000; 238 counter_val = (u64)cpu_khz * 1000;
239 do_div(counter_val, retval); 239 do_div(counter_val, retval);
240 if (counter_val > 0x7fffffffULL) { 240 if (counter_val > 0x7fffffffULL) {
241 u64 count = (u64)cpu_khz * 1000; 241 u64 count = (u64)cpu_khz * 1000;
242 do_div(count, 0x7fffffffUL); 242 do_div(count, 0x7fffffffUL);
243 retval = count + 1; 243 retval = count + 1;
@@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
251 u64 count = (u64)cpu_khz * 1000; 251 u64 count = (u64)cpu_khz * 1000;
252 252
253 do_div(count, nmi_hz); 253 do_div(count, nmi_hz);
254 if(descr) 254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count); 255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsrl(perfctr_msr, 0 - count); 256 wrmsrl(perfctr_msr, 0 - count);
257} 257}
@@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
262 u64 count = (u64)cpu_khz * 1000; 262 u64 count = (u64)cpu_khz * 1000;
263 263
264 do_div(count, nmi_hz); 264 do_div(count, nmi_hz);
265 if(descr) 265 if (descr)
266 pr_debug("setting %s to -0x%08Lx\n", descr, count); 266 pr_debug("setting %s to -0x%08Lx\n", descr, count);
267 wrmsr(perfctr_msr, (u32)(-count), 0); 267 wrmsr(perfctr_msr, (u32)(-count), 0);
268} 268}
@@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz)
296 296
297 /* setup the timer */ 297 /* setup the timer */
298 wrmsr(evntsel_msr, evntsel, 0); 298 wrmsr(evntsel_msr, evntsel, 0);
299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
300 300
301 /* initialize the wd struct before enabling */ 301 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
@@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz)
387 /* setup the timer */ 387 /* setup the timer */
388 wrmsr(evntsel_msr, evntsel, 0); 388 wrmsr(evntsel_msr, evntsel, 0);
389 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 389 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
391 391
392 /* initialize the wd struct before enabling */ 392 /* initialize the wd struct before enabling */
393 wd->perfctr_msr = perfctr_msr; 393 wd->perfctr_msr = perfctr_msr;
@@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
415 apic_write(APIC_LVTPC, APIC_DM_NMI); 415 apic_write(APIC_LVTPC, APIC_DM_NMI);
416 416
417 /* P6/ARCH_PERFMON has 32 bit counter write */ 417 /* P6/ARCH_PERFMON has 32 bit counter write */
418 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); 418 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
419} 419}
420 420
421static const struct wd_ops p6_wd_ops = { 421static const struct wd_ops p6_wd_ops = {
@@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz)
490 if (smp_num_siblings == 2) { 490 if (smp_num_siblings == 2) {
491 unsigned int ebx, apicid; 491 unsigned int ebx, apicid;
492 492
493 ebx = cpuid_ebx(1); 493 ebx = cpuid_ebx(1);
494 apicid = (ebx >> 24) & 0xff; 494 apicid = (ebx >> 24) & 0xff;
495 ht_num = apicid & 1; 495 ht_num = apicid & 1;
496 } else 496 } else
497#endif 497#endif
498 ht_num = 0; 498 ht_num = 0;
@@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz)
544 } 544 }
545 545
546 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 546 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
547 | P4_ESCR_OS 547 | P4_ESCR_OS
548 | P4_ESCR_USR; 548 | P4_ESCR_USR;
549 549
550 cccr_val |= P4_CCCR_THRESHOLD(15) 550 cccr_val |= P4_CCCR_THRESHOLD(15)
@@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
612{ 612{
613 unsigned dummy; 613 unsigned dummy;
614 /* 614 /*
615 * P4 quirks: 615 * P4 quirks:
616 * - An overflown perfctr will assert its interrupt 616 * - An overflown perfctr will assert its interrupt
617 * until the OVF flag in its CCCR is cleared. 617 * until the OVF flag in its CCCR is cleared.
618 * - LVTPC is masked on interrupt and must be 618 * - LVTPC is masked on interrupt and must be
@@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
662 * NOTE: Corresponding bit = 0 in ebx indicates event present. 662 * NOTE: Corresponding bit = 0 in ebx indicates event present.
663 */ 663 */
664 cpuid(10, &(eax.full), &ebx, &unused, &unused); 664 cpuid(10, &(eax.full), &ebx, &unused, &unused);
665 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || 665 if ((eax.split.mask_length <
666 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
666 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 667 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
667 return 0; 668 return 0;
668 669
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index d5e30397246b..62ac8cb6ba27 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
116 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); 116 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
117#endif 117#endif
118 seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); 118 seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
119#ifdef CONFIG_X86_64
120 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); 119 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
121 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 120 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
122 c->x86_phys_bits, c->x86_virt_bits); 121 c->x86_phys_bits, c->x86_virt_bits);
123#endif
124 122
125 seq_printf(m, "power management:"); 123 seq_printf(m, "power management:");
126 for (i = 0; i < 32; i++) { 124 for (i = 0; i < 32; i++) {
@@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
128 if (i < ARRAY_SIZE(x86_power_flags) && 126 if (i < ARRAY_SIZE(x86_power_flags) &&
129 x86_power_flags[i]) 127 x86_power_flags[i])
130 seq_printf(m, "%s%s", 128 seq_printf(m, "%s%s",
131 x86_power_flags[i][0]?" ":"", 129 x86_power_flags[i][0] ? " " : "",
132 x86_power_flags[i]); 130 x86_power_flags[i]);
133 else 131 else
134 seq_printf(m, " [%d]", i); 132 seq_printf(m, " [%d]", i);
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 284c399e3234..1cbed97b59cf 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <asm/div64.h> 25#include <asm/div64.h>
26#include <asm/vmware.h> 26#include <asm/vmware.h>
27#include <asm/x86_init.h>
27 28
28#define CPUID_VMWARE_INFO_LEAF 0x40000000 29#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 30#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -47,19 +48,33 @@ static inline int __vmware_platform(void)
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; 48 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48} 49}
49 50
50static unsigned long __vmware_get_tsc_khz(void) 51static unsigned long vmware_get_tsc_khz(void)
51{ 52{
52 uint64_t tsc_hz; 53 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx; 54 uint32_t eax, ebx, ecx, edx;
55
56 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
57
58 tsc_hz = eax | (((uint64_t)ebx) << 32);
59 do_div(tsc_hz, 1000);
60 BUG_ON(tsc_hz >> 32);
61 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
62 (unsigned long) tsc_hz / 1000,
63 (unsigned long) tsc_hz % 1000);
64 return tsc_hz;
65}
66
67void __init vmware_platform_setup(void)
68{
69 uint32_t eax, ebx, ecx, edx;
54 70
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 71 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56 72
57 if (ebx == UINT_MAX) 73 if (ebx != UINT_MAX)
58 return 0; 74 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
59 tsc_hz = eax | (((uint64_t)ebx) << 32); 75 else
60 do_div(tsc_hz, 1000); 76 printk(KERN_WARNING
61 BUG_ON(tsc_hz >> 32); 77 "Failed to get TSC freq from the hypervisor\n");
62 return tsc_hz;
63} 78}
64 79
65/* 80/*
@@ -87,12 +102,6 @@ int vmware_platform(void)
87 return 0; 102 return 0;
88} 103}
89 104
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/* 105/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 106 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can 107 * Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..6a52d4b36a30 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev) 185static char *cpuid_devnode(struct device *dev, mode_t *mode)
186{ 186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188} 188}
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
203 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
204 goto out_chrdev; 204 goto out_chrdev;
205 } 205 }
206 cpuid_class->nodename = cpuid_nodename; 206 cpuid_class->devnode = cpuid_devnode;
207 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
208 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
209 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index b4f14c6c09d9..37250fe490b1 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -27,9 +27,7 @@ static void doublefault_fn(void)
27 27
28 if (ptr_ok(gdt)) { 28 if (ptr_ok(gdt)) {
29 gdt += GDT_ENTRY_TSS << 3; 29 gdt += GDT_ENTRY_TSS << 3;
30 tss = *(u16 *)(gdt+2); 30 tss = get_desc_base((struct desc_struct *)gdt);
31 tss += *(u8 *)(gdt+4) << 16;
32 tss += *(u8 *)(gdt+7) << 24;
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss); 31 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34 32
35 if (ptr_ok(tss)) { 33 if (ptr_ok(tss)) {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 48bfe1386038..ef42a038f1a6 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -509,15 +509,15 @@ enum bts_field {
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask) 509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510}; 510};
511 511
512static inline unsigned long bts_get(const char *base, enum bts_field field) 512static inline unsigned long bts_get(const char *base, unsigned long field)
513{ 513{
514 base += (ds_cfg.sizeof_ptr_field * field); 514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base; 515 return *(unsigned long *)base;
516} 516}
517 517
518static inline void bts_set(char *base, enum bts_field field, unsigned long val) 518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{ 519{
520 base += (ds_cfg.sizeof_ptr_field * field);; 520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val; 521 (*(unsigned long *)base) = val;
522} 522}
523 523
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c8405718a4c3..2d8a371d4339 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -15,7 +15,6 @@
15#include <linux/bug.h> 15#include <linux/bug.h>
16#include <linux/nmi.h> 16#include <linux/nmi.h>
17#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/ftrace.h>
19 18
20#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
21 20
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..f7dd2a7c3bf4 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..a071e6be177e 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5cb5725b2bae..85419bb7d4ab 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
115{ 115{
116 int x = e820x->nr_map; 116 int x = e820x->nr_map;
117 117
118 if (x == ARRAY_SIZE(e820x->map)) { 118 if (x >= ARRAY_SIZE(e820x->map)) {
119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
120 return; 120 return;
121 } 121 }
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
1331 struct resource *res; 1331 struct resource *res;
1332 u64 end; 1332 u64 end;
1333 1333
1334 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1334 res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
1335 e820_res = res; 1335 e820_res = res;
1336 for (i = 0; i < e820.nr_map; i++) { 1336 for (i = 0; i < e820.nr_map; i++) {
1337 end = e820.map[i].addr + e820.map[i].size - 1; 1337 end = e820.map[i].addr + e820.map[i].size - 1;
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
1455 return who; 1455 return who;
1456} 1456}
1457 1457
1458char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1459{
1460 if (x86_quirks->arch_memory_setup) {
1461 char *who = x86_quirks->arch_memory_setup();
1462
1463 if (who)
1464 return who;
1465 }
1466 return default_machine_specific_memory_setup();
1467}
1468
1469/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1470char * __init __attribute__((weak)) memory_setup(void)
1471{
1472 return machine_specific_memory_setup();
1473}
1474
1475void __init setup_memory_map(void) 1458void __init setup_memory_map(void)
1476{ 1459{
1477 char *who; 1460 char *who;
1478 1461
1479 who = memory_setup(); 1462 who = x86_init.resources.memory_setup();
1480 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1463 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1481 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1464 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1482 e820_print_map(who); 1465 e820_print_map(who);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..41fd965c80c6 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
160 .index = -1, 160 .index = -1,
161}; 161};
162 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void __init dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void __init dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int request, int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int __init ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int __init ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void __init default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565
566static void __init nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
878/* Direct interface for emergencies */ 163/* Direct interface for emergencies */
879static struct console *early_console = &early_vga_console; 164static struct console *early_console = &early_vga_console;
880static int __initdata early_console_initialized; 165static int __initdata early_console_initialized;
@@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...)
891 va_end(ap); 176 va_end(ap);
892} 177}
893 178
179static inline void early_console_register(struct console *con, int keep_early)
180{
181 if (early_console->index != -1) {
182 printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
183 con->name);
184 return;
185 }
186 early_console = con;
187 if (keep_early)
188 early_console->flags &= ~CON_BOOT;
189 else
190 early_console->flags |= CON_BOOT;
191 register_console(early_console);
192}
894 193
895static int __init setup_early_printk(char *buf) 194static int __init setup_early_printk(char *buf)
896{ 195{
897 int keep_early; 196 int keep;
898 197
899 if (!buf) 198 if (!buf)
900 return 0; 199 return 0;
@@ -903,42 +202,34 @@ static int __init setup_early_printk(char *buf)
903 return 0; 202 return 0;
904 early_console_initialized = 1; 203 early_console_initialized = 1;
905 204
906 keep_early = (strstr(buf, "keep") != NULL); 205 keep = (strstr(buf, "keep") != NULL);
907 206
908 if (!strncmp(buf, "serial", 6)) { 207 while (*buf != '\0') {
909 early_serial_init(buf + 6); 208 if (!strncmp(buf, "serial", 6)) {
910 early_console = &early_serial_console; 209 early_serial_init(buf + 6);
911 } else if (!strncmp(buf, "ttyS", 4)) { 210 early_console_register(&early_serial_console, keep);
912 early_serial_init(buf); 211 }
913 early_console = &early_serial_console; 212 if (!strncmp(buf, "ttyS", 4)) {
914 } else if (!strncmp(buf, "vga", 3) 213 early_serial_init(buf + 4);
915 && boot_params.screen_info.orig_video_isVGA == 1) { 214 early_console_register(&early_serial_console, keep);
916 max_xpos = boot_params.screen_info.orig_video_cols; 215 }
917 max_ypos = boot_params.screen_info.orig_video_lines; 216 if (!strncmp(buf, "vga", 3) &&
918 current_ypos = boot_params.screen_info.orig_y; 217 boot_params.screen_info.orig_video_isVGA == 1) {
919 early_console = &early_vga_console; 218 max_xpos = boot_params.screen_info.orig_video_cols;
219 max_ypos = boot_params.screen_info.orig_video_lines;
220 current_ypos = boot_params.screen_info.orig_y;
221 early_console_register(&early_vga_console, keep);
222 }
920#ifdef CONFIG_EARLY_PRINTK_DBGP 223#ifdef CONFIG_EARLY_PRINTK_DBGP
921 } else if (!strncmp(buf, "dbgp", 4)) { 224 if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
922 if (early_dbgp_init(buf+4) < 0) 225 early_console_register(&early_dbgp_console, keep);
923 return 0;
924 early_console = &early_dbgp_console;
925 /*
926 * usb subsys will reset ehci controller, so don't keep
927 * that early console
928 */
929 keep_early = 0;
930#endif 226#endif
931#ifdef CONFIG_HVC_XEN 227#ifdef CONFIG_HVC_XEN
932 } else if (!strncmp(buf, "xen", 3)) { 228 if (!strncmp(buf, "xen", 3))
933 early_console = &xenboot_console; 229 early_console_register(&xenboot_console, keep);
934#endif 230#endif
231 buf++;
935 } 232 }
936
937 if (keep_early)
938 early_console->flags &= ~CON_BOOT;
939 else
940 early_console->flags |= CON_BOOT;
941 register_console(early_console);
942 return 0; 233 return 0;
943} 234}
944 235
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e3451..ad5bd988fb79 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
42#include <asm/time.h> 42#include <asm/time.h>
43#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/x86_init.h>
45 46
46#define EFI_DEBUG 1 47#define EFI_DEBUG 1
47#define PFX "EFI: " 48#define PFX "EFI: "
@@ -453,6 +454,9 @@ void __init efi_init(void)
453 if (add_efi_memmap) 454 if (add_efi_memmap)
454 do_add_efi_memmap(); 455 do_add_efi_memmap();
455 456
457 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss;
459
456 /* Setup for EFI runtime service */ 460 /* Setup for EFI runtime service */
457 reboot_type = BOOT_EFI; 461 reboot_type = BOOT_EFI;
458 462
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be745107..b5c061f8f358 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
146END(ftrace_graph_caller) 146END(ftrace_graph_caller)
147 147
148GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
149 subq $80, %rsp 149 subq $24, %rsp
150 150
151 /* Save the return values */ 151 /* Save the return values */
152 movq %rax, (%rsp) 152 movq %rax, (%rsp)
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 72(%rsp) 158 movq %rax, 16(%rsp)
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $72, %rsp 161 addq $16, %rsp
162 retq 162 retq
163#endif 163#endif
164 164
@@ -536,20 +536,13 @@ sysret_signal:
536 bt $TIF_SYSCALL_AUDIT,%edx 536 bt $TIF_SYSCALL_AUDIT,%edx
537 jc sysret_audit 537 jc sysret_audit
538#endif 538#endif
539 /* edx: work flags (arg3) */ 539 /*
540 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 540 * We have a signal, or exit tracing or single-step.
541 xorl %esi,%esi # oldset -> arg2 541 * These all wind up with the iret return path anyway,
542 SAVE_REST 542 * so just join that path right now.
543 FIXUP_TOP_OF_STACK %r11 543 */
544 call do_notify_resume 544 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
545 RESTORE_TOP_OF_STACK %r11 545 jmp int_check_syscall_exit_work
546 RESTORE_REST
547 movl $_TIF_WORK_MASK,%edi
548 /* Use IRET because user could have changed frame. This
549 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
550 DISABLE_INTERRUPTS(CLBR_NONE)
551 TRACE_IRQS_OFF
552 jmp int_with_check
553 546
554badsys: 547badsys:
555 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 548 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
654int_very_careful: 647int_very_careful:
655 TRACE_IRQS_ON 648 TRACE_IRQS_ON
656 ENABLE_INTERRUPTS(CLBR_NONE) 649 ENABLE_INTERRUPTS(CLBR_NONE)
650int_check_syscall_exit_work:
657 SAVE_REST 651 SAVE_REST
658 /* Check for syscall exit trace */ 652 /* Check for syscall exit trace */
659 testl $_TIF_WORK_SYSCALL_EXIT,%edx 653 testl $_TIF_WORK_SYSCALL_EXIT,%edx
@@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \
1021apicinterrupt SPURIOUS_APIC_VECTOR \ 1015apicinterrupt SPURIOUS_APIC_VECTOR \
1022 spurious_interrupt smp_spurious_interrupt 1016 spurious_interrupt smp_spurious_interrupt
1023 1017
1024#ifdef CONFIG_PERF_COUNTERS 1018#ifdef CONFIG_PERF_EVENTS
1025apicinterrupt LOCAL_PENDING_VECTOR \ 1019apicinterrupt LOCAL_PENDING_VECTOR \
1026 perf_pending_interrupt smp_perf_pending_interrupt 1020 perf_pending_interrupt smp_perf_pending_interrupt
1027#endif 1021#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
417 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
418 &return_to_handler; 418 &return_to_handler;
419 419
420 /* Nmi's are currently unsupported */
421 if (unlikely(in_nmi()))
422 return;
423
424 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
425 return; 421 return;
426 422
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
498 494
499struct syscall_metadata *syscall_nr_to_meta(int nr) 495struct syscall_metadata *syscall_nr_to_meta(int nr)
500{ 496{
501 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
502 return NULL; 498 return NULL;
503 499
504 return syscalls_metadata[nr]; 500 return syscalls_metadata[nr];
505} 501}
506 502
507void arch_init_ftrace_syscalls(void) 503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
508{ 530{
509 int i; 531 int i;
510 struct syscall_metadata *meta; 532 struct syscall_metadata *meta;
511 unsigned long **psys_syscall_table = &sys_call_table; 533 unsigned long **psys_syscall_table = &sys_call_table;
512 static atomic_t refs;
513
514 if (atomic_inc_return(&refs) != 1)
515 goto end;
516 534
517 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
518 FTRACE_SYSCALL_MAX, GFP_KERNEL); 536 NR_syscalls, GFP_KERNEL);
519 if (!syscalls_metadata) { 537 if (!syscalls_metadata) {
520 WARN_ON(1); 538 WARN_ON(1);
521 return; 539 return -ENOMEM;
522 } 540 }
523 541
524 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { 542 for (i = 0; i < NR_syscalls; i++) {
525 meta = find_syscall_meta(psys_syscall_table[i]); 543 meta = find_syscall_meta(psys_syscall_table[i]);
526 syscalls_metadata[i] = meta; 544 syscalls_metadata[i] = meta;
527 } 545 }
528 return; 546 return 0;
529
530 /* Paranoid: avoid overflow */
531end:
532 atomic_dec(&refs);
533} 547}
548arch_initcall(arch_init_ftrace_syscalls);
534#endif 549#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42c..4f8e2507e8f3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,8 +11,21 @@
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/page.h>
15#include <asm/trampoline.h> 15#include <asm/trampoline.h>
16#include <asm/apic.h>
17#include <asm/io_apic.h>
18#include <asm/bios_ebda.h>
19
20static void __init i386_default_early_setup(void)
21{
22 /* Initilize 32bit specific setup functions */
23 x86_init.resources.probe_roms = probe_roms;
24 x86_init.resources.reserve_resources = i386_reserve_resources;
25 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
26
27 reserve_ebda_region();
28}
16 29
17void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
18{ 31{
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 43 }
31#endif 44#endif
32 reserve_ebda_region(); 45
46 /* Call the subarch specific early setup function */
47 switch (boot_params.hdr.hardware_subarch) {
48 case X86_SUBARCH_MRST:
49 x86_mrst_early_setup();
50 break;
51 default:
52 i386_default_early_setup();
53 break;
54 }
33 55
34 /* 56 /*
35 * At this point everything still needed from the boot loader 57 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..0b06cd778fd9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,8 +23,8 @@
23#include <asm/sections.h> 23#include <asm/sections.h>
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 26#include <asm/trampoline.h>
27#include <asm/bios_ebda.h>
28 28
29static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
30{ 30{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac9e8d3..050c278481b1 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
79 * any particular GDT layout, because we load our own as soon as we 79 * any particular GDT layout, because we load our own as soon as we
80 * can. 80 * can.
81 */ 81 */
82.section .text.head,"ax",@progbits 82__HEAD
83ENTRY(startup_32) 83ENTRY(startup_32)
84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
85 us to not reload segments */ 85 us to not reload segments */
@@ -157,6 +157,7 @@ subarch_entries:
157 .long default_entry /* normal x86/PC */ 157 .long default_entry /* normal x86/PC */
158 .long lguest_entry /* lguest hypervisor */ 158 .long lguest_entry /* lguest hypervisor */
159 .long xen_entry /* Xen hypervisor */ 159 .long xen_entry /* Xen hypervisor */
160 .long default_entry /* Moorestown MID */
160num_subarch_entries = (. - subarch_entries) / 4 161num_subarch_entries = (. - subarch_entries) / 4
161.previous 162.previous
162#endif /* CONFIG_PARAVIRT */ 163#endif /* CONFIG_PARAVIRT */
@@ -439,7 +440,6 @@ is386: movl $2,%ecx # set MP
439 jne 1f 440 jne 1f
440 movl $per_cpu__gdt_page,%eax 441 movl $per_cpu__gdt_page,%eax
441 movl $per_cpu__stack_canary,%ecx 442 movl $per_cpu__stack_canary,%ecx
442 subl $20, %ecx
443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
444 shrl $16, %ecx 444 shrl $16, %ecx
445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -608,7 +608,7 @@ ENTRY(initial_code)
608/* 608/*
609 * BSS section 609 * BSS section
610 */ 610 */
611.section ".bss.page_aligned","wa" 611__PAGE_ALIGNED_BSS
612 .align PAGE_SIZE_asm 612 .align PAGE_SIZE_asm
613#ifdef CONFIG_X86_PAE 613#ifdef CONFIG_X86_PAE
614swapper_pg_pmd: 614swapper_pg_pmd:
@@ -626,7 +626,7 @@ ENTRY(empty_zero_page)
626 * This starts the data section. 626 * This starts the data section.
627 */ 627 */
628#ifdef CONFIG_X86_PAE 628#ifdef CONFIG_X86_PAE
629.section ".data.page_aligned","wa" 629__PAGE_ALIGNED_DATA
630 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
631 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
632ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a05..780cd928fcd5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map)
40L3_START_KERNEL = pud_index(__START_KERNEL_map) 40L3_START_KERNEL = pud_index(__START_KERNEL_map)
41 41
42 .text 42 .text
43 .section .text.head 43 __HEAD
44 .code64 44 .code64
45 .globl startup_64 45 .globl startup_64
46startup_64: 46startup_64:
@@ -418,7 +418,7 @@ ENTRY(phys_base)
418ENTRY(idt_table) 418ENTRY(idt_table)
419 .skip IDT_ENTRIES * 16 419 .skip IDT_ENTRIES * 16
420 420
421 .section .bss.page_aligned, "aw", @nobits 421 __PAGE_ALIGNED_BSS
422 .align PAGE_SIZE 422 .align PAGE_SIZE
423ENTRY(empty_zero_page) 423ENTRY(empty_zero_page)
424 .skip PAGE_SIZE 424 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda63..1736c5a725aa 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,14 @@
10EXPORT_SYMBOL(mcount); 10EXPORT_SYMBOL(mcount);
11#endif 11#endif
12 12
13/*
14 * Note, this is a prototype to get at the symbol for
15 * the export, but dont use it from C code, it is used
16 * by assembly code and is not using C calling convention!
17 */
18extern void cmpxchg8b_emu(void);
19EXPORT_SYMBOL(cmpxchg8b_emu);
20
13/* Networking helper routines. */ 21/* Networking helper routines. */
14EXPORT_SYMBOL(csum_partial_copy_generic); 22EXPORT_SYMBOL(csum_partial_copy_generic);
15 23
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac4..23c167925a5c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22#ifdef CONFIG_X86_32
23static void pit_disable_clocksource(void);
24#else
25static inline void pit_disable_clocksource(void) { }
26#endif
27
28/* 22/*
29 * HPET replaces the PIT, when enabled. So we need to know, which of 23 * HPET replaces the PIT, when enabled. So we need to know, which of
30 * the two timers is used 24 * the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
57 outb_pit(0, PIT_CH0); 51 outb_pit(0, PIT_CH0);
58 outb_pit(0, PIT_CH0); 52 outb_pit(0, PIT_CH0);
59 } 53 }
60 pit_disable_clocksource();
61 break; 54 break;
62 55
63 case CLOCK_EVT_MODE_ONESHOT: 56 case CLOCK_EVT_MODE_ONESHOT:
64 /* One shot setup */ 57 /* One shot setup */
65 pit_disable_clocksource();
66 outb_pit(0x38, PIT_MODE); 58 outb_pit(0x38, PIT_MODE);
67 break; 59 break;
68 60
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
200 .shift = 20, 192 .shift = 20,
201}; 193};
202 194
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (pit_cs.mult) {
209 clocksource_unregister(&pit_cs);
210 pit_cs.mult = 0;
211 }
212}
213
214static int __init init_pit_clocksource(void) 195static int __init init_pit_clocksource(void)
215{ 196{
216 /* 197 /*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc11..3a54dcb9cd0e 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
20 * way process stacks are handled. This is done by having a special 20 * way process stacks are handled. This is done by having a special
21 * "init_task" linker map entry.. 21 * "init_task" linker map entry..
22 */ 22 */
23union thread_union init_thread_union 23union thread_union init_thread_union __init_task_data =
24 __attribute__((__section__(".data.init_task"))) = 24 { INIT_THREAD_INFO(init_task) };
25 { INIT_THREAD_INFO(init_task) };
26 25
27/* 26/*
28 * Initial task structure. 27 * Initial task structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..74656d1d4e30 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
104 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
105# endif 105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j) 109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); 110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
200 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif 201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu); 205 sum += per_cpu(mce_poll_count, cpu);
206#endif 206#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 3b09634a5153..7d35d0fe2329 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
218void fixup_irqs(void) 218void fixup_irqs(void)
219{ 219{
220 unsigned int irq; 220 unsigned int irq;
221 static int warned;
222 struct irq_desc *desc; 221 struct irq_desc *desc;
223 222
224 for_each_irq_desc(irq, desc) { 223 for_each_irq_desc(irq, desc) {
@@ -236,8 +235,8 @@ void fixup_irqs(void)
236 } 235 }
237 if (desc->chip->set_affinity) 236 if (desc->chip->set_affinity)
238 desc->chip->set_affinity(irq, affinity); 237 desc->chip->set_affinity(irq, affinity);
239 else if (desc->action && !(warned++)) 238 else if (desc->action)
240 printk("Cannot set affinity for irq %i\n", irq); 239 printk_once("Cannot set affinity for irq %i\n", irq);
241 } 240 }
242 241
243#if 0 242#if 0
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..40f30773fb29 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
116 return 0; 116 return 0;
117} 117}
118 118
119static void __init init_ISA_irqs(void) 119void __init init_ISA_irqs(void)
120{ 120{
121 int i; 121 int i;
122 122
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)
140 } 140 }
141} 141}
142 142
143/* Overridden in paravirt.c */ 143void __init init_IRQ(void)
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 144{
145 x86_init.irqs.intr_init();
146}
145 147
146static void __init smp_intr_init(void) 148static void __init smp_intr_init(void)
147{ 149{
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void)
190#ifdef CONFIG_X86_MCE_THRESHOLD 192#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 193 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 194#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 195#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); 196 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif 197#endif
196 198
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void)
206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 208 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207 209
208 /* Performance monitoring interrupts: */ 210 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 211# ifdef CONFIG_PERF_EVENTS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 212 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif 213# endif
212 214
213#endif 215#endif
214} 216}
215 217
216/**
217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void) 218void __init native_init_IRQ(void)
237{ 219{
238 int i; 220 int i;
239 221
240 /* Execute any quirks before the call gates are initialised: */ 222 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init(); 223 x86_init.irqs.pre_vector_init();
242 224
243 apic_intr_init(); 225 apic_intr_init();
244 226
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void)
258 240
259#ifdef CONFIG_X86_32 241#ifdef CONFIG_X86_32
260 /* 242 /*
261 * Call quirks after call gates are initialised (usually add in
262 * the architecture specific gates):
263 */
264 x86_quirk_intr_init();
265
266 /*
267 * External FPU? Set up irq13 if so, for 243 * External FPU? Set up irq13 if so, for
268 * original braindamaged IBM FERR coupling. 244 * original braindamaged IBM FERR coupling.
269 */ 245 */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f613..63b0ec8d3d4a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
34struct kvm_para_state { 34struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 35 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 36 int mmu_queue_len;
37 enum paravirt_lazy_mode mode;
38}; 37};
39 38
40static DEFINE_PER_CPU(struct kvm_para_state, para_state); 39static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
77{ 76{
78 struct kvm_para_state *state = kvm_para_state(); 77 struct kvm_para_state *state = kvm_para_state();
79 78
80 if (state->mode != PARAVIRT_LAZY_MMU) { 79 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
81 kvm_mmu_op(buffer, len); 80 kvm_mmu_op(buffer, len);
82 return; 81 return;
83 } 82 }
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
185 184
186static void kvm_enter_lazy_mmu(void) 185static void kvm_enter_lazy_mmu(void)
187{ 186{
188 struct kvm_para_state *state = kvm_para_state();
189
190 paravirt_enter_lazy_mmu(); 187 paravirt_enter_lazy_mmu();
191 state->mode = paravirt_get_lazy_mode();
192} 188}
193 189
194static void kvm_leave_lazy_mmu(void) 190static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
197 193
198 mmu_queue_flush(state); 194 mmu_queue_flush(state);
199 paravirt_leave_lazy_mmu(); 195 paravirt_leave_lazy_mmu();
200 state->mode = paravirt_get_lazy_mode();
201} 196}
202 197
203static void __init paravirt_ops_setup(void) 198static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f1526..feaeb0d3aa4f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25
26#include <asm/x86_init.h>
25#include <asm/reboot.h> 27#include <asm/reboot.h>
26 28
27#define KVM_SCALE 22 29#define KVM_SCALE 22
@@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void)
50 struct timespec ts; 52 struct timespec ts;
51 int low, high; 53 int low, high;
52 54
53 low = (int)__pa(&wall_clock); 55 low = (int)__pa_symbol(&wall_clock);
54 high = ((u64)__pa(&wall_clock) >> 32); 56 high = ((u64)__pa_symbol(&wall_clock) >> 32);
55 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
56 58
57 vcpu_time = &get_cpu_var(hv_clock); 59 vcpu_time = &get_cpu_var(hv_clock);
@@ -182,12 +184,13 @@ void __init kvmclock_init(void)
182 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
183 if (kvm_register_clock("boot clock")) 185 if (kvm_register_clock("boot clock"))
184 return; 186 return;
185 pv_time_ops.get_wallclock = kvm_get_wallclock;
186 pv_time_ops.set_wallclock = kvm_set_wallclock;
187 pv_time_ops.sched_clock = kvm_clock_read; 187 pv_time_ops.sched_clock = kvm_clock_read;
188 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; 188 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
189 x86_platform.get_wallclock = kvm_get_wallclock;
190 x86_platform.set_wallclock = kvm_set_wallclock;
189#ifdef CONFIG_X86_LOCAL_APIC 191#ifdef CONFIG_X86_LOCAL_APIC
190 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 192 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock;
191#endif 194#endif
192#ifdef CONFIG_SMP 195#ifdef CONFIG_SMP
193 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635d..ec6ef60cbd17 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
67#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
68 preempt_disable(); 68 preempt_disable();
69 load_LDT(pc); 69 load_LDT(pc);
70 if (!cpus_equal(current->mm->cpu_vm_mask, 70 if (!cpumask_equal(mm_cpumask(current->mm),
71 cpumask_of_cpu(smp_processor_id()))) 71 cpumask_of(smp_processor_id())))
72 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
73 preempt_enable(); 73 preempt_enable();
74#else 74#else
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290ac..378e9a8f1bf8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
210{ 210{
211 ssize_t ret = -EINVAL; 211 ssize_t ret = -EINVAL;
212 212
213 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages); 214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
215 return ret; 215 return ret;
216 } 216 }
217 217
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode", 239 .nodename = "cpu/microcode",
240 .fops = &microcode_fops, 240 .fops = &microcode_fops,
241}; 241};
242 242
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 651c93b28862..5be95ef4ffec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
45 return sum & 0xFF; 45 return sum & 0xFF;
46} 46}
47 47
48int __init default_mpc_apic_id(struct mpc_cpu *m)
49{
50 return m->apicid;
51}
52
48static void __init MP_processor_info(struct mpc_cpu *m) 53static void __init MP_processor_info(struct mpc_cpu *m)
49{ 54{
50 int apicid; 55 int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
55 return; 60 return;
56 } 61 }
57 62
58 if (x86_quirks->mpc_apic_id) 63 apicid = x86_init.mpparse.mpc_apic_id(m);
59 apicid = x86_quirks->mpc_apic_id(m);
60 else
61 apicid = m->apicid;
62 64
63 if (m->cpuflag & CPU_BOOTPROCESSOR) { 65 if (m->cpuflag & CPU_BOOTPROCESSOR) {
64 bootup_cpu = " (Bootup-CPU)"; 66 bootup_cpu = " (Bootup-CPU)";
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
70} 72}
71 73
72#ifdef CONFIG_X86_IO_APIC 74#ifdef CONFIG_X86_IO_APIC
73static void __init MP_bus_info(struct mpc_bus *m) 75void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
74{ 76{
75 char str[7];
76 memcpy(str, m->bustype, 6); 77 memcpy(str, m->bustype, 6);
77 str[6] = 0; 78 str[6] = 0;
79 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
80}
78 81
79 if (x86_quirks->mpc_oem_bus_info) 82static void __init MP_bus_info(struct mpc_bus *m)
80 x86_quirks->mpc_oem_bus_info(m, str); 83{
81 else 84 char str[7];
82 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); 85
86 x86_init.mpparse.mpc_oem_bus_info(m, str);
83 87
84#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
85 if (m->busid >= MAX_MP_BUSSES) { 89 if (m->busid >= MAX_MP_BUSSES) {
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
97#endif 101#endif
98 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
99 if (x86_quirks->mpc_oem_pci_bus) 103 if (x86_init.mpparse.mpc_oem_pci_bus)
100 x86_quirks->mpc_oem_pci_bus(m); 104 x86_init.mpparse.mpc_oem_pci_bus(m);
101 105
102 clear_bit(m->busid, mp_bus_not_pci); 106 clear_bit(m->busid, mp_bus_not_pci);
103#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
291 1, mpc, mpc->length, 1); 295 1, mpc, mpc->length, 1);
292} 296}
293 297
298void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
299
294static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 300static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295{ 301{
296 char str[16]; 302 char str[16];
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
312 if (early) 318 if (early)
313 return 1; 319 return 1;
314 320
315 if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { 321 if (mpc->oemptr)
316 struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; 322 x86_init.mpparse.smp_read_mpc_oem(mpc);
317 x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
318 }
319 323
320 /* 324 /*
321 * Now process the configuration blocks. 325 * Now process the configuration blocks.
322 */ 326 */
323 if (x86_quirks->mpc_record) 327 x86_init.mpparse.mpc_record(0);
324 *x86_quirks->mpc_record = 0;
325 328
326 while (count < mpc->length) { 329 while (count < mpc->length) {
327 switch (*mpt) { 330 switch (*mpt) {
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
353 count = mpc->length; 356 count = mpc->length;
354 break; 357 break;
355 } 358 }
356 if (x86_quirks->mpc_record) 359 x86_init.mpparse.mpc_record(1);
357 (*x86_quirks->mpc_record)++;
358 } 360 }
359 361
360#ifdef CONFIG_X86_BIGSMP 362#ifdef CONFIG_X86_BIGSMP
@@ -482,11 +484,11 @@ static void __init construct_ioapic_table(int mpc_default_type)
482 MP_bus_info(&bus); 484 MP_bus_info(&bus);
483 } 485 }
484 486
485 ioapic.type = MP_IOAPIC; 487 ioapic.type = MP_IOAPIC;
486 ioapic.apicid = 2; 488 ioapic.apicid = 2;
487 ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; 489 ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
488 ioapic.flags = MPC_APIC_USABLE; 490 ioapic.flags = MPC_APIC_USABLE;
489 ioapic.apicaddr = 0xFEC00000; 491 ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
490 MP_ioapic_info(&ioapic); 492 MP_ioapic_info(&ioapic);
491 493
492 /* 494 /*
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
608/* 610/*
609 * Scan the memory blocks for an SMP configuration block. 611 * Scan the memory blocks for an SMP configuration block.
610 */ 612 */
611static void __init __get_smp_config(unsigned int early) 613void __init default_get_smp_config(unsigned int early)
612{ 614{
613 struct mpf_intel *mpf = mpf_found; 615 struct mpf_intel *mpf = mpf_found;
614 616
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
625 if (acpi_lapic && acpi_ioapic) 627 if (acpi_lapic && acpi_ioapic)
626 return; 628 return;
627 629
628 if (x86_quirks->mach_get_smp_config) {
629 if (x86_quirks->mach_get_smp_config(early))
630 return;
631 }
632
633 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 630 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
634 mpf->specification); 631 mpf->specification);
635#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 632#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
670 */ 667 */
671} 668}
672 669
673void __init early_get_smp_config(void)
674{
675 __get_smp_config(1);
676}
677
678void __init get_smp_config(void)
679{
680 __get_smp_config(0);
681}
682
683static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
684{ 671{
685 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
745 return 0; 732 return 0;
746} 733}
747 734
748static void __init __find_smp_config(unsigned int reserve) 735void __init default_find_smp_config(unsigned int reserve)
749{ 736{
750 unsigned int address; 737 unsigned int address;
751 738
752 if (x86_quirks->mach_find_smp_config) {
753 if (x86_quirks->mach_find_smp_config(reserve))
754 return;
755 }
756 /* 739 /*
757 * FIXME: Linux assumes you have 640K of base ram.. 740 * FIXME: Linux assumes you have 640K of base ram..
758 * this continues the error... 741 * this continues the error...
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
787 smp_scan_config(address, 0x400, reserve); 770 smp_scan_config(address, 0x400, reserve);
788} 771}
789 772
790void __init early_find_smp_config(void)
791{
792 __find_smp_config(0);
793}
794
795void __init find_smp_config(void)
796{
797 __find_smp_config(1);
798}
799
800#ifdef CONFIG_X86_IO_APIC 773#ifdef CONFIG_X86_IO_APIC
801static u8 __initdata irq_used[MAX_IRQ_SOURCES]; 774static u8 __initdata irq_used[MAX_IRQ_SOURCES];
802 775
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 000000000000..3b7078abc871
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16/*
17 * Moorestown specific x86_init function overrides and early setup
18 * calls.
19 */
20void __init x86_mrst_early_setup(void)
21{
22 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop;
24}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 98fd6cd4e3a4..6a3cefc7dda1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,6 +1,7 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * Copyright 2009 Intel Corporation; author: H. Peter Anvin
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf,
80 81
81 for (; count; count -= 8) { 82 for (; count; count -= 8) {
82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 83 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
83 if (err) { 84 if (err)
84 if (err == -EFAULT) /* Fix idiotic error code */
85 err = -EIO;
86 break; 85 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) { 86 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT; 87 err = -EFAULT;
90 break; 88 break;
@@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
115 break; 113 break;
116 } 114 }
117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 115 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
118 if (err) { 116 if (err)
119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break; 117 break;
122 }
123 tmp += 2; 118 tmp += 2;
124 bytes += 8; 119 bytes += 8;
125 } 120 }
@@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
127 return bytes ? bytes : err; 122 return bytes ? bytes : err;
128} 123}
129 124
125static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
126{
127 u32 __user *uregs = (u32 __user *)arg;
128 u32 regs[8];
129 int cpu = iminor(file->f_path.dentry->d_inode);
130 int err;
131
132 switch (ioc) {
133 case X86_IOC_RDMSR_REGS:
134 if (!(file->f_mode & FMODE_READ)) {
135 err = -EBADF;
136 break;
137 }
138 if (copy_from_user(&regs, uregs, sizeof regs)) {
139 err = -EFAULT;
140 break;
141 }
142 err = rdmsr_safe_regs_on_cpu(cpu, regs);
143 if (err)
144 break;
145 if (copy_to_user(uregs, &regs, sizeof regs))
146 err = -EFAULT;
147 break;
148
149 case X86_IOC_WRMSR_REGS:
150 if (!(file->f_mode & FMODE_WRITE)) {
151 err = -EBADF;
152 break;
153 }
154 if (copy_from_user(&regs, uregs, sizeof regs)) {
155 err = -EFAULT;
156 break;
157 }
158 err = wrmsr_safe_regs_on_cpu(cpu, regs);
159 if (err)
160 break;
161 if (copy_to_user(uregs, &regs, sizeof regs))
162 err = -EFAULT;
163 break;
164
165 default:
166 err = -ENOTTY;
167 break;
168 }
169
170 return err;
171}
172
130static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
131{ 174{
132 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
@@ -157,6 +200,8 @@ static const struct file_operations msr_fops = {
157 .read = msr_read, 200 .read = msr_read,
158 .write = msr_write, 201 .write = msr_write,
159 .open = msr_open, 202 .open = msr_open,
203 .unlocked_ioctl = msr_ioctl,
204 .compat_ioctl = msr_ioctl,
160}; 205};
161 206
162static int __cpuinit msr_device_create(int cpu) 207static int __cpuinit msr_device_create(int cpu)
@@ -196,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 241 .notifier_call = msr_class_cpu_callback,
197}; 242};
198 243
199static char *msr_nodename(struct device *dev) 244static char *msr_devnode(struct device *dev, mode_t *mode)
200{ 245{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202} 247}
@@ -217,7 +262,7 @@ static int __init msr_init(void)
217 err = PTR_ERR(msr_class); 262 err = PTR_ERR(msr_class);
218 goto out_chrdev; 263 goto out_chrdev;
219 } 264 }
220 msr_class->nodename = msr_nodename; 265 msr_class->devnode = msr_devnode;
221 for_each_online_cpu(i) { 266 for_each_online_cpu(i) {
222 err = msr_device_create(i); 267 err = msr_device_create(i);
223 if (err != 0) 268 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d76..1b1739d16310 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)
54 return x; 54 return x;
55} 55}
56 56
57static void __init default_banner(void) 57void __init default_banner(void)
58{ 58{
59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
60 pv_info.name); 60 pv_info.name);
61} 61}
62 62
63char *memory_setup(void)
64{
65 return pv_init_ops.memory_setup();
66}
67
68/* Simple instruction patching code. */ 63/* Simple instruction patching code. */
69#define DEF_NATIVE(ops, name, code) \ 64#define DEF_NATIVE(ops, name, code) \
70 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 65 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
188 return insn_len; 183 return insn_len;
189} 184}
190 185
191void init_IRQ(void)
192{
193 pv_irq_ops.init_IRQ();
194}
195
196static void native_flush_tlb(void) 186static void native_flush_tlb(void)
197{ 187{
198 __native_flush_tlb(); 188 __native_flush_tlb();
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
218extern void native_usergs_sysret32(void); 208extern void native_usergs_sysret32(void);
219extern void native_usergs_sysret64(void); 209extern void native_usergs_sysret64(void);
220 210
221static int __init print_banner(void)
222{
223 pv_init_ops.banner();
224 return 0;
225}
226core_initcall(print_banner);
227
228static struct resource reserve_ioports = { 211static struct resource reserve_ioports = {
229 .start = 0, 212 .start = 0,
230 .end = IO_SPACE_LIMIT, 213 .end = IO_SPACE_LIMIT,
@@ -320,21 +303,13 @@ struct pv_info pv_info = {
320 303
321struct pv_init_ops pv_init_ops = { 304struct pv_init_ops pv_init_ops = {
322 .patch = native_patch, 305 .patch = native_patch,
323 .banner = default_banner,
324 .arch_setup = paravirt_nop,
325 .memory_setup = machine_specific_memory_setup,
326}; 306};
327 307
328struct pv_time_ops pv_time_ops = { 308struct pv_time_ops pv_time_ops = {
329 .time_init = hpet_time_init,
330 .get_wallclock = native_get_wallclock,
331 .set_wallclock = native_set_wallclock,
332 .sched_clock = native_sched_clock, 309 .sched_clock = native_sched_clock,
333 .get_tsc_khz = native_calibrate_tsc,
334}; 310};
335 311
336struct pv_irq_ops pv_irq_ops = { 312struct pv_irq_ops pv_irq_ops = {
337 .init_IRQ = native_init_IRQ,
338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -362,8 +337,9 @@ struct pv_cpu_ops pv_cpu_ops = {
362#endif 337#endif
363 .wbinvd = native_wbinvd, 338 .wbinvd = native_wbinvd,
364 .read_msr = native_read_msr_safe, 339 .read_msr = native_read_msr_safe,
365 .read_msr_amd = native_read_msr_amd_safe, 340 .rdmsr_regs = native_rdmsr_safe_regs,
366 .write_msr = native_write_msr_safe, 341 .write_msr = native_write_msr_safe,
342 .wrmsr_regs = native_wrmsr_safe_regs,
367 .read_tsc = native_read_tsc, 343 .read_tsc = native_read_tsc,
368 .read_pmc = native_read_pmc, 344 .read_pmc = native_read_pmc,
369 .read_tscp = native_read_tscp, 345 .read_tscp = native_read_tscp,
@@ -408,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {
408 384
409struct pv_apic_ops pv_apic_ops = { 385struct pv_apic_ops pv_apic_ops = {
410#ifdef CONFIG_X86_LOCAL_APIC 386#ifdef CONFIG_X86_LOCAL_APIC
411 .setup_boot_clock = setup_boot_APIC_clock,
412 .setup_secondary_clock = setup_secondary_APIC_clock,
413 .startup_ipi_hook = paravirt_nop, 387 .startup_ipi_hook = paravirt_nop,
414#endif 388#endif
415}; 389};
@@ -423,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {
423#endif 397#endif
424 398
425struct pv_mmu_ops pv_mmu_ops = { 399struct pv_mmu_ops pv_mmu_ops = {
426#ifndef CONFIG_X86_64
427 .pagetable_setup_start = native_pagetable_setup_start,
428 .pagetable_setup_done = native_pagetable_setup_done,
429#else
430 .pagetable_setup_start = paravirt_nop,
431 .pagetable_setup_done = paravirt_nop,
432#endif
433 400
434 .read_cr2 = native_read_cr2, 401 .read_cr2 = native_read_cr2,
435 .write_cr2 = native_write_cr2, 402 .write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506b..64b838eac18c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -3,6 +3,7 @@
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/kmemleak.h>
6 7
7#include <asm/proto.h> 8#include <asm/proto.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
@@ -32,7 +33,14 @@ int no_iommu __read_mostly;
32/* Set this to 1 if there is a HW IOMMU in the system */ 33/* Set this to 1 if there is a HW IOMMU in the system */
33int iommu_detected __read_mostly = 0; 34int iommu_detected __read_mostly = 0;
34 35
35int iommu_pass_through; 36/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for
39 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation.
42 */
43int iommu_pass_through __read_mostly;
36 44
37dma_addr_t bad_dma_address __read_mostly = 0; 45dma_addr_t bad_dma_address __read_mostly = 0;
38EXPORT_SYMBOL(bad_dma_address); 46EXPORT_SYMBOL(bad_dma_address);
@@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void)
88 size = roundup(dma32_bootmem_size, align); 96 size = roundup(dma32_bootmem_size, align);
89 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 97 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
90 512ULL<<20); 98 512ULL<<20);
99 /*
100 * Kmemleak should not scan this block as it may not be mapped via the
101 * kernel direct mapping.
102 */
103 kmemleak_ignore(dma32_bootmem_ptr);
91 if (dma32_bootmem_ptr) 104 if (dma32_bootmem_ptr)
92 dma32_bootmem_size = size; 105 dma32_bootmem_size = size;
93 else 106 else
@@ -147,7 +160,7 @@ again:
147 return NULL; 160 return NULL;
148 161
149 addr = page_to_phys(page); 162 addr = page_to_phys(page);
150 if (!is_buffer_dma_capable(dma_mask, addr, size)) { 163 if (addr + size > dma_mask) {
151 __free_pages(page, get_order(size)); 164 __free_pages(page, get_order(size));
152 165
153 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { 166 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
@@ -212,10 +225,8 @@ static __init int iommu_setup(char *p)
212 if (!strncmp(p, "soft", 4)) 225 if (!strncmp(p, "soft", 4))
213 swiotlb = 1; 226 swiotlb = 1;
214#endif 227#endif
215 if (!strncmp(p, "pt", 2)) { 228 if (!strncmp(p, "pt", 2))
216 iommu_pass_through = 1; 229 iommu_pass_through = 1;
217 return 1;
218 }
219 230
220 gart_parse_options(p); 231 gart_parse_options(p);
221 232
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2e56b8f48e7..98a827ee9ed7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
190static inline int 190static inline int
191need_iommu(struct device *dev, unsigned long addr, size_t size) 191need_iommu(struct device *dev, unsigned long addr, size_t size)
192{ 192{
193 return force_iommu || 193 return force_iommu || !dma_capable(dev, addr, size);
194 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
195} 194}
196 195
197static inline int 196static inline int
198nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 197nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
199{ 198{
200 return !is_buffer_dma_capable(*dev->dma_mask, addr, size); 199 return !dma_capable(dev, addr, size);
201} 200}
202 201
203/* Map a single continuous physical area into the IOMMU. 202/* Map a single continuous physical area into the IOMMU.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 71d412a09f30..a3933d4330cd 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { 17 if (hwdev && !dma_capable(hwdev, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) 18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
79 free_pages((unsigned long)vaddr, get_order(size)); 79 free_pages((unsigned long)vaddr, get_order(size));
80} 80}
81 81
82static void nommu_sync_single_for_device(struct device *dev,
83 dma_addr_t addr, size_t size,
84 enum dma_data_direction dir)
85{
86 flush_write_buffers();
87}
88
89
90static void nommu_sync_sg_for_device(struct device *dev,
91 struct scatterlist *sg, int nelems,
92 enum dma_data_direction dir)
93{
94 flush_write_buffers();
95}
96
82struct dma_map_ops nommu_dma_ops = { 97struct dma_map_ops nommu_dma_ops = {
83 .alloc_coherent = dma_generic_alloc_coherent, 98 .alloc_coherent = dma_generic_alloc_coherent,
84 .free_coherent = nommu_free_coherent, 99 .free_coherent = nommu_free_coherent,
85 .map_sg = nommu_map_sg, 100 .map_sg = nommu_map_sg,
86 .map_page = nommu_map_page, 101 .map_page = nommu_map_page,
87 .is_phys = 1, 102 .sync_single_for_device = nommu_sync_single_for_device,
103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1,
88}; 105};
89 106
90void __init no_iommu_init(void) 107void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee44200..aaa6b7839f1e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -13,31 +13,6 @@
13 13
14int swiotlb __read_mostly; 14int swiotlb __read_mostly;
15 15
16void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
37{
38 return 0;
39}
40
41static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
42 dma_addr_t *dma_handle, gfp_t flags) 17 dma_addr_t *dma_handle, gfp_t flags)
43{ 18{
@@ -71,9 +46,8 @@ void __init pci_swiotlb_init(void)
71{ 46{
72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
74 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || 49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
75 iommu_pass_through) 50 swiotlb = 1;
76 swiotlb = 1;
77#endif 51#endif
78 if (swiotlb_force) 52 if (swiotlb_force)
79 swiotlb = 1; 53 swiotlb = 1;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a4ba83..5284cd2b5776 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/power.h> 12#include <trace/events/power.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h> 15#include <asm/syscalls.h>
@@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait);
25 25
26struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
27 27
28DEFINE_TRACE(power_start);
29DEFINE_TRACE(power_end);
30
31int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 28int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
32{ 29{
33 *dst = *src; 30 *dst = *src;
@@ -299,9 +296,7 @@ static inline int hlt_use_halt(void)
299void default_idle(void) 296void default_idle(void)
300{ 297{
301 if (hlt_use_halt()) { 298 if (hlt_use_halt()) {
302 struct power_trace it; 299 trace_power_start(POWER_CSTATE, 1);
303
304 trace_power_start(&it, POWER_CSTATE, 1);
305 current_thread_info()->status &= ~TS_POLLING; 300 current_thread_info()->status &= ~TS_POLLING;
306 /* 301 /*
307 * TS_POLLING-cleared state must be visible before we 302 * TS_POLLING-cleared state must be visible before we
@@ -314,7 +309,6 @@ void default_idle(void)
314 else 309 else
315 local_irq_enable(); 310 local_irq_enable();
316 current_thread_info()->status |= TS_POLLING; 311 current_thread_info()->status |= TS_POLLING;
317 trace_power_end(&it);
318 } else { 312 } else {
319 local_irq_enable(); 313 local_irq_enable();
320 /* loop is done by the caller */ 314 /* loop is done by the caller */
@@ -372,9 +366,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
372 */ 366 */
373void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 367void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
374{ 368{
375 struct power_trace it; 369 trace_power_start(POWER_CSTATE, (ax>>4)+1);
376
377 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
378 if (!need_resched()) { 370 if (!need_resched()) {
379 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 371 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
380 clflush((void *)&current_thread_info()->flags); 372 clflush((void *)&current_thread_info()->flags);
@@ -384,15 +376,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
384 if (!need_resched()) 376 if (!need_resched())
385 __mwait(ax, cx); 377 __mwait(ax, cx);
386 } 378 }
387 trace_power_end(&it);
388} 379}
389 380
390/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 381/* Default MONITOR/MWAIT with no hints, used for default C1 state */
391static void mwait_idle(void) 382static void mwait_idle(void)
392{ 383{
393 struct power_trace it;
394 if (!need_resched()) { 384 if (!need_resched()) {
395 trace_power_start(&it, POWER_CSTATE, 1); 385 trace_power_start(POWER_CSTATE, 1);
396 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 386 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
397 clflush((void *)&current_thread_info()->flags); 387 clflush((void *)&current_thread_info()->flags);
398 388
@@ -402,7 +392,6 @@ static void mwait_idle(void)
402 __sti_mwait(0, 0); 392 __sti_mwait(0, 0);
403 else 393 else
404 local_irq_enable(); 394 local_irq_enable();
405 trace_power_end(&it);
406 } else 395 } else
407 local_irq_enable(); 396 local_irq_enable();
408} 397}
@@ -414,13 +403,11 @@ static void mwait_idle(void)
414 */ 403 */
415static void poll_idle(void) 404static void poll_idle(void)
416{ 405{
417 struct power_trace it; 406 trace_power_start(POWER_CSTATE, 0);
418
419 trace_power_start(&it, POWER_CSTATE, 0);
420 local_irq_enable(); 407 local_irq_enable();
421 while (!need_resched()) 408 while (!need_resched())
422 cpu_relax(); 409 cpu_relax();
423 trace_power_end(&it); 410 trace_power_end(0);
424} 411}
425 412
426/* 413/*
@@ -568,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
568void __init init_c1e_mask(void) 555void __init init_c1e_mask(void)
569{ 556{
570 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 557 /* If we're using c1e_idle, we need to allocate c1e_mask. */
571 if (pm_idle == c1e_idle) { 558 if (pm_idle == c1e_idle)
572 alloc_cpumask_var(&c1e_mask, GFP_KERNEL); 559 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
573 cpumask_clear(c1e_mask);
574 }
575} 560}
576 561
577static int __init idle_setup(char *str) 562static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524984af..4cf79567cdab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -61,9 +61,6 @@
61 61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 63
64DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
65EXPORT_PER_CPU_SYMBOL(current_task);
66
67/* 64/*
68 * Return saved PC of a blocked thread. 65 * Return saved PC of a blocked thread.
69 */ 66 */
@@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
350 *next = &next_p->thread; 347 *next = &next_p->thread;
351 int cpu = smp_processor_id(); 348 int cpu = smp_processor_id();
352 struct tss_struct *tss = &per_cpu(init_tss, cpu); 349 struct tss_struct *tss = &per_cpu(init_tss, cpu);
350 bool preload_fpu;
353 351
354 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 352 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
355 353
356 __unlazy_fpu(prev_p); 354 /*
355 * If the task has used fpu the last 5 timeslices, just do a full
356 * restore of the math state immediately to avoid the trap; the
357 * chances of needing FPU soon are obviously high now
358 */
359 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
357 360
361 __unlazy_fpu(prev_p);
358 362
359 /* we're going to use this soon, after a few expensive things */ 363 /* we're going to use this soon, after a few expensive things */
360 if (next_p->fpu_counter > 5) 364 if (preload_fpu)
361 prefetch(next->xstate); 365 prefetch(next->xstate);
362 366
363 /* 367 /*
@@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
398 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 402 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
399 __switch_to_xtra(prev_p, next_p, tss); 403 __switch_to_xtra(prev_p, next_p, tss);
400 404
405 /* If we're going to preload the fpu context, make sure clts
406 is run while we're batching the cpu state updates. */
407 if (preload_fpu)
408 clts();
409
401 /* 410 /*
402 * Leave lazy mode, flushing any hypercalls made here. 411 * Leave lazy mode, flushing any hypercalls made here.
403 * This must be done before restoring TLS segments so 412 * This must be done before restoring TLS segments so
@@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 */ 416 */
408 arch_end_context_switch(next_p); 417 arch_end_context_switch(next_p);
409 418
410 /* If the task has used fpu the last 5 timeslices, just do a full 419 if (preload_fpu)
411 * restore of the math state immediately to avoid the trap; the 420 __math_state_restore();
412 * chances of needing FPU soon are obviously high now
413 *
414 * tsk_used_math() checks prevent calling math_state_restore(),
415 * which can sleep in the case of !tsk_used_math()
416 */
417 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
418 math_state_restore();
419 421
420 /* 422 /*
421 * Restore %gs if needed (which is common) 423 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb5407b9d..ad535b683170 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -55,9 +55,6 @@
55 55
56asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
57 57
58DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
59EXPORT_PER_CPU_SYMBOL(current_task);
60
61DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
62static DEFINE_PER_CPU(unsigned char, is_idle); 59static DEFINE_PER_CPU(unsigned char, is_idle);
63 60
@@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386 int cpu = smp_processor_id(); 383 int cpu = smp_processor_id();
387 struct tss_struct *tss = &per_cpu(init_tss, cpu); 384 struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 unsigned fsindex, gsindex; 385 unsigned fsindex, gsindex;
386 bool preload_fpu;
387
388 /*
389 * If the task has used fpu the last 5 timeslices, just do a full
390 * restore of the math state immediately to avoid the trap; the
391 * chances of needing FPU soon are obviously high now
392 */
393 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
389 394
390 /* we're going to use this soon, after a few expensive things */ 395 /* we're going to use this soon, after a few expensive things */
391 if (next_p->fpu_counter > 5) 396 if (preload_fpu)
392 prefetch(next->xstate); 397 prefetch(next->xstate);
393 398
394 /* 399 /*
@@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
419 424
420 load_TLS(next, cpu); 425 load_TLS(next, cpu);
421 426
427 /* Must be after DS reload */
428 unlazy_fpu(prev_p);
429
430 /* Make sure cpu is ready for new context */
431 if (preload_fpu)
432 clts();
433
422 /* 434 /*
423 * Leave lazy mode, flushing any hypercalls made here. 435 * Leave lazy mode, flushing any hypercalls made here.
424 * This must be done before restoring TLS segments so 436 * This must be done before restoring TLS segments so
@@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
459 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 471 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460 prev->gsindex = gsindex; 472 prev->gsindex = gsindex;
461 473
462 /* Must be after DS reload */
463 unlazy_fpu(prev_p);
464
465 /* 474 /*
466 * Switch the PDA and FPU contexts. 475 * Switch the PDA and FPU contexts.
467 */ 476 */
@@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
480 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 489 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
481 __switch_to_xtra(prev_p, next_p, tss); 490 __switch_to_xtra(prev_p, next_p, tss);
482 491
483 /* If the task has used fpu the last 5 timeslices, just do a full 492 /*
484 * restore of the math state immediately to avoid the trap; the 493 * Preload the FPU context, now that we've determined that the
485 * chances of needing FPU soon are obviously high now 494 * task is likely to be using it.
486 *
487 * tsk_used_math() checks prevent calling math_state_restore(),
488 * which can sleep in the case of !tsk_used_math()
489 */ 495 */
490 if (tsk_used_math(next_p) && next_p->fpu_counter > 5) 496 if (preload_fpu)
491 math_state_restore(); 497 __math_state_restore();
492 return prev_p; 498 return prev_p;
493} 499}
494 500
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..7b058a2dc66a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
35#include <asm/proto.h> 35#include <asm/proto.h>
36#include <asm/ds.h> 36#include <asm/ds.h>
37 37
38#include <trace/syscall.h>
39
40#include "tls.h" 38#include "tls.h"
41 39
40#define CREATE_TRACE_POINTS
41#include <trace/events/syscalls.h>
42
42enum x86_regset { 43enum x86_regset {
43 REGSET_GENERAL, 44 REGSET_GENERAL,
44 REGSET_FP, 45 REGSET_FP,
@@ -324,16 +325,6 @@ static int putreg(struct task_struct *child,
324 return set_flags(child, value); 325 return set_flags(child, value);
325 326
326#ifdef CONFIG_X86_64 327#ifdef CONFIG_X86_64
327 /*
328 * Orig_ax is really just a flag with small positive and
329 * negative values, so make sure to always sign-extend it
330 * from 32 bits so that it works correctly regardless of
331 * whether we come from a 32-bit environment or not.
332 */
333 case offsetof(struct user_regs_struct, orig_ax):
334 value = (long) (s32) value;
335 break;
336
337 case offsetof(struct user_regs_struct,fs_base): 328 case offsetof(struct user_regs_struct,fs_base):
338 if (value >= TASK_SIZE_OF(child)) 329 if (value >= TASK_SIZE_OF(child))
339 return -EIO; 330 return -EIO;
@@ -1125,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1125 1116
1126 case offsetof(struct user32, regs.orig_eax): 1117 case offsetof(struct user32, regs.orig_eax):
1127 /* 1118 /*
1128 * Sign-extend the value so that orig_eax = -1 1119 * A 32-bit debugger setting orig_eax means to restore
1129 * causes (long)orig_ax < 0 tests to fire correctly. 1120 * the state of the task restarting a 32-bit syscall.
1121 * Make sure we interpret the -ERESTART* codes correctly
1122 * in case the task is not actually still sitting at the
1123 * exit from a 32-bit syscall with TS_COMPAT still set.
1130 */ 1124 */
1131 regs->orig_ax = (long) (s32) value; 1125 regs->orig_ax = value;
1126 if (syscall_get_nr(child, regs) >= 0)
1127 task_thread_info(child)->status |= TS_COMPAT;
1132 break; 1128 break;
1133 1129
1134 case offsetof(struct user32, regs.eflags): 1130 case offsetof(struct user32, regs.eflags):
@@ -1497,8 +1493,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1497 tracehook_report_syscall_entry(regs)) 1493 tracehook_report_syscall_entry(regs))
1498 ret = -1L; 1494 ret = -1L;
1499 1495
1500 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1496 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1501 ftrace_syscall_enter(regs); 1497 trace_sys_enter(regs, regs->orig_ax);
1502 1498
1503 if (unlikely(current->audit_context)) { 1499 if (unlikely(current->audit_context)) {
1504 if (IS_IA32) 1500 if (IS_IA32)
@@ -1523,8 +1519,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1523 if (unlikely(current->audit_context)) 1519 if (unlikely(current->audit_context))
1524 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1525 1521
1526 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1527 ftrace_syscall_exit(regs); 1523 trace_sys_exit(regs, regs->ax);
1528 1524
1529 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1530 tracehook_report_syscall_exit(regs, 0); 1526 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..6c3b2c6fd772 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
508 508
509 pci_read_config_dword(nb_ht, 0x60, &val); 509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev); 511 pci_dev_put(nb_ht);
512} 512}
513 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..27349f92a6d7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/tboot.h>
7#include <acpi/reboot.h> 8#include <acpi/reboot.h>
8#include <asm/io.h> 9#include <asm/io.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void)
508 if (reboot_emergency) 509 if (reboot_emergency)
509 emergency_vmx_disable_all(); 510 emergency_vmx_disable_all();
510 511
512 tboot_shutdown(TB_SHUTDOWN_REBOOT);
513
511 /* Tell the BIOS if we want cold or warm reboot */ 514 /* Tell the BIOS if we want cold or warm reboot */
512 *((unsigned short *)__va(0x472)) = reboot_mode; 515 *((unsigned short *)__va(0x472)) = reboot_mode;
513 516
@@ -634,6 +637,8 @@ static void native_machine_halt(void)
634 /* stop other cpus and apics */ 637 /* stop other cpus and apics */
635 machine_shutdown(); 638 machine_shutdown();
636 639
640 tboot_shutdown(TB_SHUTDOWN_HALT);
641
637 /* stop this cpu */ 642 /* stop this cpu */
638 stop_this_cpu(NULL); 643 stop_this_cpu(NULL);
639} 644}
@@ -645,6 +650,8 @@ static void native_machine_power_off(void)
645 machine_shutdown(); 650 machine_shutdown();
646 pm_power_off(); 651 pm_power_off();
647 } 652 }
653 /* a fallback in case there is no PM info available */
654 tboot_shutdown(TB_SHUTDOWN_HALT);
648} 655}
649 656
650struct machine_ops machine_ops = { 657struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..1cfbbfc3ae26 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/x86_init.h>
11#include <asm/time.h> 12#include <asm/time.h>
12 13
13#ifdef CONFIG_X86_32 14#ifdef CONFIG_X86_32
@@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
165} 166}
166EXPORT_SYMBOL(rtc_cmos_write); 167EXPORT_SYMBOL(rtc_cmos_write);
167 168
168static int set_rtc_mmss(unsigned long nowtime) 169int update_persistent_clock(struct timespec now)
169{ 170{
170 unsigned long flags; 171 unsigned long flags;
171 int retval; 172 int retval;
172 173
173 spin_lock_irqsave(&rtc_lock, flags); 174 spin_lock_irqsave(&rtc_lock, flags);
174 retval = set_wallclock(nowtime); 175 retval = x86_platform.set_wallclock(now.tv_sec);
175 spin_unlock_irqrestore(&rtc_lock, flags); 176 spin_unlock_irqrestore(&rtc_lock, flags);
176 177
177 return retval; 178 return retval;
178} 179}
179 180
180/* not static: needed by APM */ 181/* not static: needed by APM */
181unsigned long read_persistent_clock(void) 182void read_persistent_clock(struct timespec *ts)
182{ 183{
183 unsigned long retval, flags; 184 unsigned long retval, flags;
184 185
185 spin_lock_irqsave(&rtc_lock, flags); 186 spin_lock_irqsave(&rtc_lock, flags);
186 retval = get_wallclock(); 187 retval = x86_platform.get_wallclock();
187 spin_unlock_irqrestore(&rtc_lock, flags); 188 spin_unlock_irqrestore(&rtc_lock, flags);
188 189
189 return retval; 190 ts->tv_sec = retval;
190} 191 ts->tv_nsec = 0;
191
192int update_persistent_clock(struct timespec now)
193{
194 return set_rtc_mmss(now.tv_sec);
195} 192}
196 193
197unsigned long long native_read_tsc(void) 194unsigned long long native_read_tsc(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..e09f0e2c14b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
27#include <linux/screen_info.h> 27#include <linux/screen_info.h>
28#include <linux/ioport.h> 28#include <linux/ioport.h>
29#include <linux/acpi.h> 29#include <linux/acpi.h>
30#include <linux/sfi.h>
30#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
31#include <linux/initrd.h> 32#include <linux/initrd.h>
32#include <linux/bootmem.h> 33#include <linux/bootmem.h>
@@ -66,6 +67,7 @@
66 67
67#include <linux/percpu.h> 68#include <linux/percpu.h>
68#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h>
69 71
70#include <video/edid.h> 72#include <video/edid.h>
71 73
@@ -108,10 +110,6 @@
108#include <asm/numa_64.h> 110#include <asm/numa_64.h>
109#endif 111#endif
110 112
111#ifndef ARCH_SETUP
112#define ARCH_SETUP
113#endif
114
115/* 113/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access 115 * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
133 return __default_cpu_present_to_apicid(mps_cpu); 131 return __default_cpu_present_to_apicid(mps_cpu);
134} 132}
135 133
136int default_check_phys_apicid_present(int boot_cpu_physical_apicid) 134int default_check_phys_apicid_present(int phys_apicid)
137{ 135{
138 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 136 return __default_check_phys_apicid_present(phys_apicid);
139} 137}
140#endif 138#endif
141 139
@@ -171,13 +169,6 @@ static struct resource bss_resource = {
171 169
172 170
173#ifdef CONFIG_X86_32 171#ifdef CONFIG_X86_32
174static struct resource video_ram_resource = {
175 .name = "Video RAM area",
176 .start = 0xa0000,
177 .end = 0xbffff,
178 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
179};
180
181/* cpu data as detected by the assembly code in head.S */ 172/* cpu data as detected by the assembly code in head.S */
182struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 173struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
183/* common cpu data for all cpus */ 174/* common cpu data for all cpus */
@@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = {
605 .flags = IORESOURCE_BUSY | IORESOURCE_IO } 596 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
606}; 597};
607 598
608static void __init reserve_standard_io_resources(void) 599void __init reserve_standard_io_resources(void)
609{ 600{
610 int i; 601 int i;
611 602
@@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg)
637early_param("elfcorehdr", setup_elfcorehdr); 628early_param("elfcorehdr", setup_elfcorehdr);
638#endif 629#endif
639 630
640static struct x86_quirks default_x86_quirks __initdata;
641
642struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
643
644#ifdef CONFIG_X86_RESERVE_LOW_64K 631#ifdef CONFIG_X86_RESERVE_LOW_64K
645static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
646{ 633{
@@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p)
757 } 744 }
758#endif 745#endif
759 746
760 ARCH_SETUP 747 x86_init.oem.arch_setup();
761 748
762 setup_memory_map(); 749 setup_memory_map();
763 parse_setup_data(); 750 parse_setup_data();
@@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p)
796 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
797 *cmdline_p = command_line; 784 *cmdline_p = command_line;
798 785
786#ifdef CONFIG_X86_64
787 /*
788 * Must call this twice: Once just to detect whether hardware doesn't
789 * support NX (so that the early EHCI debug console setup can safely
790 * call set_fixmap(), and then again after parsing early parameters to
791 * honor the respective command line option.
792 */
793 check_efer();
794#endif
795
799 parse_early_param(); 796 parse_early_param();
800 797
801#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p)
833 * VMware detection requires dmi to be available, so this 830 * VMware detection requires dmi to be available, so this
834 * needs to be done after dmi_scan_machine, for the BP. 831 * needs to be done after dmi_scan_machine, for the BP.
835 */ 832 */
836 init_hypervisor(&boot_cpu_data); 833 init_hypervisor_platform();
837 834
838#ifdef CONFIG_X86_32 835 x86_init.resources.probe_roms();
839 probe_roms();
840#endif
841 836
842 /* after parse_early_param, so could debug it */ 837 /* after parse_early_param, so could debug it */
843 insert_resource(&iomem_resource, &code_resource); 838 insert_resource(&iomem_resource, &code_resource);
@@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p)
972 kvmclock_init(); 967 kvmclock_init();
973#endif 968#endif
974 969
975 paravirt_pagetable_setup_start(swapper_pg_dir); 970 x86_init.paging.pagetable_setup_start(swapper_pg_dir);
976 paging_init(); 971 paging_init();
977 paravirt_pagetable_setup_done(swapper_pg_dir); 972 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
978 paravirt_post_allocator_init(); 973
974 tboot_probe();
979 975
980#ifdef CONFIG_X86_64 976#ifdef CONFIG_X86_64
981 map_vsyscall(); 977 map_vsyscall();
@@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p)
990 */ 986 */
991 acpi_boot_init(); 987 acpi_boot_init();
992 988
993#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) 989 sfi_init();
990
994 /* 991 /*
995 * get boot-time SMP configuration: 992 * get boot-time SMP configuration:
996 */ 993 */
997 if (smp_found_config) 994 if (smp_found_config)
998 get_smp_config(); 995 get_smp_config();
999#endif
1000 996
1001 prefill_possible_map(); 997 prefill_possible_map();
1002 998
@@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p)
1015 e820_reserve_resources(); 1011 e820_reserve_resources();
1016 e820_mark_nosave_regions(max_low_pfn); 1012 e820_mark_nosave_regions(max_low_pfn);
1017 1013
1018#ifdef CONFIG_X86_32 1014 x86_init.resources.reserve_resources();
1019 request_resource(&iomem_resource, &video_ram_resource);
1020#endif
1021 reserve_standard_io_resources();
1022 1015
1023 e820_setup_gap(); 1016 e820_setup_gap();
1024 1017
@@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p)
1030 conswitchp = &dummy_con; 1023 conswitchp = &dummy_con;
1031#endif 1024#endif
1032#endif 1025#endif
1026 x86_init.oem.banner();
1033} 1027}
1034 1028
1035#ifdef CONFIG_X86_32 1029#ifdef CONFIG_X86_32
1036 1030
1037/** 1031static struct resource video_ram_resource = {
1038 * x86_quirk_intr_init - post gate setup interrupt initialisation 1032 .name = "Video RAM area",
1039 * 1033 .start = 0xa0000,
1040 * Description: 1034 .end = 0xbffff,
1041 * Fill in any interrupts that may have been left out by the general 1035 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1042 * init_IRQ() routine. interrupts having to do with the machine rather
1043 * than the devices on the I/O bus (like APIC interrupts in intel MP
1044 * systems) are started here.
1045 **/
1046void __init x86_quirk_intr_init(void)
1047{
1048 if (x86_quirks->arch_intr_init) {
1049 if (x86_quirks->arch_intr_init())
1050 return;
1051 }
1052}
1053
1054/**
1055 * x86_quirk_trap_init - initialise system specific traps
1056 *
1057 * Description:
1058 * Called as the final act of trap_init(). Used in VISWS to initialise
1059 * the various board specific APIC traps.
1060 **/
1061void __init x86_quirk_trap_init(void)
1062{
1063 if (x86_quirks->arch_trap_init) {
1064 if (x86_quirks->arch_trap_init())
1065 return;
1066 }
1067}
1068
1069static struct irqaction irq0 = {
1070 .handler = timer_interrupt,
1071 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1072 .name = "timer"
1073}; 1036};
1074 1037
1075/** 1038void __init i386_reserve_resources(void)
1076 * x86_quirk_pre_time_init - do any specific initialisations before.
1077 *
1078 **/
1079void __init x86_quirk_pre_time_init(void)
1080{ 1039{
1081 if (x86_quirks->arch_pre_time_init) 1040 request_resource(&iomem_resource, &video_ram_resource);
1082 x86_quirks->arch_pre_time_init(); 1041 reserve_standard_io_resources();
1083} 1042}
1084 1043
1085/**
1086 * x86_quirk_time_init - do any specific initialisations for the system timer.
1087 *
1088 * Description:
1089 * Must plug the system timer interrupt source at HZ into the IRQ listed
1090 * in irq_vectors.h:TIMER_IRQ
1091 **/
1092void __init x86_quirk_time_init(void)
1093{
1094 if (x86_quirks->arch_time_init) {
1095 /*
1096 * A nonzero return code does not mean failure, it means
1097 * that the architecture quirk does not want any
1098 * generic (timer) setup to be performed after this:
1099 */
1100 if (x86_quirks->arch_time_init())
1101 return;
1102 }
1103
1104 irq0.mask = cpumask_of_cpu(0);
1105 setup_irq(0, &irq0);
1106}
1107#endif /* CONFIG_X86_32 */ 1044#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 000000000000..34e099382651
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,122 @@
1/*
2 * sfi.c - x86 architecture SFI support.
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#define KMSG_COMPONENT "SFI"
22#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24#include <linux/acpi.h>
25#include <linux/init.h>
26#include <linux/sfi.h>
27#include <linux/io.h>
28
29#include <asm/io_apic.h>
30#include <asm/mpspec.h>
31#include <asm/setup.h>
32#include <asm/apic.h>
33
34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id)
50{
51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS);
54 return;
55 }
56
57 pr_info("registering lapic[%d]\n", id);
58
59 generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
60}
61
62static int __init sfi_parse_cpus(struct sfi_table_header *table)
63{
64 struct sfi_table_simple *sb;
65 struct sfi_cpu_table_entry *pentry;
66 int i;
67 int cpu_num;
68
69 sb = (struct sfi_table_simple *)table;
70 cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
71 pentry = (struct sfi_cpu_table_entry *)sb->pentry;
72
73 for (i = 0; i < cpu_num; i++) {
74 mp_sfi_register_lapic(pentry->apic_id);
75 pentry++;
76 }
77
78 smp_found_config = 1;
79 return 0;
80}
81#endif /* CONFIG_X86_LOCAL_APIC */
82
83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85
86static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{
88 struct sfi_table_simple *sb;
89 struct sfi_apic_table_entry *pentry;
90 int i, num;
91
92 sb = (struct sfi_table_simple *)table;
93 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
94 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95
96 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++;
100 }
101
102 WARN(pic_mode, KERN_WARNING
103 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
104 pic_mode = 0;
105 return 0;
106}
107#endif /* CONFIG_X86_IO_APIC */
108
109/*
110 * sfi_platform_init(): register lapics & io-apics
111 */
112int __init sfi_platform_init(void)
113{
114#ifdef CONFIG_X86_LOCAL_APIC
115 mp_sfi_register_lapic_address(sfi_lapic_addr);
116 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
117#endif
118#ifdef CONFIG_X86_IO_APIC
119 sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
120#endif
121 return 0;
122}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94e..6a44a76055ad 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#ifdef CONFIG_X86_NEW_MCE 859#ifdef CONFIG_X86_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_process(); 862 mce_notify_process();
@@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
869 if (thread_info_flags & _TIF_NOTIFY_RESUME) { 869 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
870 clear_thread_flag(TIF_NOTIFY_RESUME); 870 clear_thread_flag(TIF_NOTIFY_RESUME);
871 tracehook_notify_resume(regs); 871 tracehook_notify_resume(regs);
872 if (current->replacement_session_keyring)
873 key_replace_session_keyring();
872 } 874 }
873 875
874#ifdef CONFIG_X86_32 876#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee64..565ebc65920e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h>
50 51
51#include <asm/acpi.h> 52#include <asm/acpi.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
@@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused)
323 /* enable local interrupts */ 324 /* enable local interrupts */
324 local_irq_enable(); 325 local_irq_enable();
325 326
326 setup_secondary_clock(); 327 x86_cpuinit.setup_percpu_clockev();
327 328
328 wmb(); 329 wmb();
329 cpu_idle(); 330 cpu_idle();
@@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
434 * For perf, we return last level cache shared map. 435 * For perf, we return last level cache shared map.
435 * And for power savings, we return cpu_core_map 436 * And for power savings, we return cpu_core_map
436 */ 437 */
437 if (sched_mc_power_savings || sched_smt_power_savings) 438 if ((sched_mc_power_savings || sched_smt_power_savings) &&
439 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
438 return cpu_core_mask(cpu); 440 return cpu_core_mask(cpu);
439 else 441 else
440 return c->llc_shared_map; 442 return c->llc_shared_map;
@@ -1057,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1057#endif 1059#endif
1058 current_thread_info()->cpu = 0; /* needed? */ 1060 current_thread_info()->cpu = 0; /* needed? */
1059 for_each_possible_cpu(i) { 1061 for_each_possible_cpu(i) {
1060 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1062 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1061 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1063 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1062 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1064 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1063 cpumask_clear(per_cpu(cpu_core_map, i));
1064 cpumask_clear(per_cpu(cpu_sibling_map, i));
1065 cpumask_clear(cpu_data(i).llc_shared_map);
1066 } 1065 }
1067 set_cpu_sibling_map(0); 1066 set_cpu_sibling_map(0);
1068 1067
@@ -1112,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1112 1111
1113 printk(KERN_INFO "CPU%d: ", 0); 1112 printk(KERN_INFO "CPU%d: ", 0);
1114 print_cpu_info(&cpu_data(0)); 1113 print_cpu_info(&cpu_data(0));
1115 setup_boot_clock(); 1114 x86_init.timers.setup_percpu_clockev();
1116 1115
1117 if (is_uv_system()) 1116 if (is_uv_system())
1118 uv_system_init(); 1117 uv_system_init();
1118
1119 set_mtrr_aps_delayed_init();
1119out: 1120out:
1120 preempt_enable(); 1121 preempt_enable();
1121} 1122}
1123
1124void arch_enable_nonboot_cpus_begin(void)
1125{
1126 set_mtrr_aps_delayed_init();
1127}
1128
1129void arch_enable_nonboot_cpus_end(void)
1130{
1131 mtrr_aps_init();
1132}
1133
1122/* 1134/*
1123 * Early setup to make printk work. 1135 * Early setup to make printk work.
1124 */ 1136 */
@@ -1140,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1140 setup_ioapic_dest(); 1152 setup_ioapic_dest();
1141#endif 1153#endif
1142 check_nmi_watchdog(); 1154 check_nmi_watchdog();
1155 mtrr_aps_init();
1143} 1156}
1144 1157
1145static int __initdata setup_possible_cpus = -1; 1158static int __initdata setup_possible_cpus = -1;
@@ -1317,6 +1330,7 @@ void play_dead_common(void)
1317void native_play_dead(void) 1330void native_play_dead(void)
1318{ 1331{
1319 play_dead_common(); 1332 play_dead_common();
1333 tboot_shutdown(TB_SHUTDOWN_WFS);
1320 wbinvd_halt(); 1334 wbinvd_halt();
1321} 1335}
1322 1336
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index e8b9863ef8c4..3149032ff107 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -4,6 +4,7 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <asm/desc.h>
7 8
8unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) 9unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
9{ 10{
@@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
23 * and APM bios ones we just ignore here. 24 * and APM bios ones we just ignore here.
24 */ 25 */
25 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { 26 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
26 u32 *desc; 27 struct desc_struct *desc;
27 unsigned long base; 28 unsigned long base;
28 29
29 seg &= ~7UL; 30 seg &= ~7UL;
@@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
33 addr = -1L; /* bogus selector, access would fault */ 34 addr = -1L; /* bogus selector, access would fault */
34 else { 35 else {
35 desc = child->mm->context.ldt + seg; 36 desc = child->mm->context.ldt + seg;
36 base = ((desc[0] >> 16) | 37 base = get_desc_base(desc);
37 ((desc[1] & 0xff) << 16) |
38 (desc[1] & 0xff000000));
39 38
40 /* 16-bit code segment? */ 39 /* 16-bit code segment? */
41 if (!((desc[1] >> 22) & 1)) 40 if (!desc->d)
42 addr &= 0xffff; 41 addr &= 0xffff;
43 addr += base; 42 addr += base;
44 } 43 }
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long prot, unsigned long flags, 22 unsigned long, prot, unsigned long, flags,
23 unsigned long fd, unsigned long off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file; 26 struct file *file;
@@ -226,7 +226,7 @@ bottomup:
226} 226}
227 227
228 228
229asmlinkage long sys_uname(struct new_utsname __user *name) 229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{ 230{
231 int err; 231 int err;
232 down_read(&uts_sem); 232 down_read(&uts_sem);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafda..0157cd26d7cc 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open 338 .long sys_perf_event_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
1/*
2 * tboot.c: main implementation of helper functions used by kernel for
3 * runtime support of Intel(R) Trusted Execution Technology
4 *
5 * Copyright (c) 2006-2009, Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 */
21
22#include <linux/dma_remapping.h>
23#include <linux/init_task.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26#include <linux/sched.h>
27#include <linux/init.h>
28#include <linux/dmar.h>
29#include <linux/cpu.h>
30#include <linux/pfn.h>
31#include <linux/mm.h>
32#include <linux/tboot.h>
33
34#include <asm/trampoline.h>
35#include <asm/processor.h>
36#include <asm/bootparam.h>
37#include <asm/pgtable.h>
38#include <asm/pgalloc.h>
39#include <asm/fixmap.h>
40#include <asm/proto.h>
41#include <asm/setup.h>
42#include <asm/e820.h>
43#include <asm/io.h>
44
45#include "acpi/realmode/wakeup.h"
46
47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly;
49
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1
52
53#undef pr_fmt
54#define pr_fmt(fmt) "tboot: " fmt
55
56static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
57
58void __init tboot_probe(void)
59{
60 /* Look for valid page-aligned address for shared page. */
61 if (!boot_params.tboot_addr)
62 return;
63 /*
64 * also verify that it is mapped as we expect it before calling
65 * set_fixmap(), to reduce chance of garbage value causing crash
66 */
67 if (!e820_any_mapped(boot_params.tboot_addr,
68 boot_params.tboot_addr, E820_RESERVED)) {
69 pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
70 return;
71 }
72
73 /* only a natively booted kernel should be using TXT */
74 if (paravirt_enabled()) {
75 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
76 return;
77 }
78
79 /* Map and check for tboot UUID. */
80 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
81 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
82 if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
83 pr_warning("tboot at 0x%llx is invalid\n",
84 boot_params.tboot_addr);
85 tboot = NULL;
86 return;
87 }
88 if (tboot->version < 5) {
89 pr_warning("tboot version is invalid: %u\n", tboot->version);
90 tboot = NULL;
91 return;
92 }
93
94 pr_info("found shared page at phys addr 0x%llx:\n",
95 boot_params.tboot_addr);
96 pr_debug("version: %d\n", tboot->version);
97 pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
98 pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
99 pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
100 pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
101}
102
103static pgd_t *tboot_pg_dir;
104static struct mm_struct tboot_mm = {
105 .mm_rb = RB_ROOT,
106 .pgd = swapper_pg_dir,
107 .mm_users = ATOMIC_INIT(2),
108 .mm_count = ATOMIC_INIT(1),
109 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
110 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
111 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
112 .cpu_vm_mask = CPU_MASK_ALL,
113};
114
115static inline void switch_to_tboot_pt(void)
116{
117 write_cr3(virt_to_phys(tboot_pg_dir));
118}
119
120static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
121 pgprot_t prot)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = pgd_offset(&tboot_mm, vaddr);
129 pud = pud_alloc(&tboot_mm, pgd, vaddr);
130 if (!pud)
131 return -1;
132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
133 if (!pmd)
134 return -1;
135 pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
136 if (!pte)
137 return -1;
138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
139 pte_unmap(pte);
140 return 0;
141}
142
143static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
144 unsigned long nr)
145{
146 /* Reuse the original kernel mapping */
147 tboot_pg_dir = pgd_alloc(&tboot_mm);
148 if (!tboot_pg_dir)
149 return -1;
150
151 for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
152 if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
153 return -1;
154 }
155
156 return 0;
157}
158
159static void tboot_create_trampoline(void)
160{
161 u32 map_base, map_size;
162
163 /* Create identity map for tboot shutdown code. */
164 map_base = PFN_DOWN(tboot->tboot_base);
165 map_size = PFN_UP(tboot->tboot_size);
166 if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
167 panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
168 map_base, map_size);
169}
170
171#ifdef CONFIG_ACPI_SLEEP
172
173static void add_mac_region(phys_addr_t start, unsigned long size)
174{
175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size;
177
178 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE);
181 mr->size = round_up(end, PAGE_SIZE) - mr->start;
182 }
183}
184
185static int tboot_setup_sleep(void)
186{
187 tboot->num_mac_regions = 0;
188
189 /* S3 resume code */
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
191
192#ifdef CONFIG_X86_TRAMPOLINE
193 /* AP trampoline code */
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201
202 return 0;
203}
204
205#else /* no CONFIG_ACPI_SLEEP */
206
207static int tboot_setup_sleep(void)
208{
209 /* S3 shutdown requested, but S3 not supported by the kernel... */
210 BUG();
211 return -1;
212}
213
214#endif
215
216void tboot_shutdown(u32 shutdown_type)
217{
218 void (*shutdown)(void);
219
220 if (!tboot_enabled())
221 return;
222
223 /*
224 * if we're being called before the 1:1 mapping is set up then just
225 * return and let the normal shutdown happen; this should only be
226 * due to very early panic()
227 */
228 if (!tboot_pg_dir)
229 return;
230
231 /* if this is S3 then set regions to MAC */
232 if (shutdown_type == TB_SHUTDOWN_S3)
233 if (tboot_setup_sleep())
234 return;
235
236 tboot->shutdown_type = shutdown_type;
237
238 switch_to_tboot_pt();
239
240 shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
241 shutdown();
242
243 /* should not reach here */
244 while (1)
245 halt();
246}
247
248static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
249{
250#define TB_COPY_GAS(tbg, g) \
251 tbg.space_id = g.space_id; \
252 tbg.bit_width = g.bit_width; \
253 tbg.bit_offset = g.bit_offset; \
254 tbg.access_width = g.access_width; \
255 tbg.address = g.address;
256
257 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
258 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
259 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
260 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
261
262 /*
263 * We need phys addr of waking vector, but can't use virt_to_phys() on
264 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
265 * addr.
266 */
267 tboot->acpi_sinfo.wakeup_vector = fadt->facs +
268 offsetof(struct acpi_table_facs, firmware_waking_vector);
269}
270
271void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
272{
273 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
274 /* S0,1,2: */ -1, -1, -1,
275 /* S3: */ TB_SHUTDOWN_S3,
276 /* S4: */ TB_SHUTDOWN_S4,
277 /* S5: */ TB_SHUTDOWN_S5 };
278
279 if (!tboot_enabled())
280 return;
281
282 tboot_copy_fadt(&acpi_gbl_FADT);
283 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
284 tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
285 /* we always use the 32b wakeup vector */
286 tboot->acpi_sinfo.vector_width = 32;
287
288 if (sleep_state >= ACPI_S_STATE_COUNT ||
289 acpi_shutdown_map[sleep_state] == -1) {
290 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
291 return;
292 }
293
294 tboot_shutdown(acpi_shutdown_map[sleep_state]);
295}
296
297static atomic_t ap_wfs_count;
298
299static int tboot_wait_for_aps(int num_aps)
300{
301 unsigned long timeout;
302
303 timeout = AP_WAIT_TIMEOUT*HZ;
304 while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
305 timeout) {
306 mdelay(1);
307 timeout--;
308 }
309
310 if (timeout)
311 pr_warning("tboot wait for APs timeout\n");
312
313 return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
314}
315
316static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
317 unsigned long action, void *hcpu)
318{
319 switch (action) {
320 case CPU_DYING:
321 atomic_inc(&ap_wfs_count);
322 if (num_online_cpus() == 1)
323 if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
324 return NOTIFY_BAD;
325 break;
326 }
327 return NOTIFY_OK;
328}
329
330static struct notifier_block tboot_cpu_notifier __cpuinitdata =
331{
332 .notifier_call = tboot_cpu_callback,
333};
334
335static __init int tboot_late_init(void)
336{
337 if (!tboot_enabled())
338 return 0;
339
340 tboot_create_trampoline();
341
342 atomic_set(&ap_wfs_count, 0);
343 register_hotcpu_notifier(&tboot_cpu_notifier);
344 return 0;
345}
346
347late_initcall(tboot_late_init);
348
349/*
350 * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
351 */
352
353#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
354#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
355
356/* # pages for each config regs space - used by fixmap */
357#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
358 TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
359
360/* offsets from pub/priv config space */
361#define TXTCR_HEAP_BASE 0x0300
362#define TXTCR_HEAP_SIZE 0x0308
363
364#define SHA1_SIZE 20
365
366struct sha1_hash {
367 u8 hash[SHA1_SIZE];
368};
369
370struct sinit_mle_data {
371 u32 version; /* currently 6 */
372 struct sha1_hash bios_acm_id;
373 u32 edx_senter_flags;
374 u64 mseg_valid;
375 struct sha1_hash sinit_hash;
376 struct sha1_hash mle_hash;
377 struct sha1_hash stm_hash;
378 struct sha1_hash lcp_policy_hash;
379 u32 lcp_policy_control;
380 u32 rlp_wakeup_addr;
381 u32 reserved;
382 u32 num_mdrs;
383 u32 mdrs_off;
384 u32 num_vtd_dmars;
385 u32 vtd_dmars_off;
386} __packed;
387
388struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
389{
390 void *heap_base, *heap_ptr, *config;
391
392 if (!tboot_enabled())
393 return dmar_tbl;
394
395 /*
396 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
397 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
398 */
399
400 /* map config space in order to get heap addr */
401 config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
402 PAGE_SIZE);
403 if (!config)
404 return NULL;
405
406 /* now map TXT heap */
407 heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
408 *(u64 *)(config + TXTCR_HEAP_SIZE));
409 iounmap(config);
410 if (!heap_base)
411 return NULL;
412
413 /* walk heap to SinitMleData */
414 /* skip BiosData */
415 heap_ptr = heap_base + *(u64 *)heap_base;
416 /* skip OsMleData */
417 heap_ptr += *(u64 *)heap_ptr;
418 /* skip OsSinitData */
419 heap_ptr += *(u64 *)heap_ptr;
420 /* now points to SinitMleDataSize; set to SinitMleData */
421 heap_ptr += sizeof(u64);
422 /* get addr of DMAR table */
423 dmar_tbl = (struct acpi_table_header *)(heap_ptr +
424 ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
425 sizeof(u64));
426
427 /* don't unmap heap because dmar.c needs access to this */
428
429 return dmar_tbl;
430}
431
432int tboot_force_iommu(void)
433{
434 if (!tboot_enabled())
435 return 0;
436
437 if (no_iommu || swiotlb || dmar_disabled)
438 pr_warning("Forcing Intel-IOMMU to enabled\n");
439
440 dmar_disabled = 0;
441#ifdef CONFIG_SWIOTLB
442 swiotlb = 0;
443#endif
444 no_iommu = 0;
445
446 return 1;
447}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000000..dcb00d278512
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,120 @@
1/*
2 * Copyright (c) 1991,1992,1995 Linus Torvalds
3 * Copyright (c) 1994 Alan Modra
4 * Copyright (c) 1995 Markus Kuhn
5 * Copyright (c) 1996 Ingo Molnar
6 * Copyright (c) 1998 Andrea Arcangeli
7 * Copyright (c) 2002,2006 Vojtech Pavlik
8 * Copyright (c) 2003 Andi Kleen
9 *
10 */
11
12#include <linux/clockchips.h>
13#include <linux/interrupt.h>
14#include <linux/time.h>
15#include <linux/mca.h>
16
17#include <asm/vsyscall.h>
18#include <asm/x86_init.h>
19#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h>
22#include <asm/hpet.h>
23#include <asm/time.h>
24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif
32
33unsigned long profile_pc(struct pt_regs *regs)
34{
35 unsigned long pc = instruction_pointer(regs);
36
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 /*
43 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero,
45 * kernel addresses don't.
46 */
47 if (sp[0] >> 22)
48 return sp[0];
49 if (sp[1] >> 22)
50 return sp[1];
51#endif
52 }
53 return pc;
54}
55EXPORT_SYMBOL(profile_pc);
56
57/*
58 * Default timer interrupt handler for PIT/HPET
59 */
60static irqreturn_t timer_interrupt(int irq, void *dev_id)
61{
62 /* Keep nmi watchdog up to date */
63 inc_irq_stat(irq0_irqs);
64
65 /* Optimized out for !IO_APIC and x86_64 */
66 if (timer_ack) {
67 /*
68 * Subtle, when I/O APICs are used we have to ack timer IRQ
69 * manually to deassert NMI lines for the watchdog if run
70 * on an 82489DX-based system.
71 */
72 spin_lock(&i8259A_lock);
73 outb(0x0c, PIC_MASTER_OCW3);
74 /* Ack the IRQ; AEOI will end it automatically. */
75 inb(PIC_MASTER_POLL);
76 spin_unlock(&i8259A_lock);
77 }
78
79 global_clock_event->event_handler(global_clock_event);
80
81 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
82 if (MCA_bus)
83 outb_p(inb_p(0x61)| 0x80, 0x61);
84
85 return IRQ_HANDLED;
86}
87
88static struct irqaction irq0 = {
89 .handler = timer_interrupt,
90 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
91 .name = "timer"
92};
93
94void __init setup_default_timer_irq(void)
95{
96 setup_irq(0, &irq0);
97}
98
99/* Default timer init function */
100void __init hpet_time_init(void)
101{
102 if (!hpet_enable())
103 setup_pit_timer();
104 setup_default_timer_irq();
105}
106
107static __init void x86_late_time_init(void)
108{
109 x86_init.timers.timer_init();
110 tsc_init();
111}
112
113/*
114 * Initialize TSC and delay the periodic timer init to
115 * late x86_late_time_init() so ioremap works.
116 */
117void __init time_init(void)
118{
119 late_time_init = x86_late_time_init;
120}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e1..000000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
3 *
4 * This file contains the PC-specific time handling details:
5 * reading the RTC at bootup, etc..
6 * 1994-07-02 Alan Modra
7 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
8 * 1995-03-26 Markus Kuhn
9 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
10 * precision CMOS clock update
11 * 1996-05-03 Ingo Molnar
12 * fixed time warps in do_[slow|fast]_gettimeoffset()
13 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14 * "A Kernel Model for Precision Timekeeping" by Dave Mills
15 * 1998-09-05 (Various)
16 * More robust do_fast_gettimeoffset() algorithm implemented
17 * (works with APM, Cyrix 6x86MX and Centaur C6),
18 * monotonic gettimeofday() with fast_get_timeoffset(),
19 * drift-proof precision TSC calibration on boot
20 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
21 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
22 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
23 * 1998-12-16 Andrea Arcangeli
24 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
25 * because was not accounting lost_ticks.
26 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
27 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
28 * serialize accesses to xtime/lost_ticks).
29 */
30
31#include <linux/init.h>
32#include <linux/interrupt.h>
33#include <linux/time.h>
34#include <linux/mca.h>
35
36#include <asm/setup.h>
37#include <asm/hpet.h>
38#include <asm/time.h>
39#include <asm/timer.h>
40
41#include <asm/do_timer.h>
42
43int timer_ack;
44
45unsigned long profile_pc(struct pt_regs *regs)
46{
47 unsigned long pc = instruction_pointer(regs);
48
49#ifdef CONFIG_SMP
50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else
54 unsigned long *sp = (unsigned long *)&regs->sp;
55
56 /* Return address is either directly at stack pointer
57 or above a saved flags. Eflags has bits 22-31 zero,
58 kernel addresses don't. */
59 if (sp[0] >> 22)
60 return sp[0];
61 if (sp[1] >> 22)
62 return sp[1];
63#endif
64 }
65#endif
66 return pc;
67}
68EXPORT_SYMBOL(profile_pc);
69
70/*
71 * This is the same as the above, except we _also_ save the current
72 * Time Stamp Counter value at the time of the timer interrupt, so that
73 * we later on can estimate the time of day more exactly.
74 */
75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{
77 /* Keep nmi watchdog up to date */
78 inc_irq_stat(irq0_irqs);
79
80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) {
82 /*
83 * Subtle, when I/O APICs are used we have to ack timer IRQ
84 * manually to deassert NMI lines for the watchdog if run
85 * on an 82489DX-based system.
86 */
87 spin_lock(&i8259A_lock);
88 outb(0x0c, PIC_MASTER_OCW3);
89 /* Ack the IRQ; AEOI will end it automatically. */
90 inb(PIC_MASTER_POLL);
91 spin_unlock(&i8259A_lock);
92 }
93#endif
94
95 do_timer_interrupt_hook();
96
97#ifdef CONFIG_MCA
98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to
101 enable edge-triggered interrupts usually gets intercepted by a
102 special hardware circuit). Hence we have to acknowledge
103 the timer interrupt. Through some incredibly stupid
104 design idea, the reset for IRQ 0 is done by setting the
105 high bit of the PPI port B (0x61). Note that some PS/2s,
106 notably the 55SX, work fine if this is removed. */
107
108 u8 irq_v = inb_p(0x61); /* read the current state */
109 outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
110 }
111#endif
112
113 return IRQ_HANDLED;
114}
115
116/* Duplicate of time_init() below, with hpet_enable part added */
117void __init hpet_time_init(void)
118{
119 if (!hpet_enable())
120 setup_pit_timer();
121 x86_quirk_time_init();
122}
123
124/*
125 * This is called directly from init code; we must delay timer setup in the
126 * HPET case as we can't make the decision to turn on HPET this early in the
127 * boot process.
128 *
129 * The chosen time_init function will usually be hpet_time_init, above, but
130 * in the case of virtual hardware, an alternative function may be substituted.
131 */
132void __init time_init(void)
133{
134 x86_quirk_pre_time_init();
135 tsc_init();
136 late_time_init = choose_time_init();
137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e61844..000000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
1/*
2 * "High Precision Event Timer" based timekeeping.
3 *
4 * Copyright (c) 1991,1992,1995 Linus Torvalds
5 * Copyright (c) 1994 Alan Modra
6 * Copyright (c) 1995 Markus Kuhn
7 * Copyright (c) 1996 Ingo Molnar
8 * Copyright (c) 1998 Andrea Arcangeli
9 * Copyright (c) 2002,2006 Vojtech Pavlik
10 * Copyright (c) 2003 Andi Kleen
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */
13
14#include <linux/clockchips.h>
15#include <linux/init.h>
16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/time.h>
19#include <linux/mca.h>
20#include <linux/nmi.h>
21
22#include <asm/i8253.h>
23#include <asm/hpet.h>
24#include <asm/vgtod.h>
25#include <asm/time.h>
26#include <asm/timer.h>
27
28volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
29
30unsigned long profile_pc(struct pt_regs *regs)
31{
32 unsigned long pc = instruction_pointer(regs);
33
34 /* Assume the lock function has either no stack frame or a copy
35 of flags from PUSHF
36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 if (sp[0] >> 22)
43 return sp[0];
44 if (sp[1] >> 22)
45 return sp[1];
46#endif
47 }
48 return pc;
49}
50EXPORT_SYMBOL(profile_pc);
51
52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{
54 inc_irq_stat(irq0_irqs);
55
56 global_clock_event->event_handler(global_clock_event);
57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
65 return IRQ_HANDLED;
66}
67
68/* calibrate_cpu is used on systems with fixed rate TSCs to determine
69 * processor frequency */
70#define TICK_COUNT 100000000
71unsigned long __init calibrate_cpu(void)
72{
73 int tsc_start, tsc_now;
74 int i, no_ctr_free;
75 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
76 unsigned long flags;
77
78 for (i = 0; i < 4; i++)
79 if (avail_to_resrv_perfctr_nmi_bit(i))
80 break;
81 no_ctr_free = (i == 4);
82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
85 i = 3;
86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
87 wrmsrl(MSR_K7_EVNTSEL3, 0);
88 rdmsrl(MSR_K7_PERFCTR3, pmc3);
89 } else {
90 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
91 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
92 }
93 local_irq_save(flags);
94 /* start measuring cycles, incrementing from 0 */
95 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
96 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
97 rdtscl(tsc_start);
98 do {
99 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
100 tsc_now = get_cycles();
101 } while ((tsc_now - tsc_start) < TICK_COUNT);
102
103 local_irq_restore(flags);
104 if (no_ctr_free) {
105 wrmsrl(MSR_K7_EVNTSEL3, 0);
106 wrmsrl(MSR_K7_PERFCTR3, pmc3);
107 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
108 } else {
109 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
110 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
111 }
112
113 return pmc_now * tsc_khz / (tsc_now - tsc_start);
114}
115
116static struct irqaction irq0 = {
117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .name = "timer"
120};
121
122void __init hpet_time_init(void)
123{
124 if (!hpet_enable())
125 setup_pit_timer();
126
127 setup_irq(0, &irq0);
128}
129
130void __init time_init(void)
131{
132 tsc_init();
133
134 late_time_init = choose_time_init();
135}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 77b9689f8edb..503c1f2e8835 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -640,13 +640,13 @@ static int __init uv_ptc_init(void)
640 if (!is_uv_system()) 640 if (!is_uv_system())
641 return 0; 641 return 0;
642 642
643 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 643 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
644 &proc_uv_ptc_operations);
644 if (!proc_uv_ptc) { 645 if (!proc_uv_ptc) {
645 printk(KERN_ERR "unable to create %s proc entry\n", 646 printk(KERN_ERR "unable to create %s proc entry\n",
646 UV_PTC_BASENAME); 647 UV_PTC_BASENAME);
647 return -EINVAL; 648 return -EINVAL;
648 } 649 }
649 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
650 return 0; 650 return 0;
651} 651}
652 652
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba19..699f7eeb896a 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -4,7 +4,7 @@
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);
8 8
9void __init reserve_trampoline_memory(void) 9void __init reserve_trampoline_memory(void)
10{ 10{
@@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 27 * has made sure it's suitably aligned.
28 */ 28 */
29unsigned long setup_trampoline(void) 29unsigned long __cpuinit setup_trampoline(void)
30{ 30{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404c..8508237e8e43 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
28 */ 28 */
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <linux/init.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/page_types.h> 33#include <asm/page_types.h>
33 34
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 36__CPUINITRODATA
36.section ".cpuinit.data","aw",@progbits
37#else
38.section .rodata,"a",@progbits
39#endif
40
41.code16 37.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b9..596d54c660a5 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,15 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
28#include <asm/pgtable_types.h> 29#include <asm/pgtable_types.h>
29#include <asm/page_types.h> 30#include <asm/page_types.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
33 34
34.section .rodata, "a", @progbits 35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
35 36__CPUINITRODATA
36.code16 37.code16
37 38
38ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5204332f475d..7e37dcee0cc3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/utsname.h>
18#include <linux/kdebug.h> 17#include <linux/kdebug.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/module.h> 19#include <linux/module.h>
@@ -59,12 +58,12 @@
59#include <asm/mach_traps.h> 58#include <asm/mach_traps.h>
60 59
61#ifdef CONFIG_X86_64 60#ifdef CONFIG_X86_64
61#include <asm/x86_init.h>
62#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
63#include <asm/proto.h> 63#include <asm/proto.h>
64#else 64#else
65#include <asm/processor-flags.h> 65#include <asm/processor-flags.h>
66#include <asm/setup.h> 66#include <asm/setup.h>
67#include <asm/traps.h>
68 67
69asmlinkage int system_call(void); 68asmlinkage int system_call(void);
70 69
@@ -73,11 +72,9 @@ char ignore_fpu_irq;
73 72
74/* 73/*
75 * The IDT has to be page-aligned to simplify the Pentium 74 * The IDT has to be page-aligned to simplify the Pentium
76 * F0 0F bug workaround.. We have a special link segment 75 * F0 0F bug workaround.
77 * for this.
78 */ 76 */
79gate_desc idt_table[256] 77gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
81#endif 78#endif
82 79
83DECLARE_BITMAP(used_vectors, NR_VECTORS); 80DECLARE_BITMAP(used_vectors, NR_VECTORS);
@@ -786,33 +783,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
786#endif 783#endif
787} 784}
788 785
789#ifdef CONFIG_X86_32 786asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
790unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
791{ 787{
792 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
793 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
794 unsigned long new_kesp = kesp - base;
795 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
796 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
797
798 /* Set up base for espfix segment */
799 desc &= 0x00f0ff0000000000ULL;
800 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
801 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
802 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
803 (lim_pages & 0xffff);
804 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
805
806 return new_kesp;
807} 788}
808#endif
809 789
810asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 790asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
811{ 791{
812} 792}
813 793
814asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) 794/*
795 * __math_state_restore assumes that cr0.TS is already clear and the
796 * fpu state is all ready for use. Used during context switch.
797 */
798void __math_state_restore(void)
815{ 799{
800 struct thread_info *thread = current_thread_info();
801 struct task_struct *tsk = thread->task;
802
803 /*
804 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
805 */
806 if (unlikely(restore_fpu_checking(tsk))) {
807 stts();
808 force_sig(SIGSEGV, tsk);
809 return;
810 }
811
812 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
813 tsk->fpu_counter++;
816} 814}
817 815
818/* 816/*
@@ -846,17 +844,8 @@ asmlinkage void math_state_restore(void)
846 } 844 }
847 845
848 clts(); /* Allow maths ops (or we recurse) */ 846 clts(); /* Allow maths ops (or we recurse) */
849 /*
850 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
851 */
852 if (unlikely(restore_fpu_checking(tsk))) {
853 stts();
854 force_sig(SIGSEGV, tsk);
855 return;
856 }
857 847
858 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 848 __math_state_restore();
859 tsk->fpu_counter++;
860} 849}
861EXPORT_SYMBOL_GPL(math_state_restore); 850EXPORT_SYMBOL_GPL(math_state_restore);
862 851
@@ -980,7 +969,5 @@ void __init trap_init(void)
980 */ 969 */
981 cpu_init(); 970 cpu_init();
982 971
983#ifdef CONFIG_X86_32 972 x86_init.irqs.trap_init();
984 x86_quirk_trap_init();
985#endif
986} 973}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357e..cd982f48e23e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,8 @@
17#include <asm/time.h> 17#include <asm/time.h>
18#include <asm/delay.h> 18#include <asm/delay.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/nmi.h>
21#include <asm/x86_init.h>
20 22
21unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ 23unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
22EXPORT_SYMBOL(cpu_khz); 24EXPORT_SYMBOL(cpu_khz);
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
400{ 402{
401 u64 tsc1, tsc2, delta, ref1, ref2; 403 u64 tsc1, tsc2, delta, ref1, ref2;
402 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 404 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
403 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; 405 unsigned long flags, latch, ms, fast_calibrate;
404 int hpet = is_hpet_enabled(), i, loopmin; 406 int hpet = is_hpet_enabled(), i, loopmin;
405 407
406 hv_tsc_khz = get_hypervisor_tsc_freq();
407 if (hv_tsc_khz) {
408 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
409 return hv_tsc_khz;
410 }
411
412 local_irq_save(flags); 408 local_irq_save(flags);
413 fast_calibrate = quick_pit_calibrate(); 409 fast_calibrate = quick_pit_calibrate();
414 local_irq_restore(flags); 410 local_irq_restore(flags);
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)
566 unsigned long cpu_khz_old = cpu_khz; 562 unsigned long cpu_khz_old = cpu_khz;
567 563
568 if (cpu_has_tsc) { 564 if (cpu_has_tsc) {
569 tsc_khz = calibrate_tsc(); 565 tsc_khz = x86_platform.calibrate_tsc();
570 cpu_khz = tsc_khz; 566 cpu_khz = tsc_khz;
571 cpu_data(0).loops_per_jiffy = 567 cpu_data(0).loops_per_jiffy =
572 cpufreq_scale(cpu_data(0).loops_per_jiffy, 568 cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
670 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 666 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
671 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 667 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
672 (val == CPUFREQ_RESUMECHANGE)) { 668 (val == CPUFREQ_RESUMECHANGE)) {
673 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 669 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
674 670
675 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 671 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
676 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 672 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
744} 740}
745#endif 741#endif
746 742
743static void resume_tsc(void)
744{
745 clocksource_tsc.cycle_last = 0;
746}
747
747static struct clocksource clocksource_tsc = { 748static struct clocksource clocksource_tsc = {
748 .name = "tsc", 749 .name = "tsc",
749 .rating = 300, 750 .rating = 300,
750 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc,
751 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
752 .shift = 22, 754 .shift = 22,
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 755 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)
761{ 763{
762 if (!tsc_unstable) { 764 if (!tsc_unstable) {
763 tsc_unstable = 1; 765 tsc_unstable = 1;
764 printk("Marking TSC unstable due to %s\n", reason); 766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
765 /* Change only the rating, when not registered */ 767 /* Change only the rating, when not registered */
766 if (clocksource_tsc.mult) 768 if (clocksource_tsc.mult)
767 clocksource_change_rating(&clocksource_tsc, 0); 769 clocksource_mark_unstable(&clocksource_tsc);
768 else 770 else {
771 clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
769 clocksource_tsc.rating = 0; 772 clocksource_tsc.rating = 0;
773 }
770 } 774 }
771} 775}
772 776
@@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)
852 clocksource_register(&clocksource_tsc); 856 clocksource_register(&clocksource_tsc);
853} 857}
854 858
859#ifdef CONFIG_X86_64
860/*
861 * calibrate_cpu is used on systems with fixed rate TSCs to determine
862 * processor frequency
863 */
864#define TICK_COUNT 100000000
865static unsigned long __init calibrate_cpu(void)
866{
867 int tsc_start, tsc_now;
868 int i, no_ctr_free;
869 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
870 unsigned long flags;
871
872 for (i = 0; i < 4; i++)
873 if (avail_to_resrv_perfctr_nmi_bit(i))
874 break;
875 no_ctr_free = (i == 4);
876 if (no_ctr_free) {
877 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
878 "cpu_khz value may be incorrect.\n");
879 i = 3;
880 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
881 wrmsrl(MSR_K7_EVNTSEL3, 0);
882 rdmsrl(MSR_K7_PERFCTR3, pmc3);
883 } else {
884 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
885 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
886 }
887 local_irq_save(flags);
888 /* start measuring cycles, incrementing from 0 */
889 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
890 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
891 rdtscl(tsc_start);
892 do {
893 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
894 tsc_now = get_cycles();
895 } while ((tsc_now - tsc_start) < TICK_COUNT);
896
897 local_irq_restore(flags);
898 if (no_ctr_free) {
899 wrmsrl(MSR_K7_EVNTSEL3, 0);
900 wrmsrl(MSR_K7_PERFCTR3, pmc3);
901 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
902 } else {
903 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
904 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
905 }
906
907 return pmc_now * tsc_khz / (tsc_now - tsc_start);
908}
909#else
910static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
911#endif
912
855void __init tsc_init(void) 913void __init tsc_init(void)
856{ 914{
857 u64 lpj; 915 u64 lpj;
858 int cpu; 916 int cpu;
859 917
918 x86_init.timers.tsc_pre_init();
919
860 if (!cpu_has_tsc) 920 if (!cpu_has_tsc)
861 return; 921 return;
862 922
863 tsc_khz = calibrate_tsc(); 923 tsc_khz = x86_platform.calibrate_tsc();
864 cpu_khz = tsc_khz; 924 cpu_khz = tsc_khz;
865 925
866 if (!tsc_khz) { 926 if (!tsc_khz) {
@@ -868,11 +928,9 @@ void __init tsc_init(void)
868 return; 928 return;
869 } 929 }
870 930
871#ifdef CONFIG_X86_64
872 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 931 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
873 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) 932 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
874 cpu_khz = calibrate_cpu(); 933 cpu_khz = calibrate_cpu();
875#endif
876 934
877 printk("Detected %lu.%03lu MHz processor.\n", 935 printk("Detected %lu.%03lu MHz processor.\n",
878 (unsigned long)cpu_khz / 1000, 936 (unsigned long)cpu_khz / 1000,
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b498993..f37930954d15 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 pr_info("Skipping synchronization checks as TSC is reliable.\n"); 117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4d..f068553a1b17 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/time.h>
33#include <asm/io.h> 34#include <asm/io.h>
34 35
35#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
53 return visws_board_type >= 0; 54 return visws_board_type >= 0;
54} 55}
55 56
56static int __init visws_time_init(void) 57static void __init visws_time_init(void)
57{ 58{
58 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 59 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
59 60
@@ -66,21 +67,13 @@ static int __init visws_time_init(void)
66 /* Enable (unmask) the timer interrupt */ 67 /* Enable (unmask) the timer interrupt */
67 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); 68 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
68 69
69 /* 70 setup_default_timer_irq();
70 * Zero return means the generic timer setup code will set up
71 * the standard vector:
72 */
73 return 0;
74} 71}
75 72
76static int __init visws_pre_intr_init(void) 73/* Replaces the default init_ISA_irqs in the generic setup */
74static void __init visws_pre_intr_init(void)
77{ 75{
78 init_VISWS_APIC_irqs(); 76 init_VISWS_APIC_irqs();
79
80 /*
81 * We dont want ISA irqs to be set up by the generic code:
82 */
83 return 1;
84} 77}
85 78
86/* Quirk for machine specific memory setup. */ 79/* Quirk for machine specific memory setup. */
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void)
156 outl(PIIX_SPECIAL_STOP, 0xCFC); 149 outl(PIIX_SPECIAL_STOP, 0xCFC);
157} 150}
158 151
159static int __init visws_get_smp_config(unsigned int early) 152static void __init visws_get_smp_config(unsigned int early)
160{ 153{
161 /*
162 * Prevent MP-table parsing by the generic code:
163 */
164 return 1;
165} 154}
166 155
167/* 156/*
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
208 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
209} 198}
210 199
211static int __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(unsigned int reserve)
212{ 201{
213 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
214 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)
230 MP_processor_info(mp++); 219 MP_processor_info(mp++);
231 220
232 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 221 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
233
234 return 1;
235} 222}
236 223
237static int visws_trap_init(void); 224static void visws_trap_init(void);
238
239static struct x86_quirks visws_x86_quirks __initdata = {
240 .arch_time_init = visws_time_init,
241 .arch_pre_intr_init = visws_pre_intr_init,
242 .arch_memory_setup = visws_memory_setup,
243 .arch_intr_init = NULL,
244 .arch_trap_init = visws_trap_init,
245 .mach_get_smp_config = visws_get_smp_config,
246 .mach_find_smp_config = visws_find_smp_config,
247};
248 225
249void __init visws_early_detect(void) 226void __init visws_early_detect(void)
250{ 227{
@@ -257,11 +234,14 @@ void __init visws_early_detect(void)
257 return; 234 return;
258 235
259 /* 236 /*
260 * Install special quirks for timer, interrupt and memory setup: 237 * Override the default platform setup functions
261 * Fall back to generic behavior for traps:
262 * Override generic MP-table parsing:
263 */ 238 */
264 x86_quirks = &visws_x86_quirks; 239 x86_init.resources.memory_setup = visws_memory_setup;
240 x86_init.mpparse.get_smp_config = visws_get_smp_config;
241 x86_init.mpparse.find_smp_config = visws_find_smp_config;
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init;
265 245
266 /* 246 /*
267 * Install reboot quirks: 247 * Install reboot quirks:
@@ -400,12 +380,10 @@ static __init void cobalt_init(void)
400 co_apic_read(CO_APIC_ID)); 380 co_apic_read(CO_APIC_ID));
401} 381}
402 382
403static int __init visws_trap_init(void) 383static void __init visws_trap_init(void)
404{ 384{
405 lithium_init(); 385 lithium_init();
406 cobalt_init(); 386 cobalt_init();
407
408 return 1;
409} 387}
410 388
411/* 389/*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0c..31e6f6cfe53e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)
817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
818 vmi_timer_ops.cancel_alarm = 818 vmi_timer_ops.cancel_alarm =
819 vmi_get_function(VMI_CALL_CancelAlarm); 819 vmi_get_function(VMI_CALL_CancelAlarm);
820 pv_time_ops.time_init = vmi_time_init; 820 x86_init.timers.timer_init = vmi_time_init;
821 pv_time_ops.get_wallclock = vmi_get_wallclock;
822 pv_time_ops.set_wallclock = vmi_set_wallclock;
823#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
824 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; 822 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
825 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 823 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
826#endif 824#endif
827 pv_time_ops.sched_clock = vmi_sched_clock; 825 pv_time_ops.sched_clock = vmi_sched_clock;
828 pv_time_ops.get_tsc_khz = vmi_tsc_khz; 826 x86_platform.calibrate_tsc = vmi_tsc_khz;
827 x86_platform.get_wallclock = vmi_get_wallclock;
828 x86_platform.set_wallclock = vmi_set_wallclock;
829 829
830 /* We have true wallclock functions; disable CMOS clock sync */ 830 /* We have true wallclock functions; disable CMOS clock sync */
831 no_sync_cmos_clock = 1; 831 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efeeb..611b9e2360d3 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69} 69}
70 70
71/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ 71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void) 72unsigned long vmi_tsc_khz(void)
73{ 73{
74 unsigned long long khz; 74 unsigned long long khz;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c04..92929fb3f9fa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64 47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */ 48 user PT_LOAD FLAGS(5); /* R_E */
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 percpu PT_LOAD FLAGS(7); /* RWE */ 50 percpu PT_LOAD FLAGS(6); /* RW_ */
51#endif 51#endif
52 init PT_LOAD FLAGS(7); /* RWE */ 52 init PT_LOAD FLAGS(7); /* RWE */
53#endif 53#endif
@@ -65,17 +65,11 @@ SECTIONS
65#endif 65#endif
66 66
67 /* Text and read-only data */ 67 /* Text and read-only data */
68
69 /* bootstrapping code */
70 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
71 _text = .;
72 *(.text.head)
73 } :text = 0x9090
74
75 /* The rest of the text */
76 .text : AT(ADDR(.text) - LOAD_OFFSET) { 68 .text : AT(ADDR(.text) - LOAD_OFFSET) {
69 _text = .;
70 /* bootstrapping code */
71 HEAD_TEXT
77#ifdef CONFIG_X86_32 72#ifdef CONFIG_X86_32
78 /* not really needed, already page aligned */
79 . = ALIGN(PAGE_SIZE); 73 . = ALIGN(PAGE_SIZE);
80 *(.text.page_aligned) 74 *(.text.page_aligned)
81#endif 75#endif
@@ -94,13 +88,7 @@ SECTIONS
94 88
95 NOTES :text :note 89 NOTES :text :note
96 90
97 /* Exception table */ 91 EXCEPTION_TABLE(16) :text = 0x9090
98 . = ALIGN(16);
99 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
100 __start___ex_table = .;
101 *(__ex_table)
102 __stop___ex_table = .;
103 } :text = 0x9090
104 92
105 RO_DATA(PAGE_SIZE) 93 RO_DATA(PAGE_SIZE)
106 94
@@ -118,7 +106,6 @@ SECTIONS
118#endif 106#endif
119 107
120 PAGE_ALIGNED_DATA(PAGE_SIZE) 108 PAGE_ALIGNED_DATA(PAGE_SIZE)
121 *(.data.idt)
122 109
123 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
124 111
@@ -135,24 +122,21 @@ SECTIONS
135#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
136 123
137#define VSYSCALL_ADDR (-10*1024*1024) 124#define VSYSCALL_ADDR (-10*1024*1024)
138#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
139 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
140#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
141 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
142 125
143#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) 126#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
144#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) 127#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
145 128
146#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) 129#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
147#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 130#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
148 131
132 . = ALIGN(4096);
133 __vsyscall_0 = .;
134
149 . = VSYSCALL_ADDR; 135 . = VSYSCALL_ADDR;
150 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { 136 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
151 *(.vsyscall_0) 137 *(.vsyscall_0)
152 } :user 138 } :user
153 139
154 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
155
156 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
157 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
158 *(.vsyscall_fn) 142 *(.vsyscall_fn)
@@ -192,11 +176,9 @@ SECTIONS
192 *(.vsyscall_3) 176 *(.vsyscall_3)
193 } 177 }
194 178
195 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; 179 . = __vsyscall_0 + PAGE_SIZE;
196 180
197#undef VSYSCALL_ADDR 181#undef VSYSCALL_ADDR
198#undef VSYSCALL_PHYS_ADDR
199#undef VSYSCALL_VIRT_ADDR
200#undef VLOAD_OFFSET 182#undef VLOAD_OFFSET
201#undef VLOAD 183#undef VLOAD
202#undef VVIRT_OFFSET 184#undef VVIRT_OFFSET
@@ -219,36 +201,12 @@ SECTIONS
219 PERCPU_VADDR(0, :percpu) 201 PERCPU_VADDR(0, :percpu)
220#endif 202#endif
221 203
222 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 204 INIT_TEXT_SECTION(PAGE_SIZE)
223 _sinittext = .;
224 INIT_TEXT
225 _einittext = .;
226 }
227#ifdef CONFIG_X86_64 205#ifdef CONFIG_X86_64
228 :init 206 :init
229#endif 207#endif
230 208
231 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { 209 INIT_DATA_SECTION(16)
232 INIT_DATA
233 }
234
235 . = ALIGN(16);
236 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
237 __setup_start = .;
238 *(.init.setup)
239 __setup_end = .;
240 }
241 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
242 __initcall_start = .;
243 INITCALLS
244 __initcall_end = .;
245 }
246
247 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
248 __con_initcall_start = .;
249 *(.con_initcall.init)
250 __con_initcall_end = .;
251 }
252 210
253 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 211 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
254 __x86_cpu_dev_start = .; 212 __x86_cpu_dev_start = .;
@@ -256,8 +214,6 @@ SECTIONS
256 __x86_cpu_dev_end = .; 214 __x86_cpu_dev_end = .;
257 } 215 }
258 216
259 SECURITY_INIT
260
261 . = ALIGN(8); 217 . = ALIGN(8);
262 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 218 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
263 __parainstructions = .; 219 __parainstructions = .;
@@ -288,15 +244,6 @@ SECTIONS
288 EXIT_DATA 244 EXIT_DATA
289 } 245 }
290 246
291#ifdef CONFIG_BLK_DEV_INITRD
292 . = ALIGN(PAGE_SIZE);
293 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
294 __initramfs_start = .;
295 *(.init.ramfs)
296 __initramfs_end = .;
297 }
298#endif
299
300#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 247#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
301 PERCPU(PAGE_SIZE) 248 PERCPU(PAGE_SIZE)
302#endif 249#endif
@@ -348,15 +295,12 @@ SECTIONS
348 _end = .; 295 _end = .;
349 } 296 }
350 297
351 /* Sections to be discarded */
352 /DISCARD/ : {
353 *(.exitcall.exit)
354 *(.eh_frame)
355 *(.discard)
356 }
357
358 STABS_DEBUG 298 STABS_DEBUG
359 DWARF_DEBUG 299 DWARF_DEBUG
300
301 /* Sections to be discarded */
302 DISCARDS
303 /DISCARD/ : { *(.eh_frame) }
360} 304}
361 305
362 306
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aad..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
90 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
90 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 91 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
91} 92}
92 93
@@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
227} 228}
228 229
229#ifdef CONFIG_SYSCTL 230#ifdef CONFIG_SYSCTL
230
231static int
232vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 void __user *buffer, size_t *lenp, loff_t *ppos)
234{
235 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236}
237
238static ctl_table kernel_table2[] = { 231static ctl_table kernel_table2[] = {
239 { .procname = "vsyscall64", 232 { .procname = "vsyscall64",
240 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), 233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
241 .mode = 0644, 234 .mode = 0644,
242 .proc_handler = vsyscall_sysctl_change }, 235 .proc_handler = proc_dointvec },
243 {} 236 {}
244}; 237};
245 238
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000000..4449a4a2c2ed
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
3 *
4 * For licencing details see kernel-base/COPYING
5 */
6#include <linux/init.h>
7
8#include <asm/bios_ebda.h>
9#include <asm/paravirt.h>
10#include <asm/mpspec.h>
11#include <asm/setup.h>
12#include <asm/apic.h>
13#include <asm/e820.h>
14#include <asm/time.h>
15#include <asm/irq.h>
16#include <asm/tsc.h>
17
18void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { }
21
22/*
23 * The platform setup functions are preset with the default functions
24 * for standard PC hardware.
25 */
26struct x86_init_ops x86_init __initdata = {
27
28 .resources = {
29 .probe_roms = x86_init_noop,
30 .reserve_resources = reserve_standard_io_resources,
31 .memory_setup = default_machine_specific_memory_setup,
32 },
33
34 .mpparse = {
35 .mpc_record = x86_init_uint_noop,
36 .setup_ioapic_ids = x86_init_noop,
37 .mpc_apic_id = default_mpc_apic_id,
38 .smp_read_mpc_oem = default_smp_read_mpc_oem,
39 .mpc_oem_bus_info = default_mpc_oem_bus_info,
40 .find_smp_config = default_find_smp_config,
41 .get_smp_config = default_get_smp_config,
42 },
43
44 .irqs = {
45 .pre_vector_init = init_ISA_irqs,
46 .intr_init = native_init_IRQ,
47 .trap_init = x86_init_noop,
48 },
49
50 .oem = {
51 .arch_setup = x86_init_noop,
52 .banner = default_banner,
53 },
54
55 .paging = {
56 .pagetable_setup_start = native_pagetable_setup_start,
57 .pagetable_setup_done = native_pagetable_setup_done,
58 },
59
60 .timers = {
61 .setup_percpu_clockev = setup_boot_APIC_clock,
62 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init,
64 },
65};
66
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock,
69};
70
71struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss,
75};