aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c80
-rw-r--r--arch/x86/kernel/acpi/cstate.c16
-rw-r--r--arch/x86/kernel/acpi/processor.c13
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile1
-rw-r--r--arch/x86/kernel/amd_iommu.c20
-rw-r--r--arch/x86/kernel/amd_iommu_init.c39
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c62
-rw-r--r--arch/x86/kernel/apic/ipi.c3
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c11
-rw-r--r--arch/x86/kernel/apic/probe_64.c10
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c42
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c13
-rw-r--r--arch/x86/kernel/cpu/common.c50
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c221
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c60
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c93
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile9
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c272
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h38
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c (renamed from arch/x86/kernel/cpu/mcheck/mce_amd_64.c)0
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c250
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c248
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c129
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c3
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c445
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c17
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/efi.c35
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/entry_32.S66
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/ftrace.c57
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c3
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c50
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c221
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c10
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/tsc.c45
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S147
74 files changed, 1772 insertions, 1293 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b67efd1cf59b..bf04201b6575 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,6 +24,10 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp) 26CFLAGS_paravirt.o := $(nostackp)
27GCOV_PROFILE_vsyscall_64.o := n
28GCOV_PROFILE_hpet.o := n
29GCOV_PROFILE_tsc.o := n
30GCOV_PROFILE_paravirt.o := n
27 31
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 631086159c53..6b8ca3a0285d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -44,11 +44,7 @@
44 44
45static int __initdata acpi_force = 0; 45static int __initdata acpi_force = 0;
46u32 acpi_rsdt_forced; 46u32 acpi_rsdt_forced;
47#ifdef CONFIG_ACPI 47int acpi_disabled;
48int acpi_disabled = 0;
49#else
50int acpi_disabled = 1;
51#endif
52EXPORT_SYMBOL(acpi_disabled); 48EXPORT_SYMBOL(acpi_disabled);
53 49
54#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
@@ -122,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
122 early_iounmap(map, size); 118 early_iounmap(map, size);
123} 119}
124 120
125#ifdef CONFIG_PCI_MMCONFIG
126
127static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
128
129/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
130struct acpi_mcfg_allocation *pci_mmcfg_config;
131int pci_mmcfg_config_num;
132
133static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
134{
135 if (!strcmp(mcfg->header.oem_id, "SGI"))
136 acpi_mcfg_64bit_base_addr = TRUE;
137
138 return 0;
139}
140
141int __init acpi_parse_mcfg(struct acpi_table_header *header)
142{
143 struct acpi_table_mcfg *mcfg;
144 unsigned long i;
145 int config_size;
146
147 if (!header)
148 return -EINVAL;
149
150 mcfg = (struct acpi_table_mcfg *)header;
151
152 /* how many config structures do we have */
153 pci_mmcfg_config_num = 0;
154 i = header->length - sizeof(struct acpi_table_mcfg);
155 while (i >= sizeof(struct acpi_mcfg_allocation)) {
156 ++pci_mmcfg_config_num;
157 i -= sizeof(struct acpi_mcfg_allocation);
158 };
159 if (pci_mmcfg_config_num == 0) {
160 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
161 return -ENODEV;
162 }
163
164 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
165 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
166 if (!pci_mmcfg_config) {
167 printk(KERN_WARNING PREFIX
168 "No memory for MCFG config tables\n");
169 return -ENOMEM;
170 }
171
172 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
173
174 acpi_mcfg_oem_check(mcfg);
175
176 for (i = 0; i < pci_mmcfg_config_num; ++i) {
177 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
178 !acpi_mcfg_64bit_base_addr) {
179 printk(KERN_ERR PREFIX
180 "MMCONFIG not in low 4GB of memory\n");
181 kfree(pci_mmcfg_config);
182 pci_mmcfg_config_num = 0;
183 return -ENODEV;
184 }
185 }
186
187 return 0;
188}
189#endif /* CONFIG_PCI_MMCONFIG */
190
191#ifdef CONFIG_X86_LOCAL_APIC 121#ifdef CONFIG_X86_LOCAL_APIC
192static int __init acpi_parse_madt(struct acpi_table_header *table) 122static int __init acpi_parse_madt(struct acpi_table_header *table)
193{ 123{
@@ -1519,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1519 }, 1449 },
1520 { 1450 {
1521 .callback = force_acpi_ht, 1451 .callback = force_acpi_ht,
1522 .ident = "ASUS P4B266",
1523 .matches = {
1524 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1525 DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1526 },
1527 },
1528 {
1529 .callback = force_acpi_ht,
1530 .ident = "ASUS P2B-DS", 1452 .ident = "ASUS P2B-DS",
1531 .matches = { 1453 .matches = {
1532 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), 1454 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bbbe4bbb6f34..8c44c232efcb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
34 flags->bm_check = 1; 34 flags->bm_check = 1;
35 else if (c->x86_vendor == X86_VENDOR_INTEL) { 35 else if (c->x86_vendor == X86_VENDOR_INTEL) {
36 /* 36 /*
37 * Today all CPUs that support C3 share cache. 37 * Today all MP CPUs that support C3 share cache.
38 * TBD: This needs to look at cache shared map, once 38 * And caches should not be flushed by software while
39 * multi-core detection patch makes to the base. 39 * entering C3 type state.
40 */ 40 */
41 flags->bm_check = 1; 41 flags->bm_check = 1;
42 } 42 }
43
44 /*
45 * On all recent Intel platforms, ARB_DISABLE is a nop.
46 * So, set bm_control to zero to indicate that ARB_DISABLE
47 * is not required while entering C3 type state on
48 * P4, Core and beyond CPUs
49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0;
43} 53}
44EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
45 55
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 7c074eec39fb..d296f4a195c9 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
72 return; 72 return;
73} 73}
74 74
75
75/* Initialize _PDC data based on the CPU vendor */ 76/* Initialize _PDC data based on the CPU vendor */
76void arch_acpi_processor_init_pdc(struct acpi_processor *pr) 77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
77{ 78{
@@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
85} 86}
86 87
87EXPORT_SYMBOL(arch_acpi_processor_init_pdc); 88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 167bc16ce0e5..6a564ac67ef5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
42 $(call cc-option, -mpreferred-stack-boundary=2) 42 $(call cc-option, -mpreferred-stack-boundary=2)
43KBUILD_CFLAGS += $(call cc-option, -m32) 43KBUILD_CFLAGS += $(call cc-option, -m32)
44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
45GCOV_PROFILE := n
45 46
46WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) 47WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
47 48
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 1c60554537c3..6c99f5037801 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -434,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
434 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 434 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
435} 435}
436 436
437/* Flush the whole IO/TLB for a given protection domain - including PDE */
438static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
439{
440 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
441
442 INC_STATS_COUNTER(domain_flush_single);
443
444 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
445}
446
437/* 447/*
438 * This function is used to flush the IO/TLB for a given protection domain 448 * This function is used to flush the IO/TLB for a given protection domain
439 * on every IOMMU in the system 449 * on every IOMMU in the system
@@ -1078,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu,
1078 amd_iommu_pd_table[devid] = domain; 1088 amd_iommu_pd_table[devid] = domain;
1079 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1080 1090
1091 /*
1092 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff.
1095 */
1081 iommu_queue_inv_dev_entry(iommu, devid); 1096 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id);
1082} 1098}
1083 1099
1084/* 1100/*
@@ -1176,7 +1192,7 @@ out:
1176 return 0; 1192 return 0;
1177} 1193}
1178 1194
1179struct notifier_block device_nb = { 1195static struct notifier_block device_nb = {
1180 .notifier_call = device_change_notifier, 1196 .notifier_call = device_change_notifier,
1181}; 1197};
1182 1198
@@ -1747,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1747 flag |= __GFP_ZERO; 1763 flag |= __GFP_ZERO;
1748 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1764 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1749 if (!virt_addr) 1765 if (!virt_addr)
1750 return 0; 1766 return NULL;
1751 1767
1752 paddr = virt_to_phys(virt_addr); 1768 paddr = virt_to_phys(virt_addr);
1753 1769
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 238989ec077d..c1b17e97252e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -260,6 +260,14 @@ static void iommu_enable(struct amd_iommu *iommu)
260 260
261static void iommu_disable(struct amd_iommu *iommu) 261static void iommu_disable(struct amd_iommu *iommu)
262{ 262{
263 /* Disable command buffer */
264 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
265
266 /* Disable event logging and event interrupts */
267 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
268 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
269
270 /* Disable IOMMU hardware itself */
263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN); 271 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
264} 272}
265 273
@@ -464,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
464 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
465 return NULL; 473 return NULL;
466 474
475 iommu->evt_buf_size = EVT_BUFFER_SIZE;
476
467 return iommu->evt_buf; 477 return iommu->evt_buf;
468} 478}
469 479
@@ -478,6 +488,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 488 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
479 &entry, sizeof(entry)); 489 &entry, sizeof(entry));
480 490
491 /* set head and tail to zero manually */
492 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
493 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
494
481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 495 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
482} 496}
483 497
@@ -679,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
679 693
680 devid = e->devid; 694 devid = e->devid;
681 devid_to = e->ext >> 8; 695 devid_to = e->ext >> 8;
696 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); 697 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
683 amd_iommu_alias_table[devid] = devid_to; 698 amd_iommu_alias_table[devid] = devid_to;
684 break; 699 break;
@@ -737,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
737 752
738 devid = e->devid; 753 devid = e->devid;
739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 754 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
740 if (alias) 755 if (alias) {
741 amd_iommu_alias_table[dev_i] = devid_to; 756 amd_iommu_alias_table[dev_i] = devid_to;
742 set_dev_entry_from_acpi(iommu, 757 set_dev_entry_from_acpi(iommu,
743 amd_iommu_alias_table[dev_i], 758 devid_to, flags, ext_flags);
744 flags, ext_flags); 759 }
760 set_dev_entry_from_acpi(iommu, dev_i,
761 flags, ext_flags);
745 } 762 }
746 break; 763 break;
747 default: 764 default:
@@ -1042,6 +1059,7 @@ static void enable_iommus(void)
1042 struct amd_iommu *iommu; 1059 struct amd_iommu *iommu;
1043 1060
1044 for_each_iommu(iommu) { 1061 for_each_iommu(iommu) {
1062 iommu_disable(iommu);
1045 iommu_set_device_table(iommu); 1063 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu); 1064 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu); 1065 iommu_enable_event_buffer(iommu);
@@ -1066,12 +1084,6 @@ static void disable_iommus(void)
1066 1084
1067static int amd_iommu_resume(struct sys_device *dev) 1085static int amd_iommu_resume(struct sys_device *dev)
1068{ 1086{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */ 1087 /* re-load the hardware */
1076 enable_iommus(); 1088 enable_iommus();
1077 1089
@@ -1079,8 +1091,8 @@ static int amd_iommu_resume(struct sys_device *dev)
1079 * we have to flush after the IOMMUs are enabled because a 1091 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send 1092 * disabled IOMMU will never execute the commands we send
1081 */ 1093 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices(); 1094 amd_iommu_flush_all_devices();
1095 amd_iommu_flush_all_domains();
1084 1096
1085 return 0; 1097 return 0;
1086} 1098}
@@ -1273,6 +1285,11 @@ free:
1273 goto out; 1285 goto out;
1274} 1286}
1275 1287
1288void amd_iommu_shutdown(void)
1289{
1290 disable_iommus();
1291}
1292
1276/**************************************************************************** 1293/****************************************************************************
1277 * 1294 *
1278 * Early detect code. This code runs at IOMMU detection time in the DMA 1295 * Early detect code. This code runs at IOMMU detection time in the DMA
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042ecad1..0a1c2830ec66 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -140,7 +140,6 @@ int x2apic_mode;
140#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
141/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
142static int x2apic_preenabled; 142static int x2apic_preenabled;
143static int disable_x2apic;
144static __init int setup_nox2apic(char *str) 143static __init int setup_nox2apic(char *str)
145{ 144{
146 if (x2apic_enabled()) { 145 if (x2apic_enabled()) {
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str)
149 return 0; 148 return 0;
150 } 149 }
151 150
152 disable_x2apic = 1;
153 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
154 return 0; 152 return 0;
155} 153}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 69328ac8de9c..8952a5890281 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
652 return ret && es7000_apic_is_cluster(); 652 return ret && es7000_apic_is_cluster();
653} 653}
654 654
655struct apic apic_es7000_cluster = { 655/* We've been warned by a false positive warning.Use __refdata to keep calm. */
656struct apic __refdata apic_es7000_cluster = {
656 657
657 .name = "es7000", 658 .name = "es7000",
658 .probe = probe_es7000, 659 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ef8d9290c7ea..d2ed6c5ddc80 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -462,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
462static void 462static void
463__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 463__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
464{ 464{
465 union entry_union eu; 465 union entry_union eu = {{0, 0}};
466
466 eu.entry = e; 467 eu.entry = e;
467 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 468 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
468 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 469 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -1413,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
1413 irte.vector = vector; 1414 irte.vector = vector;
1414 irte.dest_id = IRTE_DEST(destination); 1415 irte.dest_id = IRTE_DEST(destination);
1415 1416
1417 /* Set source-id of interrupt request */
1418 set_ioapic_sid(&irte, apic_id);
1419
1416 modify_irte(irq, &irte); 1420 modify_irte(irq, &irte);
1417 1421
1418 ir_entry->index2 = (index >> 15) & 0x1; 1422 ir_entry->index2 = (index >> 15) & 0x1;
@@ -1712,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void)
1712 return; 1716 return;
1713} 1717}
1714 1718
1715__apicdebuginit(void) print_APIC_bitfield(int base) 1719__apicdebuginit(void) print_APIC_field(int base)
1716{ 1720{
1717 unsigned int v; 1721 int i;
1718 int i, j;
1719 1722
1720 if (apic_verbosity == APIC_QUIET) 1723 if (apic_verbosity == APIC_QUIET)
1721 return; 1724 return;
1722 1725
1723 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); 1726 printk(KERN_DEBUG);
1724 for (i = 0; i < 8; i++) { 1727
1725 v = apic_read(base + i*0x10); 1728 for (i = 0; i < 8; i++)
1726 for (j = 0; j < 32; j++) { 1729 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1727 if (v & (1<<j)) 1730
1728 printk("1"); 1731 printk(KERN_CONT "\n");
1729 else
1730 printk("0");
1731 }
1732 printk("\n");
1733 }
1734} 1732}
1735 1733
1736__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1741,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1741 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
1742 return; 1740 return;
1743 1741
1744 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1742 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1745 smp_processor_id(), hard_smp_processor_id()); 1743 smp_processor_id(), hard_smp_processor_id());
1746 v = apic_read(APIC_ID); 1744 v = apic_read(APIC_ID);
1747 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); 1745 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1782,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1782 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1780 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1783 1781
1784 printk(KERN_DEBUG "... APIC ISR field:\n"); 1782 printk(KERN_DEBUG "... APIC ISR field:\n");
1785 print_APIC_bitfield(APIC_ISR); 1783 print_APIC_field(APIC_ISR);
1786 printk(KERN_DEBUG "... APIC TMR field:\n"); 1784 printk(KERN_DEBUG "... APIC TMR field:\n");
1787 print_APIC_bitfield(APIC_TMR); 1785 print_APIC_field(APIC_TMR);
1788 printk(KERN_DEBUG "... APIC IRR field:\n"); 1786 printk(KERN_DEBUG "... APIC IRR field:\n");
1789 print_APIC_bitfield(APIC_IRR); 1787 print_APIC_field(APIC_IRR);
1790 1788
1791 if (APIC_INTEGRATED(ver)) { /* !82489DX */ 1789 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1792 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1790 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -2003,7 +2001,9 @@ void disable_IO_APIC(void)
2003 /* 2001 /*
2004 * Use virtual wire A mode when interrupt remapping is enabled. 2002 * Use virtual wire A mode when interrupt remapping is enabled.
2005 */ 2003 */
2006 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); 2004 if (cpu_has_apic)
2005 disconnect_bsp_APIC(!intr_remapping_enabled &&
2006 ioapic_i8259.pin != -1);
2007} 2007}
2008 2008
2009#ifdef CONFIG_X86_32 2009#ifdef CONFIG_X86_32
@@ -3287,6 +3287,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3287 irte.vector = cfg->vector; 3287 irte.vector = cfg->vector;
3288 irte.dest_id = IRTE_DEST(dest); 3288 irte.dest_id = IRTE_DEST(dest);
3289 3289
3290 /* Set source-id of interrupt request */
3291 set_msi_sid(&irte, pdev);
3292
3290 modify_irte(irq, &irte); 3293 modify_irte(irq, &irte);
3291 3294
3292 msg->address_hi = MSI_ADDR_BASE_HI; 3295 msg->address_hi = MSI_ADDR_BASE_HI;
@@ -3567,7 +3570,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3567 3570
3568#endif /* CONFIG_SMP */ 3571#endif /* CONFIG_SMP */
3569 3572
3570struct irq_chip dmar_msi_type = { 3573static struct irq_chip dmar_msi_type = {
3571 .name = "DMAR_MSI", 3574 .name = "DMAR_MSI",
3572 .unmask = dmar_msi_unmask, 3575 .unmask = dmar_msi_unmask,
3573 .mask = dmar_msi_mask, 3576 .mask = dmar_msi_mask,
@@ -3790,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3790 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3793 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3791 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3794 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3792 3795
3796 if (cfg->move_in_progress)
3797 send_cleanup_vector(cfg);
3798
3793 return irq; 3799 return irq;
3794} 3800}
3795 3801
@@ -4178,28 +4184,20 @@ fake_ioapic_page:
4178 } 4184 }
4179} 4185}
4180 4186
4181static int __init ioapic_insert_resources(void) 4187void __init ioapic_insert_resources(void)
4182{ 4188{
4183 int i; 4189 int i;
4184 struct resource *r = ioapic_resources; 4190 struct resource *r = ioapic_resources;
4185 4191
4186 if (!r) { 4192 if (!r) {
4187 if (nr_ioapics > 0) { 4193 if (nr_ioapics > 0)
4188 printk(KERN_ERR 4194 printk(KERN_ERR
4189 "IO APIC resources couldn't be allocated.\n"); 4195 "IO APIC resources couldn't be allocated.\n");
4190 return -1; 4196 return;
4191 }
4192 return 0;
4193 } 4197 }
4194 4198
4195 for (i = 0; i < nr_ioapics; i++) { 4199 for (i = 0; i < nr_ioapics; i++) {
4196 insert_resource(&iomem_resource, r); 4200 insert_resource(&iomem_resource, r);
4197 r++; 4201 r++;
4198 } 4202 }
4199
4200 return 0;
4201} 4203}
4202
4203/* Insert the IO APIC resources after PCI initialization has occured to handle
4204 * IO APICS that are mapped in on a BAR in PCI space. */
4205late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a9..6ef00ba4c886 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
106 unsigned long mask = cpumask_bits(cpumask)[0]; 106 unsigned long mask = cpumask_bits(cpumask)[0];
107 unsigned long flags; 107 unsigned long flags;
108 108
109 if (WARN_ONCE(!mask, "empty IPI mask"))
110 return;
111
109 local_irq_save(flags); 112 local_irq_save(flags);
110 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); 113 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
111 __default_send_IPI_dest_field(mask, vector, apic->dest_logical); 114 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494} 494}
495 495
496struct apic apic_numaq = { 496/* Use __refdata to keep false positive warning calm. */
497struct apic __refdata apic_numaq = {
497 498
498 .name = "NUMAQ", 499 .name = "NUMAQ",
499 .probe = probe_numaq, 500 .probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 440a8bccd91a..0c0182cc947d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -20,23 +20,12 @@
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/setup.h> 21#include <asm/setup.h>
22 22
23#include <linux/threads.h>
24#include <linux/cpumask.h>
25#include <asm/mpspec.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h> 23#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/ipi.h> 24#include <asm/ipi.h>
33 25
34#include <linux/smp.h>
35#include <linux/init.h>
36#include <linux/interrupt.h> 26#include <linux/interrupt.h>
37#include <asm/acpi.h> 27#include <asm/acpi.h>
38#include <asm/e820.h> 28#include <asm/e820.h>
39#include <asm/setup.h>
40 29
41#ifdef CONFIG_HOTPLUG_CPU 30#ifdef CONFIG_HOTPLUG_CPU
42#define DEFAULT_SEND_IPI (1) 31#define DEFAULT_SEND_IPI (1)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880f9b82..fcec2f1d34a1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -44,6 +44,11 @@ static struct apic *apic_probe[] __initdata = {
44 NULL, 44 NULL,
45}; 45};
46 46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{
49 return hard_smp_processor_id() >> index_msb;
50}
51
47/* 52/*
48 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 53 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
49 */ 54 */
@@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void)
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 74 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 } 75 }
71 76
77 if (is_vsmp_box()) {
78 /* need to update phys_pkg_id */
79 apic->phys_pkg_id = apicid_phys_pkg_id;
80 }
81
72 /* 82 /*
73 * Now that apic routing model is selected, configure the 83 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping. 84 * fault handling for intr remapping.
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 344eee4ac0a4..eafdfbd1ea95 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -44,7 +44,6 @@
44#include <asm/ipi.h> 44#include <asm/ipi.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/init.h>
48#include <linux/gfp.h> 47#include <linux/gfp.h>
49#include <linux/smp.h> 48#include <linux/smp.h>
50 49
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..a5371ec36776 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
17 return x2apic_enabled(); 17 return x2apic_enabled();
18} 18}
19 19
20/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 20/*
21 21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
22static const struct cpumask *x2apic_target_cpus(void) 24static const struct cpumask *x2apic_target_cpus(void)
23{ 25{
24 return cpumask_of(0); 26 return cpu_online_mask;
25} 27}
26 28
27/* 29/*
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id)
170 172
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 173static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{ 174{
173 return current_cpu_data.initial_apicid >> index_msb; 175 return initial_apicid >> index_msb;
174} 176}
175 177
176static void x2apic_send_IPI_self(int vector) 178static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..a8989aadc99a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 27 return 0;
28} 28}
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/*
31 31 * need to use more than cpu 0, because we need more vectors when
32 * MSI-X are used.
33 */
32static const struct cpumask *x2apic_target_cpus(void) 34static const struct cpumask *x2apic_target_cpus(void)
33{ 35{
34 return cpumask_of(0); 36 return cpu_online_mask;
35} 37}
36 38
37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) 39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id)
162 164
163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) 165static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
164{ 166{
165 return current_cpu_data.initial_apicid >> index_msb; 167 return initial_apicid >> index_msb;
166} 168}
167 169
168static void x2apic_send_IPI_self(int vector) 170static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f7..601159374e87 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -46,7 +46,7 @@ static int early_get_nodeid(void)
46 return node_id.s.node_id; 46 return node_id.s.node_id;
47} 47}
48 48
49static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 50{
51 if (!strcmp(oem_id, "SGI")) { 51 if (!strcmp(oem_id, "SGI")) {
52 if (!strcmp(oem_table_id, "UVL")) 52 if (!strcmp(oem_table_id, "UVL"))
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector)
253 apic_write(APIC_SELF_IPI, vector); 253 apic_write(APIC_SELF_IPI, vector);
254} 254}
255 255
256struct apic apic_x2apic_uv_x = { 256struct apic __refdata apic_x2apic_uv_x = {
257 257
258 .name = "UV large system", 258 .name = "UV large system",
259 .probe = NULL, 259 .probe = NULL,
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
261 .apic_id_registered = uv_apic_id_registered, 261 .apic_id_registered = uv_apic_id_registered,
262 262
263 .irq_delivery_mode = dest_Fixed, 263 .irq_delivery_mode = dest_Fixed,
264 .irq_dest_mode = 1, /* logical */ 264 .irq_dest_mode = 0, /* physical */
265 265
266 .target_cpus = uv_target_cpus, 266 .target_cpus = uv_target_cpus,
267 .disable_esr = 0, 267 .disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
362 BUG(); 362 BUG();
363} 363}
364 364
365static __init void map_low_mmrs(void)
366{
367 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
368 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
369}
370
371enum map_type {map_wb, map_uc}; 365enum map_type {map_wb, map_uc};
372 366
373static __init void map_high(char *id, unsigned long base, int shift, 367static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
395 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
396} 390}
397 391
398static __init void map_config_high(int max_pnode)
399{
400 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
401 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
402
403 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
404 if (cfg.s.enable)
405 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
406}
407
408static __init void map_mmr_high(int max_pnode)
409{
410 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
411 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
412
413 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
414 if (mmr.s.enable)
415 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
416}
417
418static __init void map_mmioh_high(int max_pnode) 392static __init void map_mmioh_high(int max_pnode)
419{ 393{
420 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
566 unsigned long mmr_base, present, paddr; 540 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 541 unsigned short pnode_mask;
568 542
569 map_low_mmrs();
570
571 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 543 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
572 m_val = m_n_config.s.m_skt; 544 m_val = m_n_config.s.m_skt;
573 n_val = m_n_config.s.n_skt; 545 n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 563 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
592 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 564 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info); 565 BUG_ON(!uv_blade_info);
566 for (blade = 0; blade < uv_num_possible_blades(); blade++)
567 uv_blade_info[blade].memory_nid = -1;
594 568
595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 569 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
596 570
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
629 lcpu = uv_blade_info[blade].nr_possible_cpus; 603 lcpu = uv_blade_info[blade].nr_possible_cpus;
630 uv_blade_info[blade].nr_possible_cpus++; 604 uv_blade_info[blade].nr_possible_cpus++;
631 605
606 /* Any node on the blade, else will contain -1. */
607 uv_blade_info[blade].memory_nid = nid;
608
632 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 609 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
633 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 610 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
634 uv_cpu_hub_info(cpu)->m_val = m_val; 611 uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
662 pnode = (paddr >> m_val) & pnode_mask; 639 pnode = (paddr >> m_val) & pnode_mask;
663 blade = boot_pnode_to_blade(pnode); 640 blade = boot_pnode_to_blade(pnode);
664 uv_node_to_blade[nid] = blade; 641 uv_node_to_blade[nid] = blade;
642 max_pnode = max(pnode, max_pnode);
665 } 643 }
666 644
667 map_gru_high(max_pnode); 645 map_gru_high(max_pnode);
668 map_mmr_high(max_pnode);
669 map_config_high(max_pnode);
670 map_mmioh_high(max_pnode); 646 map_mmioh_high(max_pnode);
671 647
672 uv_cpu_init(); 648 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
811 u8 ret = 0; 811 u8 ret = 0;
812 int idled = 0; 812 int idled = 0;
813 int polling; 813 int polling;
814 int err; 814 int err = 0;
815 815
816 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
817 if (polling) { 817 if (polling) {
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
3 * This code generates raw asm output which is post-processed to extract 3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data. 4 * and format the required data.
5 */ 5 */
6#define COMPILE_OFFSETS
6 7
7#include <linux/crypto.h> 8#include <linux/crypto.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3efcb2b96a15..c1f253dac155 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8endif 8endif
9 9
10# Make sure load_percpu_segment has no stackprotector
11nostackp := $(call cc-option, -fno-stack-protector)
12CFLAGS_common.o := $(nostackp)
13
10obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
11obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
12obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e5b27d8f1b47..63fddcd082cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
258{ 258{
259#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
260 unsigned bits; 260 unsigned bits;
261 int cpu = smp_processor_id();
261 262
262 bits = c->x86_coreid_bits; 263 bits = c->x86_coreid_bits;
263
264 /* Low order bits define the core id (index of core in socket) */ 264 /* Low order bits define the core id (index of core in socket) */
265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); 265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
266 /* Convert the initial APIC ID into the socket ID */ 266 /* Convert the initial APIC ID into the socket ID */
267 c->phys_proc_id = c->initial_apicid >> bits; 267 c->phys_proc_id = c->initial_apicid >> bits;
268 /* use socket ID also for last level cache */
269 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
268#endif 270#endif
269} 271}
270 272
@@ -354,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
354#endif 356#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) 357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */ 358 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) { 359 if (cpu_has_apic && c->x86 >= 0xf) {
358 unsigned int val; 360 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68); 361 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) 362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
@@ -398,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
398 level = cpuid_eax(1); 400 level = cpuid_eax(1);
399 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) 401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
400 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 402 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
403
404 /*
405 * Some BIOSes incorrectly force this feature, but only K8
406 * revision D (model = 0x14) and later actually support it.
407 */
408 if (c->x86_model < 0x14)
409 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
401 } 410 }
402 if (c->x86 == 0x10 || c->x86 == 0x11) 411 if (c->x86 == 0x10 || c->x86 == 0x11)
403 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 412 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9fa33886c0d7..5ce60a88027b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); 59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
60} 60}
61 61
62static const struct cpu_dev *this_cpu __cpuinitdata; 62static void __cpuinit default_init(struct cpuinfo_x86 *c)
63{
64#ifdef CONFIG_X86_64
65 display_cacheinfo(c);
66#else
67 /* Not much we can do here... */
68 /* Check if at least it has cpuid */
69 if (c->cpuid_level == -1) {
70 /* No cpuid. It must be an ancient CPU */
71 if (c->x86 == 4)
72 strcpy(c->x86_model_id, "486");
73 else if (c->x86 == 3)
74 strcpy(c->x86_model_id, "386");
75 }
76#endif
77}
78
79static const struct cpu_dev __cpuinitconst default_cpu = {
80 .c_init = default_init,
81 .c_vendor = "Unknown",
82 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83};
84
85static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
63 86
64DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 87DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 88#ifdef CONFIG_X86_64
@@ -108,7 +131,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
108 /* data */ 131 /* data */
109 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 132 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
110 133
111 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 134 [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
112 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 135 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
113 GDT_STACK_CANARY_INIT 136 GDT_STACK_CANARY_INIT
114#endif 137#endif
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
332 355
333static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; 356static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
334 357
335static void __cpuinit default_init(struct cpuinfo_x86 *c)
336{
337#ifdef CONFIG_X86_64
338 display_cacheinfo(c);
339#else
340 /* Not much we can do here... */
341 /* Check if at least it has cpuid */
342 if (c->cpuid_level == -1) {
343 /* No cpuid. It must be an ancient CPU */
344 if (c->x86 == 4)
345 strcpy(c->x86_model_id, "486");
346 else if (c->x86 == 3)
347 strcpy(c->x86_model_id, "386");
348 }
349#endif
350}
351
352static const struct cpu_dev __cpuinitconst default_cpu = {
353 .c_init = default_init,
354 .c_vendor = "Unknown",
355 .c_x86_vendor = X86_VENDOR_UNKNOWN,
356};
357
358static void __cpuinit get_model_name(struct cpuinfo_x86 *c) 358static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
359{ 359{
360 unsigned int *v; 360 unsigned int *v;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index cf52215d9eb1..2a50ef891000 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,3 +1,4 @@
1
1/* 2/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc. 3 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 4 * Your use of this code is subject to the terms and conditions of the
@@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
117 u32 i = 0; 118 u32 i = 0;
118 119
119 if (cpu_family == CPU_HW_PSTATE) { 120 if (cpu_family == CPU_HW_PSTATE) {
120 if (data->currpstate == HW_PSTATE_INVALID) { 121 rdmsr(MSR_PSTATE_STATUS, lo, hi);
121 /* read (initial) hw pstate if not yet set */ 122 i = lo & HW_PSTATE_MASK;
122 rdmsr(MSR_PSTATE_STATUS, lo, hi); 123 data->currpstate = i;
123 i = lo & HW_PSTATE_MASK; 124
124 125 /*
125 /* 126 * a workaround for family 11h erratum 311 might cause
126 * a workaround for family 11h erratum 311 might cause 127 * an "out-of-range Pstate if the core is in Pstate-0
127 * an "out-of-range Pstate if the core is in Pstate-0 128 */
128 */ 129 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
129 if (i >= data->numps) 130 data->currpstate = HW_PSTATE_0;
130 data->currpstate = HW_PSTATE_0; 131
131 else
132 data->currpstate = i;
133 }
134 return 0; 132 return 0;
135 } 133 }
136 do { 134 do {
@@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
301static int transition_fid_vid(struct powernow_k8_data *data, 299static int transition_fid_vid(struct powernow_k8_data *data,
302 u32 reqfid, u32 reqvid) 300 u32 reqfid, u32 reqvid)
303{ 301{
304 if (core_voltage_pre_transition(data, reqvid)) 302 if (core_voltage_pre_transition(data, reqvid, reqfid))
305 return 1; 303 return 1;
306 304
307 if (core_frequency_transition(data, reqfid)) 305 if (core_frequency_transition(data, reqfid))
@@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
329 327
330/* Phase 1 - core voltage transition ... setup voltage */ 328/* Phase 1 - core voltage transition ... setup voltage */
331static int core_voltage_pre_transition(struct powernow_k8_data *data, 329static int core_voltage_pre_transition(struct powernow_k8_data *data,
332 u32 reqvid) 330 u32 reqvid, u32 reqfid)
333{ 331{
334 u32 rvosteps = data->rvo; 332 u32 rvosteps = data->rvo;
335 u32 savefid = data->currfid; 333 u32 savefid = data->currfid;
336 u32 maxvid, lo; 334 u32 maxvid, lo, rvomult = 1;
337 335
338 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " 336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
339 "reqvid 0x%x, rvo 0x%x\n", 337 "reqvid 0x%x, rvo 0x%x\n",
340 smp_processor_id(), 338 smp_processor_id(),
341 data->currfid, data->currvid, reqvid, data->rvo); 339 data->currfid, data->currvid, reqvid, data->rvo);
342 340
341 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
342 rvomult = 2;
343 rvosteps *= rvomult;
343 rdmsr(MSR_FIDVID_STATUS, lo, maxvid); 344 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
344 maxvid = 0x1f & (maxvid >> 16); 345 maxvid = 0x1f & (maxvid >> 16);
345 dprintk("ph1 maxvid=0x%x\n", maxvid); 346 dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
353 return 1; 354 return 1;
354 } 355 }
355 356
356 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { 357 while ((rvosteps > 0) &&
358 ((rvomult * data->rvo + data->currvid) > reqvid)) {
357 if (data->currvid == maxvid) { 359 if (data->currvid == maxvid) {
358 rvosteps = 0; 360 rvosteps = 0;
359 } else { 361 } else {
@@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
386 u32 vcoreqfid, vcocurrfid, vcofiddiff; 388 u32 vcoreqfid, vcocurrfid, vcofiddiff;
387 u32 fid_interval, savevid = data->currvid; 389 u32 fid_interval, savevid = data->currvid;
388 390
389 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
390 (data->currfid < HI_FID_TABLE_BOTTOM)) {
391 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
392 "0x%x 0x%x\n", reqfid, data->currfid);
393 return 1;
394 }
395
396 if (data->currfid == reqfid) { 391 if (data->currfid == reqfid) {
397 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", 392 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
398 data->currfid); 393 data->currfid);
@@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
409 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid 404 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
410 : vcoreqfid - vcocurrfid; 405 : vcoreqfid - vcocurrfid;
411 406
407 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
408 vcofiddiff = 0;
409
412 while (vcofiddiff > 2) { 410 while (vcofiddiff > 2) {
413 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); 411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
414 412
@@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
510 return 0; 508 return 0;
511} 509}
512 510
513static int check_supported_cpu(unsigned int cpu) 511static void check_supported_cpu(void *_rc)
514{ 512{
515 cpumask_t oldmask;
516 u32 eax, ebx, ecx, edx; 513 u32 eax, ebx, ecx, edx;
517 unsigned int rc = 0; 514 int *rc = _rc;
518 515
519 oldmask = current->cpus_allowed; 516 *rc = -ENODEV;
520 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
521
522 if (smp_processor_id() != cpu) {
523 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
524 goto out;
525 }
526 517
527 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) 518 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
528 goto out; 519 return;
529 520
530 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 521 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
531 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && 522 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
532 ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) 523 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
533 goto out; 524 return;
534 525
535 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { 526 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
536 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || 527 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
537 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { 528 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
538 printk(KERN_INFO PFX 529 printk(KERN_INFO PFX
539 "Processor cpuid %x not supported\n", eax); 530 "Processor cpuid %x not supported\n", eax);
540 goto out; 531 return;
541 } 532 }
542 533
543 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); 534 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
544 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { 535 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
545 printk(KERN_INFO PFX 536 printk(KERN_INFO PFX
546 "No frequency change capabilities detected\n"); 537 "No frequency change capabilities detected\n");
547 goto out; 538 return;
548 } 539 }
549 540
550 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 541 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
@@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)
552 != P_STATE_TRANSITION_CAPABLE) { 543 != P_STATE_TRANSITION_CAPABLE) {
553 printk(KERN_INFO PFX 544 printk(KERN_INFO PFX
554 "Power state transitions not supported\n"); 545 "Power state transitions not supported\n");
555 goto out; 546 return;
556 } 547 }
557 } else { /* must be a HW Pstate capable processor */ 548 } else { /* must be a HW Pstate capable processor */
558 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 549 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
559 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) 550 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
560 cpu_family = CPU_HW_PSTATE; 551 cpu_family = CPU_HW_PSTATE;
561 else 552 else
562 goto out; 553 return;
563 } 554 }
564 555
565 rc = 1; 556 *rc = 0;
566
567out:
568 set_cpus_allowed_ptr(current, &oldmask);
569 return rc;
570} 557}
571 558
572static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, 559static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
@@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
823 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 810 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
824 return; 811 return;
825 812
826 control = data->acpi_data.states[index].control; data->irt = (control 813 control = data->acpi_data.states[index].control;
827 >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> 814 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
828 RVO_SHIFT) & RVO_MASK; data->exttype = (control 815 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
829 >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 816 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
830 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 817 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
831 << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = 818 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
832 (control >> VST_SHIFT) & VST_MASK; } 819 data->vstable = (control >> VST_SHIFT) & VST_MASK;
820}
833 821
834static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 822static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835{ 823{
@@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)
1046 if (cur_latency > max_latency) 1034 if (cur_latency > max_latency)
1047 max_latency = cur_latency; 1035 max_latency = cur_latency;
1048 } 1036 }
1037 if (max_latency == 0) {
1038 /*
1039 * Fam 11h always returns 0 as transition latency.
1040 * This is intended and means "very fast". While cpufreq core
1041 * and governors currently can handle that gracefully, better
1042 * set it to 1 to avoid problems in the future.
1043 * For all others it's a BIOS bug.
1044 */
1045 if (!boot_cpu_data.x86 == 0x11)
1046 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1047 "latency\n");
1048 max_latency = 1;
1049 }
1049 /* value in usecs, needs to be in nanoseconds */ 1050 /* value in usecs, needs to be in nanoseconds */
1050 return 1000 * max_latency; 1051 return 1000 * max_latency;
1051} 1052}
@@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1080 return 0; 1081 return 0;
1081 } 1082 }
1082 1083
1083 if ((fid < HI_FID_TABLE_BOTTOM) &&
1084 (data->currfid < HI_FID_TABLE_BOTTOM)) {
1085 printk(KERN_ERR PFX
1086 "ignoring illegal change in lo freq table-%x to 0x%x\n",
1087 data->currfid, fid);
1088 return 1;
1089 }
1090
1091 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", 1084 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1092 smp_processor_id(), fid, vid); 1085 smp_processor_id(), fid, vid);
1093 freqs.old = find_khz_freq_from_fid(data->currfid); 1086 freqs.old = find_khz_freq_from_fid(data->currfid);
1094 freqs.new = find_khz_freq_from_fid(fid); 1087 freqs.new = find_khz_freq_from_fid(fid);
1095 1088
1096 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1089 for_each_cpu(i, data->available_cores) {
1097 freqs.cpu = i; 1090 freqs.cpu = i;
1098 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1091 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1099 } 1092 }
@@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1101 res = transition_fid_vid(data, fid, vid); 1094 res = transition_fid_vid(data, fid, vid);
1102 freqs.new = find_khz_freq_from_fid(data->currfid); 1095 freqs.new = find_khz_freq_from_fid(data->currfid);
1103 1096
1104 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1097 for_each_cpu(i, data->available_cores) {
1105 freqs.cpu = i; 1098 freqs.cpu = i;
1106 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1099 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1107 } 1100 }
@@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1126 data->currpstate); 1119 data->currpstate);
1127 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1120 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1128 1121
1129 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1122 for_each_cpu(i, data->available_cores) {
1130 freqs.cpu = i; 1123 freqs.cpu = i;
1131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1124 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1132 } 1125 }
@@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1134 res = transition_pstate(data, pstate); 1127 res = transition_pstate(data, pstate);
1135 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1128 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1136 1129
1137 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1130 for_each_cpu(i, data->available_cores) {
1138 freqs.cpu = i; 1131 freqs.cpu = i;
1139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1132 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1140 } 1133 }
@@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1235 return cpufreq_frequency_table_verify(pol, data->powernow_table); 1228 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1236} 1229}
1237 1230
1238static const char ACPI_PSS_BIOS_BUG_MSG[] = 1231struct init_on_cpu {
1239 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1232 struct powernow_k8_data *data;
1240 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1233 int rc;
1234};
1235
1236static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1237{
1238 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1239
1240 if (pending_bit_stuck()) {
1241 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1242 init_on_cpu->rc = -ENODEV;
1243 return;
1244 }
1245
1246 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1247 init_on_cpu->rc = -ENODEV;
1248 return;
1249 }
1250
1251 if (cpu_family == CPU_OPTERON)
1252 fidvid_msr_init();
1253
1254 init_on_cpu->rc = 0;
1255}
1241 1256
1242/* per CPU init entry point to the driver */ 1257/* per CPU init entry point to the driver */
1243static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1258static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244{ 1259{
1260 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1261 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1262 FW_BUG PFX "Try again with latest BIOS.\n";
1245 struct powernow_k8_data *data; 1263 struct powernow_k8_data *data;
1246 cpumask_t oldmask; 1264 struct init_on_cpu init_on_cpu;
1247 int rc; 1265 int rc;
1248 1266
1249 if (!cpu_online(pol->cpu)) 1267 if (!cpu_online(pol->cpu))
1250 return -ENODEV; 1268 return -ENODEV;
1251 1269
1252 if (!check_supported_cpu(pol->cpu)) 1270 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1271 if (rc)
1253 return -ENODEV; 1272 return -ENODEV;
1254 1273
1255 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); 1274 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
@@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1289 pol->cpuinfo.transition_latency = get_transition_latency(data); 1308 pol->cpuinfo.transition_latency = get_transition_latency(data);
1290 1309
1291 /* only run on specific CPU from here on */ 1310 /* only run on specific CPU from here on */
1292 oldmask = current->cpus_allowed; 1311 init_on_cpu.data = data;
1293 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1312 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1294 1313 &init_on_cpu, 1);
1295 if (smp_processor_id() != pol->cpu) { 1314 rc = init_on_cpu.rc;
1296 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1315 if (rc != 0)
1297 goto err_out_unmask; 1316 goto err_out_exit_acpi;
1298 }
1299
1300 if (pending_bit_stuck()) {
1301 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1302 goto err_out_unmask;
1303 }
1304
1305 if (query_current_values_with_pending_wait(data))
1306 goto err_out_unmask;
1307
1308 if (cpu_family == CPU_OPTERON)
1309 fidvid_msr_init();
1310
1311 /* run on any CPU again */
1312 set_cpus_allowed_ptr(current, &oldmask);
1313 1317
1314 if (cpu_family == CPU_HW_PSTATE) 1318 if (cpu_family == CPU_HW_PSTATE)
1315 cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); 1319 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
@@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1346 1350
1347 return 0; 1351 return 0;
1348 1352
1349err_out_unmask: 1353err_out_exit_acpi:
1350 set_cpus_allowed_ptr(current, &oldmask);
1351 powernow_k8_cpu_exit_acpi(data); 1354 powernow_k8_cpu_exit_acpi(data);
1352 1355
1353err_out: 1356err_out:
@@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1372 return 0; 1375 return 0;
1373} 1376}
1374 1377
1378static void query_values_on_cpu(void *_err)
1379{
1380 int *err = _err;
1381 struct powernow_k8_data *data = __get_cpu_var(powernow_data);
1382
1383 *err = query_current_values_with_pending_wait(data);
1384}
1385
1375static unsigned int powernowk8_get(unsigned int cpu) 1386static unsigned int powernowk8_get(unsigned int cpu)
1376{ 1387{
1377 struct powernow_k8_data *data; 1388 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1378 cpumask_t oldmask = current->cpus_allowed;
1379 unsigned int khz = 0; 1389 unsigned int khz = 0;
1380 unsigned int first; 1390 int err;
1381
1382 first = cpumask_first(cpu_core_mask(cpu));
1383 data = per_cpu(powernow_data, first);
1384 1391
1385 if (!data) 1392 if (!data)
1386 return -EINVAL; 1393 return -EINVAL;
1387 1394
1388 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1395 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1389 if (smp_processor_id() != cpu) { 1396 if (err)
1390 printk(KERN_ERR PFX
1391 "limiting to CPU %d failed in powernowk8_get\n", cpu);
1392 set_cpus_allowed_ptr(current, &oldmask);
1393 return 0;
1394 }
1395
1396 if (query_current_values_with_pending_wait(data))
1397 goto out; 1397 goto out;
1398 1398
1399 if (cpu_family == CPU_HW_PSTATE) 1399 if (cpu_family == CPU_HW_PSTATE)
@@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu)
1404 1404
1405 1405
1406out: 1406out:
1407 set_cpus_allowed_ptr(current, &oldmask);
1408 return khz; 1407 return khz;
1409} 1408}
1410 1409
@@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void)
1430 unsigned int i, supported_cpus = 0; 1429 unsigned int i, supported_cpus = 0;
1431 1430
1432 for_each_online_cpu(i) { 1431 for_each_online_cpu(i) {
1433 if (check_supported_cpu(i)) 1432 int rc;
1433 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1434 if (rc == 0)
1434 supported_cpus++; 1435 supported_cpus++;
1435 } 1436 }
1436 1437
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 6c6698feade1..02ce824073cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
215 215
216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) 216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
217 217
218static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); 218static int core_voltage_pre_transition(struct powernow_k8_data *data,
219 u32 reqvid, u32 regfid);
219static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); 220static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
220static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); 221static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
221 222
@@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
223 224
224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 225static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 226static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
226
227#ifdef CONFIG_SMP
228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
229{
230}
231#else
232static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
233{
234 cpu_set(0, cpu_sharedcore_mask[0]);
235}
236#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 55c831ed71ce..8d672ef162ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)
323{ 323{
324 unsigned l, h; 324 unsigned l, h;
325 unsigned clock_freq; 325 unsigned clock_freq;
326 cpumask_t saved_mask;
327 326
328 saved_mask = current->cpus_allowed; 327 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
330 if (smp_processor_id() != cpu)
331 return 0;
332
333 rdmsr(MSR_IA32_PERF_STATUS, l, h);
334 clock_freq = extract_clock(l, cpu, 0); 328 clock_freq = extract_clock(l, cpu, 0);
335 329
336 if (unlikely(clock_freq == 0)) { 330 if (unlikely(clock_freq == 0)) {
@@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
340 * P-state transition (like TM2). Get the last freq set 334 * P-state transition (like TM2). Get the last freq set
341 * in PERF_CTL. 335 * in PERF_CTL.
342 */ 336 */
343 rdmsr(MSR_IA32_PERF_CTL, l, h); 337 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
344 clock_freq = extract_clock(l, cpu, 1); 338 clock_freq = extract_clock(l, cpu, 1);
345 } 339 }
346
347 set_cpus_allowed_ptr(current, &saved_mask);
348 return clock_freq; 340 return clock_freq;
349} 341}
350 342
@@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,
467 struct cpufreq_freqs freqs; 459 struct cpufreq_freqs freqs;
468 int retval = 0; 460 int retval = 0;
469 unsigned int j, k, first_cpu, tmp; 461 unsigned int j, k, first_cpu, tmp;
470 cpumask_var_t saved_mask, covered_cpus; 462 cpumask_var_t covered_cpus;
471 463
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 464 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
473 return -ENOMEM;
474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 465 return -ENOMEM;
477 }
478 cpumask_copy(saved_mask, &current->cpus_allowed);
479 466
480 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { 467 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
481 retval = -ENODEV; 468 retval = -ENODEV;
@@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,
493 480
494 first_cpu = 1; 481 first_cpu = 1;
495 for_each_cpu(j, policy->cpus) { 482 for_each_cpu(j, policy->cpus) {
496 const struct cpumask *mask; 483 int good_cpu;
497 484
498 /* cpufreq holds the hotplug lock, so we are safe here */ 485 /* cpufreq holds the hotplug lock, so we are safe here */
499 if (!cpu_online(j)) 486 if (!cpu_online(j))
@@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,
504 * Make sure we are running on CPU that wants to change freq 491 * Make sure we are running on CPU that wants to change freq
505 */ 492 */
506 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 493 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
507 mask = policy->cpus; 494 good_cpu = cpumask_any_and(policy->cpus,
495 cpu_online_mask);
508 else 496 else
509 mask = cpumask_of(j); 497 good_cpu = j;
510 498
511 set_cpus_allowed_ptr(current, mask); 499 if (good_cpu >= nr_cpu_ids) {
512 preempt_disable();
513 if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
514 dprintk("couldn't limit to CPUs in this domain\n"); 500 dprintk("couldn't limit to CPUs in this domain\n");
515 retval = -EAGAIN; 501 retval = -EAGAIN;
516 if (first_cpu) { 502 if (first_cpu) {
517 /* We haven't started the transition yet. */ 503 /* We haven't started the transition yet. */
518 goto migrate_end; 504 goto out;
519 } 505 }
520 preempt_enable();
521 break; 506 break;
522 } 507 }
523 508
524 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; 509 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
525 510
526 if (first_cpu) { 511 if (first_cpu) {
527 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 512 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
528 if (msr == (oldmsr & 0xffff)) { 513 if (msr == (oldmsr & 0xffff)) {
529 dprintk("no change needed - msr was and needs " 514 dprintk("no change needed - msr was and needs "
530 "to be %x\n", oldmsr); 515 "to be %x\n", oldmsr);
531 retval = 0; 516 retval = 0;
532 goto migrate_end; 517 goto out;
533 } 518 }
534 519
535 freqs.old = extract_clock(oldmsr, cpu, 0); 520 freqs.old = extract_clock(oldmsr, cpu, 0);
@@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,
553 oldmsr |= msr; 538 oldmsr |= msr;
554 } 539 }
555 540
556 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 541 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
557 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 542 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
558 preempt_enable();
559 break; 543 break;
560 }
561 544
562 cpu_set(j, *covered_cpus); 545 cpumask_set_cpu(j, covered_cpus);
563 preempt_enable();
564 } 546 }
565 547
566 for_each_cpu(k, policy->cpus) { 548 for_each_cpu(k, policy->cpus) {
@@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,
578 * Best effort undo.. 560 * Best effort undo..
579 */ 561 */
580 562
581 for_each_cpu_mask_nr(j, *covered_cpus) { 563 for_each_cpu(j, covered_cpus)
582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); 564 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
583 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
584 }
585 565
586 tmp = freqs.new; 566 tmp = freqs.new;
587 freqs.new = freqs.old; 567 freqs.new = freqs.old;
@@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,
593 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 573 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
594 } 574 }
595 } 575 }
596 set_cpus_allowed_ptr(current, saved_mask);
597 retval = 0; 576 retval = 0;
598 goto out;
599 577
600migrate_end:
601 preempt_enable();
602 set_cpus_allowed_ptr(current, saved_mask);
603out: 578out:
604 free_cpumask_var(saved_mask);
605 free_cpumask_var(covered_cpus); 579 free_cpumask_var(covered_cpus);
606 return retval; 580 return retval;
607} 581}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 016c1a4fa3fc..6911e91fb4f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -89,7 +89,8 @@ static int speedstep_find_register(void)
89 * speedstep_set_state - set the SpeedStep state 89 * speedstep_set_state - set the SpeedStep state
90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
91 * 91 *
92 * Tries to change the SpeedStep state. 92 * Tries to change the SpeedStep state. Can be called from
93 * smp_call_function_single.
93 */ 94 */
94static void speedstep_set_state(unsigned int state) 95static void speedstep_set_state(unsigned int state)
95{ 96{
@@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)
143 return; 144 return;
144} 145}
145 146
147/* Wrapper for smp_call_function_single. */
148static void _speedstep_set_state(void *_state)
149{
150 speedstep_set_state(*(unsigned int *)_state);
151}
146 152
147/** 153/**
148 * speedstep_activate - activate SpeedStep control in the chipset 154 * speedstep_activate - activate SpeedStep control in the chipset
@@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)
226 return 0; 232 return 0;
227} 233}
228 234
229static unsigned int _speedstep_get(const struct cpumask *cpus) 235struct get_freq_data {
230{
231 unsigned int speed; 236 unsigned int speed;
232 cpumask_t cpus_allowed; 237 unsigned int processor;
233 238};
234 cpus_allowed = current->cpus_allowed; 239
235 set_cpus_allowed_ptr(current, cpus); 240static void get_freq_data(void *_data)
236 speed = speedstep_get_frequency(speedstep_processor); 241{
237 set_cpus_allowed_ptr(current, &cpus_allowed); 242 struct get_freq_data *data = _data;
238 dprintk("detected %u kHz as current frequency\n", speed); 243
239 return speed; 244 data->speed = speedstep_get_frequency(data->processor);
240} 245}
241 246
242static unsigned int speedstep_get(unsigned int cpu) 247static unsigned int speedstep_get(unsigned int cpu)
243{ 248{
244 return _speedstep_get(cpumask_of(cpu)); 249 struct get_freq_data data = { .processor = cpu };
250
251 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
253 BUG();
254
255 dprintk("detected %u kHz as current frequency\n", data.speed);
256 return data.speed;
245} 257}
246 258
247/** 259/**
@@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,
257 unsigned int target_freq, 269 unsigned int target_freq,
258 unsigned int relation) 270 unsigned int relation)
259{ 271{
260 unsigned int newstate = 0; 272 unsigned int newstate = 0, policy_cpu;
261 struct cpufreq_freqs freqs; 273 struct cpufreq_freqs freqs;
262 cpumask_t cpus_allowed;
263 int i; 274 int i;
264 275
265 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], 276 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
266 target_freq, relation, &newstate)) 277 target_freq, relation, &newstate))
267 return -EINVAL; 278 return -EINVAL;
268 279
269 freqs.old = _speedstep_get(policy->cpus); 280 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
281 freqs.old = speedstep_get(policy_cpu);
270 freqs.new = speedstep_freqs[newstate].frequency; 282 freqs.new = speedstep_freqs[newstate].frequency;
271 freqs.cpu = policy->cpu; 283 freqs.cpu = policy->cpu;
272 284
@@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,
276 if (freqs.old == freqs.new) 288 if (freqs.old == freqs.new)
277 return 0; 289 return 0;
278 290
279 cpus_allowed = current->cpus_allowed;
280
281 for_each_cpu(i, policy->cpus) { 291 for_each_cpu(i, policy->cpus) {
282 freqs.cpu = i; 292 freqs.cpu = i;
283 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 293 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
284 } 294 }
285 295
286 /* switch to physical CPU where state is to be changed */ 296 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
287 set_cpus_allowed_ptr(current, policy->cpus); 297 true);
288
289 speedstep_set_state(newstate);
290
291 /* allow to be run on all CPUs */
292 set_cpus_allowed_ptr(current, &cpus_allowed);
293 298
294 for_each_cpu(i, policy->cpus) { 299 for_each_cpu(i, policy->cpus) {
295 freqs.cpu = i; 300 freqs.cpu = i;
@@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)
312 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 317 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
313} 318}
314 319
320struct get_freqs {
321 struct cpufreq_policy *policy;
322 int ret;
323};
324
325static void get_freqs_on_cpu(void *_get_freqs)
326{
327 struct get_freqs *get_freqs = _get_freqs;
328
329 get_freqs->ret =
330 speedstep_get_freqs(speedstep_processor,
331 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
332 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
333 &get_freqs->policy->cpuinfo.transition_latency,
334 &speedstep_set_state);
335}
315 336
316static int speedstep_cpu_init(struct cpufreq_policy *policy) 337static int speedstep_cpu_init(struct cpufreq_policy *policy)
317{ 338{
318 int result = 0; 339 int result;
319 unsigned int speed; 340 unsigned int policy_cpu, speed;
320 cpumask_t cpus_allowed; 341 struct get_freqs gf;
321 342
322 /* only run on CPU to be set, or on its sibling */ 343 /* only run on CPU to be set, or on its sibling */
323#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
324 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 345 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
325#endif 346#endif
326 347 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
327 cpus_allowed = current->cpus_allowed;
328 set_cpus_allowed_ptr(current, policy->cpus);
329 348
330 /* detect low and high frequency and transition latency */ 349 /* detect low and high frequency and transition latency */
331 result = speedstep_get_freqs(speedstep_processor, 350 gf.policy = policy;
332 &speedstep_freqs[SPEEDSTEP_LOW].frequency, 351 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
333 &speedstep_freqs[SPEEDSTEP_HIGH].frequency, 352 if (gf.ret)
334 &policy->cpuinfo.transition_latency, 353 return gf.ret;
335 &speedstep_set_state);
336 set_cpus_allowed_ptr(current, &cpus_allowed);
337 if (result)
338 return result;
339 354
340 /* get current speed setting */ 355 /* get current speed setting */
341 speed = _speedstep_get(policy->cpus); 356 speed = speedstep_get(policy_cpu);
342 if (!speed) 357 if (!speed)
343 return -EIO; 358 return -EIO;
344 359
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 2e3c6862657b..f4c290b8482f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)
226} 226}
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(unsigned int processor)
230{ 231{
231 switch (processor) { 232 switch (processor) {
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 45004faf67ea..188a1ca5ad2b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,12 @@
1obj-y = mce.o therm_throt.o 1obj-y = mce.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o 6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o 7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11
12obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index 89e510424152..b945d5dbc609 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -10,10 +10,9 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* Machine Check Handler For AMD Athlon/Duron: */ 16/* Machine Check Handler For AMD Athlon/Duron: */
18static void k7_machine_check(struct pt_regs *regs, long error_code) 17static void k7_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index fabba15e4558..01213048f62f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -44,7 +44,6 @@
44#include <asm/msr.h> 44#include <asm/msr.h>
45 45
46#include "mce-internal.h" 46#include "mce-internal.h"
47#include "mce.h"
48 47
49/* Handle unconfigured int18 (should never happen) */ 48/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
@@ -57,7 +56,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
57void (*machine_check_vector)(struct pt_regs *, long error_code) = 56void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check; 57 unexpected_machine_check;
59 58
60int mce_disabled; 59int mce_disabled __read_mostly;
61 60
62#ifdef CONFIG_X86_NEW_MCE 61#ifdef CONFIG_X86_NEW_MCE
63 62
@@ -76,21 +75,22 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only) 76 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */ 77 */
79static int tolerant = 1; 78static int tolerant __read_mostly = 1;
80static int banks; 79static int banks __read_mostly;
81static u64 *bank; 80static u64 *bank __read_mostly;
82static unsigned long notify_user; 81static int rip_msr __read_mostly;
83static int rip_msr; 82static int mce_bootlog __read_mostly = -1;
84static int mce_bootlog = -1; 83static int monarch_timeout __read_mostly = -1;
85static int monarch_timeout = -1; 84static int mce_panic_timeout __read_mostly;
86static int mce_panic_timeout; 85static int mce_dont_log_ce __read_mostly;
87static int mce_dont_log_ce; 86int mce_cmci_disabled __read_mostly;
88int mce_cmci_disabled; 87int mce_ignore_ce __read_mostly;
89int mce_ignore_ce; 88int mce_ser __read_mostly;
90int mce_ser; 89
91 90/* User mode helper program triggered by machine check event */
92static char trigger[128]; 91static unsigned long mce_need_notify;
93static char *trigger_argv[2] = { trigger, NULL }; 92static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL };
94 94
95static unsigned long dont_init_banks; 95static unsigned long dont_init_banks;
96 96
@@ -180,7 +180,7 @@ void mce_log(struct mce *mce)
180 wmb(); 180 wmb();
181 181
182 mce->finished = 1; 182 mce->finished = 1;
183 set_bit(0, &notify_user); 183 set_bit(0, &mce_need_notify);
184} 184}
185 185
186static void print_mce(struct mce *m) 186static void print_mce(struct mce *m)
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m)
194 m->cs, m->ip); 194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS) 195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip); 196 print_symbol("{%s}", m->ip);
197 printk("\n"); 197 printk(KERN_CONT "\n");
198 } 198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc); 199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr) 200 if (m->addr)
201 printk("ADDR %llx ", m->addr); 201 printk(KERN_CONT "ADDR %llx ", m->addr);
202 if (m->misc) 202 if (m->misc)
203 printk("MISC %llx ", m->misc); 203 printk(KERN_CONT "MISC %llx ", m->misc);
204 printk("\n"); 204 printk(KERN_CONT "\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 207 m->apicid);
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m)
209 209
210static void print_mce_head(void) 210static void print_mce_head(void)
211{ 211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 212 printk(KERN_EMERG "\nHARDWARE ERROR\n");
213} 213}
214 214
215static void print_mce_tail(void) 215static void print_mce_tail(void)
216{ 216{
217 printk(KERN_EMERG "This is not a software problem!\n" 217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 218 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219} 219}
220 220
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 221#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -691,18 +691,21 @@ static atomic_t global_nwo;
691 * in the entry order. 691 * in the entry order.
692 * TBD double check parallel CPU hotunplug 692 * TBD double check parallel CPU hotunplug
693 */ 693 */
694static int mce_start(int no_way_out, int *order) 694static int mce_start(int *no_way_out)
695{ 695{
696 int nwo; 696 int order;
697 int cpus = num_online_cpus(); 697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699 699
700 if (!timeout) { 700 if (!timeout)
701 *order = -1; 701 return -1;
702 return no_way_out;
703 }
704 702
705 atomic_add(no_way_out, &global_nwo); 703 atomic_add(*no_way_out, &global_nwo);
704 /*
705 * global_nwo should be updated before mce_callin
706 */
707 smp_wmb();
708 order = atomic_add_return(1, &mce_callin);
706 709
707 /* 710 /*
708 * Wait for everyone. 711 * Wait for everyone.
@@ -710,40 +713,43 @@ static int mce_start(int no_way_out, int *order)
710 while (atomic_read(&mce_callin) != cpus) { 713 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) { 714 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0); 715 atomic_set(&global_nwo, 0);
713 *order = -1; 716 return -1;
714 return no_way_out;
715 } 717 }
716 ndelay(SPINUNIT); 718 ndelay(SPINUNIT);
717 } 719 }
718 720
719 /* 721 /*
720 * Cache the global no_way_out state. 722 * mce_callin should be read before global_nwo
721 */ 723 */
722 nwo = atomic_read(&global_nwo); 724 smp_rmb();
723 725
724 /* 726 if (order == 1) {
725 * Monarch starts executing now, the others wait. 727 /*
726 */ 728 * Monarch: Starts executing now, the others wait.
727 if (*order == 1) { 729 */
728 atomic_set(&mce_executing, 1); 730 atomic_set(&mce_executing, 1);
729 return nwo; 731 } else {
732 /*
733 * Subject: Now start the scanning loop one by one in
734 * the original callin order.
735 * This way when there are any shared banks it will be
736 * only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 return -1;
742 }
743 ndelay(SPINUNIT);
744 }
730 } 745 }
731 746
732 /* 747 /*
733 * Now start the scanning loop one by one 748 * Cache the global no_way_out state.
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */ 749 */
738 while (atomic_read(&mce_executing) < *order) { 750 *no_way_out = atomic_read(&global_nwo);
739 if (mce_timed_out(&timeout)) { 751
740 atomic_set(&global_nwo, 0); 752 return order;
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747} 753}
748 754
749/* 755/*
@@ -863,7 +869,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
863 * check handler. 869 * check handler.
864 */ 870 */
865 int order; 871 int order;
866
867 /* 872 /*
868 * If no_way_out gets set, there is no safe way to recover from this 873 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway. 874 * MCE. If tolerant is cranked up, we'll try anyway.
@@ -887,7 +892,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
887 if (!banks) 892 if (!banks)
888 goto out; 893 goto out;
889 894
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m); 895 mce_setup(&m);
892 896
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -909,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
909 * This way we don't report duplicated events on shared banks 913 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it. 914 * because the first one to see it will clear it.
911 */ 915 */
912 no_way_out = mce_start(no_way_out, &order); 916 order = mce_start(&no_way_out);
913 for (i = 0; i < banks; i++) { 917 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear); 918 __clear_bit(i, toclear);
915 if (!bank[i]) 919 if (!bank[i])
@@ -1113,12 +1117,12 @@ static void mcheck_timer(unsigned long data)
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114 1118
1115 t->expires = jiffies + *n; 1119 t->expires = jiffies + *n;
1116 add_timer(t); 1120 add_timer_on(t, smp_processor_id());
1117} 1121}
1118 1122
1119static void mce_do_trigger(struct work_struct *work) 1123static void mce_do_trigger(struct work_struct *work)
1120{ 1124{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 1125 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1122} 1126}
1123 1127
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
@@ -1135,7 +1139,7 @@ int mce_notify_irq(void)
1135 1139
1136 clear_thread_flag(TIF_MCE_NOTIFY); 1140 clear_thread_flag(TIF_MCE_NOTIFY);
1137 1141
1138 if (test_and_clear_bit(0, &notify_user)) { 1142 if (test_and_clear_bit(0, &mce_need_notify)) {
1139 wake_up_interruptible(&mce_wait); 1143 wake_up_interruptible(&mce_wait);
1140 1144
1141 /* 1145 /*
@@ -1143,7 +1147,7 @@ int mce_notify_irq(void)
1143 * work_pending is always cleared before the function is 1147 * work_pending is always cleared before the function is
1144 * executed. 1148 * executed.
1145 */ 1149 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work)) 1150 if (mce_helper[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work); 1151 schedule_work(&mce_trigger_work);
1148 1152
1149 if (__ratelimit(&ratelimit)) 1153 if (__ratelimit(&ratelimit))
@@ -1222,8 +1226,13 @@ static void mce_init(void)
1222} 1226}
1223 1227
1224/* Add per CPU specific workarounds here */ 1228/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1229static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{ 1230{
1231 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1232 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1233 return -EOPNOTSUPP;
1234 }
1235
1227 /* This should be disabled by the BIOS, but isn't always */ 1236 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) { 1237 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) { 1238 if (c->x86 == 15 && banks > 4) {
@@ -1245,7 +1254,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1245 * Various K7s with broken bank 0 around. Always disable 1254 * Various K7s with broken bank 0 around. Always disable
1246 * by default. 1255 * by default.
1247 */ 1256 */
1248 if (c->x86 == 6) 1257 if (c->x86 == 6 && banks > 0)
1249 bank[0] = 0; 1258 bank[0] = 0;
1250 } 1259 }
1251 1260
@@ -1269,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1278 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0) 1279 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC; 1280 monarch_timeout = USEC_PER_SEC;
1281
1282 /*
1283 * There are also broken BIOSes on some Pentium M and
1284 * earlier systems:
1285 */
1286 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1287 mce_bootlog = 0;
1272 } 1288 }
1273 if (monarch_timeout < 0) 1289 if (monarch_timeout < 0)
1274 monarch_timeout = 0; 1290 monarch_timeout = 0;
1275 if (mce_bootlog != 0) 1291 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30; 1292 mce_panic_timeout = 30;
1293
1294 return 0;
1277} 1295}
1278 1296
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1297static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1282,8 +1300,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1282 return; 1300 return;
1283 switch (c->x86_vendor) { 1301 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL: 1302 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled()) 1303 intel_p5_mcheck_init(c);
1286 intel_p5_mcheck_init(c);
1287 break; 1304 break;
1288 case X86_VENDOR_CENTAUR: 1305 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c); 1306 winchip_mcheck_init(c);
@@ -1318,7 +1335,7 @@ static void mce_init_timer(void)
1318 return; 1335 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id()); 1336 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n); 1337 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t); 1338 add_timer_on(t, smp_processor_id());
1322} 1339}
1323 1340
1324/* 1341/*
@@ -1335,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1335 if (!mce_available(c)) 1352 if (!mce_available(c))
1336 return; 1353 return;
1337 1354
1338 if (mce_cap_init() < 0) { 1355 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
1339 mce_disabled = 1; 1356 mce_disabled = 1;
1340 return; 1357 return;
1341 } 1358 }
1342 mce_cpu_quirks(c);
1343 1359
1344 machine_check_vector = do_machine_check; 1360 machine_check_vector = do_machine_check;
1345 1361
@@ -1609,8 +1625,9 @@ static int mce_resume(struct sys_device *dev)
1609static void mce_cpu_restart(void *data) 1625static void mce_cpu_restart(void *data)
1610{ 1626{
1611 del_timer_sync(&__get_cpu_var(mce_timer)); 1627 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data)) 1628 if (!mce_available(&current_cpu_data))
1613 mce_init(); 1629 return;
1630 mce_init();
1614 mce_init_timer(); 1631 mce_init_timer();
1615} 1632}
1616 1633
@@ -1620,6 +1637,26 @@ static void mce_restart(void)
1620 on_each_cpu(mce_cpu_restart, NULL, 1); 1637 on_each_cpu(mce_cpu_restart, NULL, 1);
1621} 1638}
1622 1639
1640/* Toggle features for corrected errors */
1641static void mce_disable_ce(void *all)
1642{
1643 if (!mce_available(&current_cpu_data))
1644 return;
1645 if (all)
1646 del_timer_sync(&__get_cpu_var(mce_timer));
1647 cmci_clear();
1648}
1649
1650static void mce_enable_ce(void *all)
1651{
1652 if (!mce_available(&current_cpu_data))
1653 return;
1654 cmci_reenable();
1655 cmci_recheck();
1656 if (all)
1657 mce_init_timer();
1658}
1659
1623static struct sysdev_class mce_sysclass = { 1660static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend, 1661 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown, 1662 .shutdown = mce_shutdown,
@@ -1659,26 +1696,70 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1659static ssize_t 1696static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1697show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{ 1698{
1662 strcpy(buf, trigger); 1699 strcpy(buf, mce_helper);
1663 strcat(buf, "\n"); 1700 strcat(buf, "\n");
1664 return strlen(trigger) + 1; 1701 return strlen(mce_helper) + 1;
1665} 1702}
1666 1703
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1704static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz) 1705 const char *buf, size_t siz)
1669{ 1706{
1670 char *p; 1707 char *p;
1671 int len;
1672 1708
1673 strncpy(trigger, buf, sizeof(trigger)); 1709 strncpy(mce_helper, buf, sizeof(mce_helper));
1674 trigger[sizeof(trigger)-1] = 0; 1710 mce_helper[sizeof(mce_helper)-1] = 0;
1675 len = strlen(trigger); 1711 p = strchr(mce_helper, '\n');
1676 p = strchr(trigger, '\n');
1677 1712
1678 if (*p) 1713 if (p)
1679 *p = 0; 1714 *p = 0;
1680 1715
1681 return len; 1716 return strlen(mce_helper) + !!p;
1717}
1718
1719static ssize_t set_ignore_ce(struct sys_device *s,
1720 struct sysdev_attribute *attr,
1721 const char *buf, size_t size)
1722{
1723 u64 new;
1724
1725 if (strict_strtoull(buf, 0, &new) < 0)
1726 return -EINVAL;
1727
1728 if (mce_ignore_ce ^ !!new) {
1729 if (new) {
1730 /* disable ce features */
1731 on_each_cpu(mce_disable_ce, (void *)1, 1);
1732 mce_ignore_ce = 1;
1733 } else {
1734 /* enable ce features */
1735 mce_ignore_ce = 0;
1736 on_each_cpu(mce_enable_ce, (void *)1, 1);
1737 }
1738 }
1739 return size;
1740}
1741
1742static ssize_t set_cmci_disabled(struct sys_device *s,
1743 struct sysdev_attribute *attr,
1744 const char *buf, size_t size)
1745{
1746 u64 new;
1747
1748 if (strict_strtoull(buf, 0, &new) < 0)
1749 return -EINVAL;
1750
1751 if (mce_cmci_disabled ^ !!new) {
1752 if (new) {
1753 /* disable cmci */
1754 on_each_cpu(mce_disable_ce, NULL, 1);
1755 mce_cmci_disabled = 1;
1756 } else {
1757 /* enable cmci */
1758 mce_cmci_disabled = 0;
1759 on_each_cpu(mce_enable_ce, NULL, 1);
1760 }
1761 }
1762 return size;
1682} 1763}
1683 1764
1684static ssize_t store_int_with_restart(struct sys_device *s, 1765static ssize_t store_int_with_restart(struct sys_device *s,
@@ -1693,6 +1774,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1774static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1775static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1776static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1777static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1696 1778
1697static struct sysdev_ext_attribute attr_check_interval = { 1779static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1780 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1700,9 +1782,24 @@ static struct sysdev_ext_attribute attr_check_interval = {
1700 &check_interval 1782 &check_interval
1701}; 1783};
1702 1784
1785static struct sysdev_ext_attribute attr_ignore_ce = {
1786 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1787 &mce_ignore_ce
1788};
1789
1790static struct sysdev_ext_attribute attr_cmci_disabled = {
1791 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1792 &mce_cmci_disabled
1793};
1794
1703static struct sysdev_attribute *mce_attrs[] = { 1795static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1796 &attr_tolerant.attr,
1797 &attr_check_interval.attr,
1798 &attr_trigger,
1705 &attr_monarch_timeout.attr, 1799 &attr_monarch_timeout.attr,
1800 &attr_dont_log_ce.attr,
1801 &attr_ignore_ce.attr,
1802 &attr_cmci_disabled.attr,
1706 NULL 1803 NULL
1707}; 1804};
1708 1805
@@ -1712,7 +1809,7 @@ static cpumask_var_t mce_dev_initialized;
1712static __cpuinit int mce_create_device(unsigned int cpu) 1809static __cpuinit int mce_create_device(unsigned int cpu)
1713{ 1810{
1714 int err; 1811 int err;
1715 int i; 1812 int i, j;
1716 1813
1717 if (!mce_available(&boot_cpu_data)) 1814 if (!mce_available(&boot_cpu_data))
1718 return -EIO; 1815 return -EIO;
@@ -1730,9 +1827,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1730 if (err) 1827 if (err)
1731 goto error; 1828 goto error;
1732 } 1829 }
1733 for (i = 0; i < banks; i++) { 1830 for (j = 0; j < banks; j++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1831 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]); 1832 &bank_attrs[j]);
1736 if (err) 1833 if (err)
1737 goto error2; 1834 goto error2;
1738 } 1835 }
@@ -1740,8 +1837,8 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1740 1837
1741 return 0; 1838 return 0;
1742error2: 1839error2:
1743 while (--i >= 0) 1840 while (--j >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1841 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
1745error: 1842error:
1746 while (--i >= 0) 1843 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1844 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
@@ -1883,7 +1980,7 @@ static __init int mce_init_device(void)
1883 if (!mce_available(&boot_cpu_data)) 1980 if (!mce_available(&boot_cpu_data))
1884 return -EIO; 1981 return -EIO;
1885 1982
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1983 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887 1984
1888 err = mce_init_banks(); 1985 err = mce_init_banks();
1889 if (err) 1986 if (err)
@@ -1915,7 +2012,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1915/* This has to be run for each processor */ 2012/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c) 2013void mcheck_init(struct cpuinfo_x86 *c)
1917{ 2014{
1918 if (mce_disabled == 1) 2015 if (mce_disabled)
1919 return; 2016 return;
1920 2017
1921 switch (c->x86_vendor) { 2018 switch (c->x86_vendor) {
@@ -1945,10 +2042,9 @@ void mcheck_init(struct cpuinfo_x86 *c)
1945 2042
1946static int __init mcheck_enable(char *str) 2043static int __init mcheck_enable(char *str)
1947{ 2044{
1948 mce_disabled = -1; 2045 mce_p5_enabled = 1;
1949 return 1; 2046 return 1;
1950} 2047}
1951
1952__setup("mce", mcheck_enable); 2048__setup("mce", mcheck_enable);
1953 2049
1954#endif /* CONFIG_X86_OLD_MCE */ 2050#endif /* CONFIG_X86_OLD_MCE */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
deleted file mode 100644
index 84a552b458c8..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#include <linux/init.h>
2#include <asm/mce.h>
3
4#ifdef CONFIG_X86_OLD_MCE
5void amd_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
22
23/* Call the installed machine check handler for this CPU setup. */
24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
25
26#ifdef CONFIG_X86_OLD_MCE
27
28extern int nr_mce_banks;
29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 2b011d2d8579..e1acec0f7a32 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -1,74 +1,226 @@
1/* 1/*
2 * Common code for Intel machine checks 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
3 */ 6 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9 7
10#include <asm/therm_throt.h> 8#include <linux/init.h>
11#include <asm/processor.h> 9#include <linux/interrupt.h>
12#include <asm/system.h> 10#include <linux/percpu.h>
13#include <asm/apic.h> 11#include <asm/apic.h>
12#include <asm/processor.h>
14#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/mce.h>
15
16/*
17 * Support for Intel Correct Machine Check Interrupts. This allows
18 * the CPU to raise an interrupt when a corrected machine check happened.
19 * Normally we pick those up using a regular polling timer.
20 * Also supports reliable discovery of shared banks.
21 */
15 22
16#include "mce.h" 23static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
17 24
18void intel_init_thermal(struct cpuinfo_x86 *c) 25/*
26 * cmci_discover_lock protects against parallel discovery attempts
27 * which could race against each other.
28 */
29static DEFINE_SPINLOCK(cmci_discover_lock);
30
31#define CMCI_THRESHOLD 1
32
33static int cmci_supported(int *banks)
19{ 34{
20 unsigned int cpu = smp_processor_id(); 35 u64 cap;
21 int tm2 = 0;
22 u32 l, h;
23 36
24 /* Thermal monitoring depends on ACPI and clock modulation*/ 37 if (mce_cmci_disabled || mce_ignore_ce)
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) 38 return 0;
26 return;
27 39
28 /* 40 /*
29 * First check if its enabled already, in which case there might 41 * Vendor check is not strictly needed, but the initial
30 * be some SMM goo which handles it, so we can't even put a handler 42 * initialization is vendor keyed and this
31 * since it might be delivered via SMI already: 43 * makes sure none of the backdoors are entered otherwise.
32 */ 44 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 45 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
34 h = apic_read(APIC_LVTTHMR); 46 return 0;
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 47 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
36 printk(KERN_DEBUG 48 return 0;
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 49 rdmsrl(MSR_IA32_MCG_CAP, cap);
38 return; 50 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
51 return !!(cap & MCG_CMCI_P);
52}
53
54/*
55 * The interrupt handler. This is called on every event.
56 * Just call the poller directly to log any events.
57 * This could in theory increase the threshold under high load,
58 * but doesn't for now.
59 */
60static void intel_threshold_interrupt(void)
61{
62 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
63 mce_notify_irq();
64}
65
66static void print_update(char *type, int *hdr, int num)
67{
68 if (*hdr == 0)
69 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
70 *hdr = 1;
71 printk(KERN_CONT " %s:%d", type, num);
72}
73
74/*
75 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
76 * on this CPU. Use the algorithm recommended in the SDM to discover shared
77 * banks.
78 */
79static void cmci_discover(int banks, int boot)
80{
81 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
82 unsigned long flags;
83 int hdr = 0;
84 int i;
85
86 spin_lock_irqsave(&cmci_discover_lock, flags);
87 for (i = 0; i < banks; i++) {
88 u64 val;
89
90 if (test_bit(i, owned))
91 continue;
92
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
94
95 /* Already owned by someone else? */
96 if (val & CMCI_EN) {
97 if (test_and_clear_bit(i, owned) || boot)
98 print_update("SHD", &hdr, i);
99 __clear_bit(i, __get_cpu_var(mce_poll_banks));
100 continue;
101 }
102
103 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
106
107 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) {
109 if (!test_and_set_bit(i, owned) || boot)
110 print_update("CMCI", &hdr, i);
111 __clear_bit(i, __get_cpu_var(mce_poll_banks));
112 } else {
113 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
114 }
39 } 115 }
116 spin_unlock_irqrestore(&cmci_discover_lock, flags);
117 if (hdr)
118 printk(KERN_CONT "\n");
119}
120
121/*
122 * Just in case we missed an event during initialization check
123 * all the CMCI owned banks.
124 */
125void cmci_recheck(void)
126{
127 unsigned long flags;
128 int banks;
129
130 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
131 return;
132 local_irq_save(flags);
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 local_irq_restore(flags);
135}
40 136
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) 137/*
42 tm2 = 1; 138 * Disable CMCI on this CPU for all banks it owns when it goes down.
139 * This allows other CPUs to claim the banks on rediscovery.
140 */
141void cmci_clear(void)
142{
143 unsigned long flags;
144 int i;
145 int banks;
146 u64 val;
43 147
44 /* Check whether a vector already exists */ 148 if (!cmci_supported(&banks))
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return; 149 return;
150 spin_lock_irqsave(&cmci_discover_lock, flags);
151 for (i = 0; i < banks; i++) {
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue;
154 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned));
50 } 159 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags);
161}
162
163/*
164 * After a CPU went down cycle through all the others and rediscover
165 * Must run in process context.
166 */
167void cmci_rediscover(int dying)
168{
169 int banks;
170 int cpu;
171 cpumask_var_t old;
172
173 if (!cmci_supported(&banks))
174 return;
175 if (!alloc_cpumask_var(&old, GFP_KERNEL))
176 return;
177 cpumask_copy(old, &current->cpus_allowed);
51 178
52 /* We'll mask the thermal vector in the lapic till we're ready: */ 179 for_each_online_cpu(cpu) {
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 180 if (cpu == dying)
54 apic_write(APIC_LVTTHMR, h); 181 continue;
182 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
183 continue;
184 /* Recheck banks in case CPUs don't all have the same */
185 if (cmci_supported(&banks))
186 cmci_discover(banks, 0);
187 }
55 188
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 189 set_cpus_allowed_ptr(current, old);
57 wrmsr(MSR_IA32_THERM_INTERRUPT, 190 free_cpumask_var(old);
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 191}
59 192
60 intel_set_thermal_handler(); 193/*
194 * Reenable CMCI on this CPU in case a CPU down failed.
195 */
196void cmci_reenable(void)
197{
198 int banks;
199 if (cmci_supported(&banks))
200 cmci_discover(banks, 0);
201}
61 202
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 203static void intel_init_cmci(void)
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); 204{
205 int banks;
64 206
65 /* Unmask the thermal vector: */ 207 if (!cmci_supported(&banks))
66 l = apic_read(APIC_LVTTHMR); 208 return;
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68 209
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 210 mce_threshold_vector = intel_threshold_interrupt;
70 cpu, tm2 ? "TM2" : "TM1"); 211 cmci_discover(banks, 1);
212 /*
213 * For CPU #0 this runs with still disabled APIC, but that's
214 * ok because only the vector is set up. We still do another
215 * check for the banks later for CPU #0 just to make sure
216 * to not miss any events.
217 */
218 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
219 cmci_recheck();
220}
71 221
72 /* enable thermal throttle processing */ 222void mce_intel_feature_init(struct cpuinfo_x86 *c)
73 atomic_set(&therm_throt_en, 1); 223{
224 intel_init_thermal(c);
225 intel_init_cmci();
74} 226}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
deleted file mode 100644
index f2ef6952c400..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ /dev/null
@@ -1,248 +0,0 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
6 */
7
8#include <linux/init.h>
9#include <linux/interrupt.h>
10#include <linux/percpu.h>
11#include <asm/processor.h>
12#include <asm/apic.h>
13#include <asm/msr.h>
14#include <asm/mce.h>
15#include <asm/hw_irq.h>
16#include <asm/idle.h>
17#include <asm/therm_throt.h>
18
19#include "mce.h"
20
21asmlinkage void smp_thermal_interrupt(void)
22{
23 __u64 msr_val;
24
25 ack_APIC_irq();
26
27 exit_idle();
28 irq_enter();
29
30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
32 mce_log_therm_throt_event(msr_val);
33
34 inc_irq_stat(irq_thermal_count);
35 irq_exit();
36}
37
38/*
39 * Support for Intel Correct Machine Check Interrupts. This allows
40 * the CPU to raise an interrupt when a corrected machine check happened.
41 * Normally we pick those up using a regular polling timer.
42 * Also supports reliable discovery of shared banks.
43 */
44
45static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
46
47/*
48 * cmci_discover_lock protects against parallel discovery attempts
49 * which could race against each other.
50 */
51static DEFINE_SPINLOCK(cmci_discover_lock);
52
53#define CMCI_THRESHOLD 1
54
55static int cmci_supported(int *banks)
56{
57 u64 cap;
58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
62 /*
63 * Vendor check is not strictly needed, but the initial
64 * initialization is vendor keyed and this
65 * makes sure none of the backdoors are entered otherwise.
66 */
67 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
68 return 0;
69 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
70 return 0;
71 rdmsrl(MSR_IA32_MCG_CAP, cap);
72 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
73 return !!(cap & MCG_CMCI_P);
74}
75
76/*
77 * The interrupt handler. This is called on every event.
78 * Just call the poller directly to log any events.
79 * This could in theory increase the threshold under high load,
80 * but doesn't for now.
81 */
82static void intel_threshold_interrupt(void)
83{
84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
85 mce_notify_irq();
86}
87
88static void print_update(char *type, int *hdr, int num)
89{
90 if (*hdr == 0)
91 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
92 *hdr = 1;
93 printk(KERN_CONT " %s:%d", type, num);
94}
95
96/*
97 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
98 * on this CPU. Use the algorithm recommended in the SDM to discover shared
99 * banks.
100 */
101static void cmci_discover(int banks, int boot)
102{
103 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
104 unsigned long flags;
105 int hdr = 0;
106 int i;
107
108 spin_lock_irqsave(&cmci_discover_lock, flags);
109 for (i = 0; i < banks; i++) {
110 u64 val;
111
112 if (test_bit(i, owned))
113 continue;
114
115 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
116
117 /* Already owned by someone else? */
118 if (val & CMCI_EN) {
119 if (test_and_clear_bit(i, owned) || boot)
120 print_update("SHD", &hdr, i);
121 __clear_bit(i, __get_cpu_var(mce_poll_banks));
122 continue;
123 }
124
125 val |= CMCI_EN | CMCI_THRESHOLD;
126 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
127 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
128
129 /* Did the enable bit stick? -- the bank supports CMCI */
130 if (val & CMCI_EN) {
131 if (!test_and_set_bit(i, owned) || boot)
132 print_update("CMCI", &hdr, i);
133 __clear_bit(i, __get_cpu_var(mce_poll_banks));
134 } else {
135 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
136 }
137 }
138 spin_unlock_irqrestore(&cmci_discover_lock, flags);
139 if (hdr)
140 printk(KERN_CONT "\n");
141}
142
143/*
144 * Just in case we missed an event during initialization check
145 * all the CMCI owned banks.
146 */
147void cmci_recheck(void)
148{
149 unsigned long flags;
150 int banks;
151
152 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
153 return;
154 local_irq_save(flags);
155 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
156 local_irq_restore(flags);
157}
158
159/*
160 * Disable CMCI on this CPU for all banks it owns when it goes down.
161 * This allows other CPUs to claim the banks on rediscovery.
162 */
163void cmci_clear(void)
164{
165 unsigned long flags;
166 int i;
167 int banks;
168 u64 val;
169
170 if (!cmci_supported(&banks))
171 return;
172 spin_lock_irqsave(&cmci_discover_lock, flags);
173 for (i = 0; i < banks; i++) {
174 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
175 continue;
176 /* Disable CMCI */
177 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
178 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
179 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
180 __clear_bit(i, __get_cpu_var(mce_banks_owned));
181 }
182 spin_unlock_irqrestore(&cmci_discover_lock, flags);
183}
184
185/*
186 * After a CPU went down cycle through all the others and rediscover
187 * Must run in process context.
188 */
189void cmci_rediscover(int dying)
190{
191 int banks;
192 int cpu;
193 cpumask_var_t old;
194
195 if (!cmci_supported(&banks))
196 return;
197 if (!alloc_cpumask_var(&old, GFP_KERNEL))
198 return;
199 cpumask_copy(old, &current->cpus_allowed);
200
201 for_each_online_cpu(cpu) {
202 if (cpu == dying)
203 continue;
204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
205 continue;
206 /* Recheck banks in case CPUs don't all have the same */
207 if (cmci_supported(&banks))
208 cmci_discover(banks, 0);
209 }
210
211 set_cpus_allowed_ptr(current, old);
212 free_cpumask_var(old);
213}
214
215/*
216 * Reenable CMCI on this CPU in case a CPU down failed.
217 */
218void cmci_reenable(void)
219{
220 int banks;
221 if (cmci_supported(&banks))
222 cmci_discover(banks, 0);
223}
224
225static void intel_init_cmci(void)
226{
227 int banks;
228
229 if (!cmci_supported(&banks))
230 return;
231
232 mce_threshold_vector = intel_threshold_interrupt;
233 cmci_discover(banks, 1);
234 /*
235 * For CPU #0 this runs with still disabled APIC, but that's
236 * ok because only the vector is set up. We still do another
237 * check for the banks later for CPU #0 just to make sure
238 * to not miss any events.
239 */
240 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
241 cmci_recheck();
242}
243
244void mce_intel_feature_init(struct cpuinfo_x86 *c)
245{
246 intel_init_thermal(c);
247 intel_init_cmci();
248}
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 70b710420f74..f5f2d6f71fb6 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -17,10 +17,9 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/mce.h>
20#include <asm/msr.h> 21#include <asm/msr.h>
21 22
22#include "mce.h"
23
24static int firstbank; 23static int firstbank;
25 24
26#define MCE_RATE (15*HZ) /* timer rate is 15s */ 25#define MCE_RATE (15*HZ) /* timer rate is 15s */
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 82cee108a2d3..4482aea9aa2e 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -1,21 +1,15 @@
1/* 1/*
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4
5#include <linux/interrupt.h>
6#include <linux/kernel.h> 4#include <linux/kernel.h>
7#include <linux/types.h> 5#include <linux/types.h>
8#include <linux/init.h> 6#include <linux/init.h>
9#include <linux/smp.h> 7#include <linux/smp.h>
10 8
11#include <asm/therm_throt.h>
12#include <asm/processor.h> 9#include <asm/processor.h>
13#include <asm/system.h> 10#include <asm/mce.h>
14#include <asm/apic.h>
15#include <asm/msr.h> 11#include <asm/msr.h>
16 12
17#include "mce.h"
18
19/* as supported by the P4/Xeon family */ 13/* as supported by the P4/Xeon family */
20struct intel_mce_extended_msrs { 14struct intel_mce_extended_msrs {
21 u32 eax; 15 u32 eax;
@@ -33,46 +27,6 @@ struct intel_mce_extended_msrs {
33 27
34static int mce_num_extended_msrs; 28static int mce_num_extended_msrs;
35 29
36
37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 __u64 msr_val;
50
51 ack_APIC_irq();
52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55}
56
57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
60
61void smp_thermal_interrupt(struct pt_regs *regs)
62{
63 irq_enter();
64 vendor_thermal_interrupt(regs);
65 __get_cpu_var(irq_stat).irq_thermal_count++;
66 irq_exit();
67}
68
69void intel_set_thermal_handler(void)
70{
71 vendor_thermal_interrupt = intel_thermal_interrupt;
72}
73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
75
76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
78{ 32{
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 015f481ab1b0..5c0e6533d9bc 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -10,12 +10,11 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* By default disabled */ 16/* By default disabled */
18int mce_p5_enable; 17int mce_p5_enabled __read_mostly;
19 18
20/* Machine check handler for Pentium class Intel CPUs: */ 19/* Machine check handler for Pentium class Intel CPUs: */
21static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
@@ -43,15 +42,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
43{ 42{
44 u32 l, h; 43 u32 l, h;
45 44
46 /* Check for MCE support: */ 45 /* Default P5 to off as its often misconnected: */
47 if (!cpu_has(c, X86_FEATURE_MCE)) 46 if (!mce_p5_enabled)
48 return; 47 return;
49 48
50#ifdef CONFIG_X86_OLD_MCE 49 /* Check for MCE support: */
51 /* Default P5 to off as its often misconnected: */ 50 if (!cpu_has(c, X86_FEATURE_MCE))
52 if (mce_disabled != -1)
53 return; 51 return;
54#endif
55 52
56 machine_check_vector = pentium_machine_check; 53 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */ 54 /* Make sure the vector pointer is visible before we enable MCEs: */
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 43c24e667457..01e4f8178183 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -10,10 +10,9 @@
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
15#include "mce.h"
16
17/* Machine Check Handler For PII/PIII */ 16/* Machine Check Handler For PII/PIII */
18static void intel_machine_check(struct pt_regs *regs, long error_code) 17static void intel_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 7b1ae2e20ba5..5957a93e5173 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -13,21 +13,32 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16#include <linux/interrupt.h>
16#include <linux/notifier.h> 17#include <linux/notifier.h>
17#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/kernel.h>
18#include <linux/percpu.h> 20#include <linux/percpu.h>
19#include <linux/sysdev.h> 21#include <linux/sysdev.h>
22#include <linux/types.h>
23#include <linux/init.h>
24#include <linux/smp.h>
20#include <linux/cpu.h> 25#include <linux/cpu.h>
21 26
22#include <asm/therm_throt.h> 27#include <asm/processor.h>
28#include <asm/system.h>
29#include <asm/apic.h>
30#include <asm/idle.h>
31#include <asm/mce.h>
32#include <asm/msr.h>
23 33
24/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
25#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
26 36
27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
39static DEFINE_PER_CPU(bool, thermal_throttle_active);
29 40
30atomic_t therm_throt_en = ATOMIC_INIT(0); 41static atomic_t therm_throt_en = ATOMIC_INIT(0);
31 42
32#ifdef CONFIG_SYSFS 43#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 44#define define_therm_throt_sysdev_one_ro(_name) \
@@ -82,31 +93,37 @@ static struct attribute_group thermal_throttle_attr_group = {
82 * 1 : Event should be logged further, and a message has been 93 * 1 : Event should be logged further, and a message has been
83 * printed to the syslog. 94 * printed to the syslog.
84 */ 95 */
85int therm_throt_process(int curr) 96static int therm_throt_process(int curr)
86{ 97{
87 unsigned int cpu = smp_processor_id(); 98 unsigned int cpu = smp_processor_id();
88 __u64 tmp_jiffs = get_jiffies_64(); 99 __u64 tmp_jiffs = get_jiffies_64();
100 bool was_throttled = __get_cpu_var(thermal_throttle_active);
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
89 102
90 if (curr) 103 if (is_throttled)
91 __get_cpu_var(thermal_throttle_count)++; 104 __get_cpu_var(thermal_throttle_count)++;
92 105
93 if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) 106 if (!(was_throttled ^ is_throttled) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check)))
94 return 0; 108 return 0;
95 109
96 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
97 111
98 /* if we just entered the thermal event */ 112 /* if we just entered the thermal event */
99 if (curr) { 113 if (is_throttled) {
100 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 114 printk(KERN_CRIT "CPU%d: Temperature above threshold, "
101 "cpu clock throttled (total events = %lu)\n", cpu, 115 "cpu clock throttled (total events = %lu)\n",
102 __get_cpu_var(thermal_throttle_count)); 116 cpu, __get_cpu_var(thermal_throttle_count));
103 117
104 add_taint(TAINT_MACHINE_CHECK); 118 add_taint(TAINT_MACHINE_CHECK);
105 } else { 119 return 1;
106 printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); 120 }
121 if (was_throttled) {
122 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
123 return 1;
107 } 124 }
108 125
109 return 1; 126 return 0;
110} 127}
111 128
112#ifdef CONFIG_SYSFS 129#ifdef CONFIG_SYSFS
@@ -186,6 +203,94 @@ static __init int thermal_throttle_init_device(void)
186 203
187 return 0; 204 return 0;
188} 205}
189
190device_initcall(thermal_throttle_init_device); 206device_initcall(thermal_throttle_init_device);
207
191#endif /* CONFIG_SYSFS */ 208#endif /* CONFIG_SYSFS */
209
210/* Thermal transition interrupt handler */
211static void intel_thermal_interrupt(void)
212{
213 __u64 msr_val;
214
215 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
216 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
217 mce_log_therm_throt_event(msr_val);
218}
219
220static void unexpected_thermal_interrupt(void)
221{
222 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
223 smp_processor_id());
224 add_taint(TAINT_MACHINE_CHECK);
225}
226
227static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
228
229asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
230{
231 exit_idle();
232 irq_enter();
233 inc_irq_stat(irq_thermal_count);
234 smp_thermal_vector();
235 irq_exit();
236 /* Ack only at the end to avoid potential reentry */
237 ack_APIC_irq();
238}
239
240void intel_init_thermal(struct cpuinfo_x86 *c)
241{
242 unsigned int cpu = smp_processor_id();
243 int tm2 = 0;
244 u32 l, h;
245
246 /* Thermal monitoring depends on ACPI and clock modulation*/
247 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
248 return;
249
250 /*
251 * First check if its enabled already, in which case there might
252 * be some SMM goo which handles it, so we can't even put a handler
253 * since it might be delivered via SMI already:
254 */
255 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
256 h = apic_read(APIC_LVTTHMR);
257 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
258 printk(KERN_DEBUG
259 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
260 return;
261 }
262
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG
269 "CPU%d: Thermal LVT vector (%#x) already installed\n",
270 cpu, (h & APIC_VECTOR_MASK));
271 return;
272 }
273
274 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h);
277
278 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
279 wrmsr(MSR_IA32_THERM_INTERRUPT,
280 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
281
282 smp_thermal_vector = intel_thermal_interrupt;
283
284 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
285 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
286
287 /* Unmask the thermal vector: */
288 l = apic_read(APIC_LVTTHMR);
289 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
290
291 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
292 cpu, tm2 ? "TM2" : "TM1");
293
294 /* enable thermal throttle processing */
295 atomic_set(&therm_throt_en, 1);
296}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 81b02487090b..54060f565974 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -9,10 +9,9 @@
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/mce.h>
12#include <asm/msr.h> 13#include <asm/msr.h>
13 14
14#include "mce.h"
15
16/* Machine check handler for WinChip C6: */ 15/* Machine check handler for WinChip C6: */
17static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
18{ 17{
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 275bc142cd5d..900332b800f8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -19,6 +19,7 @@
19#include <linux/kdebug.h> 19#include <linux/kdebug.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/highmem.h>
22 23
23#include <asm/apic.h> 24#include <asm/apic.h>
24#include <asm/stacktrace.h> 25#include <asm/stacktrace.h>
@@ -54,6 +55,7 @@ struct x86_pmu {
54 int num_counters_fixed; 55 int num_counters_fixed;
55 int counter_bits; 56 int counter_bits;
56 u64 counter_mask; 57 u64 counter_mask;
58 int apic;
57 u64 max_period; 59 u64 max_period;
58 u64 intel_ctrl; 60 u64 intel_ctrl;
59}; 61};
@@ -65,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
65}; 67};
66 68
67/* 69/*
70 * Not sure about some of these
71 */
72static const u64 p6_perfmon_event_map[] =
73{
74 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
75 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
76 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
77 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
78 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
79 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
80 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
81};
82
83static u64 p6_pmu_event_map(int event)
84{
85 return p6_perfmon_event_map[event];
86}
87
88/*
89 * Counter setting that is specified not to count anything.
90 * We use this to effectively disable a counter.
91 *
92 * L2_RQSTS with 0 MESI unit mask.
93 */
94#define P6_NOP_COUNTER 0x0000002EULL
95
96static u64 p6_pmu_raw_event(u64 event)
97{
98#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
99#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
100#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
101#define P6_EVNTSEL_INV_MASK 0x00800000ULL
102#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
103
104#define P6_EVNTSEL_MASK \
105 (P6_EVNTSEL_EVENT_MASK | \
106 P6_EVNTSEL_UNIT_MASK | \
107 P6_EVNTSEL_EDGE_MASK | \
108 P6_EVNTSEL_INV_MASK | \
109 P6_EVNTSEL_COUNTER_MASK)
110
111 return event & P6_EVNTSEL_MASK;
112}
113
114
115/*
68 * Intel PerfMon v3. Used on Core2 and later. 116 * Intel PerfMon v3. Used on Core2 and later.
69 */ 117 */
70static const u64 intel_perfmon_event_map[] = 118static const u64 intel_perfmon_event_map[] =
@@ -389,23 +437,23 @@ static u64 intel_pmu_raw_event(u64 event)
389 return event & CORE_EVNTSEL_MASK; 437 return event & CORE_EVNTSEL_MASK;
390} 438}
391 439
392static const u64 amd_0f_hw_cache_event_ids 440static const u64 amd_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX] 441 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX] 442 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 443 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{ 444{
397 [ C(L1D) ] = { 445 [ C(L1D) ] = {
398 [ C(OP_READ) ] = { 446 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0, 447 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
400 [ C(RESULT_MISS) ] = 0, 448 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
401 }, 449 },
402 [ C(OP_WRITE) ] = { 450 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0, 451 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
404 [ C(RESULT_MISS) ] = 0, 452 [ C(RESULT_MISS) ] = 0,
405 }, 453 },
406 [ C(OP_PREFETCH) ] = { 454 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0, 455 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
408 [ C(RESULT_MISS) ] = 0, 456 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
409 }, 457 },
410 }, 458 },
411 [ C(L1I ) ] = { 459 [ C(L1I ) ] = {
@@ -418,17 +466,17 @@ static const u64 amd_0f_hw_cache_event_ids
418 [ C(RESULT_MISS) ] = -1, 466 [ C(RESULT_MISS) ] = -1,
419 }, 467 },
420 [ C(OP_PREFETCH) ] = { 468 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0, 469 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
422 [ C(RESULT_MISS) ] = 0, 470 [ C(RESULT_MISS) ] = 0,
423 }, 471 },
424 }, 472 },
425 [ C(LL ) ] = { 473 [ C(LL ) ] = {
426 [ C(OP_READ) ] = { 474 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0, 475 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
428 [ C(RESULT_MISS) ] = 0, 476 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
429 }, 477 },
430 [ C(OP_WRITE) ] = { 478 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0, 479 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
432 [ C(RESULT_MISS) ] = 0, 480 [ C(RESULT_MISS) ] = 0,
433 }, 481 },
434 [ C(OP_PREFETCH) ] = { 482 [ C(OP_PREFETCH) ] = {
@@ -438,8 +486,8 @@ static const u64 amd_0f_hw_cache_event_ids
438 }, 486 },
439 [ C(DTLB) ] = { 487 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = { 488 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0, 489 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
442 [ C(RESULT_MISS) ] = 0, 490 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
443 }, 491 },
444 [ C(OP_WRITE) ] = { 492 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0, 493 [ C(RESULT_ACCESS) ] = 0,
@@ -566,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
566 614
567static bool reserve_pmc_hardware(void) 615static bool reserve_pmc_hardware(void)
568{ 616{
617#ifdef CONFIG_X86_LOCAL_APIC
569 int i; 618 int i;
570 619
571 if (nmi_watchdog == NMI_LOCAL_APIC) 620 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -580,9 +629,11 @@ static bool reserve_pmc_hardware(void)
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 629 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail; 630 goto eventsel_fail;
582 } 631 }
632#endif
583 633
584 return true; 634 return true;
585 635
636#ifdef CONFIG_X86_LOCAL_APIC
586eventsel_fail: 637eventsel_fail:
587 for (i--; i >= 0; i--) 638 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i); 639 release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -597,10 +648,12 @@ perfctr_fail:
597 enable_lapic_nmi_watchdog(); 648 enable_lapic_nmi_watchdog();
598 649
599 return false; 650 return false;
651#endif
600} 652}
601 653
602static void release_pmc_hardware(void) 654static void release_pmc_hardware(void)
603{ 655{
656#ifdef CONFIG_X86_LOCAL_APIC
604 int i; 657 int i;
605 658
606 for (i = 0; i < x86_pmu.num_counters; i++) { 659 for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -610,6 +663,7 @@ static void release_pmc_hardware(void)
610 663
611 if (nmi_watchdog == NMI_LOCAL_APIC) 664 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog(); 665 enable_lapic_nmi_watchdog();
666#endif
613} 667}
614 668
615static void hw_perf_counter_destroy(struct perf_counter *counter) 669static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -665,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
665{ 719{
666 struct perf_counter_attr *attr = &counter->attr; 720 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw; 721 struct hw_perf_counter *hwc = &counter->hw;
722 u64 config;
668 int err; 723 int err;
669 724
670 if (!x86_pmu_initialized()) 725 if (!x86_pmu_initialized())
@@ -700,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
700 hwc->sample_period = x86_pmu.max_period; 755 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period; 756 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period); 757 atomic64_set(&hwc->period_left, hwc->sample_period);
758 } else {
759 /*
760 * If we have a PMU initialized but no APIC
761 * interrupts, we cannot sample hardware
762 * counters (user-space has to fall back and
763 * sample via a hrtimer based software counter):
764 */
765 if (!x86_pmu.apic)
766 return -EOPNOTSUPP;
703 } 767 }
704 768
705 counter->destroy = hw_perf_counter_destroy; 769 counter->destroy = hw_perf_counter_destroy;
@@ -717,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
717 781
718 if (attr->config >= x86_pmu.max_events) 782 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL; 783 return -EINVAL;
784
720 /* 785 /*
721 * The generic map: 786 * The generic map:
722 */ 787 */
723 hwc->config |= x86_pmu.event_map(attr->config); 788 config = x86_pmu.event_map(attr->config);
789
790 if (config == 0)
791 return -ENOENT;
792
793 if (config == -1LL)
794 return -EINVAL;
795
796 hwc->config |= config;
724 797
725 return 0; 798 return 0;
726} 799}
727 800
801static void p6_pmu_disable_all(void)
802{
803 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
804 u64 val;
805
806 if (!cpuc->enabled)
807 return;
808
809 cpuc->enabled = 0;
810 barrier();
811
812 /* p6 only has one enable register */
813 rdmsrl(MSR_P6_EVNTSEL0, val);
814 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
815 wrmsrl(MSR_P6_EVNTSEL0, val);
816}
817
728static void intel_pmu_disable_all(void) 818static void intel_pmu_disable_all(void)
729{ 819{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -766,6 +856,23 @@ void hw_perf_disable(void)
766 return x86_pmu.disable_all(); 856 return x86_pmu.disable_all();
767} 857}
768 858
859static void p6_pmu_enable_all(void)
860{
861 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
862 unsigned long val;
863
864 if (cpuc->enabled)
865 return;
866
867 cpuc->enabled = 1;
868 barrier();
869
870 /* p6 only has one enable register */
871 rdmsrl(MSR_P6_EVNTSEL0, val);
872 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
873 wrmsrl(MSR_P6_EVNTSEL0, val);
874}
875
769static void intel_pmu_enable_all(void) 876static void intel_pmu_enable_all(void)
770{ 877{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -783,13 +890,13 @@ static void amd_pmu_enable_all(void)
783 barrier(); 890 barrier();
784 891
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 892 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
893 struct perf_counter *counter = cpuc->counters[idx];
786 u64 val; 894 u64 val;
787 895
788 if (!test_bit(idx, cpuc->active_mask)) 896 if (!test_bit(idx, cpuc->active_mask))
789 continue; 897 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 898
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) 899 val = counter->hw.config;
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 900 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 901 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 } 902 }
@@ -818,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack)
818 925
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 926static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{ 927{
821 int err; 928 (void)checking_wrmsrl(hwc->config_base + idx,
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 929 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824} 930}
825 931
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 932static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{ 933{
828 int err; 934 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831} 935}
832 936
833static inline void 937static inline void
@@ -835,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{ 939{
836 int idx = __idx - X86_PMC_IDX_FIXED; 940 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask; 941 u64 ctrl_val, mask;
838 int err;
839 942
840 mask = 0xfULL << (idx * 4); 943 mask = 0xfULL << (idx * 4);
841 944
842 rdmsrl(hwc->config_base, ctrl_val); 945 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask; 946 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val); 947 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
948}
949
950static inline void
951p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
952{
953 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
954 u64 val = P6_NOP_COUNTER;
955
956 if (cpuc->enabled)
957 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
958
959 (void)checking_wrmsrl(hwc->config_base + idx, val);
845} 960}
846 961
847static inline void 962static inline void
@@ -911,6 +1026,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
911 err = checking_wrmsrl(hwc->counter_base + idx, 1026 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask); 1027 (u64)(-left) & x86_pmu.counter_mask);
913 1028
1029 perf_counter_update_userpage(counter);
1030
914 return ret; 1031 return ret;
915} 1032}
916 1033
@@ -940,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
940 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1057 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941} 1058}
942 1059
1060static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1061{
1062 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1063 u64 val;
1064
1065 val = hwc->config;
1066 if (cpuc->enabled)
1067 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1068
1069 (void)checking_wrmsrl(hwc->config_base + idx, val);
1070}
1071
1072
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{ 1074{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -956,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
956 1086
957 if (cpuc->enabled) 1087 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx); 1088 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961} 1089}
962 1090
963static int 1091static int
@@ -968,13 +1096,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
968 if (!x86_pmu.num_counters_fixed) 1096 if (!x86_pmu.num_counters_fixed)
969 return -1; 1097 return -1;
970 1098
971 /*
972 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
973 */
974 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
975 boot_cpu_data.x86_model == 28)
976 return -1;
977
978 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1099 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
979 1100
980 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1101 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
@@ -1040,6 +1161,8 @@ try_generic:
1040 x86_perf_counter_set_period(counter, hwc, idx); 1161 x86_perf_counter_set_period(counter, hwc, idx);
1041 x86_pmu.enable(hwc, idx); 1162 x86_pmu.enable(hwc, idx);
1042 1163
1164 perf_counter_update_userpage(counter);
1165
1043 return 0; 1166 return 0;
1044} 1167}
1045 1168
@@ -1132,6 +1255,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
1132 x86_perf_counter_update(counter, hwc, idx); 1255 x86_perf_counter_update(counter, hwc, idx);
1133 cpuc->counters[idx] = NULL; 1256 cpuc->counters[idx] = NULL;
1134 clear_bit(idx, cpuc->used_mask); 1257 clear_bit(idx, cpuc->used_mask);
1258
1259 perf_counter_update_userpage(counter);
1135} 1260}
1136 1261
1137/* 1262/*
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void)
1176 local_irq_restore(flags); 1301 local_irq_restore(flags);
1177} 1302}
1178 1303
1304static int p6_pmu_handle_irq(struct pt_regs *regs)
1305{
1306 struct perf_sample_data data;
1307 struct cpu_hw_counters *cpuc;
1308 struct perf_counter *counter;
1309 struct hw_perf_counter *hwc;
1310 int idx, handled = 0;
1311 u64 val;
1312
1313 data.regs = regs;
1314 data.addr = 0;
1315
1316 cpuc = &__get_cpu_var(cpu_hw_counters);
1317
1318 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1319 if (!test_bit(idx, cpuc->active_mask))
1320 continue;
1321
1322 counter = cpuc->counters[idx];
1323 hwc = &counter->hw;
1324
1325 val = x86_perf_counter_update(counter, hwc, idx);
1326 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1327 continue;
1328
1329 /*
1330 * counter overflow
1331 */
1332 handled = 1;
1333 data.period = counter->hw.last_period;
1334
1335 if (!x86_perf_counter_set_period(counter, hwc, idx))
1336 continue;
1337
1338 if (perf_counter_overflow(counter, 1, &data))
1339 p6_pmu_disable_counter(hwc, idx);
1340 }
1341
1342 if (handled)
1343 inc_irq_stat(apic_perf_irqs);
1344
1345 return handled;
1346}
1179 1347
1180/* 1348/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling 1349 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{ 1353{
1186 struct perf_sample_data data; 1354 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc; 1355 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops; 1356 int bit, loops;
1189 u64 ack, status; 1357 u64 ack, status;
1190 1358
1191 data.regs = regs; 1359 data.regs = regs;
1192 data.addr = 0; 1360 data.addr = 0;
1193 1361
1194 cpu = smp_processor_id(); 1362 cpuc = &__get_cpu_var(cpu_hw_counters);
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196 1363
1197 perf_disable(); 1364 perf_disable();
1198 status = intel_pmu_get_status(); 1365 status = intel_pmu_get_status();
@@ -1223,6 +1390,8 @@ again:
1223 if (!intel_pmu_save_and_restart(counter)) 1390 if (!intel_pmu_save_and_restart(counter))
1224 continue; 1391 continue;
1225 1392
1393 data.period = counter->hw.last_period;
1394
1226 if (perf_counter_overflow(counter, 1, &data)) 1395 if (perf_counter_overflow(counter, 1, &data))
1227 intel_pmu_disable_counter(&counter->hw, bit); 1396 intel_pmu_disable_counter(&counter->hw, bit);
1228 } 1397 }
@@ -1247,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1247 struct cpu_hw_counters *cpuc; 1416 struct cpu_hw_counters *cpuc;
1248 struct perf_counter *counter; 1417 struct perf_counter *counter;
1249 struct hw_perf_counter *hwc; 1418 struct hw_perf_counter *hwc;
1250 int cpu, idx, handled = 0; 1419 int idx, handled = 0;
1251 u64 val; 1420 u64 val;
1252 1421
1253 data.regs = regs; 1422 data.regs = regs;
1254 data.addr = 0; 1423 data.addr = 0;
1255 1424
1256 cpu = smp_processor_id(); 1425 cpuc = &__get_cpu_var(cpu_hw_counters);
1257 cpuc = &per_cpu(cpu_hw_counters, cpu);
1258 1426
1259 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1427 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1260 if (!test_bit(idx, cpuc->active_mask)) 1428 if (!test_bit(idx, cpuc->active_mask))
@@ -1297,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1297 1465
1298void set_perf_counter_pending(void) 1466void set_perf_counter_pending(void)
1299{ 1467{
1468#ifdef CONFIG_X86_LOCAL_APIC
1300 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1469 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1470#endif
1301} 1471}
1302 1472
1303void perf_counters_lapic_init(void) 1473void perf_counters_lapic_init(void)
1304{ 1474{
1305 if (!x86_pmu_initialized()) 1475#ifdef CONFIG_X86_LOCAL_APIC
1476 if (!x86_pmu.apic || !x86_pmu_initialized())
1306 return; 1477 return;
1307 1478
1308 /* 1479 /*
1309 * Always use NMI for PMU 1480 * Always use NMI for PMU
1310 */ 1481 */
1311 apic_write(APIC_LVTPC, APIC_DM_NMI); 1482 apic_write(APIC_LVTPC, APIC_DM_NMI);
1483#endif
1312} 1484}
1313 1485
1314static int __kprobes 1486static int __kprobes
@@ -1332,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
1332 1504
1333 regs = args->regs; 1505 regs = args->regs;
1334 1506
1507#ifdef CONFIG_X86_LOCAL_APIC
1335 apic_write(APIC_LVTPC, APIC_DM_NMI); 1508 apic_write(APIC_LVTPC, APIC_DM_NMI);
1509#endif
1336 /* 1510 /*
1337 * Can't rely on the handled return value to say it was our NMI, two 1511 * Can't rely on the handled return value to say it was our NMI, two
1338 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1512 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1351,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1351 .priority = 1 1525 .priority = 1
1352}; 1526};
1353 1527
1528static struct x86_pmu p6_pmu = {
1529 .name = "p6",
1530 .handle_irq = p6_pmu_handle_irq,
1531 .disable_all = p6_pmu_disable_all,
1532 .enable_all = p6_pmu_enable_all,
1533 .enable = p6_pmu_enable_counter,
1534 .disable = p6_pmu_disable_counter,
1535 .eventsel = MSR_P6_EVNTSEL0,
1536 .perfctr = MSR_P6_PERFCTR0,
1537 .event_map = p6_pmu_event_map,
1538 .raw_event = p6_pmu_raw_event,
1539 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1540 .apic = 1,
1541 .max_period = (1ULL << 31) - 1,
1542 .version = 0,
1543 .num_counters = 2,
1544 /*
1545 * Counters have 40 bits implemented. However they are designed such
1546 * that bits [32-39] are sign extensions of bit 31. As such the
1547 * effective width of a counter for P6-like PMU is 32 bits only.
1548 *
1549 * See IA-32 Intel Architecture Software developer manual Vol 3B
1550 */
1551 .counter_bits = 32,
1552 .counter_mask = (1ULL << 32) - 1,
1553};
1554
1354static struct x86_pmu intel_pmu = { 1555static struct x86_pmu intel_pmu = {
1355 .name = "Intel", 1556 .name = "Intel",
1356 .handle_irq = intel_pmu_handle_irq, 1557 .handle_irq = intel_pmu_handle_irq,
@@ -1363,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
1363 .event_map = intel_pmu_event_map, 1564 .event_map = intel_pmu_event_map,
1364 .raw_event = intel_pmu_raw_event, 1565 .raw_event = intel_pmu_raw_event,
1365 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 1566 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1567 .apic = 1,
1366 /* 1568 /*
1367 * Intel PMCs cannot be accessed sanely above 32 bit width, 1569 * Intel PMCs cannot be accessed sanely above 32 bit width,
1368 * so we install an artificial 1<<31 period regardless of 1570 * so we install an artificial 1<<31 period regardless of
@@ -1386,10 +1588,43 @@ static struct x86_pmu amd_pmu = {
1386 .num_counters = 4, 1588 .num_counters = 4,
1387 .counter_bits = 48, 1589 .counter_bits = 48,
1388 .counter_mask = (1ULL << 48) - 1, 1590 .counter_mask = (1ULL << 48) - 1,
1591 .apic = 1,
1389 /* use highest bit to detect overflow */ 1592 /* use highest bit to detect overflow */
1390 .max_period = (1ULL << 47) - 1, 1593 .max_period = (1ULL << 47) - 1,
1391}; 1594};
1392 1595
1596static int p6_pmu_init(void)
1597{
1598 switch (boot_cpu_data.x86_model) {
1599 case 1:
1600 case 3: /* Pentium Pro */
1601 case 5:
1602 case 6: /* Pentium II */
1603 case 7:
1604 case 8:
1605 case 11: /* Pentium III */
1606 break;
1607 case 9:
1608 case 13:
1609 /* Pentium M */
1610 break;
1611 default:
1612 pr_cont("unsupported p6 CPU model %d ",
1613 boot_cpu_data.x86_model);
1614 return -ENODEV;
1615 }
1616
1617 x86_pmu = p6_pmu;
1618
1619 if (!cpu_has_apic) {
1620 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1621 pr_info("no hardware sampling interrupt available.\n");
1622 x86_pmu.apic = 0;
1623 }
1624
1625 return 0;
1626}
1627
1393static int intel_pmu_init(void) 1628static int intel_pmu_init(void)
1394{ 1629{
1395 union cpuid10_edx edx; 1630 union cpuid10_edx edx;
@@ -1398,8 +1633,14 @@ static int intel_pmu_init(void)
1398 unsigned int ebx; 1633 unsigned int ebx;
1399 int version; 1634 int version;
1400 1635
1401 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 1636 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1637 /* check for P6 processor family */
1638 if (boot_cpu_data.x86 == 6) {
1639 return p6_pmu_init();
1640 } else {
1402 return -ENODEV; 1641 return -ENODEV;
1642 }
1643 }
1403 1644
1404 /* 1645 /*
1405 * Check whether the Architectural PerfMon supports 1646 * Check whether the Architectural PerfMon supports
@@ -1425,8 +1666,6 @@ static int intel_pmu_init(void)
1425 */ 1666 */
1426 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 1667 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1427 1668
1428 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1429
1430 /* 1669 /*
1431 * Install the hw-cache-events table: 1670 * Install the hw-cache-events table:
1432 */ 1671 */
@@ -1459,18 +1698,16 @@ static int intel_pmu_init(void)
1459 1698
1460static int amd_pmu_init(void) 1699static int amd_pmu_init(void)
1461{ 1700{
1701 /* Performance-monitoring supported from K7 and later: */
1702 if (boot_cpu_data.x86 < 6)
1703 return -ENODEV;
1704
1462 x86_pmu = amd_pmu; 1705 x86_pmu = amd_pmu;
1463 1706
1464 switch (boot_cpu_data.x86) { 1707 /* Events are common for all AMDs */
1465 case 0x0f: 1708 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
1466 case 0x10: 1709 sizeof(hw_cache_event_ids));
1467 case 0x11:
1468 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1469 sizeof(hw_cache_event_ids));
1470 1710
1471 pr_cont("AMD Family 0f/10/11 events, ");
1472 break;
1473 }
1474 return 0; 1711 return 0;
1475} 1712}
1476 1713
@@ -1498,21 +1735,22 @@ void __init init_hw_perf_counters(void)
1498 pr_cont("%s PMU driver.\n", x86_pmu.name); 1735 pr_cont("%s PMU driver.\n", x86_pmu.name);
1499 1736
1500 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1737 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1501 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1502 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 1738 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1503 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 1739 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1740 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1504 } 1741 }
1505 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 1742 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1506 perf_max_counters = x86_pmu.num_counters; 1743 perf_max_counters = x86_pmu.num_counters;
1507 1744
1508 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1745 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1509 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1510 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 1746 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1511 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 1747 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1748 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1512 } 1749 }
1513 1750
1514 perf_counter_mask |= 1751 perf_counter_mask |=
1515 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 1752 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1753 x86_pmu.intel_ctrl = perf_counter_mask;
1516 1754
1517 perf_counters_lapic_init(); 1755 perf_counters_lapic_init();
1518 register_die_notifier(&perf_counter_nmi_notifier); 1756 register_die_notifier(&perf_counter_nmi_notifier);
@@ -1554,14 +1792,15 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1554 */ 1792 */
1555 1793
1556static inline 1794static inline
1557void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) 1795void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1558{ 1796{
1559 if (entry->nr < MAX_STACK_DEPTH) 1797 if (entry->nr < PERF_MAX_STACK_DEPTH)
1560 entry->ip[entry->nr++] = ip; 1798 entry->ip[entry->nr++] = ip;
1561} 1799}
1562 1800
1563static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1564static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame);
1565 1804
1566 1805
1567static void 1806static void
@@ -1577,14 +1816,19 @@ static void backtrace_warning(void *data, char *msg)
1577 1816
1578static int backtrace_stack(void *data, char *name) 1817static int backtrace_stack(void *data, char *name)
1579{ 1818{
1580 /* Don't bother with IRQ stacks for now */ 1819 per_cpu(in_nmi_frame, smp_processor_id()) =
1581 return -1; 1820 x86_is_stack_id(NMI_STACK, name);
1821
1822 return 0;
1582} 1823}
1583 1824
1584static void backtrace_address(void *data, unsigned long addr, int reliable) 1825static void backtrace_address(void *data, unsigned long addr, int reliable)
1585{ 1826{
1586 struct perf_callchain_entry *entry = data; 1827 struct perf_callchain_entry *entry = data;
1587 1828
1829 if (per_cpu(in_nmi_frame, smp_processor_id()))
1830 return;
1831
1588 if (reliable) 1832 if (reliable)
1589 callchain_store(entry, addr); 1833 callchain_store(entry, addr);
1590} 1834}
@@ -1596,47 +1840,59 @@ static const struct stacktrace_ops backtrace_ops = {
1596 .address = backtrace_address, 1840 .address = backtrace_address,
1597}; 1841};
1598 1842
1843#include "../dumpstack.h"
1844
1599static void 1845static void
1600perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1846perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1601{ 1847{
1602 unsigned long bp; 1848 callchain_store(entry, PERF_CONTEXT_KERNEL);
1603 char *stack; 1849 callchain_store(entry, regs->ip);
1604 int nr = entry->nr;
1605 1850
1606 callchain_store(entry, instruction_pointer(regs)); 1851 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1852}
1607 1853
1608 stack = ((char *)regs + sizeof(struct pt_regs)); 1854/*
1609#ifdef CONFIG_FRAME_POINTER 1855 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1610 bp = frame_pointer(regs); 1856 */
1611#else 1857static unsigned long
1612 bp = 0; 1858copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1613#endif 1859{
1860 unsigned long offset, addr = (unsigned long)from;
1861 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1862 unsigned long size, len = 0;
1863 struct page *page;
1864 void *map;
1865 int ret;
1614 1866
1615 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); 1867 do {
1868 ret = __get_user_pages_fast(addr, 1, 0, &page);
1869 if (!ret)
1870 break;
1616 1871
1617 entry->kernel = entry->nr - nr; 1872 offset = addr & (PAGE_SIZE - 1);
1618} 1873 size = min(PAGE_SIZE - offset, n - len);
1619 1874
1875 map = kmap_atomic(page, type);
1876 memcpy(to, map+offset, size);
1877 kunmap_atomic(map, type);
1878 put_page(page);
1620 1879
1621struct stack_frame { 1880 len += size;
1622 const void __user *next_fp; 1881 to += size;
1623 unsigned long return_address; 1882 addr += size;
1624}; 1883
1884 } while (len < n);
1885
1886 return len;
1887}
1625 1888
1626static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1889static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1627{ 1890{
1628 int ret; 1891 unsigned long bytes;
1629
1630 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1631 return 0;
1632 1892
1633 ret = 1; 1893 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
1634 pagefault_disable();
1635 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1636 ret = 0;
1637 pagefault_enable();
1638 1894
1639 return ret; 1895 return bytes == sizeof(*frame);
1640} 1896}
1641 1897
1642static void 1898static void
@@ -1644,28 +1900,28 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1644{ 1900{
1645 struct stack_frame frame; 1901 struct stack_frame frame;
1646 const void __user *fp; 1902 const void __user *fp;
1647 int nr = entry->nr;
1648 1903
1649 regs = (struct pt_regs *)current->thread.sp0 - 1; 1904 if (!user_mode(regs))
1650 fp = (void __user *)regs->bp; 1905 regs = task_pt_regs(current);
1906
1907 fp = (void __user *)regs->bp;
1651 1908
1909 callchain_store(entry, PERF_CONTEXT_USER);
1652 callchain_store(entry, regs->ip); 1910 callchain_store(entry, regs->ip);
1653 1911
1654 while (entry->nr < MAX_STACK_DEPTH) { 1912 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1655 frame.next_fp = NULL; 1913 frame.next_frame = NULL;
1656 frame.return_address = 0; 1914 frame.return_address = 0;
1657 1915
1658 if (!copy_stack_frame(fp, &frame)) 1916 if (!copy_stack_frame(fp, &frame))
1659 break; 1917 break;
1660 1918
1661 if ((unsigned long)fp < user_stack_pointer(regs)) 1919 if ((unsigned long)fp < regs->sp)
1662 break; 1920 break;
1663 1921
1664 callchain_store(entry, frame.return_address); 1922 callchain_store(entry, frame.return_address);
1665 fp = frame.next_fp; 1923 fp = frame.next_frame;
1666 } 1924 }
1667
1668 entry->user = entry->nr - nr;
1669} 1925}
1670 1926
1671static void 1927static void
@@ -1701,9 +1957,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1701 entry = &__get_cpu_var(irq_entry); 1957 entry = &__get_cpu_var(irq_entry);
1702 1958
1703 entry->nr = 0; 1959 entry->nr = 0;
1704 entry->hv = 0;
1705 entry->kernel = 0;
1706 entry->user = 0;
1707 1960
1708 perf_do_callchain(regs, entry); 1961 perf_do_callchain(regs, entry);
1709 1962
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d6f5b9fbde32..e60ed740d2b3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)
716 wd_ops = &k7_wd_ops; 716 wd_ops = &k7_wd_ops;
717 break; 717 break;
718 case X86_VENDOR_INTEL: 718 case X86_VENDOR_INTEL:
719 /* 719 /* Work around where perfctr1 doesn't have a working enable
720 * Work around Core Duo (Yonah) errata AE49 where perfctr1 720 * bit as described in the following errata:
721 * doesn't have a working enable bit. 721 * AE49 Core Duo and Intel Core Solo 65 nm
722 * AN49 Intel Pentium Dual-Core
723 * AF49 Dual-Core Intel Xeon Processor LV
722 */ 724 */
723 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { 725 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
726 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
727 boot_cpu_data.x86_mask == 4))) {
724 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; 728 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
725 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; 729 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
726 } 730 }
@@ -799,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
799 wd_ops->rearm(wd, nmi_hz); 803 wd_ops->rearm(wd, nmi_hz);
800 return 1; 804 return 1;
801} 805}
802
803int lapic_watchdog_ok(void)
804{
805 return wd_ops != NULL;
806}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff958248e61d..5e409dc298a4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,6 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h>
30 31
31 32
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 104#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 105 hpet_disable();
105#endif 106#endif
107
108#ifdef CONFIG_X86_64
109 pci_iommu_shutdown();
110#endif
111
106 crash_save_cpu(regs, safe_smp_processor_id()); 112 crash_save_cpu(regs, safe_smp_processor_id());
107} 113}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 95ea5fa7d444..c8405718a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,6 +22,7 @@
22#include "dumpstack.h" 22#include "dumpstack.h"
23 23
24int panic_on_unrecovered_nmi; 24int panic_on_unrecovered_nmi;
25int panic_on_io_nmi;
25unsigned int code_bytes = 64; 26unsigned int code_bytes = 64;
26int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; 27int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
27static int die_counter; 28static int die_counter;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58dc..bca5fba91c9e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22/* Just a stub for now */
23int x86_is_stack_id(int id, char *name)
24{
25 return 0;
26}
27
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 28void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 29 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 30 const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd6..54b0a3276766 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22
23 unsigned *usedp, char **idp) 23static char x86_stack_ids[][8] = {
24{
25 static char ids[][8] = {
26 [DEBUG_STACK - 1] = "#DB", 24 [DEBUG_STACK - 1] = "#DB",
27 [NMI_STACK - 1] = "NMI", 25 [NMI_STACK - 1] = "NMI",
28 [DOUBLEFAULT_STACK - 1] = "#DF", 26 [DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
33 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 31 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
34#endif 32#endif
35 }; 33 };
34
35int x86_is_stack_id(int id, char *name)
36{
37 return x86_stack_ids[id - 1] == name;
38}
39
40static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
41 unsigned *usedp, char **idp)
42{
36 unsigned k; 43 unsigned k;
37 44
38 /* 45 /*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
61 if (*usedp & (1U << k)) 68 if (*usedp & (1U << k))
62 break; 69 break;
63 *usedp |= 1U << k; 70 *usedp |= 1U << k;
64 *idp = ids[k]; 71 *idp = x86_stack_ids[k];
65 return (unsigned long *)end; 72 return (unsigned long *)end;
66 } 73 }
67 /* 74 /*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
81 do { 88 do {
82 ++j; 89 ++j;
83 end -= EXCEPTION_STKSZ; 90 end -= EXCEPTION_STKSZ;
84 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 91 x86_stack_ids[j][4] = '1' +
92 (j - N_EXCEPTION_STACKS);
85 } while (stack < end - EXCEPTION_STKSZ); 93 } while (stack < end - EXCEPTION_STKSZ);
86 if (*usedp & (1U << j)) 94 if (*usedp & (1U << j))
87 break; 95 break;
88 *usedp |= 1U << j; 96 *usedp |= 1U << j;
89 *idp = ids[j]; 97 *idp = x86_stack_ids[j];
90 return (unsigned long *)end; 98 return (unsigned long *)end;
91 } 99 }
92#endif 100#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7271fa33d791..5cb5725b2bae 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void)
627#ifdef CONFIG_X86_64 627#ifdef CONFIG_X86_64
628 if (!found) { 628 if (!found) {
629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
630 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " 630 printk(KERN_ERR
631 "address range\n" 631 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
632 KERN_ERR "PCI: Unassigned devices with 32bit resource " 632 "PCI: Unassigned devices with 32bit resource registers may break!\n");
633 "registers may break!\n");
634 } 633 }
635#endif 634#endif
636 635
@@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1383 return 32*1024*1024; 1382 return 32*1024*1024;
1384} 1383}
1385 1384
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1386
1386void __init e820_reserve_resources_late(void) 1387void __init e820_reserve_resources_late(void)
1387{ 1388{
1388 int i; 1389 int i;
@@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void)
1400 * avoid stolen RAM: 1401 * avoid stolen RAM:
1401 */ 1402 */
1402 for (i = 0; i < e820.nr_map; i++) { 1403 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i]; 1404 struct e820entry *entry = &e820.map[i];
1404 resource_size_t start, end; 1405 u64 start, end;
1405 1406
1406 if (entry->type != E820_RAM) 1407 if (entry->type != E820_RAM)
1407 continue; 1408 continue;
1408 start = entry->addr + entry->size; 1409 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start)); 1410 end = round_up(start, ram_alignment(start)) - 1;
1410 if (start == end) 1411 if (end > MAX_RESOURCE_SIZE)
1412 end = MAX_RESOURCE_SIZE;
1413 if (start >= end)
1411 continue; 1414 continue;
1412 reserve_region_with_split(&iomem_resource, start, 1415 reserve_region_with_split(&iomem_resource, start, end,
1413 end - 1, "RAM buffer"); 1416 "RAM buffer");
1414 } 1417 }
1415} 1418}
1416 1419
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1736acc4d7aa..fe26ba3e3451 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void)
240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; 240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
241 int e820_type; 241 int e820_type;
242 242
243 if (md->attribute & EFI_MEMORY_WB) 243 switch (md->type) {
244 e820_type = E820_RAM; 244 case EFI_LOADER_CODE:
245 else 245 case EFI_LOADER_DATA:
246 case EFI_BOOT_SERVICES_CODE:
247 case EFI_BOOT_SERVICES_DATA:
248 case EFI_CONVENTIONAL_MEMORY:
249 if (md->attribute & EFI_MEMORY_WB)
250 e820_type = E820_RAM;
251 else
252 e820_type = E820_RESERVED;
253 break;
254 case EFI_ACPI_RECLAIM_MEMORY:
255 e820_type = E820_ACPI;
256 break;
257 case EFI_ACPI_MEMORY_NVS:
258 e820_type = E820_NVS;
259 break;
260 case EFI_UNUSABLE_MEMORY:
261 e820_type = E820_UNUSABLE;
262 break;
263 default:
264 /*
265 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
266 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
267 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
268 */
246 e820_type = E820_RESERVED; 269 e820_type = E820_RESERVED;
270 break;
271 }
247 e820_add_region(start, size, e820_type); 272 e820_add_region(start, size, e820_type);
248 } 273 }
249 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 274 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
@@ -329,7 +354,7 @@ void __init efi_init(void)
329 */ 354 */
330 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); 355 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
331 if (c16) { 356 if (c16) {
332 for (i = 0; i < sizeof(vendor) && *c16; ++i) 357 for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
333 vendor[i] = *c16++; 358 vendor[i] = *c16++;
334 vendor[i] = '\0'; 359 vendor[i] = '\0';
335 } else 360 } else
@@ -487,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
487 && end_pfn <= max_pfn_mapped)) 512 && end_pfn <= max_pfn_mapped))
488 va = __va(md->phys_addr); 513 va = __va(md->phys_addr);
489 else 514 else
490 va = efi_ioremap(md->phys_addr, size); 515 va = efi_ioremap(md->phys_addr, size, md->type);
491 516
492 md->virt_addr = (u64) (unsigned long) va; 517 md->virt_addr = (u64) (unsigned long) va;
493 518
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
98 early_runtime_code_mapping_set_exec(0); 98 early_runtime_code_mapping_set_exec(0);
99} 99}
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
102{ 103{
103 unsigned long last_map_pfn; 104 unsigned long last_map_pfn;
104 105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
107 return NULL; 111 return NULL;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add475c9..c097e7d607c6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
48#include <asm/segment.h> 48#include <asm/segment.h>
49#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/page_types.h> 50#include <asm/page_types.h>
51#include <asm/desc.h>
52#include <asm/percpu.h> 51#include <asm/percpu.h>
53#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
54#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
@@ -84,7 +83,7 @@
84#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 83#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
85#else 84#else
86#define preempt_stop(clobbers) 85#define preempt_stop(clobbers)
87#define resume_kernel restore_nocheck 86#define resume_kernel restore_all
88#endif 87#endif
89 88
90.macro TRACE_IRQS_IRET 89.macro TRACE_IRQS_IRET
@@ -372,7 +371,7 @@ END(ret_from_exception)
372ENTRY(resume_kernel) 371ENTRY(resume_kernel)
373 DISABLE_INTERRUPTS(CLBR_ANY) 372 DISABLE_INTERRUPTS(CLBR_ANY)
374 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 373 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
375 jnz restore_nocheck 374 jnz restore_all
376need_resched: 375need_resched:
377 movl TI_flags(%ebp), %ecx # need_resched set ? 376 movl TI_flags(%ebp), %ecx # need_resched set ?
378 testb $_TIF_NEED_RESCHED, %cl 377 testb $_TIF_NEED_RESCHED, %cl
@@ -540,6 +539,8 @@ syscall_exit:
540 jne syscall_exit_work 539 jne syscall_exit_work
541 540
542restore_all: 541restore_all:
542 TRACE_IRQS_IRET
543restore_all_notrace:
543 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 544 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
544 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 545 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
545 # are returning to the kernel. 546 # are returning to the kernel.
@@ -551,8 +552,6 @@ restore_all:
551 CFI_REMEMBER_STATE 552 CFI_REMEMBER_STATE
552 je ldt_ss # returning to user-space with LDT SS 553 je ldt_ss # returning to user-space with LDT SS
553restore_nocheck: 554restore_nocheck:
554 TRACE_IRQS_IRET
555restore_nocheck_notrace:
556 RESTORE_REGS 4 # skip orig_eax/error_code 555 RESTORE_REGS 4 # skip orig_eax/error_code
557 CFI_ADJUST_CFA_OFFSET -4 556 CFI_ADJUST_CFA_OFFSET -4
558irq_return: 557irq_return:
@@ -588,22 +587,34 @@ ldt_ss:
588 jne restore_nocheck 587 jne restore_nocheck
589#endif 588#endif
590 589
591 /* If returning to userspace with 16bit stack, 590/*
592 * try to fix the higher word of ESP, as the CPU 591 * Setup and switch to ESPFIX stack
593 * won't restore it. 592 *
594 * This is an "official" bug of all the x86-compatible 593 * We're returning to userspace with a 16 bit stack. The CPU will not
595 * CPUs, which we can try to work around to make 594 * restore the high word of ESP for us on executing iret... This is an
596 * dosemu and wine happy. */ 595 * "official" bug of all the x86-compatible CPUs, which we can work
597 movl PT_OLDESP(%esp), %eax 596 * around to make dosemu and wine happy. We do this by preloading the
598 movl %esp, %edx 597 * high word of ESP with the high word of the userspace ESP while
599 call patch_espfix_desc 598 * compensating for the offset by changing to the ESPFIX segment with
599 * a base address that matches for the difference.
600 */
601 mov %esp, %edx /* load kernel esp */
602 mov PT_OLDESP(%esp), %eax /* load userspace esp */
603 mov %dx, %ax /* eax: new kernel esp */
604 sub %eax, %edx /* offset (low word is 0) */
605 PER_CPU(gdt_page, %ebx)
606 shr $16, %edx
607 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
608 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
600 pushl $__ESPFIX_SS 609 pushl $__ESPFIX_SS
601 CFI_ADJUST_CFA_OFFSET 4 610 CFI_ADJUST_CFA_OFFSET 4
602 pushl %eax 611 push %eax /* new kernel esp */
603 CFI_ADJUST_CFA_OFFSET 4 612 CFI_ADJUST_CFA_OFFSET 4
613 /* Disable interrupts, but do not irqtrace this section: we
614 * will soon execute iret and the tracer was already set to
615 * the irqstate after the iret */
604 DISABLE_INTERRUPTS(CLBR_EAX) 616 DISABLE_INTERRUPTS(CLBR_EAX)
605 TRACE_IRQS_OFF 617 lss (%esp), %esp /* switch to espfix segment */
606 lss (%esp), %esp
607 CFI_ADJUST_CFA_OFFSET -8 618 CFI_ADJUST_CFA_OFFSET -8
608 jmp restore_nocheck 619 jmp restore_nocheck
609 CFI_ENDPROC 620 CFI_ENDPROC
@@ -716,15 +727,24 @@ PTREGSCALL(vm86)
716PTREGSCALL(vm86old) 727PTREGSCALL(vm86old)
717 728
718.macro FIXUP_ESPFIX_STACK 729.macro FIXUP_ESPFIX_STACK
719 /* since we are on a wrong stack, we cant make it a C code :( */ 730/*
731 * Switch back for ESPFIX stack to the normal zerobased stack
732 *
733 * We can't call C functions using the ESPFIX stack. This code reads
734 * the high word of the segment base from the GDT and swiches to the
735 * normal stack and adjusts ESP with the matching offset.
736 */
737 /* fixup the stack */
720 PER_CPU(gdt_page, %ebx) 738 PER_CPU(gdt_page, %ebx)
721 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) 739 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
722 addl %esp, %eax 740 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
741 shl $16, %eax
742 addl %esp, %eax /* the adjusted stack pointer */
723 pushl $__KERNEL_DS 743 pushl $__KERNEL_DS
724 CFI_ADJUST_CFA_OFFSET 4 744 CFI_ADJUST_CFA_OFFSET 4
725 pushl %eax 745 pushl %eax
726 CFI_ADJUST_CFA_OFFSET 4 746 CFI_ADJUST_CFA_OFFSET 4
727 lss (%esp), %esp 747 lss (%esp), %esp /* switch to the normal stack segment */
728 CFI_ADJUST_CFA_OFFSET -8 748 CFI_ADJUST_CFA_OFFSET -8
729.endm 749.endm
730.macro UNWIND_ESPFIX_STACK 750.macro UNWIND_ESPFIX_STACK
@@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller)
1154 pushl %edx 1174 pushl %edx
1155 movl 0xc(%esp), %edx 1175 movl 0xc(%esp), %edx
1156 lea 0x4(%ebp), %eax 1176 lea 0x4(%ebp), %eax
1177 movl (%ebp), %ecx
1157 subl $MCOUNT_INSN_SIZE, %edx 1178 subl $MCOUNT_INSN_SIZE, %edx
1158 call prepare_ftrace_return 1179 call prepare_ftrace_return
1159 popl %edx 1180 popl %edx
@@ -1168,6 +1189,7 @@ return_to_handler:
1168 pushl %eax 1189 pushl %eax
1169 pushl %ecx 1190 pushl %ecx
1170 pushl %edx 1191 pushl %edx
1192 movl %ebp, %eax
1171 call ftrace_return_to_handler 1193 call ftrace_return_to_handler
1172 movl %eax, 0xc(%esp) 1194 movl %eax, 0xc(%esp)
1173 popl %edx 1195 popl %edx
@@ -1329,7 +1351,7 @@ nmi_stack_correct:
1329 xorl %edx,%edx # zero error code 1351 xorl %edx,%edx # zero error code
1330 movl %esp,%eax # pt_regs pointer 1352 movl %esp,%eax # pt_regs pointer
1331 call do_nmi 1353 call do_nmi
1332 jmp restore_nocheck_notrace 1354 jmp restore_all_notrace
1333 CFI_ENDPROC 1355 CFI_ENDPROC
1334 1356
1335nmi_stack_fixup: 1357nmi_stack_fixup:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index de74f0a3e0ed..c251be745107 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)
135 135
136 leaq 8(%rbp), %rdi 136 leaq 8(%rbp), %rdi
137 movq 0x38(%rsp), %rsi 137 movq 0x38(%rsp), %rsi
138 movq (%rbp), %rdx
138 subq $MCOUNT_INSN_SIZE, %rsi 139 subq $MCOUNT_INSN_SIZE, %rsi
139 140
140 call prepare_ftrace_return 141 call prepare_ftrace_return
@@ -150,6 +151,7 @@ GLOBAL(return_to_handler)
150 /* Save the return values */ 151 /* Save the return values */
151 movq %rax, (%rsp) 152 movq %rax, (%rsp)
152 movq %rdx, 8(%rsp) 153 movq %rdx, 8(%rsp)
154 movq %rbp, %rdi
153 155
154 call ftrace_return_to_handler 156 call ftrace_return_to_handler
155 157
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b79c5533c421..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)
408 * Hook the return address and push it in the stack of return addrs 408 * Hook the return address and push it in the stack of return addrs
409 * in current thread info. 409 * in current thread info.
410 */ 410 */
411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) 411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
412 unsigned long frame_pointer)
412{ 413{
413 unsigned long old; 414 unsigned long old;
414 int faulted; 415 int faulted;
@@ -416,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
416 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
417 &return_to_handler; 418 &return_to_handler;
418 419
419 /* Nmi's are currently unsupported */
420 if (unlikely(in_nmi()))
421 return;
422
423 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
424 return; 421 return;
425 422
@@ -453,7 +450,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
453 return; 450 return;
454 } 451 }
455 452
456 if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { 453 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
454 frame_pointer) == -EBUSY) {
457 *parent = old; 455 *parent = old;
458 return; 456 return;
459 } 457 }
@@ -496,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
496 494
497struct syscall_metadata *syscall_nr_to_meta(int nr) 495struct syscall_metadata *syscall_nr_to_meta(int nr)
498{ 496{
499 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
500 return NULL; 498 return NULL;
501 499
502 return syscalls_metadata[nr]; 500 return syscalls_metadata[nr];
503} 501}
504 502
505void arch_init_ftrace_syscalls(void) 503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
506{ 530{
507 int i; 531 int i;
508 struct syscall_metadata *meta; 532 struct syscall_metadata *meta;
509 unsigned long **psys_syscall_table = &sys_call_table; 533 unsigned long **psys_syscall_table = &sys_call_table;
510 static atomic_t refs;
511
512 if (atomic_inc_return(&refs) != 1)
513 goto end;
514 534
515 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
516 FTRACE_SYSCALL_MAX, GFP_KERNEL); 536 NR_syscalls, GFP_KERNEL);
517 if (!syscalls_metadata) { 537 if (!syscalls_metadata) {
518 WARN_ON(1); 538 WARN_ON(1);
519 return; 539 return -ENOMEM;
520 } 540 }
521 541
522 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { 542 for (i = 0; i < NR_syscalls; i++) {
523 meta = find_syscall_meta(psys_syscall_table[i]); 543 meta = find_syscall_meta(psys_syscall_table[i]);
524 syscalls_metadata[i] = meta; 544 syscalls_metadata[i] = meta;
525 } 545 }
526 return; 546 return 0;
527
528 /* Paranoid: avoid overflow */
529end:
530 atomic_dec(&refs);
531} 547}
548arch_initcall(arch_init_ftrace_syscalls);
532#endif 549#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index dc5ed4bdd88d..cc827ac9e8d3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/page_types.h> 14#include <asm/page_types.h>
15#include <asm/pgtable_types.h> 15#include <asm/pgtable_types.h>
16#include <asm/desc.h>
17#include <asm/cache.h> 16#include <asm/cache.h>
18#include <asm/thread_info.h> 17#include <asm/thread_info.h>
19#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
@@ -262,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
262 * which will be freed later 261 * which will be freed later
263 */ 262 */
264 263
265#ifndef CONFIG_HOTPLUG_CPU 264__CPUINIT
266.section .init.text,"ax",@progbits
267#endif
268 265
269#ifdef CONFIG_SMP 266#ifdef CONFIG_SMP
270ENTRY(startup_32_smp) 267ENTRY(startup_32_smp)
@@ -603,7 +600,7 @@ ignore_int:
603#endif 600#endif
604 iret 601 iret
605 602
606.section .cpuinit.data,"wa" 603 __REFDATA
607.align 4 604.align 4
608ENTRY(initial_code) 605ENTRY(initial_code)
609 .long i386_start_kernel 606 .long i386_start_kernel
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 54b29bb24e71..fa54f78e2a05 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -12,7 +12,6 @@
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <linux/threads.h> 13#include <linux/threads.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <asm/desc.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/pgtable.h> 16#include <asm/pgtable.h>
18#include <asm/page.h> 17#include <asm/page.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 81408b93f887..dedc2bddf7a5 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev)
510{ 510{
511 511
512 if (request_irq(dev->irq, hpet_interrupt_handler, 512 if (request_irq(dev->irq, hpet_interrupt_handler,
513 IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) 513 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
514 dev->name, dev))
514 return -1; 515 return -1;
515 516
516 disable_irq(dev->irq); 517 disable_irq(dev->irq);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
187#ifdef CONFIG_X86_THERMAL_VECTOR 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif 189#endif
190#ifdef CONFIG_X86_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a78ecad0c900..c664d515f613 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void)
200 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
201} 201}
202 202
203static void paravirt_ops_setup(void) 203static void __init paravirt_ops_setup(void)
204{ 204{
205 pv_info.name = "KVM"; 205 pv_info.name = "KVM";
206 pv_info.paravirt_enabled = 1; 206 pv_info.paravirt_enabled = 1;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
347 347
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
352}; 352};
353 353
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc8256..1a041bcf506b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int no_iommu __read_mostly;
32/* Set this to 1 if there is a HW IOMMU in the system */ 32/* Set this to 1 if there is a HW IOMMU in the system */
33int iommu_detected __read_mostly = 0; 33int iommu_detected __read_mostly = 0;
34 34
35int iommu_pass_through;
36
35dma_addr_t bad_dma_address __read_mostly = 0; 37dma_addr_t bad_dma_address __read_mostly = 0;
36EXPORT_SYMBOL(bad_dma_address); 38EXPORT_SYMBOL(bad_dma_address);
37 39
@@ -210,6 +212,10 @@ static __init int iommu_setup(char *p)
210 if (!strncmp(p, "soft", 4)) 212 if (!strncmp(p, "soft", 4))
211 swiotlb = 1; 213 swiotlb = 1;
212#endif 214#endif
215 if (!strncmp(p, "pt", 2)) {
216 iommu_pass_through = 1;
217 return 1;
218 }
213 219
214 gart_parse_options(p); 220 gart_parse_options(p);
215 221
@@ -290,6 +296,8 @@ static int __init pci_iommu_init(void)
290void pci_iommu_shutdown(void) 296void pci_iommu_shutdown(void)
291{ 297{
292 gart_iommu_shutdown(); 298 gart_iommu_shutdown();
299
300 amd_iommu_shutdown();
293} 301}
294/* Must execute after PCI subsystem */ 302/* Must execute after PCI subsystem */
295fs_initcall(pci_iommu_init); 303fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index cfd9f9063896..d2e56b8f48e7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
675 nommu: 675 nommu:
676 /* Should not happen anymore */ 676 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 KERN_WARNING "falling back to iommu=soft.\n"); 678 "falling back to iommu=soft.\n");
679 return -1; 679 return -1;
680} 680}
681 681
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a1712f2b50f1..6af96ee44200 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
71{ 71{
72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 74 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
75 iommu_pass_through)
75 swiotlb = 1; 76 swiotlb = 1;
76#endif 77#endif
77 if (swiotlb_force) 78 if (swiotlb_force)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fc6e4b773fc4..1092a1a2fbe6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -509,16 +509,12 @@ static void c1e_idle(void)
509 if (!cpumask_test_cpu(cpu, c1e_mask)) { 509 if (!cpumask_test_cpu(cpu, c1e_mask)) {
510 cpumask_set_cpu(cpu, c1e_mask); 510 cpumask_set_cpu(cpu, c1e_mask);
511 /* 511 /*
512 * Force broadcast so ACPI can not interfere. Needs 512 * Force broadcast so ACPI can not interfere.
513 * to run with interrupts enabled as it uses
514 * smp_function_call.
515 */ 513 */
516 local_irq_enable();
517 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 514 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
518 &cpu); 515 &cpu);
519 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", 516 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
520 cpu); 517 cpu);
521 local_irq_disable();
522 } 518 }
523 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 519 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
524 520
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index cabdabce3cb2..113b8927c822 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -36,10 +36,11 @@
36#include <asm/ds.h> 36#include <asm/ds.h>
37#include <asm/hw_breakpoint.h> 37#include <asm/hw_breakpoint.h>
38 38
39#include <trace/syscall.h>
40
41#include "tls.h" 39#include "tls.h"
42 40
41#define CREATE_TRACE_POINTS
42#include <trace/events/syscalls.h>
43
43enum x86_regset { 44enum x86_regset {
44 REGSET_GENERAL, 45 REGSET_GENERAL,
45 REGSET_FP, 46 REGSET_FP,
@@ -1548,8 +1549,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1548 tracehook_report_syscall_entry(regs)) 1549 tracehook_report_syscall_entry(regs))
1549 ret = -1L; 1550 ret = -1L;
1550 1551
1551 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1552 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1552 ftrace_syscall_enter(regs); 1553 trace_sys_enter(regs, regs->orig_ax);
1553 1554
1554 if (unlikely(current->audit_context)) { 1555 if (unlikely(current->audit_context)) {
1555 if (IS_IA32) 1556 if (IS_IA32)
@@ -1574,8 +1575,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1574 if (unlikely(current->audit_context)) 1575 if (unlikely(current->audit_context))
1575 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1576 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1576 1577
1577 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1578 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1578 ftrace_syscall_exit(regs); 1579 trace_sys_exit(regs, regs->ax);
1579 1580
1580 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1581 if (test_thread_flag(TIF_SYSCALL_TRACE))
1581 tracehook_report_syscall_exit(regs, 0); 1582 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
60 "adc %5,%%edx ; " 60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__ 63#elif defined(__x86_64__)
64 __asm__ ( 64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax" 65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..a06e8d101844 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h>
6#include <acpi/reboot.h> 7#include <acpi/reboot.h>
7#include <asm/io.h> 8#include <asm/io.h>
8#include <asm/apic.h> 9#include <asm/apic.h>
@@ -17,7 +18,6 @@
17#include <asm/cpu.h> 18#include <asm/cpu.h>
18 19
19#ifdef CONFIG_X86_32 20#ifdef CONFIG_X86_32
20# include <linux/dmi.h>
21# include <linux/ctype.h> 21# include <linux/ctype.h>
22# include <linux/mc146818rtc.h> 22# include <linux/mc146818rtc.h>
23#else 23#else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 }, 250 },
251 }, 251 },
252 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
253 .callback = set_bios_reboot,
254 .ident = "CompuLab SBC-FITPC2",
255 .matches = {
256 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
257 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
258 },
259 },
252 { } 260 { }
253}; 261};
254 262
@@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart);
396 404
397#endif /* CONFIG_X86_32 */ 405#endif /* CONFIG_X86_32 */
398 406
407/*
408 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
409 */
410static int __init set_pci_reboot(const struct dmi_system_id *d)
411{
412 if (reboot_type != BOOT_CF9) {
413 reboot_type = BOOT_CF9;
414 printk(KERN_INFO "%s series board detected. "
415 "Selecting PCI-method for reboots.\n", d->ident);
416 }
417 return 0;
418}
419
420static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
421 { /* Handle problems with rebooting on Apple MacBook5 */
422 .callback = set_pci_reboot,
423 .ident = "Apple MacBook5",
424 .matches = {
425 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
426 DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
427 },
428 },
429 { /* Handle problems with rebooting on Apple MacBookPro5 */
430 .callback = set_pci_reboot,
431 .ident = "Apple MacBookPro5",
432 .matches = {
433 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
434 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
435 },
436 },
437 { }
438};
439
440static int __init pci_reboot_init(void)
441{
442 dmi_check_system(pci_reboot_dmi_table);
443 return 0;
444}
445core_initcall(pci_reboot_init);
446
399static inline void kb_wait(void) 447static inline void kb_wait(void)
400{ 448{
401 int i; 449 int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index be5ae80f897f..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align)
289 return ret; 289 return ret;
290} 290}
291 291
292#ifdef CONFIG_X86_64
293static void __init init_gbpages(void)
294{
295 if (direct_gbpages && cpu_has_gbpages)
296 printk(KERN_INFO "Using GB pages for direct mapping\n");
297 else
298 direct_gbpages = 0;
299}
300#else
301static inline void init_gbpages(void)
302{
303}
304#endif
305
292static void __init reserve_brk(void) 306static void __init reserve_brk(void)
293{ 307{
294 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
@@ -658,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
658 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), 672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
659 }, 673 },
660 }, 674 },
675 {
676 /*
677 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
678 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
679 * match only DMI_BOARD_NAME and see if there is more bad products
680 * with this vendor.
681 */
682 .callback = dmi_low_memory_corruption,
683 .ident = "AMI BIOS",
684 .matches = {
685 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
686 },
687 },
661#endif 688#endif
662 {} 689 {}
663}; 690};
@@ -871,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
871 898
872 reserve_brk(); 899 reserve_brk();
873 900
901 init_gbpages();
902
874 /* max_pfn_mapped is updated here */ 903 /* max_pfn_mapped is updated here */
875 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 904 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
876 max_pfn_mapped = max_low_pfn_mapped; 905 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9c3f0823e6aa..07d81916f212 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 124}
125 125
126/* 126/*
127 * Remap allocator 127 * Large page remap allocator
128 * 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for 129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping. 130 * each cpu and each is remapped into vmalloc area using PMD mapping.
@@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
137 * better than only using 4k mappings while still being NUMA friendly. 137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata; 140struct pcpul_ent {
141static void **pcpur_ptrs __initdata; 141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
142 148
143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) 149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
144{ 150{
145 size_t off = (size_t)pageno << PAGE_SHIFT; 151 size_t off = (size_t)pageno << PAGE_SHIFT;
146 152
147 if (off >= pcpur_size) 153 if (off >= pcpul_size)
148 return NULL; 154 return NULL;
149 155
150 return virt_to_page(pcpur_ptrs[cpu] + off); 156 return virt_to_page(pcpul_map[cpu].ptr + off);
151} 157}
152 158
153static ssize_t __init setup_pcpu_remap(size_t static_size) 159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
154{ 160{
155 static struct vm_struct vm; 161 size_t map_size, dyn_size;
156 size_t ptrs_size, dyn_size;
157 unsigned int cpu; 162 unsigned int cpu;
163 int i, j;
158 ssize_t ret; 164 ssize_t ret;
159 165
160 /* 166 if (!chosen) {
161 * If large page isn't supported, there's no benefit in doing 167 size_t vm_size = VMALLOC_END - VMALLOC_START;
162 * this. Also, on non-NUMA, embedding is better. 168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
163 * 169
164 * NOTE: disabled for now. 170 /* on non-NUMA, embedding is better */
165 */ 171 if (!pcpu_need_numa())
166 if (true || !cpu_has_pse || !pcpu_need_numa()) 172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
167 return -EINVAL; 185 return -EINVAL;
186 }
168 187
169 /* 188 /*
170 * Currently supports only single page. Supporting multiple 189 * Currently supports only single page. Supporting multiple
171 * pages won't be too difficult if it ever becomes necessary. 190 * pages won't be too difficult if it ever becomes necessary.
172 */ 191 */
173 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
174 PERCPU_DYNAMIC_RESERVE); 193 PERCPU_DYNAMIC_RESERVE);
175 if (pcpur_size > PMD_SIZE) { 194 if (pcpul_size > PMD_SIZE) {
176 pr_warning("PERCPU: static data is larger than large page, " 195 pr_warning("PERCPU: static data is larger than large page, "
177 "can't use large page\n"); 196 "can't use large page\n");
178 return -EINVAL; 197 return -EINVAL;
179 } 198 }
180 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
181 200
182 /* allocate pointer array and alloc large pages */ 201 /* allocate pointer array and alloc large pages */
183 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); 202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
184 pcpur_ptrs = alloc_bootmem(ptrs_size); 203 pcpul_map = alloc_bootmem(map_size);
185 204
186 for_each_possible_cpu(cpu) { 205 for_each_possible_cpu(cpu) {
187 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); 206 pcpul_map[cpu].cpu = cpu;
188 if (!pcpur_ptrs[cpu]) 207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
189 goto enomem; 212 goto enomem;
213 }
190 214
191 /* 215 /*
192 * Only use pcpur_size bytes and give back the rest. 216 * Only use pcpul_size bytes and give back the rest.
193 * 217 *
194 * Ingo: The 2MB up-rounding bootmem is needed to make 218 * Ingo: The 2MB up-rounding bootmem is needed to make
195 * sure the partial 2MB page is still fully RAM - it's 219 * sure the partial 2MB page is still fully RAM - it's
196 * not well-specified to have a PAT-incompatible area 220 * not well-specified to have a PAT-incompatible area
197 * (unmapped RAM, device memory, etc.) in that hole. 221 * (unmapped RAM, device memory, etc.) in that hole.
198 */ 222 */
199 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), 223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
200 PMD_SIZE - pcpur_size); 224 PMD_SIZE - pcpul_size);
201 225
202 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); 226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
203 } 227 }
204 228
205 /* allocate address and map */ 229 /* allocate address and map */
206 vm.flags = VM_ALLOC; 230 pcpul_vm.flags = VM_ALLOC;
207 vm.size = num_possible_cpus() * PMD_SIZE; 231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
208 vm_area_register_early(&vm, PMD_SIZE); 232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
209 233
210 for_each_possible_cpu(cpu) { 234 for_each_possible_cpu(cpu) {
211 pmd_t *pmd; 235 pmd_t *pmd, pmd_v;
212 236
213 pmd = populate_extra_pmd((unsigned long)vm.addr 237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
214 + cpu * PMD_SIZE); 238 cpu * PMD_SIZE);
215 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), 239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
216 PAGE_KERNEL_LARGE)); 240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
217 } 242 }
218 243
219 /* we're ready, commit */ 244 /* we're ready, commit */
220 pr_info("PERCPU: Remapped at %p with large pages, static data " 245 pr_info("PERCPU: Remapped at %p with large pages, static data "
221 "%zu bytes\n", vm.addr, static_size); 246 "%zu bytes\n", pcpul_vm.addr, static_size);
222 247
223 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
224 PERCPU_FIRST_CHUNK_RESERVE, dyn_size, 249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
225 PMD_SIZE, vm.addr, NULL); 250 PMD_SIZE, pcpul_vm.addr, NULL);
226 goto out_free_ar; 251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
227 262
228enomem: 263enomem:
229 for_each_possible_cpu(cpu) 264 for_each_possible_cpu(cpu)
230 if (pcpur_ptrs[cpu]) 265 if (pcpul_map[cpu].ptr)
231 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); 266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
232 ret = -ENOMEM; 267 free_bootmem(__pa(pcpul_map), map_size);
233out_free_ar: 268 return -ENOMEM;
234 free_bootmem(__pa(pcpur_ptrs), ptrs_size); 269}
235 return ret; 270
271/**
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
291 int left = 0, right = nr_cpu_ids - 1;
292 int pos;
293
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
236} 316}
237#else 317#else
238static ssize_t __init setup_pcpu_remap(size_t static_size) 318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
239{ 319{
240 return -EINVAL; 320 return -EINVAL;
241} 321}
@@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
249 * mapping so that it can use PMD mapping without additional TLB 329 * mapping so that it can use PMD mapping without additional TLB
250 * pressure. 330 * pressure.
251 */ 331 */
252static ssize_t __init setup_pcpu_embed(size_t static_size) 332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
253{ 333{
254 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
255 335
@@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
258 * this. Also, embedding allocation doesn't play well with 338 * this. Also, embedding allocation doesn't play well with
259 * NUMA. 339 * NUMA.
260 */ 340 */
261 if (!cpu_has_pse || pcpu_need_numa()) 341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
262 return -EINVAL; 342 return -EINVAL;
263 343
264 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
@@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
297 pcpu4k_nr_static_pages = PFN_UP(static_size); 377 pcpu4k_nr_static_pages = PFN_UP(static_size);
298 378
299 /* unaligned allocations can't be freed, round up to page size */ 379 /* unaligned allocations can't be freed, round up to page size */
300 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() 380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
301 * sizeof(pcpu4k_pages[0])); 381 * sizeof(pcpu4k_pages[0]));
302 pcpu4k_pages = alloc_bootmem(pages_size); 382 pcpu4k_pages = alloc_bootmem(pages_size);
303 383
@@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
308 void *ptr; 388 void *ptr;
309 389
310 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); 390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
311 if (!ptr) 391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
312 goto enomem; 394 goto enomem;
395 }
313 396
314 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); 397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
315 pcpu4k_pages[j++] = virt_to_page(ptr); 398 pcpu4k_pages[j++] = virt_to_page(ptr);
@@ -333,6 +416,16 @@ out_free_ar:
333 return ret; 416 return ret;
334} 417}
335 418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
336static inline void setup_percpu_segment(int cpu) 429static inline void setup_percpu_segment(int cpu)
337{ 430{
338#ifdef CONFIG_X86_32 431#ifdef CONFIG_X86_32
@@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu)
346#endif 439#endif
347} 440}
348 441
349/*
350 * Great future plan:
351 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
352 * Always point %gs to its beginning
353 */
354void __init setup_per_cpu_areas(void) 442void __init setup_per_cpu_areas(void)
355{ 443{
356 size_t static_size = __per_cpu_end - __per_cpu_start; 444 size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void)
367 * of large page mappings. Please read comments on top of 455 * of large page mappings. Please read comments on top of
368 * each allocator for details. 456 * each allocator for details.
369 */ 457 */
370 ret = setup_pcpu_remap(static_size); 458 ret = -EINVAL;
371 if (ret < 0) 459 if (strlen(pcpu_chosen_alloc)) {
372 ret = setup_pcpu_embed(static_size); 460 if (strcmp(pcpu_chosen_alloc, "4k")) {
461 if (!strcmp(pcpu_chosen_alloc, "lpage"))
462 ret = setup_pcpu_lpage(static_size, true);
463 else if (!strcmp(pcpu_chosen_alloc, "embed"))
464 ret = setup_pcpu_embed(static_size, true);
465 else
466 pr_warning("PERCPU: unknown allocator %s "
467 "specified\n", pcpu_chosen_alloc);
468 if (ret < 0)
469 pr_warning("PERCPU: %s allocator failed (%zd), "
470 "falling back to 4k\n",
471 pcpu_chosen_alloc, ret);
472 }
473 } else {
474 ret = setup_pcpu_lpage(static_size, false);
475 if (ret < 0)
476 ret = setup_pcpu_embed(static_size, false);
477 }
373 if (ret < 0) 478 if (ret < 0)
374 ret = setup_pcpu_4k(static_size); 479 ret = setup_pcpu_4k(static_size);
375 if (ret < 0) 480 if (ret < 0)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long prot, unsigned long flags, 22 unsigned long, prot, unsigned long, flags,
23 unsigned long fd, unsigned long off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file; 26 struct file *file;
@@ -226,7 +226,7 @@ bottomup:
226} 226}
227 227
228 228
229asmlinkage long sys_uname(struct new_utsname __user *name) 229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{ 230{
231 int err; 231 int err;
232 down_read(&uts_sem); 232 down_read(&uts_sem);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 124d40c575df..77b9689f8edb 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode)
711 unsigned long pa; 711 unsigned long pa;
712 unsigned long m; 712 unsigned long m;
713 unsigned long n; 713 unsigned long n;
714 unsigned long mmr_image;
715 struct bau_desc *adp; 714 struct bau_desc *adp;
716 struct bau_desc *ad2; 715 struct bau_desc *ad2;
717 716
@@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode)
727 n = pa >> uv_nshift; 726 n = pa >> uv_nshift;
728 m = pa & uv_mmask; 727 m = pa & uv_mmask;
729 728
730 mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); 729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
731 if (mmr_image) { 730 (n << UV_DESC_BASE_PNODE_SHIFT | m));
732 uv_write_global_mmr64(pnode, (unsigned long)
733 UVH_LB_BAU_SB_DESCRIPTOR_BASE,
734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
735 }
736 731
737 /* 732 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 733 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
@@ -749,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode)
749 * note that base_dest_nodeid is actually a nasid. 744 * note that base_dest_nodeid is actually a nasid.
750 */ 745 */
751 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 746 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
747 ad2->header.dest_subnodeid = 0x10; /* the LB */
752 ad2->header.command = UV_NET_ENDPOINT_INTD; 748 ad2->header.command = UV_NET_ENDPOINT_INTD;
753 ad2->header.int_both = 1; 749 ad2->header.int_both = 1;
754 /* 750 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 286d64eba31b..ae04589a579b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,6 +54,7 @@
54#include <asm/traps.h> 54#include <asm/traps.h>
55#include <asm/desc.h> 55#include <asm/desc.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/mce.h>
57 58
58#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
59 60
@@ -65,8 +66,6 @@
65#include <asm/setup.h> 66#include <asm/setup.h>
66#include <asm/traps.h> 67#include <asm/traps.h>
67 68
68#include "cpu/mcheck/mce.h"
69
70asmlinkage int system_call(void); 69asmlinkage int system_call(void);
71 70
72/* Do we ignore FPU interrupts ? */ 71/* Do we ignore FPU interrupts ? */
@@ -347,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
347 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 346 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
348 show_registers(regs); 347 show_registers(regs);
349 348
349 if (panic_on_io_nmi)
350 panic("NMI IOCK error: Not continuing");
351
350 /* Re-enable the IOCK line, wait for a few seconds */ 352 /* Re-enable the IOCK line, wait for a few seconds */
351 reason = (reason & 0xf) | 8; 353 reason = (reason & 0xf) | 8;
352 outb(reason, 0x61); 354 outb(reason, 0x61);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ae3180c506a6..71f4368b357e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
275 * use the TSC value at the transitions to calculate a pretty 275 * use the TSC value at the transitions to calculate a pretty
276 * good value for the TSC frequencty. 276 * good value for the TSC frequencty.
277 */ 277 */
278static inline int pit_verify_msb(unsigned char val)
279{
280 /* Ignore LSB */
281 inb(0x42);
282 return inb(0x42) == val;
283}
284
278static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 285static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
279{ 286{
280 int count; 287 int count;
281 u64 tsc = 0; 288 u64 tsc = 0;
282 289
283 for (count = 0; count < 50000; count++) { 290 for (count = 0; count < 50000; count++) {
284 /* Ignore LSB */ 291 if (!pit_verify_msb(val))
285 inb(0x42);
286 if (inb(0x42) != val)
287 break; 292 break;
288 tsc = get_cycles(); 293 tsc = get_cycles();
289 } 294 }
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
336 * to do that is to just read back the 16-bit counter 341 * to do that is to just read back the 16-bit counter
337 * once from the PIT. 342 * once from the PIT.
338 */ 343 */
339 inb(0x42); 344 pit_verify_msb(0);
340 inb(0x42);
341 345
342 if (pit_expect_msb(0xff, &tsc, &d1)) { 346 if (pit_expect_msb(0xff, &tsc, &d1)) {
343 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { 347 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
348 * Iterate until the error is less than 500 ppm 352 * Iterate until the error is less than 500 ppm
349 */ 353 */
350 delta -= tsc; 354 delta -= tsc;
351 if (d1+d2 < delta >> 11) 355 if (d1+d2 >= delta >> 11)
352 goto success; 356 continue;
357
358 /*
359 * Check the PIT one more time to verify that
360 * all TSC reads were stable wrt the PIT.
361 *
362 * This also guarantees serialization of the
363 * last cycle read ('d2') in pit_expect_msb.
364 */
365 if (!pit_verify_msb(0xfe - i))
366 break;
367 goto success;
353 } 368 }
354 } 369 }
355 printk("Fast TSC calibration failed\n"); 370 printk("Fast TSC calibration failed\n");
@@ -590,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
590 */ 605 */
591 606
592DEFINE_PER_CPU(unsigned long, cyc2ns); 607DEFINE_PER_CPU(unsigned long, cyc2ns);
608DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
593 609
594static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) 610static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
595{ 611{
596 unsigned long long tsc_now, ns_now; 612 unsigned long long tsc_now, ns_now, *offset;
597 unsigned long flags, *scale; 613 unsigned long flags, *scale;
598 614
599 local_irq_save(flags); 615 local_irq_save(flags);
600 sched_clock_idle_sleep_event(); 616 sched_clock_idle_sleep_event();
601 617
602 scale = &per_cpu(cyc2ns, cpu); 618 scale = &per_cpu(cyc2ns, cpu);
619 offset = &per_cpu(cyc2ns_offset, cpu);
603 620
604 rdtscll(tsc_now); 621 rdtscll(tsc_now);
605 ns_now = __cycles_2_ns(tsc_now); 622 ns_now = __cycles_2_ns(tsc_now);
606 623
607 if (cpu_khz) 624 if (cpu_khz) {
608 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 625 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
626 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
627 }
609 628
610 sched_clock_idle_wakeup_event(0); 629 sched_clock_idle_wakeup_event(0);
611 local_irq_restore(flags); 630 local_irq_restore(flags);
@@ -632,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
632 void *data) 651 void *data)
633{ 652{
634 struct cpufreq_freqs *freq = data; 653 struct cpufreq_freqs *freq = data;
635 unsigned long *lpj, dummy; 654 unsigned long *lpj;
636 655
637 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) 656 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
638 return 0; 657 return 0;
639 658
640 lpj = &dummy; 659 lpj = &boot_cpu_data.loops_per_jiffy;
641 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
642#ifdef CONFIG_SMP 660#ifdef CONFIG_SMP
661 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
643 lpj = &cpu_data(freq->cpu).loops_per_jiffy; 662 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
644#else
645 lpj = &boot_cpu_data.loops_per_jiffy;
646#endif 663#endif
647 664
648 if (!ref_freq) { 665 if (!ref_freq) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423fbe2a..95a7289e4b0c 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
441 ap.ds = __USER_DS; 441 ap.ds = __USER_DS;
442 ap.es = __USER_DS; 442 ap.es = __USER_DS;
443 ap.fs = __KERNEL_PERCPU; 443 ap.fs = __KERNEL_PERCPU;
444 ap.gs = 0; 444 ap.gs = __KERNEL_STACK_CANARY;
445 445
446 ap.eflags = 0; 446 ap.eflags = 0;
447 447
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e87882041..9fc178255c04 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,11 +46,10 @@ PHDRS {
46 data PT_LOAD FLAGS(7); /* RWE */ 46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64 47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */ 48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */ 50 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif 51#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */ 52 init PT_LOAD FLAGS(7); /* RWE */
54#endif 53#endif
55 note PT_NOTE FLAGS(0); /* ___ */ 54 note PT_NOTE FLAGS(0); /* ___ */
56} 55}
@@ -103,72 +102,43 @@ SECTIONS
103 __stop___ex_table = .; 102 __stop___ex_table = .;
104 } :text = 0x9090 103 } :text = 0x9090
105 104
106 RODATA 105 RO_DATA(PAGE_SIZE)
107 106
108 /* Data */ 107 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) { 108 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 /* Start of data section */ 109 /* Start of data section */
112 _sdata = .; 110 _sdata = .;
113 DATA_DATA
114 CONSTRUCTORS
115 111
116#ifdef CONFIG_X86_64 112 /* init_task */
117 /* End of data section */ 113 INIT_TASK_DATA(THREAD_SIZE)
118 _edata = .;
119#endif
120 } :data
121 114
122#ifdef CONFIG_X86_32 115#ifdef CONFIG_X86_32
123 /* 32 bit has nosave before _edata */ 116 /* 32 bit has nosave before _edata */
124 . = ALIGN(PAGE_SIZE); 117 NOSAVE_DATA
125 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
126 __nosave_begin = .;
127 *(.data.nosave)
128 . = ALIGN(PAGE_SIZE);
129 __nosave_end = .;
130 }
131#endif 118#endif
132 119
133 . = ALIGN(PAGE_SIZE); 120 PAGE_ALIGNED_DATA(PAGE_SIZE)
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
135 *(.data.page_aligned)
136 *(.data.idt) 121 *(.data.idt)
137 }
138 122
139#ifdef CONFIG_X86_32 123 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
140 . = ALIGN(32);
141#else
142 . = ALIGN(PAGE_SIZE);
143 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
144#endif
145 .data.cacheline_aligned :
146 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
147 *(.data.cacheline_aligned)
148 }
149 124
150 /* rarely changed data like cpu maps */ 125 DATA_DATA
151#ifdef CONFIG_X86_32 126 CONSTRUCTORS
152 . = ALIGN(32); 127
153#else 128 /* rarely changed data like cpu maps */
154 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 129 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
155#endif
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly)
158 130
159#ifdef CONFIG_X86_32
160 /* End of data section */ 131 /* End of data section */
161 _edata = .; 132 _edata = .;
162#endif 133 } :data
163 }
164 134
165#ifdef CONFIG_X86_64 135#ifdef CONFIG_X86_64
166 136
167#define VSYSCALL_ADDR (-10*1024*1024) 137#define VSYSCALL_ADDR (-10*1024*1024)
168#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ 138#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095)) 139 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
170#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ 140#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
171 SIZEOF(.data.read_mostly) + 4095) & ~(4095)) 141 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
172 142
173#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) 143#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
174#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) 144#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -234,35 +204,29 @@ SECTIONS
234 204
235#endif /* CONFIG_X86_64 */ 205#endif /* CONFIG_X86_64 */
236 206
237 /* init_task */ 207 /* Init code and data - will be freed after init */
238 . = ALIGN(THREAD_SIZE); 208 . = ALIGN(PAGE_SIZE);
239 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 209 .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
240 *(.data.init_task) 210 __init_begin = .; /* paired with __init_end */
241 } 211 }
242#ifdef CONFIG_X86_64
243 :data.init
244#endif
245 212
213#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
246 /* 214 /*
247 * smp_locks might be freed after init 215 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
248 * start/end must be page aligned 216 * output PHDR, so the next output section - .init.text - should
217 * start another segment - init.
249 */ 218 */
250 . = ALIGN(PAGE_SIZE); 219 PERCPU_VADDR(0, :percpu)
251 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 220#endif
252 __smp_locks = .;
253 *(.smp_locks)
254 __smp_locks_end = .;
255 . = ALIGN(PAGE_SIZE);
256 }
257 221
258 /* Init code and data - will be freed after init */
259 . = ALIGN(PAGE_SIZE);
260 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 222 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
261 __init_begin = .; /* paired with __init_end */
262 _sinittext = .; 223 _sinittext = .;
263 INIT_TEXT 224 INIT_TEXT
264 _einittext = .; 225 _einittext = .;
265 } 226 }
227#ifdef CONFIG_X86_64
228 :init
229#endif
266 230
267 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { 231 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
268 INIT_DATA 232 INIT_DATA
@@ -333,17 +297,7 @@ SECTIONS
333 } 297 }
334#endif 298#endif
335 299
336#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) 300#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
337 /*
338 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
339 * output PHDR, so the next output section - __data_nosave - should
340 * start another section data.init2. Also, pda should be at the head of
341 * percpu area. Preallocate it and define the percpu offset symbol
342 * so that it can be accessed as a percpu variable.
343 */
344 . = ALIGN(PAGE_SIZE);
345 PERCPU_VADDR(0, :percpu)
346#else
347 PERCPU(PAGE_SIZE) 301 PERCPU(PAGE_SIZE)
348#endif 302#endif
349 303
@@ -354,15 +308,22 @@ SECTIONS
354 __init_end = .; 308 __init_end = .;
355 } 309 }
356 310
311 /*
312 * smp_locks might be freed after init
313 * start/end must be page aligned
314 */
315 . = ALIGN(PAGE_SIZE);
316 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
317 __smp_locks = .;
318 *(.smp_locks)
319 __smp_locks_end = .;
320 . = ALIGN(PAGE_SIZE);
321 }
322
357#ifdef CONFIG_X86_64 323#ifdef CONFIG_X86_64
358 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 324 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
359 . = ALIGN(PAGE_SIZE); 325 NOSAVE_DATA
360 __nosave_begin = .; 326 }
361 *(.data.nosave)
362 . = ALIGN(PAGE_SIZE);
363 __nosave_end = .;
364 } :data.init2
365 /* use another section data.init2, see PERCPU_VADDR() above */
366#endif 327#endif
367 328
368 /* BSS */ 329 /* BSS */
@@ -400,8 +361,8 @@ SECTIONS
400 361
401 362
402#ifdef CONFIG_X86_32 363#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 364. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE") 365 "kernel image bigger than KERNEL_IMAGE_SIZE");
405#else 366#else
406/* 367/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load 368 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -414,12 +375,12 @@ INIT_PER_CPU(irq_stack_union);
414/* 375/*
415 * Build-time check on the image size: 376 * Build-time check on the image size:
416 */ 377 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 378. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE") 379 "kernel image bigger than KERNEL_IMAGE_SIZE");
419 380
420#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0), 382. = ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area"); 383 "irq_stack_union is not at start of per-cpu area");
423#endif 384#endif
424 385
425#endif /* CONFIG_X86_32 */ 386#endif /* CONFIG_X86_32 */
@@ -427,7 +388,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
427#ifdef CONFIG_KEXEC 388#ifdef CONFIG_KEXEC
428#include <asm/kexec.h> 389#include <asm/kexec.h>
429 390
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 391. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big") 392 "kexec control code size is too big");
432#endif 393#endif
433 394