aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/apic/apic.c42
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c400
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/check.c20
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/match.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c79
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c65
-rw-r--r--arch/x86/kernel/cpu/perf_event.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c570
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c6
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c53
-rw-r--r--arch/x86/kernel/entry_32.S47
-rw-r--r--arch/x86/kernel/entry_64.S16
-rw-r--r--arch/x86/kernel/ftrace.c500
-rw-r--r--arch/x86/kernel/head_32.S223
-rw-r--r--arch/x86/kernel/head_64.S80
-rw-r--r--arch/x86/kernel/hpet.c66
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/irq_32.c8
-rw-r--r--arch/x86/kernel/kprobes.c4
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/mca_32.c476
-rw-r--r--arch/x86/kernel/microcode_core.c9
-rw-r--r--arch/x86/kernel/microcode_intel.c14
-rw-r--r--arch/x86/kernel/mpparse.c21
-rw-r--r--arch/x86/kernel/nmi.c107
-rw-r--r--arch/x86/kernel/nmi_selftest.c13
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c8
-rw-r--r--arch/x86/kernel/pci-dma.c18
-rw-r--r--arch/x86/kernel/pci-nommu.c8
-rw-r--r--arch/x86/kernel/process.c73
-rw-r--r--arch/x86/kernel/process_32.c11
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c7
-rw-r--r--arch/x86/kernel/reboot.c237
-rw-r--r--arch/x86/kernel/setup.c36
-rw-r--r--arch/x86/kernel/setup_percpu.c14
-rw-r--r--arch/x86/kernel/signal.c18
-rw-r--r--arch/x86/kernel/smp.c100
-rw-r--r--arch/x86/kernel/smpboot.c191
-rw-r--r--arch/x86/kernel/test_rodata.c10
-rw-r--r--arch/x86/kernel/time.c6
-rw-r--r--arch/x86/kernel/traps.c12
-rw-r--r--arch/x86/kernel/uprobes.c674
-rw-r--r--arch/x86/kernel/vsmp_64.c40
-rw-r--r--arch/x86/kernel/x86_init.c8
-rw-r--r--arch/x86/kernel/xsave.c2
73 files changed, 2697 insertions, 1826 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 08484332f32..8215e5652d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds 5extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
@@ -47,7 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
47obj-y += cpu/ 47obj-y += cpu/
48obj-y += acpi/ 48obj-y += acpi/
49obj-y += reboot.o 49obj-y += reboot.o
50obj-$(CONFIG_MCA) += mca_32.o
51obj-$(CONFIG_X86_MSR) += msr.o 50obj-$(CONFIG_X86_MSR) += msr.o
52obj-$(CONFIG_X86_CPUID) += cpuid.o 51obj-$(CONFIG_X86_CPUID) += cpuid.o
53obj-$(CONFIG_PCI) += early-quirks.o 52obj-$(CONFIG_PCI) += early-quirks.o
@@ -99,6 +98,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
99 98
100obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 99obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
101obj-$(CONFIG_OF) += devicetree.o 100obj-$(CONFIG_OF) += devicetree.o
101obj-$(CONFIG_UPROBES) += uprobes.o
102 102
103### 103###
104# 64 bit specific files 104# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a415b1f4436..8afb6931981 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)
593#ifdef CONFIG_ACPI_HOTPLUG_CPU 593#ifdef CONFIG_ACPI_HOTPLUG_CPU
594#include <acpi/processor.h> 594#include <acpi/processor.h>
595 595
596static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) 596static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
597{ 597{
598#ifdef CONFIG_ACPI_NUMA 598#ifdef CONFIG_ACPI_NUMA
599 int nid; 599 int nid;
@@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void)
990 int i; 990 int i;
991 struct mpc_intsrc mp_irq; 991 struct mpc_intsrc mp_irq;
992 992
993#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 993#ifdef CONFIG_EISA
994 /* 994 /*
995 * Fabricate the legacy ISA bus (bus #31). 995 * Fabricate the legacy ISA bus (bus #31).
996 */ 996 */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index edc24480469..39a222e094a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/irq_remapping.h>
38#include <asm/perf_event.h> 39#include <asm/perf_event.h>
39#include <asm/x86_init.h> 40#include <asm/x86_init.h>
40#include <asm/pgalloc.h> 41#include <asm/pgalloc.h>
@@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void)
1325 acked); 1326 acked);
1326 break; 1327 break;
1327 } 1328 }
1328 if (cpu_has_tsc) { 1329 if (queued) {
1329 rdtscll(ntsc); 1330 if (cpu_has_tsc) {
1330 max_loops = (cpu_khz << 10) - (ntsc - tsc); 1331 rdtscll(ntsc);
1331 } else 1332 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1332 max_loops--; 1333 } else
1334 max_loops--;
1335 }
1333 } while (queued && max_loops > 0); 1336 } while (queued && max_loops > 0);
1334 WARN_ON(max_loops <= 0); 1337 WARN_ON(max_loops <= 0);
1335 1338
@@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void)
1441 * Now that local APIC setup is completed for BP, configure the fault 1444 * Now that local APIC setup is completed for BP, configure the fault
1442 * handling for interrupt remapping. 1445 * handling for interrupt remapping.
1443 */ 1446 */
1444 if (intr_remapping_enabled) 1447 if (irq_remapping_enabled)
1445 enable_drhd_fault_handling(); 1448 irq_remap_enable_fault_handling();
1446 1449
1447} 1450}
1448 1451
@@ -1517,7 +1520,7 @@ void enable_x2apic(void)
1517int __init enable_IR(void) 1520int __init enable_IR(void)
1518{ 1521{
1519#ifdef CONFIG_IRQ_REMAP 1522#ifdef CONFIG_IRQ_REMAP
1520 if (!intr_remapping_supported()) { 1523 if (!irq_remapping_supported()) {
1521 pr_debug("intr-remapping not supported\n"); 1524 pr_debug("intr-remapping not supported\n");
1522 return -1; 1525 return -1;
1523 } 1526 }
@@ -1528,7 +1531,7 @@ int __init enable_IR(void)
1528 return -1; 1531 return -1;
1529 } 1532 }
1530 1533
1531 return enable_intr_remapping(); 1534 return irq_remapping_enable();
1532#endif 1535#endif
1533 return -1; 1536 return -1;
1534} 1537}
@@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void)
1537{ 1540{
1538 unsigned long flags; 1541 unsigned long flags;
1539 int ret, x2apic_enabled = 0; 1542 int ret, x2apic_enabled = 0;
1540 int dmar_table_init_ret; 1543 int hardware_init_ret;
1544
1545 /* Make sure irq_remap_ops are initialized */
1546 setup_irq_remapping_ops();
1541 1547
1542 dmar_table_init_ret = dmar_table_init(); 1548 hardware_init_ret = irq_remapping_prepare();
1543 if (dmar_table_init_ret && !x2apic_supported()) 1549 if (hardware_init_ret && !x2apic_supported())
1544 return; 1550 return;
1545 1551
1546 ret = save_ioapic_entries(); 1552 ret = save_ioapic_entries();
@@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void)
1556 if (x2apic_preenabled && nox2apic) 1562 if (x2apic_preenabled && nox2apic)
1557 disable_x2apic(); 1563 disable_x2apic();
1558 1564
1559 if (dmar_table_init_ret) 1565 if (hardware_init_ret)
1560 ret = -1; 1566 ret = -1;
1561 else 1567 else
1562 ret = enable_IR(); 1568 ret = enable_IR();
@@ -2176,8 +2182,8 @@ static int lapic_suspend(void)
2176 local_irq_save(flags); 2182 local_irq_save(flags);
2177 disable_local_APIC(); 2183 disable_local_APIC();
2178 2184
2179 if (intr_remapping_enabled) 2185 if (irq_remapping_enabled)
2180 disable_intr_remapping(); 2186 irq_remapping_disable();
2181 2187
2182 local_irq_restore(flags); 2188 local_irq_restore(flags);
2183 return 0; 2189 return 0;
@@ -2193,7 +2199,7 @@ static void lapic_resume(void)
2193 return; 2199 return;
2194 2200
2195 local_irq_save(flags); 2201 local_irq_save(flags);
2196 if (intr_remapping_enabled) { 2202 if (irq_remapping_enabled) {
2197 /* 2203 /*
2198 * IO-APIC and PIC have their own resume routines. 2204 * IO-APIC and PIC have their own resume routines.
2199 * We just mask them here to make sure the interrupt 2205 * We just mask them here to make sure the interrupt
@@ -2245,8 +2251,8 @@ static void lapic_resume(void)
2245 apic_write(APIC_ESR, 0); 2251 apic_write(APIC_ESR, 0);
2246 apic_read(APIC_ESR); 2252 apic_read(APIC_ESR);
2247 2253
2248 if (intr_remapping_enabled) 2254 if (irq_remapping_enabled)
2249 reenable_intr_remapping(x2apic_mode); 2255 irq_remapping_reenable(x2apic_mode);
2250 2256
2251 local_irq_restore(flags); 2257 local_irq_restore(flags);
2252} 2258}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 359b6899a36..0e881c46e8c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -227,6 +227,7 @@ static struct apic apic_flat = {
227 227
228 .read = native_apic_mem_read, 228 .read = native_apic_mem_read,
229 .write = native_apic_mem_write, 229 .write = native_apic_mem_write,
230 .eoi_write = native_apic_mem_write,
230 .icr_read = native_apic_icr_read, 231 .icr_read = native_apic_icr_read,
231 .icr_write = native_apic_icr_write, 232 .icr_write = native_apic_icr_write,
232 .wait_icr_idle = native_apic_wait_icr_idle, 233 .wait_icr_idle = native_apic_wait_icr_idle,
@@ -386,6 +387,7 @@ static struct apic apic_physflat = {
386 387
387 .read = native_apic_mem_read, 388 .read = native_apic_mem_read,
388 .write = native_apic_mem_write, 389 .write = native_apic_mem_write,
390 .eoi_write = native_apic_mem_write,
389 .icr_read = native_apic_icr_read, 391 .icr_read = native_apic_icr_read,
390 .icr_write = native_apic_icr_write, 392 .icr_write = native_apic_icr_write,
391 .wait_icr_idle = native_apic_wait_icr_idle, 393 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 634ae6cdd5c..a6e4c6e06c0 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -181,6 +181,7 @@ struct apic apic_noop = {
181 181
182 .read = noop_apic_read, 182 .read = noop_apic_read,
183 .write = noop_apic_write, 183 .write = noop_apic_write,
184 .eoi_write = noop_apic_write,
184 .icr_read = noop_apic_icr_read, 185 .icr_read = noop_apic_icr_read,
185 .icr_write = noop_apic_icr_write, 186 .icr_write = noop_apic_icr_write,
186 .wait_icr_idle = noop_apic_wait_icr_idle, 187 .wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 23e75422e01..6ec6d5d297c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = {
295 295
296 .read = native_apic_mem_read, 296 .read = native_apic_mem_read,
297 .write = native_apic_mem_write, 297 .write = native_apic_mem_write,
298 .eoi_write = native_apic_mem_write,
298 .icr_read = native_apic_icr_read, 299 .icr_read = native_apic_icr_read,
299 .icr_write = native_apic_icr_write, 300 .icr_write = native_apic_icr_write,
300 .wait_icr_idle = native_apic_wait_icr_idle, 301 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0cdec7065af..31fbdbfbf96 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -248,6 +248,7 @@ static struct apic apic_bigsmp = {
248 248
249 .read = native_apic_mem_read, 249 .read = native_apic_mem_read,
250 .write = native_apic_mem_write, 250 .write = native_apic_mem_write,
251 .eoi_write = native_apic_mem_write,
251 .icr_read = native_apic_icr_read, 252 .icr_read = native_apic_icr_read,
252 .icr_write = native_apic_icr_write, 253 .icr_write = native_apic_icr_write,
253 .wait_icr_idle = native_apic_wait_icr_idle, 254 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index e42d1d3b913..db4ab1be3c7 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {
678 678
679 .read = native_apic_mem_read, 679 .read = native_apic_mem_read,
680 .write = native_apic_mem_write, 680 .write = native_apic_mem_write,
681 .eoi_write = native_apic_mem_write,
681 .icr_read = native_apic_icr_read, 682 .icr_read = native_apic_icr_read,
682 .icr_write = native_apic_icr_write, 683 .icr_write = native_apic_icr_write,
683 .wait_icr_idle = native_apic_wait_icr_idle, 684 .wait_icr_idle = native_apic_wait_icr_idle,
@@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = {
742 743
743 .read = native_apic_mem_read, 744 .read = native_apic_mem_read,
744 .write = native_apic_mem_write, 745 .write = native_apic_mem_write,
746 .eoi_write = native_apic_mem_write,
745 .icr_read = native_apic_icr_read, 747 .icr_read = native_apic_icr_read,
746 .icr_write = native_apic_icr_write, 748 .icr_write = native_apic_icr_write,
747 .wait_icr_idle = native_apic_wait_icr_idle, 749 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80..ac96561d1a9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -68,23 +68,21 @@
68#define for_each_irq_pin(entry, head) \ 68#define for_each_irq_pin(entry, head) \
69 for (entry = head; entry; entry = entry->next) 69 for (entry = head; entry; entry = entry->next)
70 70
71static void __init __ioapic_init_mappings(void); 71#ifdef CONFIG_IRQ_REMAP
72 72static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
73static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); 73static inline bool irq_remapped(struct irq_cfg *cfg)
74static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); 74{
75static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); 75 return cfg->irq_2_iommu.iommu != NULL;
76 76}
77static struct io_apic_ops io_apic_ops = { 77#else
78 .init = __ioapic_init_mappings, 78static inline bool irq_remapped(struct irq_cfg *cfg)
79 .read = __io_apic_read, 79{
80 .write = __io_apic_write, 80 return false;
81 .modify = __io_apic_modify, 81}
82}; 82static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
83
84void __init set_io_apic_ops(const struct io_apic_ops *ops)
85{ 83{
86 io_apic_ops = *ops;
87} 84}
85#endif
88 86
89/* 87/*
90 * Is the SiS APIC rmw bug present ? 88 * Is the SiS APIC rmw bug present ?
@@ -142,7 +140,7 @@ int mp_irq_entries;
142/* GSI interrupts */ 140/* GSI interrupts */
143static int nr_irqs_gsi = NR_IRQS_LEGACY; 141static int nr_irqs_gsi = NR_IRQS_LEGACY;
144 142
145#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 143#ifdef CONFIG_EISA
146int mp_bus_id_to_type[MAX_MP_BUSSES]; 144int mp_bus_id_to_type[MAX_MP_BUSSES];
147#endif 145#endif
148 146
@@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
313 irq_free_desc(at); 311 irq_free_desc(at);
314} 312}
315 313
316static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
317{
318 return io_apic_ops.read(apic, reg);
319}
320
321static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
322{
323 io_apic_ops.write(apic, reg, value);
324}
325
326static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
327{
328 io_apic_ops.modify(apic, reg, value);
329}
330
331 314
332struct io_apic { 315struct io_apic {
333 unsigned int index; 316 unsigned int index;
@@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
349 writel(vector, &io_apic->eoi); 332 writel(vector, &io_apic->eoi);
350} 333}
351 334
352static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) 335unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
353{ 336{
354 struct io_apic __iomem *io_apic = io_apic_base(apic); 337 struct io_apic __iomem *io_apic = io_apic_base(apic);
355 writel(reg, &io_apic->index); 338 writel(reg, &io_apic->index);
356 return readl(&io_apic->data); 339 return readl(&io_apic->data);
357} 340}
358 341
359static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) 342void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
360{ 343{
361 struct io_apic __iomem *io_apic = io_apic_base(apic); 344 struct io_apic __iomem *io_apic = io_apic_base(apic);
362 345
@@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va
370 * 353 *
371 * Older SiS APIC requires we rewrite the index register 354 * Older SiS APIC requires we rewrite the index register
372 */ 355 */
373static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) 356void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
374{ 357{
375 struct io_apic __iomem *io_apic = io_apic_base(apic); 358 struct io_apic __iomem *io_apic = io_apic_base(apic);
376 359
@@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v
379 writel(value, &io_apic->data); 362 writel(value, &io_apic->data);
380} 363}
381 364
382static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
383{
384 struct irq_pin_list *entry;
385 unsigned long flags;
386
387 raw_spin_lock_irqsave(&ioapic_lock, flags);
388 for_each_irq_pin(entry, cfg->irq_2_pin) {
389 unsigned int reg;
390 int pin;
391
392 pin = entry->pin;
393 reg = io_apic_read(entry->apic, 0x10 + pin*2);
394 /* Is the remote IRR bit set? */
395 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
396 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
397 return true;
398 }
399 }
400 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
401
402 return false;
403}
404
405union entry_union { 365union entry_union {
406 struct { u32 w1, w2; }; 366 struct { u32 w1, w2; };
407 struct IO_APIC_route_entry entry; 367 struct IO_APIC_route_entry entry;
@@ -875,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type)
875 return -1; 835 return -1;
876} 836}
877 837
878#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 838#ifdef CONFIG_EISA
879/* 839/*
880 * EISA Edge/Level control register, ELCR 840 * EISA Edge/Level control register, ELCR
881 */ 841 */
@@ -912,12 +872,6 @@ static int EISA_ELCR(unsigned int irq)
912#define default_PCI_trigger(idx) (1) 872#define default_PCI_trigger(idx) (1)
913#define default_PCI_polarity(idx) (1) 873#define default_PCI_polarity(idx) (1)
914 874
915/* MCA interrupts are always polarity zero level triggered,
916 * when listed as conforming in the MP table. */
917
918#define default_MCA_trigger(idx) (1)
919#define default_MCA_polarity(idx) default_ISA_polarity(idx)
920
921static int irq_polarity(int idx) 875static int irq_polarity(int idx)
922{ 876{
923 int bus = mp_irqs[idx].srcbus; 877 int bus = mp_irqs[idx].srcbus;
@@ -975,7 +929,7 @@ static int irq_trigger(int idx)
975 trigger = default_ISA_trigger(idx); 929 trigger = default_ISA_trigger(idx);
976 else 930 else
977 trigger = default_PCI_trigger(idx); 931 trigger = default_PCI_trigger(idx);
978#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 932#ifdef CONFIG_EISA
979 switch (mp_bus_id_to_type[bus]) { 933 switch (mp_bus_id_to_type[bus]) {
980 case MP_BUS_ISA: /* ISA pin */ 934 case MP_BUS_ISA: /* ISA pin */
981 { 935 {
@@ -992,11 +946,6 @@ static int irq_trigger(int idx)
992 /* set before the switch */ 946 /* set before the switch */
993 break; 947 break;
994 } 948 }
995 case MP_BUS_MCA: /* MCA pin */
996 {
997 trigger = default_MCA_trigger(idx);
998 break;
999 }
1000 default: 949 default:
1001 { 950 {
1002 printk(KERN_WARNING "broken BIOS!!\n"); 951 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1361,77 +1310,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1361 fasteoi ? "fasteoi" : "edge"); 1310 fasteoi ? "fasteoi" : "edge");
1362} 1311}
1363 1312
1364
1365static int setup_ir_ioapic_entry(int irq,
1366 struct IR_IO_APIC_route_entry *entry,
1367 unsigned int destination, int vector,
1368 struct io_apic_irq_attr *attr)
1369{
1370 int index;
1371 struct irte irte;
1372 int ioapic_id = mpc_ioapic_id(attr->ioapic);
1373 struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
1374
1375 if (!iommu) {
1376 pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
1377 return -ENODEV;
1378 }
1379
1380 index = alloc_irte(iommu, irq, 1);
1381 if (index < 0) {
1382 pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
1383 return -ENOMEM;
1384 }
1385
1386 prepare_irte(&irte, vector, destination);
1387
1388 /* Set source-id of interrupt request */
1389 set_ioapic_sid(&irte, ioapic_id);
1390
1391 modify_irte(irq, &irte);
1392
1393 apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
1394 "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
1395 "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
1396 "Avail:%X Vector:%02X Dest:%08X "
1397 "SID:%04X SQ:%X SVT:%X)\n",
1398 attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
1399 irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
1400 irte.avail, irte.vector, irte.dest_id,
1401 irte.sid, irte.sq, irte.svt);
1402
1403 memset(entry, 0, sizeof(*entry));
1404
1405 entry->index2 = (index >> 15) & 0x1;
1406 entry->zero = 0;
1407 entry->format = 1;
1408 entry->index = (index & 0x7fff);
1409 /*
1410 * IO-APIC RTE will be configured with virtual vector.
1411 * irq handler will do the explicit EOI to the io-apic.
1412 */
1413 entry->vector = attr->ioapic_pin;
1414 entry->mask = 0; /* enable IRQ */
1415 entry->trigger = attr->trigger;
1416 entry->polarity = attr->polarity;
1417
1418 /* Mask level triggered irqs.
1419 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1420 */
1421 if (attr->trigger)
1422 entry->mask = 1;
1423
1424 return 0;
1425}
1426
1427static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, 1313static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
1428 unsigned int destination, int vector, 1314 unsigned int destination, int vector,
1429 struct io_apic_irq_attr *attr) 1315 struct io_apic_irq_attr *attr)
1430{ 1316{
1431 if (intr_remapping_enabled) 1317 if (irq_remapping_enabled)
1432 return setup_ir_ioapic_entry(irq, 1318 return setup_ioapic_remapped_entry(irq, entry, destination,
1433 (struct IR_IO_APIC_route_entry *)entry, 1319 vector, attr);
1434 destination, vector, attr);
1435 1320
1436 memset(entry, 0, sizeof(*entry)); 1321 memset(entry, 0, sizeof(*entry));
1437 1322
@@ -1588,7 +1473,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1588{ 1473{
1589 struct IO_APIC_route_entry entry; 1474 struct IO_APIC_route_entry entry;
1590 1475
1591 if (intr_remapping_enabled) 1476 if (irq_remapping_enabled)
1592 return; 1477 return;
1593 1478
1594 memset(&entry, 0, sizeof(entry)); 1479 memset(&entry, 0, sizeof(entry));
@@ -1674,7 +1559,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1674 1559
1675 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1560 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1676 1561
1677 if (intr_remapping_enabled) { 1562 if (irq_remapping_enabled) {
1678 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" 1563 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
1679 " Pol Stat Indx2 Zero Vect:\n"); 1564 " Pol Stat Indx2 Zero Vect:\n");
1680 } else { 1565 } else {
@@ -1683,7 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1683 } 1568 }
1684 1569
1685 for (i = 0; i <= reg_01.bits.entries; i++) { 1570 for (i = 0; i <= reg_01.bits.entries; i++) {
1686 if (intr_remapping_enabled) { 1571 if (irq_remapping_enabled) {
1687 struct IO_APIC_route_entry entry; 1572 struct IO_APIC_route_entry entry;
1688 struct IR_IO_APIC_route_entry *ir_entry; 1573 struct IR_IO_APIC_route_entry *ir_entry;
1689 1574
@@ -2050,7 +1935,7 @@ void disable_IO_APIC(void)
2050 * IOAPIC RTE as well as interrupt-remapping table entry). 1935 * IOAPIC RTE as well as interrupt-remapping table entry).
2051 * As this gets called during crash dump, keep this simple for now. 1936 * As this gets called during crash dump, keep this simple for now.
2052 */ 1937 */
2053 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1938 if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
2054 struct IO_APIC_route_entry entry; 1939 struct IO_APIC_route_entry entry;
2055 1940
2056 memset(&entry, 0, sizeof(entry)); 1941 memset(&entry, 0, sizeof(entry));
@@ -2074,7 +1959,7 @@ void disable_IO_APIC(void)
2074 * Use virtual wire A mode when interrupt remapping is enabled. 1959 * Use virtual wire A mode when interrupt remapping is enabled.
2075 */ 1960 */
2076 if (cpu_has_apic || apic_from_smp_config()) 1961 if (cpu_has_apic || apic_from_smp_config())
2077 disconnect_bsp_APIC(!intr_remapping_enabled && 1962 disconnect_bsp_APIC(!irq_remapping_enabled &&
2078 ioapic_i8259.pin != -1); 1963 ioapic_i8259.pin != -1);
2079} 1964}
2080 1965
@@ -2390,71 +2275,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2390 return ret; 2275 return ret;
2391} 2276}
2392 2277
2393#ifdef CONFIG_IRQ_REMAP
2394
2395/*
2396 * Migrate the IO-APIC irq in the presence of intr-remapping.
2397 *
2398 * For both level and edge triggered, irq migration is a simple atomic
2399 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2400 *
2401 * For level triggered, we eliminate the io-apic RTE modification (with the
2402 * updated vector information), by using a virtual vector (io-apic pin number).
2403 * Real vector that is used for interrupting cpu will be coming from
2404 * the interrupt-remapping table entry.
2405 *
2406 * As the migration is a simple atomic update of IRTE, the same mechanism
2407 * is used to migrate MSI irq's in the presence of interrupt-remapping.
2408 */
2409static int
2410ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2411 bool force)
2412{
2413 struct irq_cfg *cfg = data->chip_data;
2414 unsigned int dest, irq = data->irq;
2415 struct irte irte;
2416
2417 if (!cpumask_intersects(mask, cpu_online_mask))
2418 return -EINVAL;
2419
2420 if (get_irte(irq, &irte))
2421 return -EBUSY;
2422
2423 if (assign_irq_vector(irq, cfg, mask))
2424 return -EBUSY;
2425
2426 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2427
2428 irte.vector = cfg->vector;
2429 irte.dest_id = IRTE_DEST(dest);
2430
2431 /*
2432 * Atomically updates the IRTE with the new destination, vector
2433 * and flushes the interrupt entry cache.
2434 */
2435 modify_irte(irq, &irte);
2436
2437 /*
2438 * After this point, all the interrupts will start arriving
2439 * at the new destination. So, time to cleanup the previous
2440 * vector allocation.
2441 */
2442 if (cfg->move_in_progress)
2443 send_cleanup_vector(cfg);
2444
2445 cpumask_copy(data->affinity, mask);
2446 return 0;
2447}
2448
2449#else
2450static inline int
2451ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2452 bool force)
2453{
2454 return 0;
2455}
2456#endif
2457
2458asmlinkage void smp_irq_move_cleanup_interrupt(void) 2278asmlinkage void smp_irq_move_cleanup_interrupt(void)
2459{ 2279{
2460 unsigned vector, me; 2280 unsigned vector, me;
@@ -2552,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data)
2552atomic_t irq_mis_count; 2372atomic_t irq_mis_count;
2553 2373
2554#ifdef CONFIG_GENERIC_PENDING_IRQ 2374#ifdef CONFIG_GENERIC_PENDING_IRQ
2375static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
2376{
2377 struct irq_pin_list *entry;
2378 unsigned long flags;
2379
2380 raw_spin_lock_irqsave(&ioapic_lock, flags);
2381 for_each_irq_pin(entry, cfg->irq_2_pin) {
2382 unsigned int reg;
2383 int pin;
2384
2385 pin = entry->pin;
2386 reg = io_apic_read(entry->apic, 0x10 + pin*2);
2387 /* Is the remote IRR bit set? */
2388 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
2389 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2390 return true;
2391 }
2392 }
2393 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2394
2395 return false;
2396}
2397
2555static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) 2398static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
2556{ 2399{
2557 /* If we are moving the irq we need to mask it */ 2400 /* If we are moving the irq we need to mask it */
@@ -2699,7 +2542,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2699 chip->irq_eoi = ir_ack_apic_level; 2542 chip->irq_eoi = ir_ack_apic_level;
2700 2543
2701#ifdef CONFIG_SMP 2544#ifdef CONFIG_SMP
2702 chip->irq_set_affinity = ir_ioapic_set_affinity; 2545 chip->irq_set_affinity = set_remapped_irq_affinity;
2703#endif 2546#endif
2704} 2547}
2705#endif /* CONFIG_IRQ_REMAP */ 2548#endif /* CONFIG_IRQ_REMAP */
@@ -2912,7 +2755,7 @@ static inline void __init check_timer(void)
2912 * 8259A. 2755 * 8259A.
2913 */ 2756 */
2914 if (pin1 == -1) { 2757 if (pin1 == -1) {
2915 if (intr_remapping_enabled) 2758 if (irq_remapping_enabled)
2916 panic("BIOS bug: timer not connected to IO-APIC"); 2759 panic("BIOS bug: timer not connected to IO-APIC");
2917 pin1 = pin2; 2760 pin1 = pin2;
2918 apic1 = apic2; 2761 apic1 = apic2;
@@ -2945,7 +2788,7 @@ static inline void __init check_timer(void)
2945 clear_IO_APIC_pin(0, pin1); 2788 clear_IO_APIC_pin(0, pin1);
2946 goto out; 2789 goto out;
2947 } 2790 }
2948 if (intr_remapping_enabled) 2791 if (irq_remapping_enabled)
2949 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2792 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2950 local_irq_disable(); 2793 local_irq_disable();
2951 clear_IO_APIC_pin(apic1, pin1); 2794 clear_IO_APIC_pin(apic1, pin1);
@@ -3169,7 +3012,7 @@ void destroy_irq(unsigned int irq)
3169 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3012 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3170 3013
3171 if (irq_remapped(cfg)) 3014 if (irq_remapped(cfg))
3172 free_irte(irq); 3015 free_remapped_irq(irq);
3173 raw_spin_lock_irqsave(&vector_lock, flags); 3016 raw_spin_lock_irqsave(&vector_lock, flags);
3174 __clear_irq_vector(irq, cfg); 3017 __clear_irq_vector(irq, cfg);
3175 raw_spin_unlock_irqrestore(&vector_lock, flags); 3018 raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3198,54 +3041,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3198 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3041 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3199 3042
3200 if (irq_remapped(cfg)) { 3043 if (irq_remapped(cfg)) {
3201 struct irte irte; 3044 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
3202 int ir_index; 3045 return err;
3203 u16 sub_handle; 3046 }
3204
3205 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
3206 BUG_ON(ir_index == -1);
3207
3208 prepare_irte(&irte, cfg->vector, dest);
3209
3210 /* Set source-id of interrupt request */
3211 if (pdev)
3212 set_msi_sid(&irte, pdev);
3213 else
3214 set_hpet_sid(&irte, hpet_id);
3215
3216 modify_irte(irq, &irte);
3217 3047
3048 if (x2apic_enabled())
3049 msg->address_hi = MSI_ADDR_BASE_HI |
3050 MSI_ADDR_EXT_DEST_ID(dest);
3051 else
3218 msg->address_hi = MSI_ADDR_BASE_HI; 3052 msg->address_hi = MSI_ADDR_BASE_HI;
3219 msg->data = sub_handle;
3220 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
3221 MSI_ADDR_IR_SHV |
3222 MSI_ADDR_IR_INDEX1(ir_index) |
3223 MSI_ADDR_IR_INDEX2(ir_index);
3224 } else {
3225 if (x2apic_enabled())
3226 msg->address_hi = MSI_ADDR_BASE_HI |
3227 MSI_ADDR_EXT_DEST_ID(dest);
3228 else
3229 msg->address_hi = MSI_ADDR_BASE_HI;
3230 3053
3231 msg->address_lo = 3054 msg->address_lo =
3232 MSI_ADDR_BASE_LO | 3055 MSI_ADDR_BASE_LO |
3233 ((apic->irq_dest_mode == 0) ? 3056 ((apic->irq_dest_mode == 0) ?
3234 MSI_ADDR_DEST_MODE_PHYSICAL: 3057 MSI_ADDR_DEST_MODE_PHYSICAL:
3235 MSI_ADDR_DEST_MODE_LOGICAL) | 3058 MSI_ADDR_DEST_MODE_LOGICAL) |
3236 ((apic->irq_delivery_mode != dest_LowestPrio) ? 3059 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3237 MSI_ADDR_REDIRECTION_CPU: 3060 MSI_ADDR_REDIRECTION_CPU:
3238 MSI_ADDR_REDIRECTION_LOWPRI) | 3061 MSI_ADDR_REDIRECTION_LOWPRI) |
3239 MSI_ADDR_DEST_ID(dest); 3062 MSI_ADDR_DEST_ID(dest);
3063
3064 msg->data =
3065 MSI_DATA_TRIGGER_EDGE |
3066 MSI_DATA_LEVEL_ASSERT |
3067 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3068 MSI_DATA_DELIVERY_FIXED:
3069 MSI_DATA_DELIVERY_LOWPRI) |
3070 MSI_DATA_VECTOR(cfg->vector);
3240 3071
3241 msg->data =
3242 MSI_DATA_TRIGGER_EDGE |
3243 MSI_DATA_LEVEL_ASSERT |
3244 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3245 MSI_DATA_DELIVERY_FIXED:
3246 MSI_DATA_DELIVERY_LOWPRI) |
3247 MSI_DATA_VECTOR(cfg->vector);
3248 }
3249 return err; 3072 return err;
3250} 3073}
3251 3074
@@ -3288,33 +3111,6 @@ static struct irq_chip msi_chip = {
3288 .irq_retrigger = ioapic_retrigger_irq, 3111 .irq_retrigger = ioapic_retrigger_irq,
3289}; 3112};
3290 3113
3291/*
3292 * Map the PCI dev to the corresponding remapping hardware unit
3293 * and allocate 'nvec' consecutive interrupt-remapping table entries
3294 * in it.
3295 */
3296static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3297{
3298 struct intel_iommu *iommu;
3299 int index;
3300
3301 iommu = map_dev_to_ir(dev);
3302 if (!iommu) {
3303 printk(KERN_ERR
3304 "Unable to map PCI %s to iommu\n", pci_name(dev));
3305 return -ENOENT;
3306 }
3307
3308 index = alloc_irte(iommu, irq, nvec);
3309 if (index < 0) {
3310 printk(KERN_ERR
3311 "Unable to allocate %d IRTE for PCI %s\n", nvec,
3312 pci_name(dev));
3313 return -ENOSPC;
3314 }
3315 return index;
3316}
3317
3318static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3114static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3319{ 3115{
3320 struct irq_chip *chip = &msi_chip; 3116 struct irq_chip *chip = &msi_chip;
@@ -3345,7 +3141,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3345 int node, ret, sub_handle, index = 0; 3141 int node, ret, sub_handle, index = 0;
3346 unsigned int irq, irq_want; 3142 unsigned int irq, irq_want;
3347 struct msi_desc *msidesc; 3143 struct msi_desc *msidesc;
3348 struct intel_iommu *iommu = NULL;
3349 3144
3350 /* x86 doesn't support multiple MSI yet */ 3145 /* x86 doesn't support multiple MSI yet */
3351 if (type == PCI_CAP_ID_MSI && nvec > 1) 3146 if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3359,7 +3154,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3359 if (irq == 0) 3154 if (irq == 0)
3360 return -1; 3155 return -1;
3361 irq_want = irq + 1; 3156 irq_want = irq + 1;
3362 if (!intr_remapping_enabled) 3157 if (!irq_remapping_enabled)
3363 goto no_ir; 3158 goto no_ir;
3364 3159
3365 if (!sub_handle) { 3160 if (!sub_handle) {
@@ -3367,23 +3162,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3367 * allocate the consecutive block of IRTE's 3162 * allocate the consecutive block of IRTE's
3368 * for 'nvec' 3163 * for 'nvec'
3369 */ 3164 */
3370 index = msi_alloc_irte(dev, irq, nvec); 3165 index = msi_alloc_remapped_irq(dev, irq, nvec);
3371 if (index < 0) { 3166 if (index < 0) {
3372 ret = index; 3167 ret = index;
3373 goto error; 3168 goto error;
3374 } 3169 }
3375 } else { 3170 } else {
3376 iommu = map_dev_to_ir(dev); 3171 ret = msi_setup_remapped_irq(dev, irq, index,
3377 if (!iommu) { 3172 sub_handle);
3378 ret = -ENOENT; 3173 if (ret < 0)
3379 goto error; 3174 goto error;
3380 }
3381 /*
3382 * setup the mapping between the irq and the IRTE
3383 * base index, the sub_handle pointing to the
3384 * appropriate interrupt remap table entry.
3385 */
3386 set_irte_irq(irq, iommu, index, sub_handle);
3387 } 3175 }
3388no_ir: 3176no_ir:
3389 ret = setup_msi_irq(dev, msidesc, irq); 3177 ret = setup_msi_irq(dev, msidesc, irq);
@@ -3501,15 +3289,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3501 struct msi_msg msg; 3289 struct msi_msg msg;
3502 int ret; 3290 int ret;
3503 3291
3504 if (intr_remapping_enabled) { 3292 if (irq_remapping_enabled) {
3505 struct intel_iommu *iommu = map_hpet_to_ir(id); 3293 if (!setup_hpet_msi_remapped(irq, id))
3506 int index;
3507
3508 if (!iommu)
3509 return -1;
3510
3511 index = alloc_irte(iommu, irq, 1);
3512 if (index < 0)
3513 return -1; 3294 return -1;
3514 } 3295 }
3515 3296
@@ -3888,8 +3669,8 @@ void __init setup_ioapic_dest(void)
3888 else 3669 else
3889 mask = apic->target_cpus(); 3670 mask = apic->target_cpus();
3890 3671
3891 if (intr_remapping_enabled) 3672 if (irq_remapping_enabled)
3892 ir_ioapic_set_affinity(idata, mask, false); 3673 set_remapped_irq_affinity(idata, mask, false);
3893 else 3674 else
3894 ioapic_set_affinity(idata, mask, false); 3675 ioapic_set_affinity(idata, mask, false);
3895 } 3676 }
@@ -3931,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
3931 return res; 3712 return res;
3932} 3713}
3933 3714
3934void __init ioapic_and_gsi_init(void) 3715void __init native_io_apic_init_mappings(void)
3935{
3936 io_apic_ops.init();
3937}
3938
3939static void __init __ioapic_init_mappings(void)
3940{ 3716{
3941 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3717 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3942 struct resource *ioapic_res; 3718 struct resource *ioapic_res;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 00d2422ca7c..f00a68cca37 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = {
530 530
531 .read = native_apic_mem_read, 531 .read = native_apic_mem_read,
532 .write = native_apic_mem_write, 532 .write = native_apic_mem_write,
533 .eoi_write = native_apic_mem_write,
533 .icr_read = native_apic_icr_read, 534 .icr_read = native_apic_icr_read,
534 .icr_write = native_apic_icr_write, 535 .icr_write = native_apic_icr_write,
535 .wait_icr_idle = native_apic_wait_icr_idle, 536 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index ff2c1b9aac4..1b291da09e6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -142,6 +142,7 @@ static struct apic apic_default = {
142 142
143 .read = native_apic_mem_read, 143 .read = native_apic_mem_read,
144 .write = native_apic_mem_write, 144 .write = native_apic_mem_write,
145 .eoi_write = native_apic_mem_write,
145 .icr_read = native_apic_icr_read, 146 .icr_read = native_apic_icr_read,
146 .icr_write = native_apic_icr_write, 147 .icr_write = native_apic_icr_write,
147 .wait_icr_idle = native_apic_wait_icr_idle, 148 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index fea000b27f0..659897c0075 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -546,6 +546,7 @@ static struct apic apic_summit = {
546 546
547 .read = native_apic_mem_read, 547 .read = native_apic_mem_read,
548 .write = native_apic_mem_write, 548 .write = native_apic_mem_write,
549 .eoi_write = native_apic_mem_write,
549 .icr_read = native_apic_icr_read, 550 .icr_read = native_apic_icr_read,
550 .icr_write = native_apic_icr_write, 551 .icr_write = native_apic_icr_write,
551 .wait_icr_idle = native_apic_wait_icr_idle, 552 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 48f3103b3c9..ff35cff0e1a 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = {
260 260
261 .read = native_apic_msr_read, 261 .read = native_apic_msr_read,
262 .write = native_apic_msr_write, 262 .write = native_apic_msr_write,
263 .eoi_write = native_apic_msr_eoi_write,
263 .icr_read = native_x2apic_icr_read, 264 .icr_read = native_x2apic_icr_read,
264 .icr_write = native_x2apic_icr_write, 265 .icr_write = native_x2apic_icr_write,
265 .wait_icr_idle = native_x2apic_wait_icr_idle, 266 .wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315f422..c17e982db27 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = {
172 172
173 .read = native_apic_msr_read, 173 .read = native_apic_msr_read,
174 .write = native_apic_msr_write, 174 .write = native_apic_msr_write,
175 .eoi_write = native_apic_msr_eoi_write,
175 .icr_read = native_x2apic_icr_read, 176 .icr_read = native_x2apic_icr_read,
176 .icr_write = native_x2apic_icr_write, 177 .icr_write = native_x2apic_icr_write,
177 .wait_icr_idle = native_x2apic_wait_icr_idle, 178 .wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 87bfa69e216..c6d03f7a440 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
404 404
405 .read = native_apic_msr_read, 405 .read = native_apic_msr_read,
406 .write = native_apic_msr_write, 406 .write = native_apic_msr_write,
407 .eoi_write = native_apic_msr_eoi_write,
407 .icr_read = native_x2apic_icr_read, 408 .icr_read = native_x2apic_icr_read,
408 .icr_write = native_x2apic_icr_write, 409 .icr_write = native_x2apic_icr_write,
409 .wait_icr_idle = native_x2apic_wait_icr_idle, 410 .wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 459e78cbf61..07b0c0db466 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2401,7 +2401,7 @@ static void __exit apm_exit(void)
2401 * (pm_idle), Wait for all processors to update cached/local 2401 * (pm_idle), Wait for all processors to update cached/local
2402 * copies of pm_idle before proceeding. 2402 * copies of pm_idle before proceeding.
2403 */ 2403 */
2404 cpu_idle_wait(); 2404 kick_all_cpus_sync();
2405 } 2405 }
2406 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) 2406 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2407 && (apm_info.connection_version > 0x0100)) { 2407 && (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269e8dd..e2dbcb7dabd 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
27 27
28static __init int set_corruption_check(char *arg) 28static __init int set_corruption_check(char *arg)
29{ 29{
30 char *end; 30 ssize_t ret;
31 unsigned long val;
31 32
32 memory_corruption_check = simple_strtol(arg, &end, 10); 33 ret = kstrtoul(arg, 10, &val);
34 if (ret)
35 return ret;
33 36
34 return (*end == 0) ? 0 : -EINVAL; 37 memory_corruption_check = val;
38 return 0;
35} 39}
36early_param("memory_corruption_check", set_corruption_check); 40early_param("memory_corruption_check", set_corruption_check);
37 41
38static __init int set_corruption_check_period(char *arg) 42static __init int set_corruption_check_period(char *arg)
39{ 43{
40 char *end; 44 ssize_t ret;
45 unsigned long val;
41 46
42 corruption_check_period = simple_strtoul(arg, &end, 10); 47 ret = kstrtoul(arg, 10, &val);
48 if (ret)
49 return ret;
43 50
44 return (*end == 0) ? 0 : -EINVAL; 51 corruption_check_period = val;
52 return 0;
45} 53}
46early_param("memory_corruption_check_period", set_corruption_check_period); 54early_param("memory_corruption_check_period", set_corruption_check_period);
47 55
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cf79302198a..82f29e70d05 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void)
1185 oist = &per_cpu(orig_ist, cpu); 1185 oist = &per_cpu(orig_ist, cpu);
1186 1186
1187#ifdef CONFIG_NUMA 1187#ifdef CONFIG_NUMA
1188 if (cpu != 0 && percpu_read(numa_node) == 0 && 1188 if (cpu != 0 && this_cpu_read(numa_node) == 0 &&
1189 early_cpu_to_node(cpu) != NUMA_NO_NODE) 1189 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1190 set_numa_node(early_cpu_to_node(cpu)); 1190 set_numa_node(early_cpu_to_node(cpu));
1191#endif 1191#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b8f3653dddb..9a7c90d80bc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
615 new_l2 = this_leaf.size/1024; 615 new_l2 = this_leaf.size/1024;
616 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 616 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
617 index_msb = get_count_order(num_threads_sharing); 617 index_msb = get_count_order(num_threads_sharing);
618 l2_id = c->apicid >> index_msb; 618 l2_id = c->apicid & ~((1 << index_msb) - 1);
619 break; 619 break;
620 case 3: 620 case 3:
621 new_l3 = this_leaf.size/1024; 621 new_l3 = this_leaf.size/1024;
622 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 622 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
623 index_msb = get_count_order( 623 index_msb = get_count_order(
624 num_threads_sharing); 624 num_threads_sharing);
625 l3_id = c->apicid >> index_msb; 625 l3_id = c->apicid & ~((1 << index_msb) - 1);
626 break; 626 break;
627 default: 627 default:
628 break; 628 break;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 5502b289341..36565373af8 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -23,7 +23,7 @@
23 * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) 23 * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
24 * 24 *
25 * Arrays used to match for this should also be declared using 25 * Arrays used to match for this should also be declared using
26 * MODULE_DEVICE_TABLE(x86_cpu, ...) 26 * MODULE_DEVICE_TABLE(x86cpu, ...)
27 * 27 *
28 * This always matches against the boot cpu, assuming models and features are 28 * This always matches against the boot cpu, assuming models and features are
29 * consistent over all CPUs. 29 * consistent over all CPUs.
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e..cd8b166a173 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
42 struct mce m; 42 struct mce m;
43 43
44 /* Only corrected MC is reported */ 44 /* Only corrected MC is reported */
45 if (!corrected) 45 if (!corrected || !(mem_err->validation_bits &
46 CPER_MEM_VALID_PHYSICAL_ADDRESS))
46 return; 47 return;
47 48
48 mce_setup(&m); 49 mce_setup(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 0c82091b165..413c2ced887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -126,6 +126,16 @@ static struct severity {
126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
127 USER 127 USER
128 ), 128 ),
129 MCESEV(
130 KEEP, "HT thread notices Action required: instruction fetch error",
131 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
132 MCGMASK(MCG_STATUS_EIPV, 0)
133 ),
134 MCESEV(
135 AR, "Action required: instruction fetch error",
136 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
137 USER
138 ),
129#endif 139#endif
130 MCESEV( 140 MCESEV(
131 PANIC, "Action required: unknown MCACOD", 141 PANIC, "Action required: unknown MCACOD",
@@ -165,15 +175,19 @@ static struct severity {
165}; 175};
166 176
167/* 177/*
168 * If the EIPV bit is set, it means the saved IP is the 178 * If mcgstatus indicated that ip/cs on the stack were
169 * instruction which caused the MCE. 179 * no good, then "m->cs" will be zero and we will have
180 * to assume the worst case (IN_KERNEL) as we actually
181 * have no idea what we were executing when the machine
182 * check hit.
183 * If we do have a good "m->cs" (or a faked one in the
184 * case we were executing in VM86 mode) we can use it to
185 * distinguish an exception taken in user from from one
186 * taken in the kernel.
170 */ 187 */
171static int error_context(struct mce *m) 188static int error_context(struct mce *m)
172{ 189{
173 if (m->mcgstatus & MCG_STATUS_EIPV) 190 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
174 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
175 /* Unknown, assume kernel */
176 return IN_KERNEL;
177} 191}
178 192
179int mce_severity(struct mce *m, int tolerant, char **msg) 193int mce_severity(struct mce *m, int tolerant, char **msg)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087..b772dd6ad45 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
438 m->ip = regs->ip; 438 m->ip = regs->ip;
439 m->cs = regs->cs; 439 m->cs = regs->cs;
440
441 /*
442 * When in VM86 mode make the cs look like ring 3
443 * always. This is a lie, but it's better than passing
444 * the additional vm86 bit around everywhere.
445 */
446 if (v8086_mode(regs))
447 m->cs |= 3;
440 } 448 }
441 /* Use accurate RIP reporting if available. */ 449 /* Use accurate RIP reporting if available. */
442 if (rip_msr) 450 if (rip_msr)
@@ -583,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
583 struct mce m; 591 struct mce m;
584 int i; 592 int i;
585 593
586 percpu_inc(mce_poll_count); 594 this_cpu_inc(mce_poll_count);
587 595
588 mce_gather_info(&m, NULL); 596 mce_gather_info(&m, NULL);
589 597
@@ -641,16 +649,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
641 * Do a quick check if any of the events requires a panic. 649 * Do a quick check if any of the events requires a panic.
642 * This decides if we keep the events around or clear them. 650 * This decides if we keep the events around or clear them.
643 */ 651 */
644static int mce_no_way_out(struct mce *m, char **msg) 652static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
645{ 653{
646 int i; 654 int i, ret = 0;
647 655
648 for (i = 0; i < banks; i++) { 656 for (i = 0; i < banks; i++) {
649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 657 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
658 if (m->status & MCI_STATUS_VAL)
659 __set_bit(i, validp);
650 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 660 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
651 return 1; 661 ret = 1;
652 } 662 }
653 return 0; 663 return ret;
654} 664}
655 665
656/* 666/*
@@ -945,9 +955,10 @@ struct mce_info {
945 atomic_t inuse; 955 atomic_t inuse;
946 struct task_struct *t; 956 struct task_struct *t;
947 __u64 paddr; 957 __u64 paddr;
958 int restartable;
948} mce_info[MCE_INFO_MAX]; 959} mce_info[MCE_INFO_MAX];
949 960
950static void mce_save_info(__u64 addr) 961static void mce_save_info(__u64 addr, int c)
951{ 962{
952 struct mce_info *mi; 963 struct mce_info *mi;
953 964
@@ -955,6 +966,7 @@ static void mce_save_info(__u64 addr)
955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 966 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
956 mi->t = current; 967 mi->t = current;
957 mi->paddr = addr; 968 mi->paddr = addr;
969 mi->restartable = c;
958 return; 970 return;
959 } 971 }
960 } 972 }
@@ -1011,11 +1023,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1011 */ 1023 */
1012 int kill_it = 0; 1024 int kill_it = 0;
1013 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1025 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1026 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1014 char *msg = "Unknown"; 1027 char *msg = "Unknown";
1015 1028
1016 atomic_inc(&mce_entry); 1029 atomic_inc(&mce_entry);
1017 1030
1018 percpu_inc(mce_exception_count); 1031 this_cpu_inc(mce_exception_count);
1019 1032
1020 if (!banks) 1033 if (!banks)
1021 goto out; 1034 goto out;
@@ -1025,7 +1038,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1025 final = &__get_cpu_var(mces_seen); 1038 final = &__get_cpu_var(mces_seen);
1026 *final = m; 1039 *final = m;
1027 1040
1028 no_way_out = mce_no_way_out(&m, &msg); 1041 memset(valid_banks, 0, sizeof(valid_banks));
1042 no_way_out = mce_no_way_out(&m, &msg, valid_banks);
1029 1043
1030 barrier(); 1044 barrier();
1031 1045
@@ -1045,6 +1059,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1045 order = mce_start(&no_way_out); 1059 order = mce_start(&no_way_out);
1046 for (i = 0; i < banks; i++) { 1060 for (i = 0; i < banks; i++) {
1047 __clear_bit(i, toclear); 1061 __clear_bit(i, toclear);
1062 if (!test_bit(i, valid_banks))
1063 continue;
1048 if (!mce_banks[i].ctl) 1064 if (!mce_banks[i].ctl)
1049 continue; 1065 continue;
1050 1066
@@ -1130,7 +1146,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1130 mce_panic("Fatal machine check on current CPU", &m, msg); 1146 mce_panic("Fatal machine check on current CPU", &m, msg);
1131 if (worst == MCE_AR_SEVERITY) { 1147 if (worst == MCE_AR_SEVERITY) {
1132 /* schedule action before return to userland */ 1148 /* schedule action before return to userland */
1133 mce_save_info(m.addr); 1149 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1134 set_thread_flag(TIF_MCE_NOTIFY); 1150 set_thread_flag(TIF_MCE_NOTIFY);
1135 } else if (kill_it) { 1151 } else if (kill_it) {
1136 force_sig(SIGBUS, current); 1152 force_sig(SIGBUS, current);
@@ -1179,7 +1195,13 @@ void mce_notify_process(void)
1179 1195
1180 pr_err("Uncorrected hardware memory error in user-access at %llx", 1196 pr_err("Uncorrected hardware memory error in user-access at %llx",
1181 mi->paddr); 1197 mi->paddr);
1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { 1198 /*
1199 * We must call memory_failure() here even if the current process is
1200 * doomed. We still need to mark the page as poisoned and alert any
1201 * other users of the page.
1202 */
1203 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
1204 mi->restartable == 0) {
1183 pr_err("Memory error not recovered"); 1205 pr_err("Memory error not recovered");
1184 force_sig(SIGBUS, current); 1206 force_sig(SIGBUS, current);
1185 } 1207 }
@@ -1423,6 +1445,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1423 */ 1445 */
1424 if (c->x86 == 6 && banks > 0) 1446 if (c->x86 == 6 && banks > 0)
1425 mce_banks[0].ctl = 0; 1447 mce_banks[0].ctl = 0;
1448
1449 /*
1450 * Turn off MC4_MISC thresholding banks on those models since
1451 * they're not supported there.
1452 */
1453 if (c->x86 == 0x15 &&
1454 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1455 int i;
1456 u64 val, hwcr;
1457 bool need_toggle;
1458 u32 msrs[] = {
1459 0x00000413, /* MC4_MISC0 */
1460 0xc0000408, /* MC4_MISC1 */
1461 };
1462
1463 rdmsrl(MSR_K7_HWCR, hwcr);
1464
1465 /* McStatusWrEn has to be set */
1466 need_toggle = !(hwcr & BIT(18));
1467
1468 if (need_toggle)
1469 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1470
1471 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1472 rdmsrl(msrs[i], val);
1473
1474 /* CntP bit set? */
1475 if (val & BIT(62)) {
1476 val &= ~BIT(62);
1477 wrmsrl(msrs[i], val);
1478 }
1479 }
1480
1481 /* restore old settings */
1482 if (need_toggle)
1483 wrmsrl(MSR_K7_HWCR, hwcr);
1484 }
1426 } 1485 }
1427 1486
1428 if (c->x86_vendor == X86_VENDOR_INTEL) { 1487 if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 99b57179f91..f4873a64f46 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -51,6 +51,7 @@ struct threshold_block {
51 unsigned int cpu; 51 unsigned int cpu;
52 u32 address; 52 u32 address;
53 u16 interrupt_enable; 53 u16 interrupt_enable;
54 bool interrupt_capable;
54 u16 threshold_limit; 55 u16 threshold_limit;
55 struct kobject kobj; 56 struct kobject kobj;
56 struct list_head miscj; 57 struct list_head miscj;
@@ -83,6 +84,21 @@ struct thresh_restart {
83 u16 old_limit; 84 u16 old_limit;
84}; 85};
85 86
87static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
88{
89 /*
90 * bank 4 supports APIC LVT interrupts implicitly since forever.
91 */
92 if (bank == 4)
93 return true;
94
95 /*
96 * IntP: interrupt present; if this bit is set, the thresholding
97 * bank can generate APIC LVT interrupts
98 */
99 return msr_high_bits & BIT(28);
100}
101
86static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) 102static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
87{ 103{
88 int msr = (hi & MASK_LVTOFF_HI) >> 20; 104 int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
104 return 1; 120 return 1;
105}; 121};
106 122
107/* must be called with correct cpu affinity */ 123/*
108/* Called via smp_call_function_single() */ 124 * Called via smp_call_function_single(), must be called with correct
125 * cpu affinity.
126 */
109static void threshold_restart_bank(void *_tr) 127static void threshold_restart_bank(void *_tr)
110{ 128{
111 struct thresh_restart *tr = _tr; 129 struct thresh_restart *tr = _tr;
@@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr)
128 (new_count & THRESHOLD_MAX); 146 (new_count & THRESHOLD_MAX);
129 } 147 }
130 148
149 /* clear IntType */
150 hi &= ~MASK_INT_TYPE_HI;
151
152 if (!tr->b->interrupt_capable)
153 goto done;
154
131 if (tr->set_lvt_off) { 155 if (tr->set_lvt_off) {
132 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { 156 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
133 /* set new lvt offset */ 157 /* set new lvt offset */
@@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr)
136 } 160 }
137 } 161 }
138 162
139 tr->b->interrupt_enable ? 163 if (tr->b->interrupt_enable)
140 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 164 hi |= INT_TYPE_APIC;
141 (hi &= ~MASK_INT_TYPE_HI); 165
166 done:
142 167
143 hi |= MASK_COUNT_EN_HI; 168 hi |= MASK_COUNT_EN_HI;
144 wrmsr(tr->b->address, lo, hi); 169 wrmsr(tr->b->address, lo, hi);
@@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
202 if (shared_bank[bank] && c->cpu_core_id) 227 if (shared_bank[bank] && c->cpu_core_id)
203 break; 228 break;
204 229
205 offset = setup_APIC_mce(offset,
206 (high & MASK_LVTOFF_HI) >> 20);
207
208 memset(&b, 0, sizeof(b)); 230 memset(&b, 0, sizeof(b));
209 b.cpu = cpu; 231 b.cpu = cpu;
210 b.bank = bank; 232 b.bank = bank;
211 b.block = block; 233 b.block = block;
212 b.address = address; 234 b.address = address;
235 b.interrupt_capable = lvt_interrupt_supported(bank, high);
236
237 if (b.interrupt_capable) {
238 int new = (high & MASK_LVTOFF_HI) >> 20;
239 offset = setup_APIC_mce(offset, new);
240 }
213 241
214 mce_threshold_block_init(&b, offset); 242 mce_threshold_block_init(&b, offset);
215 mce_threshold_vector = amd_threshold_interrupt; 243 mce_threshold_vector = amd_threshold_interrupt;
@@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
309 struct thresh_restart tr; 337 struct thresh_restart tr;
310 unsigned long new; 338 unsigned long new;
311 339
340 if (!b->interrupt_capable)
341 return -EINVAL;
342
312 if (strict_strtoul(buf, 0, &new) < 0) 343 if (strict_strtoul(buf, 0, &new) < 0)
313 return -EINVAL; 344 return -EINVAL;
314 345
@@ -390,10 +421,10 @@ RW_ATTR(threshold_limit);
390RW_ATTR(error_count); 421RW_ATTR(error_count);
391 422
392static struct attribute *default_attrs[] = { 423static struct attribute *default_attrs[] = {
393 &interrupt_enable.attr,
394 &threshold_limit.attr, 424 &threshold_limit.attr,
395 &error_count.attr, 425 &error_count.attr,
396 NULL 426 NULL, /* possibly interrupt_enable if supported, see below */
427 NULL,
397}; 428};
398 429
399#define to_block(k) container_of(k, struct threshold_block, kobj) 430#define to_block(k) container_of(k, struct threshold_block, kobj)
@@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
467 b->cpu = cpu; 498 b->cpu = cpu;
468 b->address = address; 499 b->address = address;
469 b->interrupt_enable = 0; 500 b->interrupt_enable = 0;
501 b->interrupt_capable = lvt_interrupt_supported(bank, high);
470 b->threshold_limit = THRESHOLD_MAX; 502 b->threshold_limit = THRESHOLD_MAX;
471 503
504 if (b->interrupt_capable)
505 threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
506 else
507 threshold_ktype.default_attrs[2] = NULL;
508
472 INIT_LIST_HEAD(&b->miscj); 509 INIT_LIST_HEAD(&b->miscj);
473 510
474 if (per_cpu(threshold_banks, cpu)[bank]->blocks) { 511 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bb8e03407e1..e049d6da018 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)
484 484
485 /* mark unused */ 485 /* mark unused */
486 event->hw.extra_reg.idx = EXTRA_REG_NONE; 486 event->hw.extra_reg.idx = EXTRA_REG_NONE;
487
488 /* mark not used */
489 event->hw.extra_reg.idx = EXTRA_REG_NONE;
490 event->hw.branch_reg.idx = EXTRA_REG_NONE; 487 event->hw.branch_reg.idx = EXTRA_REG_NONE;
491 488
492 return x86_pmu.hw_config(event); 489 return x86_pmu.hw_config(event);
@@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
1186 int idx, handled = 0; 1183 int idx, handled = 0;
1187 u64 val; 1184 u64 val;
1188 1185
1189 perf_sample_data_init(&data, 0);
1190
1191 cpuc = &__get_cpu_var(cpu_hw_events); 1186 cpuc = &__get_cpu_var(cpu_hw_events);
1192 1187
1193 /* 1188 /*
@@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
1222 * event overflow 1217 * event overflow
1223 */ 1218 */
1224 handled++; 1219 handled++;
1225 data.period = event->hw.last_period; 1220 perf_sample_data_init(&data, 0, event->hw.last_period);
1226 1221
1227 if (!x86_perf_event_set_period(event)) 1222 if (!x86_perf_event_set_period(event))
1228 continue; 1223 continue;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 95e7fe1c5f0..11a4eb9131d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)
134 134
135static int amd_pmu_hw_config(struct perf_event *event) 135static int amd_pmu_hw_config(struct perf_event *event)
136{ 136{
137 int ret = x86_pmu_hw_config(event); 137 int ret;
138 138
139 /* pass precise event sampling to ibs: */
140 if (event->attr.precise_ip && get_ibs_caps())
141 return -ENOENT;
142
143 ret = x86_pmu_hw_config(event);
139 if (ret) 144 if (ret)
140 return ret; 145 return ret;
141 146
@@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
205 * when we come here 210 * when we come here
206 */ 211 */
207 for (i = 0; i < x86_pmu.num_counters; i++) { 212 for (i = 0; i < x86_pmu.num_counters; i++) {
208 if (nb->owners[i] == event) { 213 if (cmpxchg(nb->owners + i, event, NULL) == event)
209 cmpxchg(nb->owners+i, event, NULL);
210 break; 214 break;
211 }
212 } 215 }
213} 216}
214 217
@@ -493,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = {
493 * 0x023 DE PERF_CTL[2:0] 496 * 0x023 DE PERF_CTL[2:0]
494 * 0x02D LS PERF_CTL[3] 497 * 0x02D LS PERF_CTL[3]
495 * 0x02E LS PERF_CTL[3,0] 498 * 0x02E LS PERF_CTL[3,0]
499 * 0x031 LS PERF_CTL[2:0] (**)
496 * 0x043 CU PERF_CTL[2:0] 500 * 0x043 CU PERF_CTL[2:0]
497 * 0x045 CU PERF_CTL[2:0] 501 * 0x045 CU PERF_CTL[2:0]
498 * 0x046 CU PERF_CTL[2:0] 502 * 0x046 CU PERF_CTL[2:0]
@@ -506,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = {
506 * 0x0DD LS PERF_CTL[5:0] 510 * 0x0DD LS PERF_CTL[5:0]
507 * 0x0DE LS PERF_CTL[5:0] 511 * 0x0DE LS PERF_CTL[5:0]
508 * 0x0DF LS PERF_CTL[5:0] 512 * 0x0DF LS PERF_CTL[5:0]
513 * 0x1C0 EX PERF_CTL[5:3]
509 * 0x1D6 EX PERF_CTL[5:0] 514 * 0x1D6 EX PERF_CTL[5:0]
510 * 0x1D8 EX PERF_CTL[5:0] 515 * 0x1D8 EX PERF_CTL[5:0]
511 * 516 *
512 * (*) depending on the umask all FPU counters may be used 517 * (*) depending on the umask all FPU counters may be used
518 * (**) only one unitmask enabled at a time
513 */ 519 */
514 520
515static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 521static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -559,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
559 return &amd_f15_PMC3; 565 return &amd_f15_PMC3;
560 case 0x02E: 566 case 0x02E:
561 return &amd_f15_PMC30; 567 return &amd_f15_PMC30;
568 case 0x031:
569 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
570 return &amd_f15_PMC20;
571 return &emptyconstraint;
572 case 0x1C0:
573 return &amd_f15_PMC53;
562 default: 574 default:
563 return &amd_f15_PMC50; 575 return &amd_f15_PMC50;
564 } 576 }
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d30d14..da9bcdcd985 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,6 +9,7 @@
9#include <linux/perf_event.h> 9#include <linux/perf_event.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/ptrace.h>
12 13
13#include <asm/apic.h> 14#include <asm/apic.h>
14 15
@@ -16,36 +17,591 @@ static u32 ibs_caps;
16 17
17#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 18#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
18 19
19static struct pmu perf_ibs; 20#include <linux/kprobes.h>
21#include <linux/hardirq.h>
22
23#include <asm/nmi.h>
24
25#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
26#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
27
28enum ibs_states {
29 IBS_ENABLED = 0,
30 IBS_STARTED = 1,
31 IBS_STOPPING = 2,
32
33 IBS_MAX_STATES,
34};
35
36struct cpu_perf_ibs {
37 struct perf_event *event;
38 unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
39};
40
41struct perf_ibs {
42 struct pmu pmu;
43 unsigned int msr;
44 u64 config_mask;
45 u64 cnt_mask;
46 u64 enable_mask;
47 u64 valid_mask;
48 u64 max_period;
49 unsigned long offset_mask[1];
50 int offset_max;
51 struct cpu_perf_ibs __percpu *pcpu;
52 u64 (*get_count)(u64 config);
53};
54
55struct perf_ibs_data {
56 u32 size;
57 union {
58 u32 data[0]; /* data buffer starts here */
59 u32 caps;
60 };
61 u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
62};
63
64static int
65perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
66{
67 s64 left = local64_read(&hwc->period_left);
68 s64 period = hwc->sample_period;
69 int overflow = 0;
70
71 /*
72 * If we are way outside a reasonable range then just skip forward:
73 */
74 if (unlikely(left <= -period)) {
75 left = period;
76 local64_set(&hwc->period_left, left);
77 hwc->last_period = period;
78 overflow = 1;
79 }
80
81 if (unlikely(left < (s64)min)) {
82 left += period;
83 local64_set(&hwc->period_left, left);
84 hwc->last_period = period;
85 overflow = 1;
86 }
87
88 /*
89 * If the hw period that triggers the sw overflow is too short
90 * we might hit the irq handler. This biases the results.
91 * Thus we shorten the next-to-last period and set the last
92 * period to the max period.
93 */
94 if (left > max) {
95 left -= max;
96 if (left > max)
97 left = max;
98 else if (left < min)
99 left = min;
100 }
101
102 *hw_period = (u64)left;
103
104 return overflow;
105}
106
107static int
108perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
109{
110 struct hw_perf_event *hwc = &event->hw;
111 int shift = 64 - width;
112 u64 prev_raw_count;
113 u64 delta;
114
115 /*
116 * Careful: an NMI might modify the previous event value.
117 *
118 * Our tactic to handle this is to first atomically read and
119 * exchange a new raw count - then add that new-prev delta
120 * count to the generic event atomically:
121 */
122 prev_raw_count = local64_read(&hwc->prev_count);
123 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
124 new_raw_count) != prev_raw_count)
125 return 0;
126
127 /*
128 * Now we have the new raw value and have updated the prev
129 * timestamp already. We can now calculate the elapsed delta
130 * (event-)time and add that to the generic event.
131 *
132 * Careful, not all hw sign-extends above the physical width
133 * of the count.
134 */
135 delta = (new_raw_count << shift) - (prev_raw_count << shift);
136 delta >>= shift;
137
138 local64_add(delta, &event->count);
139 local64_sub(delta, &hwc->period_left);
140
141 return 1;
142}
143
144static struct perf_ibs perf_ibs_fetch;
145static struct perf_ibs perf_ibs_op;
146
147static struct perf_ibs *get_ibs_pmu(int type)
148{
149 if (perf_ibs_fetch.pmu.type == type)
150 return &perf_ibs_fetch;
151 if (perf_ibs_op.pmu.type == type)
152 return &perf_ibs_op;
153 return NULL;
154}
155
156/*
157 * Use IBS for precise event sampling:
158 *
159 * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
160 * perf record -a -e r076:p ... # same as -e cpu-cycles:p
161 * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
162 *
163 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
164 * MSRC001_1033) is used to select either cycle or micro-ops counting
165 * mode.
166 *
167 * The rip of IBS samples has skid 0. Thus, IBS supports precise
168 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
169 * rip is invalid when IBS was not able to record the rip correctly.
170 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
171 *
172 */
173static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
174{
175 switch (event->attr.precise_ip) {
176 case 0:
177 return -ENOENT;
178 case 1:
179 case 2:
180 break;
181 default:
182 return -EOPNOTSUPP;
183 }
184
185 switch (event->attr.type) {
186 case PERF_TYPE_HARDWARE:
187 switch (event->attr.config) {
188 case PERF_COUNT_HW_CPU_CYCLES:
189 *config = 0;
190 return 0;
191 }
192 break;
193 case PERF_TYPE_RAW:
194 switch (event->attr.config) {
195 case 0x0076:
196 *config = 0;
197 return 0;
198 case 0x00C1:
199 *config = IBS_OP_CNT_CTL;
200 return 0;
201 }
202 break;
203 default:
204 return -ENOENT;
205 }
206
207 return -EOPNOTSUPP;
208}
20 209
21static int perf_ibs_init(struct perf_event *event) 210static int perf_ibs_init(struct perf_event *event)
22{ 211{
23 if (perf_ibs.type != event->attr.type) 212 struct hw_perf_event *hwc = &event->hw;
213 struct perf_ibs *perf_ibs;
214 u64 max_cnt, config;
215 int ret;
216
217 perf_ibs = get_ibs_pmu(event->attr.type);
218 if (perf_ibs) {
219 config = event->attr.config;
220 } else {
221 perf_ibs = &perf_ibs_op;
222 ret = perf_ibs_precise_event(event, &config);
223 if (ret)
224 return ret;
225 }
226
227 if (event->pmu != &perf_ibs->pmu)
24 return -ENOENT; 228 return -ENOENT;
229
230 if (config & ~perf_ibs->config_mask)
231 return -EINVAL;
232
233 if (hwc->sample_period) {
234 if (config & perf_ibs->cnt_mask)
235 /* raw max_cnt may not be set */
236 return -EINVAL;
237 if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
238 /*
239 * lower 4 bits can not be set in ibs max cnt,
240 * but allowing it in case we adjust the
241 * sample period to set a frequency.
242 */
243 return -EINVAL;
244 hwc->sample_period &= ~0x0FULL;
245 if (!hwc->sample_period)
246 hwc->sample_period = 0x10;
247 } else {
248 max_cnt = config & perf_ibs->cnt_mask;
249 config &= ~perf_ibs->cnt_mask;
250 event->attr.sample_period = max_cnt << 4;
251 hwc->sample_period = event->attr.sample_period;
252 }
253
254 if (!hwc->sample_period)
255 return -EINVAL;
256
257 /*
258 * If we modify hwc->sample_period, we also need to update
259 * hwc->last_period and hwc->period_left.
260 */
261 hwc->last_period = hwc->sample_period;
262 local64_set(&hwc->period_left, hwc->sample_period);
263
264 hwc->config_base = perf_ibs->msr;
265 hwc->config = config;
266
25 return 0; 267 return 0;
26} 268}
27 269
270static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
271 struct hw_perf_event *hwc, u64 *period)
272{
273 int overflow;
274
275 /* ignore lower 4 bits in min count: */
276 overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
277 local64_set(&hwc->prev_count, 0);
278
279 return overflow;
280}
281
282static u64 get_ibs_fetch_count(u64 config)
283{
284 return (config & IBS_FETCH_CNT) >> 12;
285}
286
287static u64 get_ibs_op_count(u64 config)
288{
289 u64 count = 0;
290
291 if (config & IBS_OP_VAL)
292 count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
293
294 if (ibs_caps & IBS_CAPS_RDWROPCNT)
295 count += (config & IBS_OP_CUR_CNT) >> 32;
296
297 return count;
298}
299
300static void
301perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
302 u64 *config)
303{
304 u64 count = perf_ibs->get_count(*config);
305
306 /*
307 * Set width to 64 since we do not overflow on max width but
308 * instead on max count. In perf_ibs_set_period() we clear
309 * prev count manually on overflow.
310 */
311 while (!perf_event_try_update(event, count, 64)) {
312 rdmsrl(event->hw.config_base, *config);
313 count = perf_ibs->get_count(*config);
314 }
315}
316
317static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
318 struct hw_perf_event *hwc, u64 config)
319{
320 wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
321}
322
323/*
324 * Erratum #420 Instruction-Based Sampling Engine May Generate
325 * Interrupt that Cannot Be Cleared:
326 *
327 * Must clear counter mask first, then clear the enable bit. See
328 * Revision Guide for AMD Family 10h Processors, Publication #41322.
329 */
330static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
331 struct hw_perf_event *hwc, u64 config)
332{
333 config &= ~perf_ibs->cnt_mask;
334 wrmsrl(hwc->config_base, config);
335 config &= ~perf_ibs->enable_mask;
336 wrmsrl(hwc->config_base, config);
337}
338
339/*
340 * We cannot restore the ibs pmu state, so we always needs to update
341 * the event while stopping it and then reset the state when starting
342 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
343 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
344 */
345static void perf_ibs_start(struct perf_event *event, int flags)
346{
347 struct hw_perf_event *hwc = &event->hw;
348 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
349 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
350 u64 period;
351
352 if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
353 return;
354
355 WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
356 hwc->state = 0;
357
358 perf_ibs_set_period(perf_ibs, hwc, &period);
359 set_bit(IBS_STARTED, pcpu->state);
360 perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
361
362 perf_event_update_userpage(event);
363}
364
365static void perf_ibs_stop(struct perf_event *event, int flags)
366{
367 struct hw_perf_event *hwc = &event->hw;
368 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
369 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
370 u64 config;
371 int stopping;
372
373 stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
374
375 if (!stopping && (hwc->state & PERF_HES_UPTODATE))
376 return;
377
378 rdmsrl(hwc->config_base, config);
379
380 if (stopping) {
381 set_bit(IBS_STOPPING, pcpu->state);
382 perf_ibs_disable_event(perf_ibs, hwc, config);
383 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
384 hwc->state |= PERF_HES_STOPPED;
385 }
386
387 if (hwc->state & PERF_HES_UPTODATE)
388 return;
389
390 /*
391 * Clear valid bit to not count rollovers on update, rollovers
392 * are only updated in the irq handler.
393 */
394 config &= ~perf_ibs->valid_mask;
395
396 perf_ibs_event_update(perf_ibs, event, &config);
397 hwc->state |= PERF_HES_UPTODATE;
398}
399
28static int perf_ibs_add(struct perf_event *event, int flags) 400static int perf_ibs_add(struct perf_event *event, int flags)
29{ 401{
402 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
403 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
404
405 if (test_and_set_bit(IBS_ENABLED, pcpu->state))
406 return -ENOSPC;
407
408 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
409
410 pcpu->event = event;
411
412 if (flags & PERF_EF_START)
413 perf_ibs_start(event, PERF_EF_RELOAD);
414
30 return 0; 415 return 0;
31} 416}
32 417
33static void perf_ibs_del(struct perf_event *event, int flags) 418static void perf_ibs_del(struct perf_event *event, int flags)
34{ 419{
420 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
421 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
422
423 if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
424 return;
425
426 perf_ibs_stop(event, PERF_EF_UPDATE);
427
428 pcpu->event = NULL;
429
430 perf_event_update_userpage(event);
35} 431}
36 432
37static struct pmu perf_ibs = { 433static void perf_ibs_read(struct perf_event *event) { }
38 .event_init= perf_ibs_init, 434
39 .add= perf_ibs_add, 435static struct perf_ibs perf_ibs_fetch = {
40 .del= perf_ibs_del, 436 .pmu = {
437 .task_ctx_nr = perf_invalid_context,
438
439 .event_init = perf_ibs_init,
440 .add = perf_ibs_add,
441 .del = perf_ibs_del,
442 .start = perf_ibs_start,
443 .stop = perf_ibs_stop,
444 .read = perf_ibs_read,
445 },
446 .msr = MSR_AMD64_IBSFETCHCTL,
447 .config_mask = IBS_FETCH_CONFIG_MASK,
448 .cnt_mask = IBS_FETCH_MAX_CNT,
449 .enable_mask = IBS_FETCH_ENABLE,
450 .valid_mask = IBS_FETCH_VAL,
451 .max_period = IBS_FETCH_MAX_CNT << 4,
452 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
453 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
454
455 .get_count = get_ibs_fetch_count,
41}; 456};
42 457
458static struct perf_ibs perf_ibs_op = {
459 .pmu = {
460 .task_ctx_nr = perf_invalid_context,
461
462 .event_init = perf_ibs_init,
463 .add = perf_ibs_add,
464 .del = perf_ibs_del,
465 .start = perf_ibs_start,
466 .stop = perf_ibs_stop,
467 .read = perf_ibs_read,
468 },
469 .msr = MSR_AMD64_IBSOPCTL,
470 .config_mask = IBS_OP_CONFIG_MASK,
471 .cnt_mask = IBS_OP_MAX_CNT,
472 .enable_mask = IBS_OP_ENABLE,
473 .valid_mask = IBS_OP_VAL,
474 .max_period = IBS_OP_MAX_CNT << 4,
475 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
476 .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
477
478 .get_count = get_ibs_op_count,
479};
480
481static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
482{
483 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
484 struct perf_event *event = pcpu->event;
485 struct hw_perf_event *hwc = &event->hw;
486 struct perf_sample_data data;
487 struct perf_raw_record raw;
488 struct pt_regs regs;
489 struct perf_ibs_data ibs_data;
490 int offset, size, check_rip, offset_max, throttle = 0;
491 unsigned int msr;
492 u64 *buf, *config, period;
493
494 if (!test_bit(IBS_STARTED, pcpu->state)) {
495 /*
496 * Catch spurious interrupts after stopping IBS: After
497 * disabling IBS there could be still incomming NMIs
498 * with samples that even have the valid bit cleared.
499 * Mark all this NMIs as handled.
500 */
501 return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
502 }
503
504 msr = hwc->config_base;
505 buf = ibs_data.regs;
506 rdmsrl(msr, *buf);
507 if (!(*buf++ & perf_ibs->valid_mask))
508 return 0;
509
510 config = &ibs_data.regs[0];
511 perf_ibs_event_update(perf_ibs, event, config);
512 perf_sample_data_init(&data, 0, hwc->last_period);
513 if (!perf_ibs_set_period(perf_ibs, hwc, &period))
514 goto out; /* no sw counter overflow */
515
516 ibs_data.caps = ibs_caps;
517 size = 1;
518 offset = 1;
519 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
520 if (event->attr.sample_type & PERF_SAMPLE_RAW)
521 offset_max = perf_ibs->offset_max;
522 else if (check_rip)
523 offset_max = 2;
524 else
525 offset_max = 1;
526 do {
527 rdmsrl(msr + offset, *buf++);
528 size++;
529 offset = find_next_bit(perf_ibs->offset_mask,
530 perf_ibs->offset_max,
531 offset + 1);
532 } while (offset < offset_max);
533 ibs_data.size = sizeof(u64) * size;
534
535 regs = *iregs;
536 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
537 regs.flags &= ~PERF_EFLAGS_EXACT;
538 } else {
539 instruction_pointer_set(&regs, ibs_data.regs[1]);
540 regs.flags |= PERF_EFLAGS_EXACT;
541 }
542
543 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
544 raw.size = sizeof(u32) + ibs_data.size;
545 raw.data = ibs_data.data;
546 data.raw = &raw;
547 }
548
549 throttle = perf_event_overflow(event, &data, &regs);
550out:
551 if (throttle)
552 perf_ibs_disable_event(perf_ibs, hwc, *config);
553 else
554 perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
555
556 perf_event_update_userpage(event);
557
558 return 1;
559}
560
561static int __kprobes
562perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
563{
564 int handled = 0;
565
566 handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
567 handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
568
569 if (handled)
570 inc_irq_stat(apic_perf_irqs);
571
572 return handled;
573}
574
575static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
576{
577 struct cpu_perf_ibs __percpu *pcpu;
578 int ret;
579
580 pcpu = alloc_percpu(struct cpu_perf_ibs);
581 if (!pcpu)
582 return -ENOMEM;
583
584 perf_ibs->pcpu = pcpu;
585
586 ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
587 if (ret) {
588 perf_ibs->pcpu = NULL;
589 free_percpu(pcpu);
590 }
591
592 return ret;
593}
594
43static __init int perf_event_ibs_init(void) 595static __init int perf_event_ibs_init(void)
44{ 596{
45 if (!ibs_caps) 597 if (!ibs_caps)
46 return -ENODEV; /* ibs not supported by the cpu */ 598 return -ENODEV; /* ibs not supported by the cpu */
47 599
48 perf_pmu_register(&perf_ibs, "ibs", -1); 600 perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
601 if (ibs_caps & IBS_CAPS_OPCNT)
602 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
603 perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
604 register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
49 printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); 605 printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
50 606
51 return 0; 607 return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 26b3e2fef10..166546ec6ae 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1027 u64 status; 1027 u64 status;
1028 int handled; 1028 int handled;
1029 1029
1030 perf_sample_data_init(&data, 0);
1031
1032 cpuc = &__get_cpu_var(cpu_hw_events); 1030 cpuc = &__get_cpu_var(cpu_hw_events);
1033 1031
1034 /* 1032 /*
@@ -1082,7 +1080,7 @@ again:
1082 if (!intel_pmu_save_and_restart(event)) 1080 if (!intel_pmu_save_and_restart(event))
1083 continue; 1081 continue;
1084 1082
1085 data.period = event->hw.last_period; 1083 perf_sample_data_init(&data, 0, event->hw.last_period);
1086 1084
1087 if (has_branch_stack(event)) 1085 if (has_branch_stack(event))
1088 data.br_stack = &cpuc->lbr_stack; 1086 data.br_stack = &cpuc->lbr_stack;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f64df19e7d..5a3edc27f6e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
316 316
317 ds->bts_index = ds->bts_buffer_base; 317 ds->bts_index = ds->bts_buffer_base;
318 318
319 perf_sample_data_init(&data, 0); 319 perf_sample_data_init(&data, 0, event->hw.last_period);
320 data.period = event->hw.last_period;
321 regs.ip = 0; 320 regs.ip = 0;
322 321
323 /* 322 /*
@@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
564 if (!intel_pmu_save_and_restart(event)) 563 if (!intel_pmu_save_and_restart(event))
565 return; 564 return;
566 565
567 perf_sample_data_init(&data, 0); 566 perf_sample_data_init(&data, 0, event->hw.last_period);
568 data.period = event->hw.last_period;
569 567
570 /* 568 /*
571 * We use the interrupt regs as a base because the PEBS record 569 * We use the interrupt regs as a base because the PEBS record
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a2dfacfd710..47124a73dd7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
1005 int idx, handled = 0; 1005 int idx, handled = 0;
1006 u64 val; 1006 u64 val;
1007 1007
1008 perf_sample_data_init(&data, 0);
1009
1010 cpuc = &__get_cpu_var(cpu_hw_events); 1008 cpuc = &__get_cpu_var(cpu_hw_events);
1011 1009
1012 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1010 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
1034 handled += overflow; 1032 handled += overflow;
1035 1033
1036 /* event overflow for sure */ 1034 /* event overflow for sure */
1037 data.period = event->hw.last_period; 1035 perf_sample_data_init(&data, 0, hwc->last_period);
1038 1036
1039 if (!x86_perf_event_set_period(event)) 1037 if (!x86_perf_event_set_period(event))
1040 continue; 1038 continue;
1039
1040
1041 if (perf_event_overflow(event, &data, regs)) 1041 if (perf_event_overflow(event, &data, regs))
1042 x86_pmu_stop(event, 0); 1042 x86_pmu_stop(event, 0);
1043 } 1043 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1b81839b6c8..571246d81ed 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
272 return 1; 272 return 1;
273 273
274 show_registers(regs); 274 show_regs(regs);
275#ifdef CONFIG_X86_32 275#ifdef CONFIG_X86_32
276 if (user_mode_vm(regs)) { 276 if (user_mode_vm(regs)) {
277 sp = regs->sp; 277 sp = regs->sp;
@@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)
311 311
312static int __init kstack_setup(char *s) 312static int __init kstack_setup(char *s)
313{ 313{
314 ssize_t ret;
315 unsigned long val;
316
314 if (!s) 317 if (!s)
315 return -EINVAL; 318 return -EINVAL;
316 kstack_depth_to_print = simple_strtoul(s, NULL, 0); 319
320 ret = kstrtoul(s, 0, &val);
321 if (ret)
322 return ret;
323 kstack_depth_to_print = val;
317 return 0; 324 return 0;
318} 325}
319early_param("kstack", kstack_setup); 326early_param("kstack", kstack_setup);
320 327
321static int __init code_bytes_setup(char *s) 328static int __init code_bytes_setup(char *s)
322{ 329{
323 code_bytes = simple_strtoul(s, NULL, 0); 330 ssize_t ret;
331 unsigned long val;
332
333 if (!s)
334 return -EINVAL;
335
336 ret = kstrtoul(s, 0, &val);
337 if (ret)
338 return ret;
339
340 code_bytes = val;
324 if (code_bytes > 8192) 341 if (code_bytes > 8192)
325 code_bytes = 8192; 342 code_bytes = 8192;
326 343
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 88ec9129271..e0b1d783daa 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
82} 82}
83 83
84 84
85void show_registers(struct pt_regs *regs) 85void show_regs(struct pt_regs *regs)
86{ 86{
87 int i; 87 int i;
88 88
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd6e1f..791b76122aa 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
245 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 245 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
246} 246}
247 247
248void show_registers(struct pt_regs *regs) 248void show_regs(struct pt_regs *regs)
249{ 249{
250 int i; 250 int i;
251 unsigned long sp; 251 unsigned long sp;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976e..41857970517 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
113 int x = e820x->nr_map; 113 int x = e820x->nr_map;
114 114
115 if (x >= ARRAY_SIZE(e820x->map)) { 115 if (x >= ARRAY_SIZE(e820x->map)) {
116 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 116 printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
117 (unsigned long long) start,
118 (unsigned long long) (start + size - 1));
117 return; 119 return;
118 } 120 }
119 121
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
133 switch (type) { 135 switch (type) {
134 case E820_RAM: 136 case E820_RAM:
135 case E820_RESERVED_KERN: 137 case E820_RESERVED_KERN:
136 printk(KERN_CONT "(usable)"); 138 printk(KERN_CONT "usable");
137 break; 139 break;
138 case E820_RESERVED: 140 case E820_RESERVED:
139 printk(KERN_CONT "(reserved)"); 141 printk(KERN_CONT "reserved");
140 break; 142 break;
141 case E820_ACPI: 143 case E820_ACPI:
142 printk(KERN_CONT "(ACPI data)"); 144 printk(KERN_CONT "ACPI data");
143 break; 145 break;
144 case E820_NVS: 146 case E820_NVS:
145 printk(KERN_CONT "(ACPI NVS)"); 147 printk(KERN_CONT "ACPI NVS");
146 break; 148 break;
147 case E820_UNUSABLE: 149 case E820_UNUSABLE:
148 printk(KERN_CONT "(unusable)"); 150 printk(KERN_CONT "unusable");
149 break; 151 break;
150 default: 152 default:
151 printk(KERN_CONT "type %u", type); 153 printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
158 int i; 160 int i;
159 161
160 for (i = 0; i < e820.nr_map; i++) { 162 for (i = 0; i < e820.nr_map; i++) {
161 printk(KERN_INFO " %s: %016Lx - %016Lx ", who, 163 printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
162 (unsigned long long) e820.map[i].addr, 164 (unsigned long long) e820.map[i].addr,
163 (unsigned long long) 165 (unsigned long long)
164 (e820.map[i].addr + e820.map[i].size)); 166 (e820.map[i].addr + e820.map[i].size - 1));
165 e820_print_type(e820.map[i].type); 167 e820_print_type(e820.map[i].type);
166 printk(KERN_CONT "\n"); 168 printk(KERN_CONT "\n");
167 } 169 }
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
428 size = ULLONG_MAX - start; 430 size = ULLONG_MAX - start;
429 431
430 end = start + size; 432 end = start + size;
431 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", 433 printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
432 (unsigned long long) start, 434 (unsigned long long) start, (unsigned long long) (end - 1));
433 (unsigned long long) end);
434 e820_print_type(old_type); 435 e820_print_type(old_type);
435 printk(KERN_CONT " ==> "); 436 printk(KERN_CONT " ==> ");
436 e820_print_type(new_type); 437 e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
509 size = ULLONG_MAX - start; 510 size = ULLONG_MAX - start;
510 511
511 end = start + size; 512 end = start + size;
512 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", 513 printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
513 (unsigned long long) start, 514 (unsigned long long) start, (unsigned long long) (end - 1));
514 (unsigned long long) end);
515 if (checktype) 515 if (checktype)
516 e820_print_type(old_type); 516 e820_print_type(old_type);
517 printk(KERN_CONT "\n"); 517 printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
568 return; 568 return;
569 e820.nr_map = nr_map; 569 e820.nr_map = nr_map;
570 printk(KERN_INFO "modified physical RAM map:\n"); 570 printk(KERN_INFO "e820: modified physical RAM map:\n");
571 e820_print_map("modified"); 571 e820_print_map("modified");
572} 572}
573static void __init update_e820_saved(void) 573static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
637 if (!found) { 637 if (!found) {
638 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 638 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
639 printk(KERN_ERR 639 printk(KERN_ERR
640 "PCI: Warning: Cannot find a gap in the 32bit address range\n" 640 "e820: cannot find a gap in the 32bit address range\n"
641 "PCI: Unassigned devices with 32bit resource registers may break!\n"); 641 "e820: PCI devices with unassigned 32bit BARs may break!\n");
642 } 642 }
643#endif 643#endif
644 644
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
648 pci_mem_start = gapstart; 648 pci_mem_start = gapstart;
649 649
650 printk(KERN_INFO 650 printk(KERN_INFO
651 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 651 "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
652 pci_mem_start, gapstart, gapsize); 652 gapstart, gapstart + gapsize - 1);
653} 653}
654 654
655/** 655/**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
667 extmap = (struct e820entry *)(sdata->data); 667 extmap = (struct e820entry *)(sdata->data);
668 __append_e820_map(extmap, entries); 668 __append_e820_map(extmap, entries);
669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
670 printk(KERN_INFO "extended physical RAM map:\n"); 670 printk(KERN_INFO "e820: extended physical RAM map:\n");
671 e820_print_map("extended"); 671 e820_print_map("extended");
672} 672}
673 673
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
735 if (addr) { 735 if (addr) {
736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); 736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
737 printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); 737 printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
738 update_e820_saved(); 738 update_e820_saved();
739 } 739 }
740 740
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
784 if (last_pfn > max_arch_pfn) 784 if (last_pfn > max_arch_pfn)
785 last_pfn = max_arch_pfn; 785 last_pfn = max_arch_pfn;
786 786
787 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", 787 printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
788 last_pfn, max_arch_pfn); 788 last_pfn, max_arch_pfn);
789 return last_pfn; 789 return last_pfn;
790} 790}
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
888 early_panic("Invalid user supplied memory map"); 888 early_panic("Invalid user supplied memory map");
889 e820.nr_map = nr; 889 e820.nr_map = nr;
890 890
891 printk(KERN_INFO "user-defined physical RAM map:\n"); 891 printk(KERN_INFO "e820: user-defined physical RAM map:\n");
892 e820_print_map("user"); 892 e820_print_map("user");
893 } 893 }
894} 894}
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
996 end = MAX_RESOURCE_SIZE; 996 end = MAX_RESOURCE_SIZE;
997 if (start >= end) 997 if (start >= end)
998 continue; 998 continue;
999 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", 999 printk(KERN_DEBUG
1000 start, end); 1000 "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
1001 start, end);
1001 reserve_region_with_split(&iomem_resource, start, end, 1002 reserve_region_with_split(&iomem_resource, start, end,
1002 "RAM buffer"); 1003 "RAM buffer");
1003 } 1004 }
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
1047 1048
1048 who = x86_init.resources.memory_setup(); 1049 who = x86_init.resources.memory_setup();
1049 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1050 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1050 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1051 printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
1051 e820_print_map(who); 1052 e820_print_map(who);
1052} 1053}
1053 1054
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b784f4ef1e..01ccf9b7147 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -56,6 +56,7 @@
56#include <asm/irq_vectors.h> 56#include <asm/irq_vectors.h>
57#include <asm/cpufeature.h> 57#include <asm/cpufeature.h>
58#include <asm/alternative-asm.h> 58#include <asm/alternative-asm.h>
59#include <asm/asm.h>
59 60
60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
61#include <linux/elf-em.h> 62#include <linux/elf-em.h>
@@ -151,10 +152,8 @@
151.pushsection .fixup, "ax" 152.pushsection .fixup, "ax"
15299: movl $0, (%esp) 15399: movl $0, (%esp)
153 jmp 98b 154 jmp 98b
154.section __ex_table, "a"
155 .align 4
156 .long 98b, 99b
157.popsection 155.popsection
156 _ASM_EXTABLE(98b,99b)
158.endm 157.endm
159 158
160.macro PTGS_TO_GS 159.macro PTGS_TO_GS
@@ -164,10 +163,8 @@
164.pushsection .fixup, "ax" 163.pushsection .fixup, "ax"
16599: movl $0, PT_GS(%esp) 16499: movl $0, PT_GS(%esp)
166 jmp 98b 165 jmp 98b
167.section __ex_table, "a"
168 .align 4
169 .long 98b, 99b
170.popsection 166.popsection
167 _ASM_EXTABLE(98b,99b)
171.endm 168.endm
172 169
173.macro GS_TO_REG reg 170.macro GS_TO_REG reg
@@ -249,12 +246,10 @@
249 jmp 2b 246 jmp 2b
2506: movl $0, (%esp) 2476: movl $0, (%esp)
251 jmp 3b 248 jmp 3b
252.section __ex_table, "a"
253 .align 4
254 .long 1b, 4b
255 .long 2b, 5b
256 .long 3b, 6b
257.popsection 249.popsection
250 _ASM_EXTABLE(1b,4b)
251 _ASM_EXTABLE(2b,5b)
252 _ASM_EXTABLE(3b,6b)
258 POP_GS_EX 253 POP_GS_EX
259.endm 254.endm
260 255
@@ -415,10 +410,7 @@ sysenter_past_esp:
415 jae syscall_fault 410 jae syscall_fault
4161: movl (%ebp),%ebp 4111: movl (%ebp),%ebp
417 movl %ebp,PT_EBP(%esp) 412 movl %ebp,PT_EBP(%esp)
418.section __ex_table,"a" 413 _ASM_EXTABLE(1b,syscall_fault)
419 .align 4
420 .long 1b,syscall_fault
421.previous
422 414
423 GET_THREAD_INFO(%ebp) 415 GET_THREAD_INFO(%ebp)
424 416
@@ -485,10 +477,8 @@ sysexit_audit:
485.pushsection .fixup,"ax" 477.pushsection .fixup,"ax"
4862: movl $0,PT_FS(%esp) 4782: movl $0,PT_FS(%esp)
487 jmp 1b 479 jmp 1b
488.section __ex_table,"a"
489 .align 4
490 .long 1b,2b
491.popsection 480.popsection
481 _ASM_EXTABLE(1b,2b)
492 PTGS_TO_GS_EX 482 PTGS_TO_GS_EX
493ENDPROC(ia32_sysenter_target) 483ENDPROC(ia32_sysenter_target)
494 484
@@ -543,10 +533,7 @@ ENTRY(iret_exc)
543 pushl $do_iret_error 533 pushl $do_iret_error
544 jmp error_code 534 jmp error_code
545.previous 535.previous
546.section __ex_table,"a" 536 _ASM_EXTABLE(irq_return,iret_exc)
547 .align 4
548 .long irq_return,iret_exc
549.previous
550 537
551 CFI_RESTORE_STATE 538 CFI_RESTORE_STATE
552ldt_ss: 539ldt_ss:
@@ -901,10 +888,7 @@ END(device_not_available)
901#ifdef CONFIG_PARAVIRT 888#ifdef CONFIG_PARAVIRT
902ENTRY(native_iret) 889ENTRY(native_iret)
903 iret 890 iret
904.section __ex_table,"a" 891 _ASM_EXTABLE(native_iret, iret_exc)
905 .align 4
906 .long native_iret, iret_exc
907.previous
908END(native_iret) 892END(native_iret)
909 893
910ENTRY(native_irq_enable_sysexit) 894ENTRY(native_irq_enable_sysexit)
@@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback)
1093 movl %eax,16(%esp) 1077 movl %eax,16(%esp)
1094 jmp 4b 1078 jmp 4b
1095.previous 1079.previous
1096.section __ex_table,"a" 1080 _ASM_EXTABLE(1b,6b)
1097 .align 4 1081 _ASM_EXTABLE(2b,7b)
1098 .long 1b,6b 1082 _ASM_EXTABLE(3b,8b)
1099 .long 2b,7b 1083 _ASM_EXTABLE(4b,9b)
1100 .long 3b,8b
1101 .long 4b,9b
1102.previous
1103ENDPROC(xen_failsafe_callback) 1084ENDPROC(xen_failsafe_callback)
1104 1085
1105BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, 1086BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cdc79b5cfcd..320852d0202 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
55#include <asm/paravirt.h> 55#include <asm/paravirt.h>
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h>
58#include <linux/err.h> 59#include <linux/err.h>
59 60
60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -900,18 +901,12 @@ restore_args:
900 901
901irq_return: 902irq_return:
902 INTERRUPT_RETURN 903 INTERRUPT_RETURN
903 904 _ASM_EXTABLE(irq_return, bad_iret)
904 .section __ex_table, "a"
905 .quad irq_return, bad_iret
906 .previous
907 905
908#ifdef CONFIG_PARAVIRT 906#ifdef CONFIG_PARAVIRT
909ENTRY(native_iret) 907ENTRY(native_iret)
910 iretq 908 iretq
911 909 _ASM_EXTABLE(native_iret, bad_iret)
912 .section __ex_table,"a"
913 .quad native_iret, bad_iret
914 .previous
915#endif 910#endif
916 911
917 .section .fixup,"ax" 912 .section .fixup,"ax"
@@ -1181,10 +1176,7 @@ gs_change:
1181 CFI_ENDPROC 1176 CFI_ENDPROC
1182END(native_load_gs_index) 1177END(native_load_gs_index)
1183 1178
1184 .section __ex_table,"a" 1179 _ASM_EXTABLE(gs_change,bad_gs)
1185 .align 8
1186 .quad gs_change,bad_gs
1187 .previous
1188 .section .fixup,"ax" 1180 .section .fixup,"ax"
1189 /* running with kernelgs */ 1181 /* running with kernelgs */
1190bad_gs: 1182bad_gs:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f272f..32ff36596ab 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,40 +24,21 @@
24#include <trace/syscall.h> 24#include <trace/syscall.h>
25 25
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/kprobes.h>
27#include <asm/ftrace.h> 28#include <asm/ftrace.h>
28#include <asm/nops.h> 29#include <asm/nops.h>
29#include <asm/nmi.h>
30
31 30
32#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
33 32
34/*
35 * modifying_code is set to notify NMIs that they need to use
36 * memory barriers when entering or exiting. But we don't want
37 * to burden NMIs with unnecessary memory barriers when code
38 * modification is not being done (which is most of the time).
39 *
40 * A mutex is already held when ftrace_arch_code_modify_prepare
41 * and post_process are called. No locks need to be taken here.
42 *
43 * Stop machine will make sure currently running NMIs are done
44 * and new NMIs will see the updated variable before we need
45 * to worry about NMIs doing memory barriers.
46 */
47static int modifying_code __read_mostly;
48static DEFINE_PER_CPU(int, save_modifying_code);
49
50int ftrace_arch_code_modify_prepare(void) 33int ftrace_arch_code_modify_prepare(void)
51{ 34{
52 set_kernel_text_rw(); 35 set_kernel_text_rw();
53 set_all_modules_text_rw(); 36 set_all_modules_text_rw();
54 modifying_code = 1;
55 return 0; 37 return 0;
56} 38}
57 39
58int ftrace_arch_code_modify_post_process(void) 40int ftrace_arch_code_modify_post_process(void)
59{ 41{
60 modifying_code = 0;
61 set_all_modules_text_ro(); 42 set_all_modules_text_ro();
62 set_kernel_text_ro(); 43 set_kernel_text_ro();
63 return 0; 44 return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
90 return calc.code; 71 return calc.code;
91} 72}
92 73
93/*
94 * Modifying code must take extra care. On an SMP machine, if
95 * the code being modified is also being executed on another CPU
96 * that CPU will have undefined results and possibly take a GPF.
97 * We use kstop_machine to stop other CPUS from exectuing code.
98 * But this does not stop NMIs from happening. We still need
99 * to protect against that. We separate out the modification of
100 * the code to take care of this.
101 *
102 * Two buffers are added: An IP buffer and a "code" buffer.
103 *
104 * 1) Put the instruction pointer into the IP buffer
105 * and the new code into the "code" buffer.
106 * 2) Wait for any running NMIs to finish and set a flag that says
107 * we are modifying code, it is done in an atomic operation.
108 * 3) Write the code
109 * 4) clear the flag.
110 * 5) Wait for any running NMIs to finish.
111 *
112 * If an NMI is executed, the first thing it does is to call
113 * "ftrace_nmi_enter". This will check if the flag is set to write
114 * and if it is, it will write what is in the IP and "code" buffers.
115 *
116 * The trick is, it does not matter if everyone is writing the same
117 * content to the code location. Also, if a CPU is executing code
118 * it is OK to write to that code location if the contents being written
119 * are the same as what exists.
120 */
121
122#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
123static atomic_t nmi_running = ATOMIC_INIT(0);
124static int mod_code_status; /* holds return value of text write */
125static void *mod_code_ip; /* holds the IP to write to */
126static const void *mod_code_newcode; /* holds the text to write to the IP */
127
128static unsigned nmi_wait_count;
129static atomic_t nmi_update_count = ATOMIC_INIT(0);
130
131int ftrace_arch_read_dyn_info(char *buf, int size)
132{
133 int r;
134
135 r = snprintf(buf, size, "%u %u",
136 nmi_wait_count,
137 atomic_read(&nmi_update_count));
138 return r;
139}
140
141static void clear_mod_flag(void)
142{
143 int old = atomic_read(&nmi_running);
144
145 for (;;) {
146 int new = old & ~MOD_CODE_WRITE_FLAG;
147
148 if (old == new)
149 break;
150
151 old = atomic_cmpxchg(&nmi_running, old, new);
152 }
153}
154
155static void ftrace_mod_code(void)
156{
157 /*
158 * Yes, more than one CPU process can be writing to mod_code_status.
159 * (and the code itself)
160 * But if one were to fail, then they all should, and if one were
161 * to succeed, then they all should.
162 */
163 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
164 MCOUNT_INSN_SIZE);
165
166 /* if we fail, then kill any new writers */
167 if (mod_code_status)
168 clear_mod_flag();
169}
170
171void ftrace_nmi_enter(void)
172{
173 __this_cpu_write(save_modifying_code, modifying_code);
174
175 if (!__this_cpu_read(save_modifying_code))
176 return;
177
178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
179 smp_rmb();
180 ftrace_mod_code();
181 atomic_inc(&nmi_update_count);
182 }
183 /* Must have previous changes seen before executions */
184 smp_mb();
185}
186
187void ftrace_nmi_exit(void)
188{
189 if (!__this_cpu_read(save_modifying_code))
190 return;
191
192 /* Finish all executions before clearing nmi_running */
193 smp_mb();
194 atomic_dec(&nmi_running);
195}
196
197static void wait_for_nmi_and_set_mod_flag(void)
198{
199 if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
200 return;
201
202 do {
203 cpu_relax();
204 } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
205
206 nmi_wait_count++;
207}
208
209static void wait_for_nmi(void)
210{
211 if (!atomic_read(&nmi_running))
212 return;
213
214 do {
215 cpu_relax();
216 } while (atomic_read(&nmi_running));
217
218 nmi_wait_count++;
219}
220
221static inline int 74static inline int
222within(unsigned long addr, unsigned long start, unsigned long end) 75within(unsigned long addr, unsigned long start, unsigned long end)
223{ 76{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
238 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 91 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
239 ip = (unsigned long)__va(__pa(ip)); 92 ip = (unsigned long)__va(__pa(ip));
240 93
241 mod_code_ip = (void *)ip; 94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
242 mod_code_newcode = new_code;
243
244 /* The buffers need to be visible before we let NMIs write them */
245 smp_mb();
246
247 wait_for_nmi_and_set_mod_flag();
248
249 /* Make sure all running NMIs have finished before we write the code */
250 smp_mb();
251
252 ftrace_mod_code();
253
254 /* Make sure the write happens before clearing the bit */
255 smp_mb();
256
257 clear_mod_flag();
258 wait_for_nmi();
259
260 return mod_code_status;
261} 95}
262 96
263static const unsigned char *ftrace_nop_replace(void) 97static const unsigned char *ftrace_nop_replace(void)
@@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
334 return ret; 168 return ret;
335} 169}
336 170
171int modifying_ftrace_code __read_mostly;
172
173/*
174 * A breakpoint was added to the code address we are about to
175 * modify, and this is the handle that will just skip over it.
176 * We are either changing a nop into a trace call, or a trace
177 * call to a nop. While the change is taking place, we treat
178 * it just like it was a nop.
179 */
180int ftrace_int3_handler(struct pt_regs *regs)
181{
182 if (WARN_ON_ONCE(!regs))
183 return 0;
184
185 if (!ftrace_location(regs->ip - 1))
186 return 0;
187
188 regs->ip += MCOUNT_INSN_SIZE - 1;
189
190 return 1;
191}
192
193static int ftrace_write(unsigned long ip, const char *val, int size)
194{
195 /*
196 * On x86_64, kernel text mappings are mapped read-only with
197 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
198 * of the kernel text mapping to modify the kernel text.
199 *
200 * For 32bit kernels, these mappings are same and we can use
201 * kernel identity mapping to modify code.
202 */
203 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
204 ip = (unsigned long)__va(__pa(ip));
205
206 return probe_kernel_write((void *)ip, val, size);
207}
208
209static int add_break(unsigned long ip, const char *old)
210{
211 unsigned char replaced[MCOUNT_INSN_SIZE];
212 unsigned char brk = BREAKPOINT_INSTRUCTION;
213
214 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
215 return -EFAULT;
216
217 /* Make sure it is what we expect it to be */
218 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
219 return -EINVAL;
220
221 if (ftrace_write(ip, &brk, 1))
222 return -EPERM;
223
224 return 0;
225}
226
227static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
228{
229 unsigned const char *old;
230 unsigned long ip = rec->ip;
231
232 old = ftrace_call_replace(ip, addr);
233
234 return add_break(rec->ip, old);
235}
236
237
238static int add_brk_on_nop(struct dyn_ftrace *rec)
239{
240 unsigned const char *old;
241
242 old = ftrace_nop_replace();
243
244 return add_break(rec->ip, old);
245}
246
247static int add_breakpoints(struct dyn_ftrace *rec, int enable)
248{
249 unsigned long ftrace_addr;
250 int ret;
251
252 ret = ftrace_test_record(rec, enable);
253
254 ftrace_addr = (unsigned long)FTRACE_ADDR;
255
256 switch (ret) {
257 case FTRACE_UPDATE_IGNORE:
258 return 0;
259
260 case FTRACE_UPDATE_MAKE_CALL:
261 /* converting nop to call */
262 return add_brk_on_nop(rec);
263
264 case FTRACE_UPDATE_MAKE_NOP:
265 /* converting a call to a nop */
266 return add_brk_on_call(rec, ftrace_addr);
267 }
268 return 0;
269}
270
271/*
272 * On error, we need to remove breakpoints. This needs to
273 * be done caefully. If the address does not currently have a
274 * breakpoint, we know we are done. Otherwise, we look at the
275 * remaining 4 bytes of the instruction. If it matches a nop
276 * we replace the breakpoint with the nop. Otherwise we replace
277 * it with the call instruction.
278 */
279static int remove_breakpoint(struct dyn_ftrace *rec)
280{
281 unsigned char ins[MCOUNT_INSN_SIZE];
282 unsigned char brk = BREAKPOINT_INSTRUCTION;
283 const unsigned char *nop;
284 unsigned long ftrace_addr;
285 unsigned long ip = rec->ip;
286
287 /* If we fail the read, just give up */
288 if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
289 return -EFAULT;
290
291 /* If this does not have a breakpoint, we are done */
292 if (ins[0] != brk)
293 return -1;
294
295 nop = ftrace_nop_replace();
296
297 /*
298 * If the last 4 bytes of the instruction do not match
299 * a nop, then we assume that this is a call to ftrace_addr.
300 */
301 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
302 /*
303 * For extra paranoidism, we check if the breakpoint is on
304 * a call that would actually jump to the ftrace_addr.
305 * If not, don't touch the breakpoint, we make just create
306 * a disaster.
307 */
308 ftrace_addr = (unsigned long)FTRACE_ADDR;
309 nop = ftrace_call_replace(ip, ftrace_addr);
310
311 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
312 return -EINVAL;
313 }
314
315 return probe_kernel_write((void *)ip, &nop[0], 1);
316}
317
318static int add_update_code(unsigned long ip, unsigned const char *new)
319{
320 /* skip breakpoint */
321 ip++;
322 new++;
323 if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
324 return -EPERM;
325 return 0;
326}
327
328static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
329{
330 unsigned long ip = rec->ip;
331 unsigned const char *new;
332
333 new = ftrace_call_replace(ip, addr);
334 return add_update_code(ip, new);
335}
336
337static int add_update_nop(struct dyn_ftrace *rec)
338{
339 unsigned long ip = rec->ip;
340 unsigned const char *new;
341
342 new = ftrace_nop_replace();
343 return add_update_code(ip, new);
344}
345
346static int add_update(struct dyn_ftrace *rec, int enable)
347{
348 unsigned long ftrace_addr;
349 int ret;
350
351 ret = ftrace_test_record(rec, enable);
352
353 ftrace_addr = (unsigned long)FTRACE_ADDR;
354
355 switch (ret) {
356 case FTRACE_UPDATE_IGNORE:
357 return 0;
358
359 case FTRACE_UPDATE_MAKE_CALL:
360 /* converting nop to call */
361 return add_update_call(rec, ftrace_addr);
362
363 case FTRACE_UPDATE_MAKE_NOP:
364 /* converting a call to a nop */
365 return add_update_nop(rec);
366 }
367
368 return 0;
369}
370
371static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
372{
373 unsigned long ip = rec->ip;
374 unsigned const char *new;
375
376 new = ftrace_call_replace(ip, addr);
377
378 if (ftrace_write(ip, new, 1))
379 return -EPERM;
380
381 return 0;
382}
383
384static int finish_update_nop(struct dyn_ftrace *rec)
385{
386 unsigned long ip = rec->ip;
387 unsigned const char *new;
388
389 new = ftrace_nop_replace();
390
391 if (ftrace_write(ip, new, 1))
392 return -EPERM;
393 return 0;
394}
395
396static int finish_update(struct dyn_ftrace *rec, int enable)
397{
398 unsigned long ftrace_addr;
399 int ret;
400
401 ret = ftrace_update_record(rec, enable);
402
403 ftrace_addr = (unsigned long)FTRACE_ADDR;
404
405 switch (ret) {
406 case FTRACE_UPDATE_IGNORE:
407 return 0;
408
409 case FTRACE_UPDATE_MAKE_CALL:
410 /* converting nop to call */
411 return finish_update_call(rec, ftrace_addr);
412
413 case FTRACE_UPDATE_MAKE_NOP:
414 /* converting a call to a nop */
415 return finish_update_nop(rec);
416 }
417
418 return 0;
419}
420
421static void do_sync_core(void *data)
422{
423 sync_core();
424}
425
426static void run_sync(void)
427{
428 int enable_irqs = irqs_disabled();
429
430 /* We may be called with interrupts disbled (on bootup). */
431 if (enable_irqs)
432 local_irq_enable();
433 on_each_cpu(do_sync_core, NULL, 1);
434 if (enable_irqs)
435 local_irq_disable();
436}
437
438void ftrace_replace_code(int enable)
439{
440 struct ftrace_rec_iter *iter;
441 struct dyn_ftrace *rec;
442 const char *report = "adding breakpoints";
443 int count = 0;
444 int ret;
445
446 for_ftrace_rec_iter(iter) {
447 rec = ftrace_rec_iter_record(iter);
448
449 ret = add_breakpoints(rec, enable);
450 if (ret)
451 goto remove_breakpoints;
452 count++;
453 }
454
455 run_sync();
456
457 report = "updating code";
458
459 for_ftrace_rec_iter(iter) {
460 rec = ftrace_rec_iter_record(iter);
461
462 ret = add_update(rec, enable);
463 if (ret)
464 goto remove_breakpoints;
465 }
466
467 run_sync();
468
469 report = "removing breakpoints";
470
471 for_ftrace_rec_iter(iter) {
472 rec = ftrace_rec_iter_record(iter);
473
474 ret = finish_update(rec, enable);
475 if (ret)
476 goto remove_breakpoints;
477 }
478
479 run_sync();
480
481 return;
482
483 remove_breakpoints:
484 ftrace_bug(ret, rec ? rec->ip : 0);
485 printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
486 for_ftrace_rec_iter(iter) {
487 rec = ftrace_rec_iter_record(iter);
488 remove_breakpoint(rec);
489 }
490}
491
492void arch_ftrace_update_code(int command)
493{
494 modifying_ftrace_code++;
495
496 ftrace_modify_all_code(command);
497
498 modifying_ftrace_code--;
499}
500
337int __init ftrace_dyn_arch_init(void *data) 501int __init ftrace_dyn_arch_init(void *data)
338{ 502{
339 /* The return code is retured via data */ 503 /* The return code is retured via data */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a3c2b4ffebc..d42ab17b739 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,6 +21,7 @@
21#include <asm/msr-index.h> 21#include <asm/msr-index.h>
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h>
24 25
25/* Physical address */ 26/* Physical address */
26#define pa(X) ((X) - __PAGE_OFFSET) 27#define pa(X) ((X) - __PAGE_OFFSET)
@@ -360,28 +361,23 @@ default_entry:
360 pushl $0 361 pushl $0
361 popfl 362 popfl
362 363
363#ifdef CONFIG_SMP
364 cmpb $0, ready
365 jnz checkCPUtype
366#endif /* CONFIG_SMP */
367
368/* 364/*
369 * start system 32-bit setup. We need to re-do some of the things done 365 * start system 32-bit setup. We need to re-do some of the things done
370 * in 16-bit mode for the "real" operations. 366 * in 16-bit mode for the "real" operations.
371 */ 367 */
372 call setup_idt 368 movl setup_once_ref,%eax
373 369 andl %eax,%eax
374checkCPUtype: 370 jz 1f # Did we do this already?
375 371 call *%eax
376 movl $-1,X86_CPUID # -1 for no CPUID initially 3721:
377 373
378/* check if it is 486 or 386. */ 374/* check if it is 486 or 386. */
379/* 375/*
380 * XXX - this does a lot of unnecessary setup. Alignment checks don't 376 * XXX - this does a lot of unnecessary setup. Alignment checks don't
381 * apply at our cpl of 0 and the stack ought to be aligned already, and 377 * apply at our cpl of 0 and the stack ought to be aligned already, and
382 * we don't need to preserve eflags. 378 * we don't need to preserve eflags.
383 */ 379 */
384 380 movl $-1,X86_CPUID # -1 for no CPUID initially
385 movb $3,X86 # at least 386 381 movb $3,X86 # at least 386
386 pushfl # push EFLAGS 382 pushfl # push EFLAGS
387 popl %eax # get EFLAGS 383 popl %eax # get EFLAGS
@@ -447,21 +443,6 @@ is386: movl $2,%ecx # set MP
447 movl $(__KERNEL_PERCPU), %eax 443 movl $(__KERNEL_PERCPU), %eax
448 movl %eax,%fs # set this cpu's percpu 444 movl %eax,%fs # set this cpu's percpu
449 445
450#ifdef CONFIG_CC_STACKPROTECTOR
451 /*
452 * The linker can't handle this by relocation. Manually set
453 * base address in stack canary segment descriptor.
454 */
455 cmpb $0,ready
456 jne 1f
457 movl $gdt_page,%eax
458 movl $stack_canary,%ecx
459 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
460 shrl $16, %ecx
461 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
462 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
4631:
464#endif
465 movl $(__KERNEL_STACK_CANARY),%eax 446 movl $(__KERNEL_STACK_CANARY),%eax
466 movl %eax,%gs 447 movl %eax,%gs
467 448
@@ -470,7 +451,6 @@ is386: movl $2,%ecx # set MP
470 451
471 cld # gcc2 wants the direction flag cleared at all times 452 cld # gcc2 wants the direction flag cleared at all times
472 pushl $0 # fake return address for unwinder 453 pushl $0 # fake return address for unwinder
473 movb $1, ready
474 jmp *(initial_code) 454 jmp *(initial_code)
475 455
476/* 456/*
@@ -492,81 +472,122 @@ check_x87:
492 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ 472 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
493 ret 473 ret
494 474
475
476#include "verify_cpu.S"
477
495/* 478/*
496 * setup_idt 479 * setup_once
497 * 480 *
498 * sets up a idt with 256 entries pointing to 481 * The setup work we only want to run on the BSP.
499 * ignore_int, interrupt gates. It doesn't actually load
500 * idt - that can be done only after paging has been enabled
501 * and the kernel moved to PAGE_OFFSET. Interrupts
502 * are enabled elsewhere, when we can be relatively
503 * sure everything is ok.
504 * 482 *
505 * Warning: %esi is live across this function. 483 * Warning: %esi is live across this function.
506 */ 484 */
507setup_idt: 485__INIT
508 lea ignore_int,%edx 486setup_once:
509 movl $(__KERNEL_CS << 16),%eax 487 /*
510 movw %dx,%ax /* selector = 0x0010 = cs */ 488 * Set up a idt with 256 entries pointing to ignore_int,
511 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ 489 * interrupt gates. It doesn't actually load idt - that needs
490 * to be done on each CPU. Interrupts are enabled elsewhere,
491 * when we can be relatively sure everything is ok.
492 */
512 493
513 lea idt_table,%edi 494 movl $idt_table,%edi
514 mov $256,%ecx 495 movl $early_idt_handlers,%eax
515rp_sidt: 496 movl $NUM_EXCEPTION_VECTORS,%ecx
4971:
516 movl %eax,(%edi) 498 movl %eax,(%edi)
517 movl %edx,4(%edi) 499 movl %eax,4(%edi)
500 /* interrupt gate, dpl=0, present */
501 movl $(0x8E000000 + __KERNEL_CS),2(%edi)
502 addl $9,%eax
518 addl $8,%edi 503 addl $8,%edi
519 dec %ecx 504 loop 1b
520 jne rp_sidt
521 505
522.macro set_early_handler handler,trapno 506 movl $256 - NUM_EXCEPTION_VECTORS,%ecx
523 lea \handler,%edx 507 movl $ignore_int,%edx
524 movl $(__KERNEL_CS << 16),%eax 508 movl $(__KERNEL_CS << 16),%eax
525 movw %dx,%ax 509 movw %dx,%ax /* selector = 0x0010 = cs */
526 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ 510 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
527 lea idt_table,%edi 5112:
528 movl %eax,8*\trapno(%edi) 512 movl %eax,(%edi)
529 movl %edx,8*\trapno+4(%edi) 513 movl %edx,4(%edi)
530.endm 514 addl $8,%edi
515 loop 2b
531 516
532 set_early_handler handler=early_divide_err,trapno=0 517#ifdef CONFIG_CC_STACKPROTECTOR
533 set_early_handler handler=early_illegal_opcode,trapno=6 518 /*
534 set_early_handler handler=early_protection_fault,trapno=13 519 * Configure the stack canary. The linker can't handle this by
535 set_early_handler handler=early_page_fault,trapno=14 520 * relocation. Manually set base address in stack canary
521 * segment descriptor.
522 */
523 movl $gdt_page,%eax
524 movl $stack_canary,%ecx
525 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
526 shrl $16, %ecx
527 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
528 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
529#endif
536 530
531 andl $0,setup_once_ref /* Once is enough, thanks */
537 ret 532 ret
538 533
539early_divide_err: 534ENTRY(early_idt_handlers)
540 xor %edx,%edx 535 # 36(%esp) %eflags
541 pushl $0 /* fake errcode */ 536 # 32(%esp) %cs
542 jmp early_fault 537 # 28(%esp) %eip
538 # 24(%rsp) error code
539 i = 0
540 .rept NUM_EXCEPTION_VECTORS
541 .if (EXCEPTION_ERRCODE_MASK >> i) & 1
542 ASM_NOP2
543 .else
544 pushl $0 # Dummy error code, to make stack frame uniform
545 .endif
546 pushl $i # 20(%esp) Vector number
547 jmp early_idt_handler
548 i = i + 1
549 .endr
550ENDPROC(early_idt_handlers)
551
552 /* This is global to keep gas from relaxing the jumps */
553ENTRY(early_idt_handler)
554 cld
555 cmpl $2,%ss:early_recursion_flag
556 je hlt_loop
557 incl %ss:early_recursion_flag
543 558
544early_illegal_opcode: 559 push %eax # 16(%esp)
545 movl $6,%edx 560 push %ecx # 12(%esp)
546 pushl $0 /* fake errcode */ 561 push %edx # 8(%esp)
547 jmp early_fault 562 push %ds # 4(%esp)
563 push %es # 0(%esp)
564 movl $(__KERNEL_DS),%eax
565 movl %eax,%ds
566 movl %eax,%es
548 567
549early_protection_fault: 568 cmpl $(__KERNEL_CS),32(%esp)
550 movl $13,%edx 569 jne 10f
551 jmp early_fault
552 570
553early_page_fault: 571 leal 28(%esp),%eax # Pointer to %eip
554 movl $14,%edx 572 call early_fixup_exception
555 jmp early_fault 573 andl %eax,%eax
574 jnz ex_entry /* found an exception entry */
556 575
557early_fault: 57610:
558 cld
559#ifdef CONFIG_PRINTK 577#ifdef CONFIG_PRINTK
560 pusha 578 xorl %eax,%eax
561 movl $(__KERNEL_DS),%eax 579 movw %ax,2(%esp) /* clean up the segment values on some cpus */
562 movl %eax,%ds 580 movw %ax,6(%esp)
563 movl %eax,%es 581 movw %ax,34(%esp)
564 cmpl $2,early_recursion_flag 582 leal 40(%esp),%eax
565 je hlt_loop 583 pushl %eax /* %esp before the exception */
566 incl early_recursion_flag 584 pushl %ebx
585 pushl %ebp
586 pushl %esi
587 pushl %edi
567 movl %cr2,%eax 588 movl %cr2,%eax
568 pushl %eax 589 pushl %eax
569 pushl %edx /* trapno */ 590 pushl (20+6*4)(%esp) /* trapno */
570 pushl $fault_msg 591 pushl $fault_msg
571 call printk 592 call printk
572#endif 593#endif
@@ -575,6 +596,17 @@ hlt_loop:
575 hlt 596 hlt
576 jmp hlt_loop 597 jmp hlt_loop
577 598
599ex_entry:
600 pop %es
601 pop %ds
602 pop %edx
603 pop %ecx
604 pop %eax
605 addl $8,%esp /* drop vector number and error code */
606 decl %ss:early_recursion_flag
607 iret
608ENDPROC(early_idt_handler)
609
578/* This is the default interrupt "handler" :-) */ 610/* This is the default interrupt "handler" :-) */
579 ALIGN 611 ALIGN
580ignore_int: 612ignore_int:
@@ -608,13 +640,18 @@ ignore_int:
608 popl %eax 640 popl %eax
609#endif 641#endif
610 iret 642 iret
643ENDPROC(ignore_int)
644__INITDATA
645 .align 4
646early_recursion_flag:
647 .long 0
611 648
612#include "verify_cpu.S" 649__REFDATA
613 650 .align 4
614 __REFDATA
615.align 4
616ENTRY(initial_code) 651ENTRY(initial_code)
617 .long i386_start_kernel 652 .long i386_start_kernel
653ENTRY(setup_once_ref)
654 .long setup_once
618 655
619/* 656/*
620 * BSS section 657 * BSS section
@@ -667,22 +704,19 @@ ENTRY(initial_page_table)
667ENTRY(stack_start) 704ENTRY(stack_start)
668 .long init_thread_union+THREAD_SIZE 705 .long init_thread_union+THREAD_SIZE
669 706
670early_recursion_flag: 707__INITRODATA
671 .long 0
672
673ready: .byte 0
674
675int_msg: 708int_msg:
676 .asciz "Unknown interrupt or fault at: %p %p %p\n" 709 .asciz "Unknown interrupt or fault at: %p %p %p\n"
677 710
678fault_msg: 711fault_msg:
679/* fault info: */ 712/* fault info: */
680 .ascii "BUG: Int %d: CR2 %p\n" 713 .ascii "BUG: Int %d: CR2 %p\n"
681/* pusha regs: */ 714/* regs pushed in early_idt_handler: */
682 .ascii " EDI %p ESI %p EBP %p ESP %p\n" 715 .ascii " EDI %p ESI %p EBP %p EBX %p\n"
683 .ascii " EBX %p EDX %p ECX %p EAX %p\n" 716 .ascii " ESP %p ES %p DS %p\n"
717 .ascii " EDX %p ECX %p EAX %p\n"
684/* fault frame: */ 718/* fault frame: */
685 .ascii " err %p EIP %p CS %p flg %p\n" 719 .ascii " vec %p err %p EIP %p CS %p flg %p\n"
686 .ascii "Stack: %p %p %p %p %p %p %p %p\n" 720 .ascii "Stack: %p %p %p %p %p %p %p %p\n"
687 .ascii " %p %p %p %p %p %p %p %p\n" 721 .ascii " %p %p %p %p %p %p %p %p\n"
688 .asciz " %p %p %p %p %p %p %p %p\n" 722 .asciz " %p %p %p %p %p %p %p %p\n"
@@ -696,6 +730,7 @@ fault_msg:
696 * segment size, and 32-bit linear address value: 730 * segment size, and 32-bit linear address value:
697 */ 731 */
698 732
733 .data
699.globl boot_gdt_descr 734.globl boot_gdt_descr
700.globl idt_descr 735.globl idt_descr
701 736
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d70bc2eb202..94bf9cc2c7e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,12 +19,15 @@
19#include <asm/cache.h> 19#include <asm/cache.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/percpu.h> 21#include <asm/percpu.h>
22#include <asm/nops.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
25#include <asm/paravirt.h> 26#include <asm/paravirt.h>
27#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
26#else 28#else
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 29#define GET_CR2_INTO(reg) movq %cr2, reg
30#define INTERRUPT_RETURN iretq
28#endif 31#endif
29 32
30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE 33/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -266,36 +269,56 @@ bad_address:
266 jmp bad_address 269 jmp bad_address
267 270
268 .section ".init.text","ax" 271 .section ".init.text","ax"
269#ifdef CONFIG_EARLY_PRINTK
270 .globl early_idt_handlers 272 .globl early_idt_handlers
271early_idt_handlers: 273early_idt_handlers:
274 # 104(%rsp) %rflags
275 # 96(%rsp) %cs
276 # 88(%rsp) %rip
277 # 80(%rsp) error code
272 i = 0 278 i = 0
273 .rept NUM_EXCEPTION_VECTORS 279 .rept NUM_EXCEPTION_VECTORS
274 movl $i, %esi 280 .if (EXCEPTION_ERRCODE_MASK >> i) & 1
281 ASM_NOP2
282 .else
283 pushq $0 # Dummy error code, to make stack frame uniform
284 .endif
285 pushq $i # 72(%rsp) Vector number
275 jmp early_idt_handler 286 jmp early_idt_handler
276 i = i + 1 287 i = i + 1
277 .endr 288 .endr
278#endif
279 289
280ENTRY(early_idt_handler) 290ENTRY(early_idt_handler)
281#ifdef CONFIG_EARLY_PRINTK 291 cld
292
282 cmpl $2,early_recursion_flag(%rip) 293 cmpl $2,early_recursion_flag(%rip)
283 jz 1f 294 jz 1f
284 incl early_recursion_flag(%rip) 295 incl early_recursion_flag(%rip)
285 GET_CR2_INTO_RCX 296
286 movq %rcx,%r9 297 pushq %rax # 64(%rsp)
287 xorl %r8d,%r8d # zero for error code 298 pushq %rcx # 56(%rsp)
288 movl %esi,%ecx # get vector number 299 pushq %rdx # 48(%rsp)
289 # Test %ecx against mask of vectors that push error code. 300 pushq %rsi # 40(%rsp)
290 cmpl $31,%ecx 301 pushq %rdi # 32(%rsp)
291 ja 0f 302 pushq %r8 # 24(%rsp)
292 movl $1,%eax 303 pushq %r9 # 16(%rsp)
293 salq %cl,%rax 304 pushq %r10 # 8(%rsp)
294 testl $0x27d00,%eax 305 pushq %r11 # 0(%rsp)
295 je 0f 306
296 popq %r8 # get error code 307 cmpl $__KERNEL_CS,96(%rsp)
2970: movq 0(%rsp),%rcx # get ip 308 jne 10f
298 movq 8(%rsp),%rdx # get cs 309
310 leaq 88(%rsp),%rdi # Pointer to %rip
311 call early_fixup_exception
312 andl %eax,%eax
313 jnz 20f # Found an exception entry
314
31510:
316#ifdef CONFIG_EARLY_PRINTK
317 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
318 movl 80(%rsp),%r8d # error code
319 movl 72(%rsp),%esi # vector number
320 movl 96(%rsp),%edx # %cs
321 movq 88(%rsp),%rcx # %rip
299 xorl %eax,%eax 322 xorl %eax,%eax
300 leaq early_idt_msg(%rip),%rdi 323 leaq early_idt_msg(%rip),%rdi
301 call early_printk 324 call early_printk
@@ -304,17 +327,32 @@ ENTRY(early_idt_handler)
304 call dump_stack 327 call dump_stack
305#ifdef CONFIG_KALLSYMS 328#ifdef CONFIG_KALLSYMS
306 leaq early_idt_ripmsg(%rip),%rdi 329 leaq early_idt_ripmsg(%rip),%rdi
307 movq 0(%rsp),%rsi # get rip again 330 movq 40(%rsp),%rsi # %rip again
308 call __print_symbol 331 call __print_symbol
309#endif 332#endif
310#endif /* EARLY_PRINTK */ 333#endif /* EARLY_PRINTK */
3111: hlt 3341: hlt
312 jmp 1b 335 jmp 1b
313 336
314#ifdef CONFIG_EARLY_PRINTK 33720: # Exception table entry found
338 popq %r11
339 popq %r10
340 popq %r9
341 popq %r8
342 popq %rdi
343 popq %rsi
344 popq %rdx
345 popq %rcx
346 popq %rax
347 addq $16,%rsp # drop vector number and error code
348 decl early_recursion_flag(%rip)
349 INTERRUPT_RETURN
350
351 .balign 4
315early_recursion_flag: 352early_recursion_flag:
316 .long 0 353 .long 0
317 354
355#ifdef CONFIG_EARLY_PRINTK
318early_idt_msg: 356early_idt_msg:
319 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" 357 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
320early_idt_ripmsg: 358early_idt_ripmsg:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714..9cc7b4392f7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -94,13 +94,18 @@ static int hpet_verbose;
94 94
95static int __init hpet_setup(char *str) 95static int __init hpet_setup(char *str)
96{ 96{
97 if (str) { 97 while (str) {
98 char *next = strchr(str, ',');
99
100 if (next)
101 *next++ = 0;
98 if (!strncmp("disable", str, 7)) 102 if (!strncmp("disable", str, 7))
99 boot_hpet_disable = 1; 103 boot_hpet_disable = 1;
100 if (!strncmp("force", str, 5)) 104 if (!strncmp("force", str, 5))
101 hpet_force_user = 1; 105 hpet_force_user = 1;
102 if (!strncmp("verbose", str, 7)) 106 if (!strncmp("verbose", str, 7))
103 hpet_verbose = 1; 107 hpet_verbose = 1;
108 str = next;
104 } 109 }
105 return 1; 110 return 1;
106} 111}
@@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
319 now = hpet_readl(HPET_COUNTER); 324 now = hpet_readl(HPET_COUNTER);
320 cmp = now + (unsigned int) delta; 325 cmp = now + (unsigned int) delta;
321 cfg = hpet_readl(HPET_Tn_CFG(timer)); 326 cfg = hpet_readl(HPET_Tn_CFG(timer));
322 /* Make sure we use edge triggered interrupts */
323 cfg &= ~HPET_TN_LEVEL;
324 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 327 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
325 HPET_TN_SETVAL | HPET_TN_32BIT; 328 HPET_TN_SETVAL | HPET_TN_32BIT;
326 hpet_writel(cfg, HPET_Tn_CFG(timer)); 329 hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -787,15 +790,16 @@ static int hpet_clocksource_register(void)
787 return 0; 790 return 0;
788} 791}
789 792
793static u32 *hpet_boot_cfg;
794
790/** 795/**
791 * hpet_enable - Try to setup the HPET timer. Returns 1 on success. 796 * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
792 */ 797 */
793int __init hpet_enable(void) 798int __init hpet_enable(void)
794{ 799{
795 unsigned long hpet_period; 800 u32 hpet_period, cfg, id;
796 unsigned int id;
797 u64 freq; 801 u64 freq;
798 int i; 802 unsigned int i, last;
799 803
800 if (!is_hpet_capable()) 804 if (!is_hpet_capable())
801 return 0; 805 return 0;
@@ -847,15 +851,45 @@ int __init hpet_enable(void)
847 id = hpet_readl(HPET_ID); 851 id = hpet_readl(HPET_ID);
848 hpet_print_config(); 852 hpet_print_config();
849 853
854 last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
855
850#ifdef CONFIG_HPET_EMULATE_RTC 856#ifdef CONFIG_HPET_EMULATE_RTC
851 /* 857 /*
852 * The legacy routing mode needs at least two channels, tick timer 858 * The legacy routing mode needs at least two channels, tick timer
853 * and the rtc emulation channel. 859 * and the rtc emulation channel.
854 */ 860 */
855 if (!(id & HPET_ID_NUMBER)) 861 if (!last)
856 goto out_nohpet; 862 goto out_nohpet;
857#endif 863#endif
858 864
865 cfg = hpet_readl(HPET_CFG);
866 hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
867 GFP_KERNEL);
868 if (hpet_boot_cfg)
869 *hpet_boot_cfg = cfg;
870 else
871 pr_warn("HPET initial state will not be saved\n");
872 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
873 hpet_writel(cfg, HPET_Tn_CFG(i));
874 if (cfg)
875 pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
876 cfg);
877
878 for (i = 0; i <= last; ++i) {
879 cfg = hpet_readl(HPET_Tn_CFG(i));
880 if (hpet_boot_cfg)
881 hpet_boot_cfg[i + 1] = cfg;
882 cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
883 hpet_writel(cfg, HPET_Tn_CFG(i));
884 cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
885 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
886 | HPET_TN_FSB | HPET_TN_FSB_CAP);
887 if (cfg)
888 pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
889 cfg, i);
890 }
891 hpet_print_config();
892
859 if (hpet_clocksource_register()) 893 if (hpet_clocksource_register())
860 goto out_nohpet; 894 goto out_nohpet;
861 895
@@ -923,14 +957,28 @@ fs_initcall(hpet_late_init);
923void hpet_disable(void) 957void hpet_disable(void)
924{ 958{
925 if (is_hpet_capable() && hpet_virt_address) { 959 if (is_hpet_capable() && hpet_virt_address) {
926 unsigned int cfg = hpet_readl(HPET_CFG); 960 unsigned int cfg = hpet_readl(HPET_CFG), id, last;
927 961
928 if (hpet_legacy_int_enabled) { 962 if (hpet_boot_cfg)
963 cfg = *hpet_boot_cfg;
964 else if (hpet_legacy_int_enabled) {
929 cfg &= ~HPET_CFG_LEGACY; 965 cfg &= ~HPET_CFG_LEGACY;
930 hpet_legacy_int_enabled = 0; 966 hpet_legacy_int_enabled = 0;
931 } 967 }
932 cfg &= ~HPET_CFG_ENABLE; 968 cfg &= ~HPET_CFG_ENABLE;
933 hpet_writel(cfg, HPET_CFG); 969 hpet_writel(cfg, HPET_CFG);
970
971 if (!hpet_boot_cfg)
972 return;
973
974 id = hpet_readl(HPET_ID);
975 last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
976
977 for (id = 0; id <= last; ++id)
978 hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
979
980 if (*hpet_boot_cfg & HPET_CFG_ENABLE)
981 hpet_writel(*hpet_boot_cfg, HPET_CFG);
934 } 982 }
935} 983}
936 984
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2d6e6498c17..f250431fb50 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -88,7 +88,7 @@ void kernel_fpu_begin(void)
88 __thread_clear_has_fpu(me); 88 __thread_clear_has_fpu(me);
89 /* We do 'stts()' in kernel_fpu_end() */ 89 /* We do 'stts()' in kernel_fpu_end() */
90 } else { 90 } else {
91 percpu_write(fpu_owner_task, NULL); 91 this_cpu_write(fpu_owner_task, NULL);
92 clts(); 92 clts();
93 } 93 }
94} 94}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf4494..00000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15
16/*
17 * Initial thread structure.
18 *
19 * We need to make sure that this is THREAD_SIZE aligned due to the
20 * way process stacks are handled. This is done by having a special
21 * "init_task" linker map entry..
22 */
23union thread_union init_thread_union __init_task_data =
24 { INIT_THREAD_INFO(init_task) };
25
26/*
27 * Initial task structure.
28 *
29 * All other task structs will be allocated on slabs in fork.c
30 */
31struct task_struct init_task = INIT_TASK(init_task);
32EXPORT_SYMBOL(init_task);
33
34/*
35 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
36 * no more per-task TSS's. The TSS size is kept cacheline-aligned
37 * so they are allowed to end up in the .data..cacheline_aligned
38 * section. Since TSS's are completely CPU-local, we want them
39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40 */
41DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
42
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 58b7f27cb3e..344faf8d0d6 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
127 return; 127 return;
128 128
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREAD_FLAGS, 130 THREADINFO_GFP,
131 THREAD_ORDER)); 131 THREAD_SIZE_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
137 per_cpu(hardirq_ctx, cpu) = irqctx; 137 per_cpu(hardirq_ctx, cpu) = irqctx;
138 138
139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
140 THREAD_FLAGS, 140 THREADINFO_GFP,
141 THREAD_ORDER)); 141 THREAD_SIZE_ORDER));
142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
143 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e213fc8408d..e2f751efb7b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1037 "current sp %p does not match saved sp %p\n", 1037 "current sp %p does not match saved sp %p\n",
1038 stack_addr(regs), kcb->jprobe_saved_sp); 1038 stack_addr(regs), kcb->jprobe_saved_sp);
1039 printk(KERN_ERR "Saved registers for jprobe %p\n", jp); 1039 printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1040 show_registers(saved_regs); 1040 show_regs(saved_regs);
1041 printk(KERN_ERR "Current registers\n"); 1041 printk(KERN_ERR "Current registers\n");
1042 show_registers(regs); 1042 show_regs(regs);
1043 BUG(); 1043 BUG();
1044 } 1044 }
1045 *regs = kcb->jprobe_saved_regs; 1045 *regs = kcb->jprobe_saved_regs;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ba6e4a27e..e554e5ad2fe 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -79,7 +79,6 @@ struct kvm_task_sleep_node {
79 u32 token; 79 u32 token;
80 int cpu; 80 int cpu;
81 bool halted; 81 bool halted;
82 struct mm_struct *mm;
83}; 82};
84 83
85static struct kvm_task_sleep_head { 84static struct kvm_task_sleep_head {
@@ -126,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token)
126 125
127 n.token = token; 126 n.token = token;
128 n.cpu = smp_processor_id(); 127 n.cpu = smp_processor_id();
129 n.mm = current->active_mm;
130 n.halted = idle || preempt_count() > 1; 128 n.halted = idle || preempt_count() > 1;
131 atomic_inc(&n.mm->mm_count);
132 init_waitqueue_head(&n.wq); 129 init_waitqueue_head(&n.wq);
133 hlist_add_head(&n.link, &b->list); 130 hlist_add_head(&n.link, &b->list);
134 spin_unlock(&b->lock); 131 spin_unlock(&b->lock);
@@ -161,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161static void apf_task_wake_one(struct kvm_task_sleep_node *n) 158static void apf_task_wake_one(struct kvm_task_sleep_node *n)
162{ 159{
163 hlist_del_init(&n->link); 160 hlist_del_init(&n->link);
164 if (!n->mm)
165 return;
166 mmdrop(n->mm);
167 if (n->halted) 161 if (n->halted)
168 smp_send_reschedule(n->cpu); 162 smp_send_reschedule(n->cpu);
169 else if (waitqueue_active(&n->wq)) 163 else if (waitqueue_active(&n->wq))
@@ -207,7 +201,7 @@ again:
207 * async PF was not yet handled. 201 * async PF was not yet handled.
208 * Add dummy entry for the token. 202 * Add dummy entry for the token.
209 */ 203 */
210 n = kmalloc(sizeof(*n), GFP_ATOMIC); 204 n = kzalloc(sizeof(*n), GFP_ATOMIC);
211 if (!n) { 205 if (!n) {
212 /* 206 /*
213 * Allocation failed! Busy wait while other cpu 207 * Allocation failed! Busy wait while other cpu
@@ -219,7 +213,6 @@ again:
219 } 213 }
220 n->token = token; 214 n->token = token;
221 n->cpu = smp_processor_id(); 215 n->cpu = smp_processor_id();
222 n->mm = NULL;
223 init_waitqueue_head(&n->wq); 216 init_waitqueue_head(&n->wq);
224 hlist_add_head(&n->link, &b->list); 217 hlist_add_head(&n->link, &b->list);
225 } else 218 } else
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da65bf..086eb58c6e8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h>
25 26
26#include <asm/x86_init.h> 27#include <asm/x86_init.h>
27#include <asm/reboot.h> 28#include <asm/reboot.h>
@@ -114,6 +115,25 @@ static void kvm_get_preset_lpj(void)
114 preset_lpj = lpj; 115 preset_lpj = lpj;
115} 116}
116 117
118bool kvm_check_and_clear_guest_paused(void)
119{
120 bool ret = false;
121 struct pvclock_vcpu_time_info *src;
122
123 /*
124 * per_cpu() is safe here because this function is only called from
125 * timer functions where preemption is already disabled.
126 */
127 WARN_ON(!in_atomic());
128 src = &__get_cpu_var(hv_clock);
129 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
130 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
131 ret = true;
132 }
133
134 return ret;
135}
136
117static struct clocksource kvm_clock = { 137static struct clocksource kvm_clock = {
118 .name = "kvm-clock", 138 .name = "kvm-clock",
119 .read = kvm_clock_get_cycles, 139 .read = kvm_clock_get_cycles,
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 7eb1e2b9782..00000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,476 +0,0 @@
1/*
2 * Written by Martin Kolinek, February 1996
3 *
4 * Changes:
5 *
6 * Chris Beauregard July 28th, 1996
7 * - Fixed up integrated SCSI detection
8 *
9 * Chris Beauregard August 3rd, 1996
10 * - Made mca_info local
11 * - Made integrated registers accessible through standard function calls
12 * - Added name field
13 * - More sanity checking
14 *
15 * Chris Beauregard August 9th, 1996
16 * - Rewrote /proc/mca
17 *
18 * Chris Beauregard January 7th, 1997
19 * - Added basic NMI-processing
20 * - Added more information to mca_info structure
21 *
22 * David Weinehall October 12th, 1998
23 * - Made a lot of cleaning up in the source
24 * - Added use of save_flags / restore_flags
25 * - Added the 'driver_loaded' flag in MCA_adapter
26 * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
27 *
28 * David Weinehall March 24th, 1999
29 * - Fixed the output of 'Driver Installed' in /proc/mca/pos
30 * - Made the Integrated Video & SCSI show up even if they have id 0000
31 *
32 * Alexander Viro November 9th, 1999
33 * - Switched to regular procfs methods
34 *
35 * Alfred Arnold & David Weinehall August 23rd, 2000
36 * - Added support for Planar POS-registers
37 */
38
39#include <linux/module.h>
40#include <linux/types.h>
41#include <linux/errno.h>
42#include <linux/kernel.h>
43#include <linux/mca.h>
44#include <linux/kprobes.h>
45#include <linux/slab.h>
46#include <asm/io.h>
47#include <linux/proc_fs.h>
48#include <linux/mman.h>
49#include <linux/mm.h>
50#include <linux/pagemap.h>
51#include <linux/ioport.h>
52#include <asm/uaccess.h>
53#include <linux/init.h>
54
55static unsigned char which_scsi;
56
57int MCA_bus;
58EXPORT_SYMBOL(MCA_bus);
59
60/*
61 * Motherboard register spinlock. Untested on SMP at the moment, but
62 * are there any MCA SMP boxes?
63 *
64 * Yes - Alan
65 */
66static DEFINE_SPINLOCK(mca_lock);
67
68/* Build the status info for the adapter */
69
70static void mca_configure_adapter_status(struct mca_device *mca_dev)
71{
72 mca_dev->status = MCA_ADAPTER_NONE;
73
74 mca_dev->pos_id = mca_dev->pos[0]
75 + (mca_dev->pos[1] << 8);
76
77 if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
78
79 /*
80 * id = 0x0000 usually indicates hardware failure,
81 * however, ZP Gu (zpg@castle.net> reports that his 9556
82 * has 0x0000 as id and everything still works. There
83 * also seem to be an adapter with id = 0x0000; the
84 * NCR Parallel Bus Memory Card. Until this is confirmed,
85 * however, this code will stay.
86 */
87
88 mca_dev->status = MCA_ADAPTER_ERROR;
89
90 return;
91 } else if (mca_dev->pos_id != 0xffff) {
92
93 /*
94 * 0xffff usually indicates that there's no adapter,
95 * however, some integrated adapters may have 0xffff as
96 * their id and still be valid. Examples are on-board
97 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
98 * and possibly also the 95 ULTIMEDIA.
99 */
100
101 mca_dev->status = MCA_ADAPTER_NORMAL;
102 }
103
104 if ((mca_dev->pos_id == 0xffff ||
105 mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
106 int j;
107
108 for (j = 2; j < 8; j++) {
109 if (mca_dev->pos[j] != 0xff) {
110 mca_dev->status = MCA_ADAPTER_NORMAL;
111 break;
112 }
113 }
114 }
115
116 if (!(mca_dev->pos[2] & MCA_ENABLED)) {
117
118 /* enabled bit is in POS 2 */
119
120 mca_dev->status = MCA_ADAPTER_DISABLED;
121 }
122} /* mca_configure_adapter_status */
123
124/*--------------------------------------------------------------------*/
125
126static struct resource mca_standard_resources[] = {
127 { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
128 { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
129 { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
130 { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
131 { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
132 { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
133 { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
134};
135
136#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
137
138/*
139 * mca_read_and_store_pos - read the POS registers into a memory buffer
140 * @pos: a char pointer to 8 bytes, contains the POS register value on
141 * successful return
142 *
143 * Returns 1 if a card actually exists (i.e. the pos isn't
144 * all 0xff) or 0 otherwise
145 */
146static int mca_read_and_store_pos(unsigned char *pos)
147{
148 int j;
149 int found = 0;
150
151 for (j = 0; j < 8; j++) {
152 pos[j] = inb_p(MCA_POS_REG(j));
153 if (pos[j] != 0xff) {
154 /* 0xff all across means no device. 0x00 means
155 * something's broken, but a device is
156 * probably there. However, if you get 0x00
157 * from a motherboard register it won't matter
158 * what we find. For the record, on the
159 * 57SLC, the integrated SCSI adapter has
160 * 0xffff for the adapter ID, but nonzero for
161 * other registers. */
162
163 found = 1;
164 }
165 }
166 return found;
167}
168
169static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
170{
171 unsigned char byte;
172 unsigned long flags;
173
174 if (reg < 0 || reg >= 8)
175 return 0;
176
177 spin_lock_irqsave(&mca_lock, flags);
178 if (mca_dev->pos_register) {
179 /* Disable adapter setup, enable motherboard setup */
180
181 outb_p(0, MCA_ADAPTER_SETUP_REG);
182 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
183
184 byte = inb_p(MCA_POS_REG(reg));
185 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
186 } else {
187
188 /* Make sure motherboard setup is off */
189
190 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
191
192 /* Read the appropriate register */
193
194 outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
195 byte = inb_p(MCA_POS_REG(reg));
196 outb_p(0, MCA_ADAPTER_SETUP_REG);
197 }
198 spin_unlock_irqrestore(&mca_lock, flags);
199
200 mca_dev->pos[reg] = byte;
201
202 return byte;
203}
204
205static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
206 unsigned char byte)
207{
208 unsigned long flags;
209
210 if (reg < 0 || reg >= 8)
211 return;
212
213 spin_lock_irqsave(&mca_lock, flags);
214
215 /* Make sure motherboard setup is off */
216
217 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
218
219 /* Read in the appropriate register */
220
221 outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
222 outb_p(byte, MCA_POS_REG(reg));
223 outb_p(0, MCA_ADAPTER_SETUP_REG);
224
225 spin_unlock_irqrestore(&mca_lock, flags);
226
227 /* Update the global register list, while we have the byte */
228
229 mca_dev->pos[reg] = byte;
230
231}
232
233/* for the primary MCA bus, we have identity transforms */
234static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
235{
236 return irq;
237}
238
239static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
240{
241 return port;
242}
243
244static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
245{
246 return mem;
247}
248
249
250static int __init mca_init(void)
251{
252 unsigned int i, j;
253 struct mca_device *mca_dev;
254 unsigned char pos[8];
255 short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
256 struct mca_bus *bus;
257
258 /*
259 * WARNING: Be careful when making changes here. Putting an adapter
260 * and the motherboard simultaneously into setup mode may result in
261 * damage to chips (according to The Indispensable PC Hardware Book
262 * by Hans-Peter Messmer). Also, we disable system interrupts (so
263 * that we are not disturbed in the middle of this).
264 */
265
266 /* Make sure the MCA bus is present */
267
268 if (mca_system_init()) {
269 printk(KERN_ERR "MCA bus system initialisation failed\n");
270 return -ENODEV;
271 }
272
273 if (!MCA_bus)
274 return -ENODEV;
275
276 printk(KERN_INFO "Micro Channel bus detected.\n");
277
278 /* All MCA systems have at least a primary bus */
279 bus = mca_attach_bus(MCA_PRIMARY_BUS);
280 if (!bus)
281 goto out_nomem;
282 bus->default_dma_mask = 0xffffffffLL;
283 bus->f.mca_write_pos = mca_pc_write_pos;
284 bus->f.mca_read_pos = mca_pc_read_pos;
285 bus->f.mca_transform_irq = mca_dummy_transform_irq;
286 bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
287 bus->f.mca_transform_memory = mca_dummy_transform_memory;
288
289 /* get the motherboard device */
290 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
291 if (unlikely(!mca_dev))
292 goto out_nomem;
293
294 /*
295 * We do not expect many MCA interrupts during initialization,
296 * but let us be safe:
297 */
298 spin_lock_irq(&mca_lock);
299
300 /* Make sure adapter setup is off */
301
302 outb_p(0, MCA_ADAPTER_SETUP_REG);
303
304 /* Read motherboard POS registers */
305
306 mca_dev->pos_register = 0x7f;
307 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
308 mca_dev->name[0] = 0;
309 mca_read_and_store_pos(mca_dev->pos);
310 mca_configure_adapter_status(mca_dev);
311 /* fake POS and slot for a motherboard */
312 mca_dev->pos_id = MCA_MOTHERBOARD_POS;
313 mca_dev->slot = MCA_MOTHERBOARD;
314 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
315
316 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
317 if (unlikely(!mca_dev))
318 goto out_unlock_nomem;
319
320 /* Put motherboard into video setup mode, read integrated video
321 * POS registers, and turn motherboard setup off.
322 */
323
324 mca_dev->pos_register = 0xdf;
325 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
326 mca_dev->name[0] = 0;
327 mca_read_and_store_pos(mca_dev->pos);
328 mca_configure_adapter_status(mca_dev);
329 /* fake POS and slot for the integrated video */
330 mca_dev->pos_id = MCA_INTEGVIDEO_POS;
331 mca_dev->slot = MCA_INTEGVIDEO;
332 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
333
334 /*
335 * Put motherboard into scsi setup mode, read integrated scsi
336 * POS registers, and turn motherboard setup off.
337 *
338 * It seems there are two possible SCSI registers. Martin says that
339 * for the 56,57, 0xf7 is the one, but fails on the 76.
340 * Alfredo (apena@vnet.ibm.com) says
341 * 0xfd works on his machine. We'll try both of them. I figure it's
342 * a good bet that only one could be valid at a time. This could
343 * screw up though if one is used for something else on the other
344 * machine.
345 */
346
347 for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
348 outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
349 if (mca_read_and_store_pos(pos))
350 break;
351 }
352 if (which_scsi) {
353 /* found a scsi card */
354 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
355 if (unlikely(!mca_dev))
356 goto out_unlock_nomem;
357
358 for (j = 0; j < 8; j++)
359 mca_dev->pos[j] = pos[j];
360
361 mca_configure_adapter_status(mca_dev);
362 /* fake POS and slot for integrated SCSI controller */
363 mca_dev->pos_id = MCA_INTEGSCSI_POS;
364 mca_dev->slot = MCA_INTEGSCSI;
365 mca_dev->pos_register = which_scsi;
366 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
367 }
368
369 /* Turn off motherboard setup */
370
371 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
372
373 /*
374 * Now loop over MCA slots: put each adapter into setup mode, and
375 * read its POS registers. Then put adapter setup off.
376 */
377
378 for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
379 outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
380 if (!mca_read_and_store_pos(pos))
381 continue;
382
383 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
384 if (unlikely(!mca_dev))
385 goto out_unlock_nomem;
386
387 for (j = 0; j < 8; j++)
388 mca_dev->pos[j] = pos[j];
389
390 mca_dev->driver_loaded = 0;
391 mca_dev->slot = i;
392 mca_dev->pos_register = 0;
393 mca_configure_adapter_status(mca_dev);
394 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
395 }
396 outb_p(0, MCA_ADAPTER_SETUP_REG);
397
398 /* Enable interrupts and return memory start */
399 spin_unlock_irq(&mca_lock);
400
401 for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
402 request_resource(&ioport_resource, mca_standard_resources + i);
403
404 mca_do_proc_init();
405
406 return 0;
407
408 out_unlock_nomem:
409 spin_unlock_irq(&mca_lock);
410 out_nomem:
411 printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
412 return -ENOMEM;
413}
414
415subsys_initcall(mca_init);
416
417/*--------------------------------------------------------------------*/
418
419static __kprobes void
420mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
421{
422 int slot = mca_dev->slot;
423
424 if (slot == MCA_INTEGSCSI) {
425 printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
426 mca_dev->name);
427 } else if (slot == MCA_INTEGVIDEO) {
428 printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
429 mca_dev->name);
430 } else if (slot == MCA_MOTHERBOARD) {
431 printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
432 mca_dev->name);
433 }
434
435 /* More info available in POS 6 and 7? */
436
437 if (check_flag) {
438 unsigned char pos6, pos7;
439
440 pos6 = mca_device_read_pos(mca_dev, 6);
441 pos7 = mca_device_read_pos(mca_dev, 7);
442
443 printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
444 }
445
446} /* mca_handle_nmi_slot */
447
448/*--------------------------------------------------------------------*/
449
450static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
451{
452 struct mca_device *mca_dev = to_mca_device(dev);
453 unsigned char pos5;
454
455 pos5 = mca_device_read_pos(mca_dev, 5);
456
457 if (!(pos5 & 0x80)) {
458 /*
459 * Bit 7 of POS 5 is reset when this adapter has a hardware
460 * error. Bit 7 it reset if there's error information
461 * available in POS 6 and 7.
462 */
463 mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
464 return 1;
465 }
466 return 0;
467}
468
469void __kprobes mca_handle_nmi(void)
470{
471 /*
472 * First try - scan the various adapters and see if a specific
473 * adapter was responsible for the error.
474 */
475 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
476}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9bda6d6035..fbdfc691718 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev,
299{ 299{
300 unsigned long val; 300 unsigned long val;
301 int cpu = dev->id; 301 int cpu = dev->id;
302 int ret = 0; 302 ssize_t ret = 0;
303 char *end;
304 303
305 val = simple_strtoul(buf, &end, 0); 304 ret = kstrtoul(buf, 0, &val);
306 if (end == buf) 305 if (ret)
307 return -EINVAL; 306 return ret;
308 307
309 if (val == 1) { 308 if (val == 1) {
310 get_online_cpus(); 309 get_online_cpus();
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3ca42d0e43a..0327e2b3c40 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
147 147
148 memset(csig, 0, sizeof(*csig)); 148 memset(csig, 0, sizeof(*csig));
149 149
150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
151 cpu_has(c, X86_FEATURE_IA64)) {
152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
153 return -1;
154 }
155
156 csig->sig = cpuid_eax(0x00000001); 150 csig->sig = cpuid_eax(0x00000001);
157 151
158 if ((c->x86_model >= 5) || (c->x86 > 6)) { 152 if ((c->x86_model >= 5) || (c->x86 > 6)) {
@@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = {
463 457
464struct microcode_ops * __init init_intel_microcode(void) 458struct microcode_ops * __init init_intel_microcode(void)
465{ 459{
460 struct cpuinfo_x86 *c = &cpu_data(0);
461
462 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
463 cpu_has(c, X86_FEATURE_IA64)) {
464 pr_err("Intel CPU family 0x%x not supported\n", c->x86);
465 return NULL;
466 }
467
466 return &microcode_intel_ops; 468 return &microcode_intel_ops;
467} 469}
468 470
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index f44d3115735..d2b56489d70 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -96,7 +96,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 96
97 set_bit(m->busid, mp_bus_not_pci); 97 set_bit(m->busid, mp_bus_not_pci);
98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
99#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 99#ifdef CONFIG_EISA
100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
101#endif 101#endif
102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,12 +104,10 @@ static void __init MP_bus_info(struct mpc_bus *m)
104 x86_init.mpparse.mpc_oem_pci_bus(m); 104 x86_init.mpparse.mpc_oem_pci_bus(m);
105 105
106 clear_bit(m->busid, mp_bus_not_pci); 106 clear_bit(m->busid, mp_bus_not_pci);
107#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 107#ifdef CONFIG_EISA
108 mp_bus_id_to_type[m->busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
110 mp_bus_id_to_type[m->busid] = MP_BUS_EISA; 110 mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
111 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
112 mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
113#endif 111#endif
114 } else 112 } else
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 113 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -367,9 +365,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
367 case 3: 365 case 3:
368 memcpy(bus.bustype, "EISA ", 6); 366 memcpy(bus.bustype, "EISA ", 6);
369 break; 367 break;
370 case 4:
371 case 7:
372 memcpy(bus.bustype, "MCA ", 6);
373 } 368 }
374 MP_bus_info(&bus); 369 MP_bus_info(&bus);
375 if (mpc_default_type > 4) { 370 if (mpc_default_type > 4) {
@@ -572,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
572 struct mpf_intel *mpf; 567 struct mpf_intel *mpf;
573 unsigned long mem; 568 unsigned long mem;
574 569
575 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 570 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
576 bp, length); 571 base, base + length - 1);
577 BUILD_BUG_ON(sizeof(*mpf) != 16); 572 BUILD_BUG_ON(sizeof(*mpf) != 16);
578 573
579 while (length > 0) { 574 while (length > 0) {
@@ -588,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
588#endif 583#endif
589 mpf_found = mpf; 584 mpf_found = mpf;
590 585
591 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 586 printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
592 mpf, (u64)virt_to_phys(mpf)); 587 (unsigned long long) virt_to_phys(mpf),
588 (unsigned long long) virt_to_phys(mpf) +
589 sizeof(*mpf) - 1, mpf);
593 590
594 mem = virt_to_phys(mpf); 591 mem = virt_to_phys(mpf);
595 memblock_reserve(mem, sizeof(*mpf)); 592 memblock_reserve(mem, sizeof(*mpf));
@@ -622,7 +619,7 @@ void __init default_find_smp_config(void)
622 return; 619 return;
623 /* 620 /*
624 * If it is an SMP machine we should know now, unless the 621 * If it is an SMP machine we should know now, unless the
625 * configuration is in an EISA/MCA bus machine with an 622 * configuration is in an EISA bus machine with an
626 * extended bios data area. 623 * extended bios data area.
627 * 624 *
628 * there is a real-mode segmented pointer pointing to the 625 * there is a real-mode segmented pointer pointing to the
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf31916..90875279ef3 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -19,8 +19,6 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/export.h> 20#include <linux/export.h>
21 21
22#include <linux/mca.h>
23
24#if defined(CONFIG_EDAC) 22#if defined(CONFIG_EDAC)
25#include <linux/edac.h> 23#include <linux/edac.h>
26#endif 24#endif
@@ -31,14 +29,6 @@
31#include <asm/nmi.h> 29#include <asm/nmi.h>
32#include <asm/x86_init.h> 30#include <asm/x86_init.h>
33 31
34#define NMI_MAX_NAMELEN 16
35struct nmiaction {
36 struct list_head list;
37 nmi_handler_t handler;
38 unsigned int flags;
39 char *name;
40};
41
42struct nmi_desc { 32struct nmi_desc {
43 spinlock_t lock; 33 spinlock_t lock;
44 struct list_head head; 34 struct list_head head;
@@ -54,6 +44,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
54 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), 44 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
55 .head = LIST_HEAD_INIT(nmi_desc[1].head), 45 .head = LIST_HEAD_INIT(nmi_desc[1].head),
56 }, 46 },
47 {
48 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
49 .head = LIST_HEAD_INIT(nmi_desc[2].head),
50 },
51 {
52 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
53 .head = LIST_HEAD_INIT(nmi_desc[3].head),
54 },
57 55
58}; 56};
59 57
@@ -84,7 +82,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
84 82
85#define nmi_to_desc(type) (&nmi_desc[type]) 83#define nmi_to_desc(type) (&nmi_desc[type])
86 84
87static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 85static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
88{ 86{
89 struct nmi_desc *desc = nmi_to_desc(type); 87 struct nmi_desc *desc = nmi_to_desc(type);
90 struct nmiaction *a; 88 struct nmiaction *a;
@@ -107,11 +105,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
107 return handled; 105 return handled;
108} 106}
109 107
110static int __setup_nmi(unsigned int type, struct nmiaction *action) 108int __register_nmi_handler(unsigned int type, struct nmiaction *action)
111{ 109{
112 struct nmi_desc *desc = nmi_to_desc(type); 110 struct nmi_desc *desc = nmi_to_desc(type);
113 unsigned long flags; 111 unsigned long flags;
114 112
113 if (!action->handler)
114 return -EINVAL;
115
115 spin_lock_irqsave(&desc->lock, flags); 116 spin_lock_irqsave(&desc->lock, flags);
116 117
117 /* 118 /*
@@ -120,6 +121,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
120 * to manage expectations 121 * to manage expectations
121 */ 122 */
122 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); 123 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
124 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
125 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
123 126
124 /* 127 /*
125 * some handlers need to be executed first otherwise a fake 128 * some handlers need to be executed first otherwise a fake
@@ -133,8 +136,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
133 spin_unlock_irqrestore(&desc->lock, flags); 136 spin_unlock_irqrestore(&desc->lock, flags);
134 return 0; 137 return 0;
135} 138}
139EXPORT_SYMBOL(__register_nmi_handler);
136 140
137static struct nmiaction *__free_nmi(unsigned int type, const char *name) 141void unregister_nmi_handler(unsigned int type, const char *name)
138{ 142{
139 struct nmi_desc *desc = nmi_to_desc(type); 143 struct nmi_desc *desc = nmi_to_desc(type);
140 struct nmiaction *n; 144 struct nmiaction *n;
@@ -157,61 +161,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
157 161
158 spin_unlock_irqrestore(&desc->lock, flags); 162 spin_unlock_irqrestore(&desc->lock, flags);
159 synchronize_rcu(); 163 synchronize_rcu();
160 return (n);
161} 164}
162
163int register_nmi_handler(unsigned int type, nmi_handler_t handler,
164 unsigned long nmiflags, const char *devname)
165{
166 struct nmiaction *action;
167 int retval = -ENOMEM;
168
169 if (!handler)
170 return -EINVAL;
171
172 action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
173 if (!action)
174 goto fail_action;
175
176 action->handler = handler;
177 action->flags = nmiflags;
178 action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
179 if (!action->name)
180 goto fail_action_name;
181
182 retval = __setup_nmi(type, action);
183
184 if (retval)
185 goto fail_setup_nmi;
186
187 return retval;
188
189fail_setup_nmi:
190 kfree(action->name);
191fail_action_name:
192 kfree(action);
193fail_action:
194
195 return retval;
196}
197EXPORT_SYMBOL_GPL(register_nmi_handler);
198
199void unregister_nmi_handler(unsigned int type, const char *name)
200{
201 struct nmiaction *a;
202
203 a = __free_nmi(type, name);
204 if (a) {
205 kfree(a->name);
206 kfree(a);
207 }
208}
209
210EXPORT_SYMBOL_GPL(unregister_nmi_handler); 165EXPORT_SYMBOL_GPL(unregister_nmi_handler);
211 166
212static notrace __kprobes void 167static __kprobes void
213pci_serr_error(unsigned char reason, struct pt_regs *regs) 168pci_serr_error(unsigned char reason, struct pt_regs *regs)
214{ 169{
170 /* check to see if anyone registered against these types of errors */
171 if (nmi_handle(NMI_SERR, regs, false))
172 return;
173
215 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 174 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
216 reason, smp_processor_id()); 175 reason, smp_processor_id());
217 176
@@ -236,15 +195,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
236 outb(reason, NMI_REASON_PORT); 195 outb(reason, NMI_REASON_PORT);
237} 196}
238 197
239static notrace __kprobes void 198static __kprobes void
240io_check_error(unsigned char reason, struct pt_regs *regs) 199io_check_error(unsigned char reason, struct pt_regs *regs)
241{ 200{
242 unsigned long i; 201 unsigned long i;
243 202
203 /* check to see if anyone registered against these types of errors */
204 if (nmi_handle(NMI_IO_CHECK, regs, false))
205 return;
206
244 pr_emerg( 207 pr_emerg(
245 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", 208 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
246 reason, smp_processor_id()); 209 reason, smp_processor_id());
247 show_registers(regs); 210 show_regs(regs);
248 211
249 if (panic_on_io_nmi) 212 if (panic_on_io_nmi)
250 panic("NMI IOCK error: Not continuing"); 213 panic("NMI IOCK error: Not continuing");
@@ -263,7 +226,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
263 outb(reason, NMI_REASON_PORT); 226 outb(reason, NMI_REASON_PORT);
264} 227}
265 228
266static notrace __kprobes void 229static __kprobes void
267unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 230unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
268{ 231{
269 int handled; 232 int handled;
@@ -282,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
282 245
283 __this_cpu_add(nmi_stats.unknown, 1); 246 __this_cpu_add(nmi_stats.unknown, 1);
284 247
285#ifdef CONFIG_MCA
286 /*
287 * Might actually be able to figure out what the guilty party
288 * is:
289 */
290 if (MCA_bus) {
291 mca_handle_nmi();
292 return;
293 }
294#endif
295 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 248 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
296 reason, smp_processor_id()); 249 reason, smp_processor_id());
297 250
@@ -305,7 +258,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
305static DEFINE_PER_CPU(bool, swallow_nmi); 258static DEFINE_PER_CPU(bool, swallow_nmi);
306static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 259static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
307 260
308static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 261static __kprobes void default_do_nmi(struct pt_regs *regs)
309{ 262{
310 unsigned char reason = 0; 263 unsigned char reason = 0;
311 int handled; 264 int handled;
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 2c39dcd510f..e31bf8d5c4d 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -13,6 +13,7 @@
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/delay.h> 14#include <linux/delay.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/percpu.h>
16 17
17#include <asm/apic.h> 18#include <asm/apic.h>
18#include <asm/nmi.h> 19#include <asm/nmi.h>
@@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected)
117 unexpected_testcase_failures++; 118 unexpected_testcase_failures++;
118 119
119 if (nmi_fail == FAILURE) 120 if (nmi_fail == FAILURE)
120 printk("FAILED |"); 121 printk(KERN_CONT "FAILED |");
121 else if (nmi_fail == TIMEOUT) 122 else if (nmi_fail == TIMEOUT)
122 printk("TIMEOUT|"); 123 printk(KERN_CONT "TIMEOUT|");
123 else 124 else
124 printk("ERROR |"); 125 printk(KERN_CONT "ERROR |");
125 dump_stack(); 126 dump_stack();
126 } else { 127 } else {
127 testcase_successes++; 128 testcase_successes++;
128 printk(" ok |"); 129 printk(KERN_CONT " ok |");
129 } 130 }
130 testcase_total++; 131 testcase_total++;
131 132
@@ -150,10 +151,10 @@ void __init nmi_selftest(void)
150 151
151 print_testname("remote IPI"); 152 print_testname("remote IPI");
152 dotest(remote_ipi, SUCCESS); 153 dotest(remote_ipi, SUCCESS);
153 printk("\n"); 154 printk(KERN_CONT "\n");
154 print_testname("local IPI"); 155 print_testname("local IPI");
155 dotest(local_ipi, SUCCESS); 156 dotest(local_ipi, SUCCESS);
156 printk("\n"); 157 printk(KERN_CONT "\n");
157 158
158 cleanup_nmi_testsuite(); 159 cleanup_nmi_testsuite();
159 160
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab137605e69..9ce885996fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
241 241
242static inline void enter_lazy(enum paravirt_lazy_mode mode) 242static inline void enter_lazy(enum paravirt_lazy_mode mode)
243{ 243{
244 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 244 BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
245 245
246 percpu_write(paravirt_lazy_mode, mode); 246 this_cpu_write(paravirt_lazy_mode, mode);
247} 247}
248 248
249static void leave_lazy(enum paravirt_lazy_mode mode) 249static void leave_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(percpu_read(paravirt_lazy_mode) != mode); 251 BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
252 252
253 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); 253 this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
254} 254}
255 255
256void paravirt_enter_lazy_mmu(void) 256void paravirt_enter_lazy_mmu(void)
@@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
267{ 267{
268 BUG_ON(preemptible()); 268 BUG_ON(preemptible());
269 269
270 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { 270 if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
271 arch_leave_lazy_mmu_mode(); 271 arch_leave_lazy_mmu_mode();
272 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); 272 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
273 } 273 }
@@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
289 if (in_interrupt()) 289 if (in_interrupt())
290 return PARAVIRT_LAZY_NONE; 290 return PARAVIRT_LAZY_NONE;
291 291
292 return percpu_read(paravirt_lazy_mode); 292 return this_cpu_read(paravirt_lazy_mode);
293} 293}
294 294
295void arch_flush_lazy_mmu_mode(void) 295void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d0b2fb9ccbb..b72838bae64 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1480,8 +1480,9 @@ cleanup:
1480static int __init calgary_parse_options(char *p) 1480static int __init calgary_parse_options(char *p)
1481{ 1481{
1482 unsigned int bridge; 1482 unsigned int bridge;
1483 unsigned long val;
1483 size_t len; 1484 size_t len;
1484 char* endp; 1485 ssize_t ret;
1485 1486
1486 while (*p) { 1487 while (*p) {
1487 if (!strncmp(p, "64k", 3)) 1488 if (!strncmp(p, "64k", 3))
@@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p)
1512 ++p; 1513 ++p;
1513 if (*p == '\0') 1514 if (*p == '\0')
1514 break; 1515 break;
1515 bridge = simple_strtoul(p, &endp, 0); 1516 ret = kstrtoul(p, 0, &val);
1516 if (p == endp) 1517 if (ret)
1517 break; 1518 break;
1518 1519
1520 bridge = val;
1519 if (bridge < MAX_PHB_BUS_NUM) { 1521 if (bridge < MAX_PHB_BUS_NUM) {
1520 printk(KERN_INFO "Calgary: disabling " 1522 printk(KERN_INFO "Calgary: disabling "
1521 "translation for PHB %#x\n", bridge); 1523 "translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3003250ac51..62c9457ccd2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
100 struct dma_attrs *attrs) 100 struct dma_attrs *attrs)
101{ 101{
102 unsigned long dma_mask; 102 unsigned long dma_mask;
103 struct page *page; 103 struct page *page = NULL;
104 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
104 dma_addr_t addr; 105 dma_addr_t addr;
105 106
106 dma_mask = dma_alloc_coherent_mask(dev, flag); 107 dma_mask = dma_alloc_coherent_mask(dev, flag);
107 108
108 flag |= __GFP_ZERO; 109 flag |= __GFP_ZERO;
109again: 110again:
110 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); 111 if (!(flag & GFP_ATOMIC))
112 page = dma_alloc_from_contiguous(dev, count, get_order(size));
113 if (!page)
114 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
111 if (!page) 115 if (!page)
112 return NULL; 116 return NULL;
113 117
@@ -127,6 +131,16 @@ again:
127 return page_address(page); 131 return page_address(page);
128} 132}
129 133
134void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
135 dma_addr_t dma_addr, struct dma_attrs *attrs)
136{
137 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
138 struct page *page = virt_to_page(vaddr);
139
140 if (!dma_release_from_contiguous(dev, page, count))
141 free_pages((unsigned long)vaddr, get_order(size));
142}
143
130/* 144/*
131 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel 145 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
132 * parameter documentation. 146 * parameter documentation.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index f96050685b4..871be4a84c7 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
74 return nents; 74 return nents;
75} 75}
76 76
77static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 dma_addr_t dma_addr, struct dma_attrs *attrs)
79{
80 free_pages((unsigned long)vaddr, get_order(size));
81}
82
83static void nommu_sync_single_for_device(struct device *dev, 77static void nommu_sync_single_for_device(struct device *dev,
84 dma_addr_t addr, size_t size, 78 dma_addr_t addr, size_t size,
85 enum dma_data_direction dir) 79 enum dma_data_direction dir)
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
97 91
98struct dma_map_ops nommu_dma_ops = { 92struct dma_map_ops nommu_dma_ops = {
99 .alloc = dma_generic_alloc_coherent, 93 .alloc = dma_generic_alloc_coherent,
100 .free = nommu_free_coherent, 94 .free = dma_generic_free_coherent,
101 .map_sg = nommu_map_sg, 95 .map_sg = nommu_map_sg,
102 .map_page = nommu_map_page, 96 .map_page = nommu_map_page,
103 .sync_single_for_device = nommu_sync_single_for_device, 97 .sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8..735279e54e5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -27,6 +27,15 @@
27#include <asm/debugreg.h> 27#include <asm/debugreg.h>
28#include <asm/nmi.h> 28#include <asm/nmi.h>
29 29
30/*
31 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
32 * no more per-task TSS's. The TSS size is kept cacheline-aligned
33 * so they are allowed to end up in the .data..cacheline_aligned
34 * section. Since TSS's are completely CPU-local, we want them
35 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
36 */
37DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
38
30#ifdef CONFIG_X86_64 39#ifdef CONFIG_X86_64
31static DEFINE_PER_CPU(unsigned char, is_idle); 40static DEFINE_PER_CPU(unsigned char, is_idle);
32static ATOMIC_NOTIFIER_HEAD(idle_notifier); 41static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -47,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
47struct kmem_cache *task_xstate_cachep; 56struct kmem_cache *task_xstate_cachep;
48EXPORT_SYMBOL_GPL(task_xstate_cachep); 57EXPORT_SYMBOL_GPL(task_xstate_cachep);
49 58
59/*
60 * this gets called so that we can store lazy state into memory and copy the
61 * current task into the new thread.
62 */
50int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 63int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
51{ 64{
52 int ret; 65 int ret;
53 66
67 unlazy_fpu(src);
68
54 *dst = *src; 69 *dst = *src;
55 if (fpu_allocated(&src->thread.fpu)) { 70 if (fpu_allocated(&src->thread.fpu)) {
56 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); 71 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
@@ -67,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk)
67 fpu_free(&tsk->thread.fpu); 82 fpu_free(&tsk->thread.fpu);
68} 83}
69 84
70void free_thread_info(struct thread_info *ti) 85void arch_release_task_struct(struct task_struct *tsk)
71{ 86{
72 free_thread_xstate(ti->task); 87 free_thread_xstate(tsk);
73 free_pages((unsigned long)ti, THREAD_ORDER);
74} 88}
75 89
76void arch_task_cache_init(void) 90void arch_task_cache_init(void)
@@ -81,6 +95,16 @@ void arch_task_cache_init(void)
81 SLAB_PANIC | SLAB_NOTRACK, NULL); 95 SLAB_PANIC | SLAB_NOTRACK, NULL);
82} 96}
83 97
98static inline void drop_fpu(struct task_struct *tsk)
99{
100 /*
101 * Forget coprocessor state..
102 */
103 tsk->fpu_counter = 0;
104 clear_fpu(tsk);
105 clear_used_math();
106}
107
84/* 108/*
85 * Free current thread data structures etc.. 109 * Free current thread data structures etc..
86 */ 110 */
@@ -103,12 +127,8 @@ void exit_thread(void)
103 put_cpu(); 127 put_cpu();
104 kfree(bp); 128 kfree(bp);
105 } 129 }
106}
107 130
108void show_regs(struct pt_regs *regs) 131 drop_fpu(me);
109{
110 show_registers(regs);
111 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
112} 132}
113 133
114void show_regs_common(void) 134void show_regs_common(void)
@@ -143,12 +163,7 @@ void flush_thread(void)
143 163
144 flush_ptrace_hw_breakpoint(tsk); 164 flush_ptrace_hw_breakpoint(tsk);
145 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 165 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
146 /* 166 drop_fpu(tsk);
147 * Forget coprocessor state..
148 */
149 tsk->fpu_counter = 0;
150 clear_fpu(tsk);
151 clear_used_math();
152} 167}
153 168
154static void hard_disable_TSC(void) 169static void hard_disable_TSC(void)
@@ -377,7 +392,7 @@ static inline void play_dead(void)
377#ifdef CONFIG_X86_64 392#ifdef CONFIG_X86_64
378void enter_idle(void) 393void enter_idle(void)
379{ 394{
380 percpu_write(is_idle, 1); 395 this_cpu_write(is_idle, 1);
381 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 396 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
382} 397}
383 398
@@ -516,26 +531,6 @@ void stop_this_cpu(void *dummy)
516 } 531 }
517} 532}
518 533
519static void do_nothing(void *unused)
520{
521}
522
523/*
524 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
525 * pm_idle and update to new pm_idle value. Required while changing pm_idle
526 * handler on SMP systems.
527 *
528 * Caller must have changed pm_idle to the new value before the call. Old
529 * pm_idle value will not be used by any CPU after the return of this function.
530 */
531void cpu_idle_wait(void)
532{
533 smp_mb();
534 /* kick all the CPUs so that they exit out of pm_idle */
535 smp_call_function(do_nothing, NULL, 1);
536}
537EXPORT_SYMBOL_GPL(cpu_idle_wait);
538
539/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 534/* Default MONITOR/MWAIT with no hints, used for default C1 state */
540static void mwait_idle(void) 535static void mwait_idle(void)
541{ 536{
@@ -594,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
594{ 589{
595 u32 eax, ebx, ecx, edx; 590 u32 eax, ebx, ecx, edx;
596 591
592 /* Use mwait if idle=mwait boot option is given */
597 if (boot_option_idle_override == IDLE_FORCE_MWAIT) 593 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
598 return 1; 594 return 1;
599 595
596 /*
597 * Any idle= boot option other than idle=mwait means that we must not
598 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
599 */
600 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
601 return 0;
602
600 if (c->cpuid_level < MWAIT_INFO) 603 if (c->cpuid_level < MWAIT_INFO)
601 return 0; 604 return 0;
602 605
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ae6847303e2..516fa186121 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task)
126 release_vm86_irqs(dead_task); 126 release_vm86_irqs(dead_task);
127} 127}
128 128
129/*
130 * This gets called before we allocate a new thread and copy
131 * the current task into it.
132 */
133void prepare_to_copy(struct task_struct *tsk)
134{
135 unlazy_fpu(tsk);
136}
137
138int copy_thread(unsigned long clone_flags, unsigned long sp, 129int copy_thread(unsigned long clone_flags, unsigned long sp,
139 unsigned long unused, 130 unsigned long unused,
140 struct task_struct *p, struct pt_regs *regs) 131 struct task_struct *p, struct pt_regs *regs)
@@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
302 293
303 switch_fpu_finish(next_p, fpu); 294 switch_fpu_finish(next_p, fpu);
304 295
305 percpu_write(current_task, next_p); 296 this_cpu_write(current_task, next_p);
306 297
307 return prev_p; 298 return prev_p;
308} 299}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 733ca39f367..61cdf7fdf09 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
145 return get_desc_base(&t->thread.tls_array[tls]); 145 return get_desc_base(&t->thread.tls_array[tls]);
146} 146}
147 147
148/*
149 * This gets called before we allocate a new thread and copy
150 * the current task into it.
151 */
152void prepare_to_copy(struct task_struct *tsk)
153{
154 unlazy_fpu(tsk);
155}
156
157int copy_thread(unsigned long clone_flags, unsigned long sp, 148int copy_thread(unsigned long clone_flags, unsigned long sp,
158 unsigned long unused, 149 unsigned long unused,
159 struct task_struct *p, struct pt_regs *regs) 150 struct task_struct *p, struct pt_regs *regs)
@@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
237 current->thread.usersp = new_sp; 228 current->thread.usersp = new_sp;
238 regs->ip = new_ip; 229 regs->ip = new_ip;
239 regs->sp = new_sp; 230 regs->sp = new_sp;
240 percpu_write(old_rsp, new_sp); 231 this_cpu_write(old_rsp, new_sp);
241 regs->cs = _cs; 232 regs->cs = _cs;
242 regs->ss = _ss; 233 regs->ss = _ss;
243 regs->flags = X86_EFLAGS_IF; 234 regs->flags = X86_EFLAGS_IF;
@@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
359 /* 350 /*
360 * Switch the PDA and FPU contexts. 351 * Switch the PDA and FPU contexts.
361 */ 352 */
362 prev->usersp = percpu_read(old_rsp); 353 prev->usersp = this_cpu_read(old_rsp);
363 percpu_write(old_rsp, next->usersp); 354 this_cpu_write(old_rsp, next->usersp);
364 percpu_write(current_task, next_p); 355 this_cpu_write(current_task, next_p);
365 356
366 percpu_write(kernel_stack, 357 this_cpu_write(kernel_stack,
367 (unsigned long)task_stack_page(next_p) + 358 (unsigned long)task_stack_page(next_p) +
368 THREAD_SIZE - KERNEL_STACK_OFFSET); 359 THREAD_SIZE - KERNEL_STACK_OFFSET);
369 360
@@ -423,6 +414,7 @@ void set_personality_ia32(bool x32)
423 current_thread_info()->status |= TS_COMPAT; 414 current_thread_info()->status |= TS_COMPAT;
424 } 415 }
425} 416}
417EXPORT_SYMBOL_GPL(set_personality_ia32);
426 418
427unsigned long get_wchan(struct task_struct *p) 419unsigned long get_wchan(struct task_struct *p)
428{ 420{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 685845cf16e..13b1990c7c5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs)
1480 regs->flags |= X86_EFLAGS_TF; 1480 regs->flags |= X86_EFLAGS_TF;
1481 1481
1482 /* do the secure computing check first */ 1482 /* do the secure computing check first */
1483 secure_computing(regs->orig_ax); 1483 if (secure_computing(regs->orig_ax)) {
1484 /* seccomp failures shouldn't expose any additional code. */
1485 ret = -1L;
1486 goto out;
1487 }
1484 1488
1485 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1489 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1486 ret = -1L; 1490 ret = -1L;
@@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs)
1505 regs->dx, regs->r10); 1509 regs->dx, regs->r10);
1506#endif 1510#endif
1507 1511
1512out:
1508 return ret ?: regs->orig_ax; 1513 return ret ?: regs->orig_ax;
1509} 1514}
1510 1515
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 658f856f09a..79c45af8160 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -40,7 +40,8 @@ static int reboot_mode;
40enum reboot_type reboot_type = BOOT_ACPI; 40enum reboot_type reboot_type = BOOT_ACPI;
41int reboot_force; 41int reboot_force;
42 42
43/* This variable is used privately to keep track of whether or not 43/*
44 * This variable is used privately to keep track of whether or not
44 * reboot_type is still set to its default value (i.e., reboot= hasn't 45 * reboot_type is still set to its default value (i.e., reboot= hasn't
45 * been set on the command line). This is needed so that we can 46 * been set on the command line). This is needed so that we can
46 * suppress DMI scanning for reboot quirks. Without it, it's 47 * suppress DMI scanning for reboot quirks. Without it, it's
@@ -52,7 +53,8 @@ static int reboot_default = 1;
52static int reboot_cpu = -1; 53static int reboot_cpu = -1;
53#endif 54#endif
54 55
55/* This is set if we need to go through the 'emergency' path. 56/*
57 * This is set if we need to go through the 'emergency' path.
56 * When machine_emergency_restart() is called, we may be on 58 * When machine_emergency_restart() is called, we may be on
57 * an inconsistent state and won't be able to do a clean cleanup 59 * an inconsistent state and won't be able to do a clean cleanup
58 */ 60 */
@@ -61,22 +63,24 @@ static int reboot_emergency;
61/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ 63/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
62bool port_cf9_safe = false; 64bool port_cf9_safe = false;
63 65
64/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] 66/*
65 warm Don't set the cold reboot flag 67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
66 cold Set the cold reboot flag 68 * warm Don't set the cold reboot flag
67 bios Reboot by jumping through the BIOS (only for X86_32) 69 * cold Set the cold reboot flag
68 smp Reboot by executing reset on BSP or other CPU (only for X86_32) 70 * bios Reboot by jumping through the BIOS (only for X86_32)
69 triple Force a triple fault (init) 71 * smp Reboot by executing reset on BSP or other CPU (only for X86_32)
70 kbd Use the keyboard controller. cold reset (default) 72 * triple Force a triple fault (init)
71 acpi Use the RESET_REG in the FADT 73 * kbd Use the keyboard controller. cold reset (default)
72 efi Use efi reset_system runtime service 74 * acpi Use the RESET_REG in the FADT
73 pci Use the so-called "PCI reset register", CF9 75 * efi Use efi reset_system runtime service
74 force Avoid anything that could hang. 76 * pci Use the so-called "PCI reset register", CF9
77 * force Avoid anything that could hang.
75 */ 78 */
76static int __init reboot_setup(char *str) 79static int __init reboot_setup(char *str)
77{ 80{
78 for (;;) { 81 for (;;) {
79 /* Having anything passed on the command line via 82 /*
83 * Having anything passed on the command line via
80 * reboot= will cause us to disable DMI checking 84 * reboot= will cause us to disable DMI checking
81 * below. 85 * below.
82 */ 86 */
@@ -99,9 +103,11 @@ static int __init reboot_setup(char *str)
99 if (isdigit(*(str+2))) 103 if (isdigit(*(str+2)))
100 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); 104 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
101 } 105 }
102 /* we will leave sorting out the final value 106 /*
103 when we are ready to reboot, since we might not 107 * We will leave sorting out the final value
104 have detected BSP APIC ID or smp_num_cpu */ 108 * when we are ready to reboot, since we might not
109 * have detected BSP APIC ID or smp_num_cpu
110 */
105 break; 111 break;
106#endif /* CONFIG_SMP */ 112#endif /* CONFIG_SMP */
107 113
@@ -151,6 +157,62 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
151 return 0; 157 return 0;
152} 158}
153 159
160void machine_real_restart(unsigned int type)
161{
162 void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
163 real_mode_header->machine_real_restart_asm;
164
165 local_irq_disable();
166
167 /*
168 * Write zero to CMOS register number 0x0f, which the BIOS POST
169 * routine will recognize as telling it to do a proper reboot. (Well
170 * that's what this book in front of me says -- it may only apply to
171 * the Phoenix BIOS though, it's not clear). At the same time,
172 * disable NMIs by setting the top bit in the CMOS address register,
173 * as we're about to do peculiar things to the CPU. I'm not sure if
174 * `outb_p' is needed instead of just `outb'. Use it to be on the
175 * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
176 */
177 spin_lock(&rtc_lock);
178 CMOS_WRITE(0x00, 0x8f);
179 spin_unlock(&rtc_lock);
180
181 /*
182 * Switch back to the initial page table.
183 */
184 load_cr3(initial_page_table);
185
186 /*
187 * Write 0x1234 to absolute memory location 0x472. The BIOS reads
188 * this on booting to tell it to "Bypass memory test (also warm
189 * boot)". This seems like a fairly standard thing that gets set by
190 * REBOOT.COM programs, and the previous reset routine did this
191 * too. */
192 *((unsigned short *)0x472) = reboot_mode;
193
194 /* Jump to the identity-mapped low memory code */
195 restart_lowmem(type);
196}
197#ifdef CONFIG_APM_MODULE
198EXPORT_SYMBOL(machine_real_restart);
199#endif
200
201#endif /* CONFIG_X86_32 */
202
203/*
204 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
205 */
206static int __init set_pci_reboot(const struct dmi_system_id *d)
207{
208 if (reboot_type != BOOT_CF9) {
209 reboot_type = BOOT_CF9;
210 printk(KERN_INFO "%s series board detected. "
211 "Selecting PCI-method for reboots.\n", d->ident);
212 }
213 return 0;
214}
215
154static int __init set_kbd_reboot(const struct dmi_system_id *d) 216static int __init set_kbd_reboot(const struct dmi_system_id *d)
155{ 217{
156 if (reboot_type != BOOT_KBD) { 218 if (reboot_type != BOOT_KBD) {
@@ -160,7 +222,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
160 return 0; 222 return 0;
161} 223}
162 224
225/*
226 * This is a single dmi_table handling all reboot quirks. Note that
227 * REBOOT_BIOS is only available for 32bit
228 */
163static struct dmi_system_id __initdata reboot_dmi_table[] = { 229static struct dmi_system_id __initdata reboot_dmi_table[] = {
230#ifdef CONFIG_X86_32
164 { /* Handle problems with rebooting on Dell E520's */ 231 { /* Handle problems with rebooting on Dell E520's */
165 .callback = set_bios_reboot, 232 .callback = set_bios_reboot,
166 .ident = "Dell E520", 233 .ident = "Dell E520",
@@ -185,7 +252,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
185 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), 252 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
186 }, 253 },
187 }, 254 },
188 { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ 255 { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
189 .callback = set_bios_reboot, 256 .callback = set_bios_reboot,
190 .ident = "Dell OptiPlex 745", 257 .ident = "Dell OptiPlex 745",
191 .matches = { 258 .matches = {
@@ -193,7 +260,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
193 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), 260 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
194 }, 261 },
195 }, 262 },
196 { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ 263 { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
197 .callback = set_bios_reboot, 264 .callback = set_bios_reboot,
198 .ident = "Dell OptiPlex 745", 265 .ident = "Dell OptiPlex 745",
199 .matches = { 266 .matches = {
@@ -202,7 +269,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
202 DMI_MATCH(DMI_BOARD_NAME, "0MM599"), 269 DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
203 }, 270 },
204 }, 271 },
205 { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ 272 { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
206 .callback = set_bios_reboot, 273 .callback = set_bios_reboot,
207 .ident = "Dell OptiPlex 745", 274 .ident = "Dell OptiPlex 745",
208 .matches = { 275 .matches = {
@@ -211,7 +278,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
211 DMI_MATCH(DMI_BOARD_NAME, "0KW626"), 278 DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
212 }, 279 },
213 }, 280 },
214 { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ 281 { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
215 .callback = set_bios_reboot, 282 .callback = set_bios_reboot,
216 .ident = "Dell OptiPlex 330", 283 .ident = "Dell OptiPlex 330",
217 .matches = { 284 .matches = {
@@ -220,7 +287,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
220 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 287 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
221 }, 288 },
222 }, 289 },
223 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ 290 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
224 .callback = set_bios_reboot, 291 .callback = set_bios_reboot,
225 .ident = "Dell OptiPlex 360", 292 .ident = "Dell OptiPlex 360",
226 .matches = { 293 .matches = {
@@ -229,7 +296,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
229 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 296 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
230 }, 297 },
231 }, 298 },
232 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ 299 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
233 .callback = set_bios_reboot, 300 .callback = set_bios_reboot,
234 .ident = "Dell OptiPlex 760", 301 .ident = "Dell OptiPlex 760",
235 .matches = { 302 .matches = {
@@ -302,7 +369,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
302 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 369 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
303 }, 370 },
304 }, 371 },
305 { /* Handle problems with rebooting on ASUS P4S800 */ 372 { /* Handle problems with rebooting on ASUS P4S800 */
306 .callback = set_bios_reboot, 373 .callback = set_bios_reboot,
307 .ident = "ASUS P4S800", 374 .ident = "ASUS P4S800",
308 .matches = { 375 .matches = {
@@ -310,7 +377,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
310 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 377 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
311 }, 378 },
312 }, 379 },
313 { /* Handle reboot issue on Acer Aspire one */ 380#endif /* CONFIG_X86_32 */
381
382 { /* Handle reboot issue on Acer Aspire one */
314 .callback = set_kbd_reboot, 383 .callback = set_kbd_reboot,
315 .ident = "Acer Aspire One A110", 384 .ident = "Acer Aspire One A110",
316 .matches = { 385 .matches = {
@@ -318,76 +387,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
318 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), 387 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
319 }, 388 },
320 }, 389 },
321 { }
322};
323
324static int __init reboot_init(void)
325{
326 /* Only do the DMI check if reboot_type hasn't been overridden
327 * on the command line
328 */
329 if (reboot_default) {
330 dmi_check_system(reboot_dmi_table);
331 }
332 return 0;
333}
334core_initcall(reboot_init);
335
336void machine_real_restart(unsigned int type)
337{
338 void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
339 real_mode_header->machine_real_restart_asm;
340
341 local_irq_disable();
342
343 /* Write zero to CMOS register number 0x0f, which the BIOS POST
344 routine will recognize as telling it to do a proper reboot. (Well
345 that's what this book in front of me says -- it may only apply to
346 the Phoenix BIOS though, it's not clear). At the same time,
347 disable NMIs by setting the top bit in the CMOS address register,
348 as we're about to do peculiar things to the CPU. I'm not sure if
349 `outb_p' is needed instead of just `outb'. Use it to be on the
350 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
351 */
352 spin_lock(&rtc_lock);
353 CMOS_WRITE(0x00, 0x8f);
354 spin_unlock(&rtc_lock);
355
356 /*
357 * Switch back to the initial page table.
358 */
359 load_cr3(initial_page_table);
360
361 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
362 this on booting to tell it to "Bypass memory test (also warm
363 boot)". This seems like a fairly standard thing that gets set by
364 REBOOT.COM programs, and the previous reset routine did this
365 too. */
366 *((unsigned short *)0x472) = reboot_mode;
367
368 /* Jump to the identity-mapped low memory code */
369 restart_lowmem(type);
370}
371#ifdef CONFIG_APM_MODULE
372EXPORT_SYMBOL(machine_real_restart);
373#endif
374
375#endif /* CONFIG_X86_32 */
376
377/*
378 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
379 */
380static int __init set_pci_reboot(const struct dmi_system_id *d)
381{
382 if (reboot_type != BOOT_CF9) {
383 reboot_type = BOOT_CF9;
384 printk(KERN_INFO "%s series board detected. "
385 "Selecting PCI-method for reboots.\n", d->ident);
386 }
387 return 0;
388}
389
390static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
391 { /* Handle problems with rebooting on Apple MacBook5 */ 390 { /* Handle problems with rebooting on Apple MacBook5 */
392 .callback = set_pci_reboot, 391 .callback = set_pci_reboot,
393 .ident = "Apple MacBook5", 392 .ident = "Apple MacBook5",
@@ -455,17 +454,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
455 { } 454 { }
456}; 455};
457 456
458static int __init pci_reboot_init(void) 457static int __init reboot_init(void)
459{ 458{
460 /* Only do the DMI check if reboot_type hasn't been overridden 459 /*
460 * Only do the DMI check if reboot_type hasn't been overridden
461 * on the command line 461 * on the command line
462 */ 462 */
463 if (reboot_default) { 463 if (reboot_default)
464 dmi_check_system(pci_reboot_dmi_table); 464 dmi_check_system(reboot_dmi_table);
465 }
466 return 0; 465 return 0;
467} 466}
468core_initcall(pci_reboot_init); 467core_initcall(reboot_init);
469 468
470static inline void kb_wait(void) 469static inline void kb_wait(void)
471{ 470{
@@ -483,14 +482,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)
483 cpu_emergency_vmxoff(); 482 cpu_emergency_vmxoff();
484} 483}
485 484
486/* Use NMIs as IPIs to tell all CPUs to disable virtualization 485/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
487 */
488static void emergency_vmx_disable_all(void) 486static void emergency_vmx_disable_all(void)
489{ 487{
490 /* Just make sure we won't change CPUs while doing this */ 488 /* Just make sure we won't change CPUs while doing this */
491 local_irq_disable(); 489 local_irq_disable();
492 490
493 /* We need to disable VMX on all CPUs before rebooting, otherwise 491 /*
492 * We need to disable VMX on all CPUs before rebooting, otherwise
494 * we risk hanging up the machine, because the CPU ignore INIT 493 * we risk hanging up the machine, because the CPU ignore INIT
495 * signals when VMX is enabled. 494 * signals when VMX is enabled.
496 * 495 *
@@ -509,8 +508,7 @@ static void emergency_vmx_disable_all(void)
509 * is still enabling VMX. 508 * is still enabling VMX.
510 */ 509 */
511 if (cpu_has_vmx() && cpu_vmx_enabled()) { 510 if (cpu_has_vmx() && cpu_vmx_enabled()) {
512 /* Disable VMX on this CPU. 511 /* Disable VMX on this CPU. */
513 */
514 cpu_vmxoff(); 512 cpu_vmxoff();
515 513
516 /* Halt and disable VMX on the other CPUs */ 514 /* Halt and disable VMX on the other CPUs */
@@ -555,12 +553,12 @@ static void native_machine_emergency_restart(void)
555 /* Could also try the reset bit in the Hammer NB */ 553 /* Could also try the reset bit in the Hammer NB */
556 switch (reboot_type) { 554 switch (reboot_type) {
557 case BOOT_KBD: 555 case BOOT_KBD:
558 mach_reboot_fixups(); /* for board specific fixups */ 556 mach_reboot_fixups(); /* For board specific fixups */
559 557
560 for (i = 0; i < 10; i++) { 558 for (i = 0; i < 10; i++) {
561 kb_wait(); 559 kb_wait();
562 udelay(50); 560 udelay(50);
563 outb(0xfe, 0x64); /* pulse reset low */ 561 outb(0xfe, 0x64); /* Pulse reset low */
564 udelay(50); 562 udelay(50);
565 } 563 }
566 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { 564 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
@@ -602,7 +600,7 @@ static void native_machine_emergency_restart(void)
602 600
603 case BOOT_CF9: 601 case BOOT_CF9:
604 port_cf9_safe = true; 602 port_cf9_safe = true;
605 /* fall through */ 603 /* Fall through */
606 604
607 case BOOT_CF9_COND: 605 case BOOT_CF9_COND:
608 if (port_cf9_safe) { 606 if (port_cf9_safe) {
@@ -640,7 +638,8 @@ void native_machine_shutdown(void)
640 /* Make certain I only run on the appropriate processor */ 638 /* Make certain I only run on the appropriate processor */
641 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); 639 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
642 640
643 /* O.K Now that I'm on the appropriate processor, 641 /*
642 * O.K Now that I'm on the appropriate processor,
644 * stop all of the others. 643 * stop all of the others.
645 */ 644 */
646 stop_other_cpus(); 645 stop_other_cpus();
@@ -678,12 +677,11 @@ static void native_machine_restart(char *__unused)
678 677
679static void native_machine_halt(void) 678static void native_machine_halt(void)
680{ 679{
681 /* stop other cpus and apics */ 680 /* Stop other cpus and apics */
682 machine_shutdown(); 681 machine_shutdown();
683 682
684 tboot_shutdown(TB_SHUTDOWN_HALT); 683 tboot_shutdown(TB_SHUTDOWN_HALT);
685 684
686 /* stop this cpu */
687 stop_this_cpu(NULL); 685 stop_this_cpu(NULL);
688} 686}
689 687
@@ -694,7 +692,7 @@ static void native_machine_power_off(void)
694 machine_shutdown(); 692 machine_shutdown();
695 pm_power_off(); 693 pm_power_off();
696 } 694 }
697 /* a fallback in case there is no PM info available */ 695 /* A fallback in case there is no PM info available */
698 tboot_shutdown(TB_SHUTDOWN_HALT); 696 tboot_shutdown(TB_SHUTDOWN_HALT);
699} 697}
700 698
@@ -756,7 +754,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
756 754
757 cpu = raw_smp_processor_id(); 755 cpu = raw_smp_processor_id();
758 756
759 /* Don't do anything if this handler is invoked on crashing cpu. 757 /*
758 * Don't do anything if this handler is invoked on crashing cpu.
760 * Otherwise, system will completely hang. Crashing cpu can get 759 * Otherwise, system will completely hang. Crashing cpu can get
761 * an NMI if system was initially booted with nmi_watchdog parameter. 760 * an NMI if system was initially booted with nmi_watchdog parameter.
762 */ 761 */
@@ -780,7 +779,8 @@ static void smp_send_nmi_allbutself(void)
780 apic->send_IPI_allbutself(NMI_VECTOR); 779 apic->send_IPI_allbutself(NMI_VECTOR);
781} 780}
782 781
783/* Halt all other CPUs, calling the specified function on each of them 782/*
783 * Halt all other CPUs, calling the specified function on each of them
784 * 784 *
785 * This function can be used to halt all other CPUs on crash 785 * This function can be used to halt all other CPUs on crash
786 * or emergency reboot time. The function passed as parameter 786 * or emergency reboot time. The function passed as parameter
@@ -791,7 +791,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
791 unsigned long msecs; 791 unsigned long msecs;
792 local_irq_disable(); 792 local_irq_disable();
793 793
794 /* Make a note of crashing cpu. Will be used in NMI callback.*/ 794 /* Make a note of crashing cpu. Will be used in NMI callback. */
795 crashing_cpu = safe_smp_processor_id(); 795 crashing_cpu = safe_smp_processor_id();
796 796
797 shootdown_callback = callback; 797 shootdown_callback = callback;
@@ -800,8 +800,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
800 /* Would it be better to replace the trap vector here? */ 800 /* Would it be better to replace the trap vector here? */
801 if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, 801 if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
802 NMI_FLAG_FIRST, "crash")) 802 NMI_FLAG_FIRST, "crash"))
803 return; /* return what? */ 803 return; /* Return what? */
804 /* Ensure the new callback function is set before sending 804 /*
805 * Ensure the new callback function is set before sending
805 * out the NMI 806 * out the NMI
806 */ 807 */
807 wmb(); 808 wmb();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index efcf305210a..16be6dc14db 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,7 +34,6 @@
34#include <linux/memblock.h> 34#include <linux/memblock.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/console.h> 36#include <linux/console.h>
37#include <linux/mca.h>
38#include <linux/root_dev.h> 37#include <linux/root_dev.h>
39#include <linux/highmem.h> 38#include <linux/highmem.h>
40#include <linux/module.h> 39#include <linux/module.h>
@@ -50,6 +49,7 @@
50#include <asm/pci-direct.h> 49#include <asm/pci-direct.h>
51#include <linux/init_ohci1394_dma.h> 50#include <linux/init_ohci1394_dma.h>
52#include <linux/kvm_para.h> 51#include <linux/kvm_para.h>
52#include <linux/dma-contiguous.h>
53 53
54#include <linux/errno.h> 54#include <linux/errno.h>
55#include <linux/kernel.h> 55#include <linux/kernel.h>
@@ -179,12 +179,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
179/* common cpu data for all cpus */ 179/* common cpu data for all cpus */
180struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 180struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
181EXPORT_SYMBOL(boot_cpu_data); 181EXPORT_SYMBOL(boot_cpu_data);
182static void set_mca_bus(int x)
183{
184#ifdef CONFIG_MCA
185 MCA_bus = x;
186#endif
187}
188 182
189unsigned int def_to_bigsmp; 183unsigned int def_to_bigsmp;
190 184
@@ -340,8 +334,8 @@ static void __init relocate_initrd(void)
340 memblock_reserve(ramdisk_here, area_size); 334 memblock_reserve(ramdisk_here, area_size);
341 initrd_start = ramdisk_here + PAGE_OFFSET; 335 initrd_start = ramdisk_here + PAGE_OFFSET;
342 initrd_end = initrd_start + ramdisk_size; 336 initrd_end = initrd_start + ramdisk_size;
343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 337 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
344 ramdisk_here, ramdisk_here + ramdisk_size); 338 ramdisk_here, ramdisk_here + ramdisk_size - 1);
345 339
346 q = (char *)initrd_start; 340 q = (char *)initrd_start;
347 341
@@ -372,8 +366,8 @@ static void __init relocate_initrd(void)
372 /* high pages is not converted by early_res_to_bootmem */ 366 /* high pages is not converted by early_res_to_bootmem */
373 ramdisk_image = boot_params.hdr.ramdisk_image; 367 ramdisk_image = boot_params.hdr.ramdisk_image;
374 ramdisk_size = boot_params.hdr.ramdisk_size; 368 ramdisk_size = boot_params.hdr.ramdisk_size;
375 printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" 369 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
376 " %08llx - %08llx\n", 370 " [mem %#010llx-%#010llx]\n",
377 ramdisk_image, ramdisk_image + ramdisk_size - 1, 371 ramdisk_image, ramdisk_image + ramdisk_size - 1,
378 ramdisk_here, ramdisk_here + ramdisk_size - 1); 372 ramdisk_here, ramdisk_here + ramdisk_size - 1);
379} 373}
@@ -393,14 +387,13 @@ static void __init reserve_initrd(void)
393 initrd_start = 0; 387 initrd_start = 0;
394 388
395 if (ramdisk_size >= (end_of_lowmem>>1)) { 389 if (ramdisk_size >= (end_of_lowmem>>1)) {
396 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 390 panic("initrd too large to handle, "
397 printk(KERN_ERR "initrd too large to handle, " 391 "disabling initrd (%lld needed, %lld available)\n",
398 "disabling initrd\n"); 392 ramdisk_size, end_of_lowmem>>1);
399 return;
400 } 393 }
401 394
402 printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, 395 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
403 ramdisk_end); 396 ramdisk_end - 1);
404 397
405 398
406 if (ramdisk_end <= end_of_lowmem) { 399 if (ramdisk_end <= end_of_lowmem) {
@@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p)
717 apm_info.bios = boot_params.apm_bios_info; 710 apm_info.bios = boot_params.apm_bios_info;
718 ist_info = boot_params.ist_info; 711 ist_info = boot_params.ist_info;
719 if (boot_params.sys_desc_table.length != 0) { 712 if (boot_params.sys_desc_table.length != 0) {
720 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
721 machine_id = boot_params.sys_desc_table.table[0]; 713 machine_id = boot_params.sys_desc_table.table[0];
722 machine_submodel_id = boot_params.sys_desc_table.table[1]; 714 machine_submodel_id = boot_params.sys_desc_table.table[1];
723 BIOS_revision = boot_params.sys_desc_table.table[2]; 715 BIOS_revision = boot_params.sys_desc_table.table[2];
@@ -914,8 +906,8 @@ void __init setup_arch(char **cmdline_p)
914 setup_bios_corruption_check(); 906 setup_bios_corruption_check();
915#endif 907#endif
916 908
917 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 909 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
918 max_pfn_mapped<<PAGE_SHIFT); 910 (max_pfn_mapped<<PAGE_SHIFT) - 1);
919 911
920 setup_real_mode(); 912 setup_real_mode();
921 913
@@ -934,6 +926,7 @@ void __init setup_arch(char **cmdline_p)
934 } 926 }
935#endif 927#endif
936 memblock.current_limit = get_max_mapped(); 928 memblock.current_limit = get_max_mapped();
929 dma_contiguous_reserve(0);
937 930
938 /* 931 /*
939 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 932 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -1014,7 +1007,8 @@ void __init setup_arch(char **cmdline_p)
1014 init_cpu_to_node(); 1007 init_cpu_to_node();
1015 1008
1016 init_apic_mappings(); 1009 init_apic_mappings();
1017 ioapic_and_gsi_init(); 1010 if (x86_io_apic_ops.init)
1011 x86_io_apic_ops.init();
1018 1012
1019 kvm_guest_init(); 1013 kvm_guest_init();
1020 1014
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727da37..5a98aa27218 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
185#endif 185#endif
186 rc = -EINVAL; 186 rc = -EINVAL;
187 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 187 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
188 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
189 const size_t dyn_size = PERCPU_MODULE_RESERVE + 188 const size_t dyn_size = PERCPU_MODULE_RESERVE +
190 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; 189 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
190 size_t atom_size;
191 191
192 /*
193 * On 64bit, use PMD_SIZE for atom_size so that embedded
194 * percpu areas are aligned to PMD. This, in the future,
195 * can also allow using PMD mappings in vmalloc area. Use
196 * PAGE_SIZE on 32bit as vmalloc space is highly contended
197 * and large vmalloc area allocs can easily fail.
198 */
199#ifdef CONFIG_X86_64
200 atom_size = PMD_SIZE;
201#else
202 atom_size = PAGE_SIZE;
203#endif
192 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, 204 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
193 dyn_size, atom_size, 205 dyn_size, atom_size,
194 pcpu_cpu_distance, 206 pcpu_cpu_distance,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 115eac43148..965dfda0fd5 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -18,6 +18,7 @@
18#include <linux/personality.h> 18#include <linux/personality.h>
19#include <linux/uaccess.h> 19#include <linux/uaccess.h>
20#include <linux/user-return-notifier.h> 20#include <linux/user-return-notifier.h>
21#include <linux/uprobes.h>
21 22
22#include <asm/processor.h> 23#include <asm/processor.h>
23#include <asm/ucontext.h> 24#include <asm/ucontext.h>
@@ -478,18 +479,8 @@ asmlinkage int
478sys_sigsuspend(int history0, int history1, old_sigset_t mask) 479sys_sigsuspend(int history0, int history1, old_sigset_t mask)
479{ 480{
480 sigset_t blocked; 481 sigset_t blocked;
481
482 current->saved_sigmask = current->blocked;
483
484 mask &= _BLOCKABLE;
485 siginitset(&blocked, mask); 482 siginitset(&blocked, mask);
486 set_current_blocked(&blocked); 483 return sigsuspend(&blocked);
487
488 current->state = TASK_INTERRUPTIBLE;
489 schedule();
490
491 set_restore_sigmask();
492 return -ERESTARTNOHAND;
493} 484}
494 485
495asmlinkage int 486asmlinkage int
@@ -824,6 +815,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
824 mce_notify_process(); 815 mce_notify_process();
825#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 816#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
826 817
818 if (thread_info_flags & _TIF_UPROBE) {
819 clear_thread_flag(TIF_UPROBE);
820 uprobe_notify_resume(regs);
821 }
822
827 /* deal with pending signal delivery */ 823 /* deal with pending signal delivery */
828 if (thread_info_flags & _TIF_SIGPENDING) 824 if (thread_info_flags & _TIF_SIGPENDING)
829 do_signal(regs); 825 do_signal(regs);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f481ca..48d2b7ded42 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -109,6 +109,9 @@
109 * about nothing of note with C stepping upwards. 109 * about nothing of note with C stepping upwards.
110 */ 110 */
111 111
112static atomic_t stopping_cpu = ATOMIC_INIT(-1);
113static bool smp_no_nmi_ipi = false;
114
112/* 115/*
113 * this function sends a 'reschedule' IPI to another CPU. 116 * this function sends a 'reschedule' IPI to another CPU.
114 * it goes straight through and wastes no time serializing 117 * it goes straight through and wastes no time serializing
@@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
149 free_cpumask_var(allbutself); 152 free_cpumask_var(allbutself);
150} 153}
151 154
152static atomic_t stopping_cpu = ATOMIC_INIT(-1);
153
154static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) 155static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
155{ 156{
156 /* We are registered on stopping cpu too, avoid spurious NMI */ 157 /* We are registered on stopping cpu too, avoid spurious NMI */
@@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
162 return NMI_HANDLED; 163 return NMI_HANDLED;
163} 164}
164 165
165static void native_nmi_stop_other_cpus(int wait) 166/*
167 * this function calls the 'stop' function on all other CPUs in the system.
168 */
169
170asmlinkage void smp_reboot_interrupt(void)
171{
172 ack_APIC_irq();
173 irq_enter();
174 stop_this_cpu(NULL);
175 irq_exit();
176}
177
178static void native_stop_other_cpus(int wait)
166{ 179{
167 unsigned long flags; 180 unsigned long flags;
168 unsigned long timeout; 181 unsigned long timeout;
@@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait)
174 * Use an own vector here because smp_call_function 187 * Use an own vector here because smp_call_function
175 * does lots of things not suitable in a panic situation. 188 * does lots of things not suitable in a panic situation.
176 */ 189 */
190
191 /*
192 * We start by using the REBOOT_VECTOR irq.
193 * The irq is treated as a sync point to allow critical
194 * regions of code on other cpus to release their spin locks
195 * and re-enable irqs. Jumping straight to an NMI might
196 * accidentally cause deadlocks with further shutdown/panic
197 * code. By syncing, we give the cpus up to one second to
198 * finish their work before we force them off with the NMI.
199 */
177 if (num_online_cpus() > 1) { 200 if (num_online_cpus() > 1) {
178 /* did someone beat us here? */ 201 /* did someone beat us here? */
179 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) 202 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
180 return; 203 return;
181 204
182 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, 205 /* sync above data before sending IRQ */
183 NMI_FLAG_FIRST, "smp_stop"))
184 /* Note: we ignore failures here */
185 return;
186
187 /* sync above data before sending NMI */
188 wmb(); 206 wmb();
189 207
190 apic->send_IPI_allbutself(NMI_VECTOR); 208 apic->send_IPI_allbutself(REBOOT_VECTOR);
191 209
192 /* 210 /*
193 * Don't wait longer than a second if the caller 211 * Don't wait longer than a second if the caller
@@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait)
197 while (num_online_cpus() > 1 && (wait || timeout--)) 215 while (num_online_cpus() > 1 && (wait || timeout--))
198 udelay(1); 216 udelay(1);
199 } 217 }
218
219 /* if the REBOOT_VECTOR didn't work, try with the NMI */
220 if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
221 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
222 NMI_FLAG_FIRST, "smp_stop"))
223 /* Note: we ignore failures here */
224 /* Hope the REBOOT_IRQ is good enough */
225 goto finish;
200 226
201 local_irq_save(flags); 227 /* sync above data before sending IRQ */
202 disable_local_APIC(); 228 wmb();
203 local_irq_restore(flags);
204}
205
206/*
207 * this function calls the 'stop' function on all other CPUs in the system.
208 */
209
210asmlinkage void smp_reboot_interrupt(void)
211{
212 ack_APIC_irq();
213 irq_enter();
214 stop_this_cpu(NULL);
215 irq_exit();
216}
217
218static void native_irq_stop_other_cpus(int wait)
219{
220 unsigned long flags;
221 unsigned long timeout;
222 229
223 if (reboot_force) 230 pr_emerg("Shutting down cpus with NMI\n");
224 return;
225 231
226 /* 232 apic->send_IPI_allbutself(NMI_VECTOR);
227 * Use an own vector here because smp_call_function
228 * does lots of things not suitable in a panic situation.
229 * On most systems we could also use an NMI here,
230 * but there are a few systems around where NMI
231 * is problematic so stay with an non NMI for now
232 * (this implies we cannot stop CPUs spinning with irq off
233 * currently)
234 */
235 if (num_online_cpus() > 1) {
236 apic->send_IPI_allbutself(REBOOT_VECTOR);
237 233
238 /* 234 /*
239 * Don't wait longer than a second if the caller 235 * Don't wait longer than a 10 ms if the caller
240 * didn't ask us to wait. 236 * didn't ask us to wait.
241 */ 237 */
242 timeout = USEC_PER_SEC; 238 timeout = USEC_PER_MSEC * 10;
243 while (num_online_cpus() > 1 && (wait || timeout--)) 239 while (num_online_cpus() > 1 && (wait || timeout--))
244 udelay(1); 240 udelay(1);
245 } 241 }
246 242
243finish:
247 local_irq_save(flags); 244 local_irq_save(flags);
248 disable_local_APIC(); 245 disable_local_APIC();
249 local_irq_restore(flags); 246 local_irq_restore(flags);
250} 247}
251 248
252static void native_smp_disable_nmi_ipi(void)
253{
254 smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
255}
256
257/* 249/*
258 * Reschedule call back. 250 * Reschedule call back.
259 */ 251 */
@@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
287 279
288static int __init nonmi_ipi_setup(char *str) 280static int __init nonmi_ipi_setup(char *str)
289{ 281{
290 native_smp_disable_nmi_ipi(); 282 smp_no_nmi_ipi = true;
291 return 1; 283 return 1;
292} 284}
293 285
294__setup("nonmi_ipi", nonmi_ipi_setup); 286__setup("nonmi_ipi", nonmi_ipi_setup);
@@ -298,7 +290,7 @@ struct smp_ops smp_ops = {
298 .smp_prepare_cpus = native_smp_prepare_cpus, 290 .smp_prepare_cpus = native_smp_prepare_cpus,
299 .smp_cpus_done = native_smp_cpus_done, 291 .smp_cpus_done = native_smp_cpus_done,
300 292
301 .stop_other_cpus = native_nmi_stop_other_cpus, 293 .stop_other_cpus = native_stop_other_cpus,
302 .smp_send_reschedule = native_smp_send_reschedule, 294 .smp_send_reschedule = native_smp_send_reschedule,
303 295
304 .cpu_up = native_cpu_up, 296 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 757c4b1d0a0..f56f96da77f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -78,20 +78,8 @@
78/* State of each CPU */ 78/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 79DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 80
81/* Store all idle threads, this can be reused instead of creating
82* a new thread. Also avoids complicated thread destroy functionality
83* for idle threads.
84*/
85#ifdef CONFIG_HOTPLUG_CPU 81#ifdef CONFIG_HOTPLUG_CPU
86/* 82/*
87 * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
88 * removed after init for !CONFIG_HOTPLUG_CPU.
89 */
90static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
91#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
92#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
93
94/*
95 * We need this for trampoline_base protection from concurrent accesses when 83 * We need this for trampoline_base protection from concurrent accesses when
96 * off- and onlining cores wildly. 84 * off- and onlining cores wildly.
97 */ 85 */
@@ -99,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99 87
100void cpu_hotplug_driver_lock(void) 88void cpu_hotplug_driver_lock(void)
101{ 89{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex); 90 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103} 91}
104 92
105void cpu_hotplug_driver_unlock(void) 93void cpu_hotplug_driver_unlock(void)
106{ 94{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 95 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108} 96}
109 97
110ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } 98ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
111ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } 99ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
112#else
113static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
114#define get_idle_for_cpu(x) (idle_thread_array[(x)])
115#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
116#endif 100#endif
117 101
118/* Number of siblings per CPU package */ 102/* Number of siblings per CPU package */
@@ -317,59 +301,90 @@ void __cpuinit smp_store_cpu_info(int id)
317 identify_secondary_cpu(c); 301 identify_secondary_cpu(c);
318} 302}
319 303
320static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 304static bool __cpuinit
305topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
321{ 306{
322 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 307 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
323 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 308
324 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 309 return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
325 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 310 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
326 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); 311 "[node: %d != %d]. Ignoring dependency.\n",
327 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); 312 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
328} 313}
329 314
315#define link_mask(_m, c1, c2) \
316do { \
317 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
318 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
319} while (0)
320
321static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
322{
323 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
324 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
325
326 if (c->phys_proc_id == o->phys_proc_id &&
327 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
328 c->compute_unit_id == o->compute_unit_id)
329 return topology_sane(c, o, "smt");
330
331 } else if (c->phys_proc_id == o->phys_proc_id &&
332 c->cpu_core_id == o->cpu_core_id) {
333 return topology_sane(c, o, "smt");
334 }
335
336 return false;
337}
338
339static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
340{
341 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
342
343 if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
344 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
345 return topology_sane(c, o, "llc");
346
347 return false;
348}
349
350static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
351{
352 if (c->phys_proc_id == o->phys_proc_id)
353 return topology_sane(c, o, "mc");
354
355 return false;
356}
330 357
331void __cpuinit set_cpu_sibling_map(int cpu) 358void __cpuinit set_cpu_sibling_map(int cpu)
332{ 359{
333 int i; 360 bool has_mc = boot_cpu_data.x86_max_cores > 1;
361 bool has_smt = smp_num_siblings > 1;
334 struct cpuinfo_x86 *c = &cpu_data(cpu); 362 struct cpuinfo_x86 *c = &cpu_data(cpu);
363 struct cpuinfo_x86 *o;
364 int i;
335 365
336 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 366 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
337 367
338 if (smp_num_siblings > 1) { 368 if (!has_smt && !has_mc) {
339 for_each_cpu(i, cpu_sibling_setup_mask) {
340 struct cpuinfo_x86 *o = &cpu_data(i);
341
342 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
343 if (c->phys_proc_id == o->phys_proc_id &&
344 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
345 c->compute_unit_id == o->compute_unit_id)
346 link_thread_siblings(cpu, i);
347 } else if (c->phys_proc_id == o->phys_proc_id &&
348 c->cpu_core_id == o->cpu_core_id) {
349 link_thread_siblings(cpu, i);
350 }
351 }
352 } else {
353 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 369 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
354 } 370 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
355 371 cpumask_set_cpu(cpu, cpu_core_mask(cpu));
356 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
357
358 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
359 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
360 c->booted_cores = 1; 372 c->booted_cores = 1;
361 return; 373 return;
362 } 374 }
363 375
364 for_each_cpu(i, cpu_sibling_setup_mask) { 376 for_each_cpu(i, cpu_sibling_setup_mask) {
365 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 377 o = &cpu_data(i);
366 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 378
367 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); 379 if ((i == cpu) || (has_smt && match_smt(c, o)))
368 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); 380 link_mask(sibling, cpu, i);
369 } 381
370 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 382 if ((i == cpu) || (has_mc && match_llc(c, o)))
371 cpumask_set_cpu(i, cpu_core_mask(cpu)); 383 link_mask(llc_shared, cpu, i);
372 cpumask_set_cpu(cpu, cpu_core_mask(i)); 384
385 if ((i == cpu) || (has_mc && match_mc(c, o))) {
386 link_mask(core, cpu, i);
387
373 /* 388 /*
374 * Does this new cpu bringup a new core? 389 * Does this new cpu bringup a new core?
375 */ 390 */
@@ -400,8 +415,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
400 * For perf, we return last level cache shared map. 415 * For perf, we return last level cache shared map.
401 * And for power savings, we return cpu_core_map 416 * And for power savings, we return cpu_core_map
402 */ 417 */
403 if ((sched_mc_power_savings || sched_smt_power_savings) && 418 if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
404 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
405 return cpu_core_mask(cpu); 419 return cpu_core_mask(cpu);
406 else 420 else
407 return cpu_llc_shared_mask(cpu); 421 return cpu_llc_shared_mask(cpu);
@@ -620,22 +634,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
620 return (send_status | accept_status); 634 return (send_status | accept_status);
621} 635}
622 636
623struct create_idle {
624 struct work_struct work;
625 struct task_struct *idle;
626 struct completion done;
627 int cpu;
628};
629
630static void __cpuinit do_fork_idle(struct work_struct *work)
631{
632 struct create_idle *c_idle =
633 container_of(work, struct create_idle, work);
634
635 c_idle->idle = fork_idle(c_idle->cpu);
636 complete(&c_idle->done);
637}
638
639/* reduce the number of lines printed when booting a large cpu count system */ 637/* reduce the number of lines printed when booting a large cpu count system */
640static void __cpuinit announce_cpu(int cpu, int apicid) 638static void __cpuinit announce_cpu(int cpu, int apicid)
641{ 639{
@@ -662,7 +660,7 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
662 * Returns zero if CPU booted OK, else error code from 660 * Returns zero if CPU booted OK, else error code from
663 * ->wakeup_secondary_cpu. 661 * ->wakeup_secondary_cpu.
664 */ 662 */
665static int __cpuinit do_boot_cpu(int apicid, int cpu) 663static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
666{ 664{
667 volatile u32 *trampoline_status = 665 volatile u32 *trampoline_status =
668 (volatile u32 *) __va(real_mode_header->trampoline_status); 666 (volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -671,53 +669,26 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
671 669
672 unsigned long boot_error = 0; 670 unsigned long boot_error = 0;
673 int timeout; 671 int timeout;
674 struct create_idle c_idle = {
675 .cpu = cpu,
676 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
677 };
678
679 INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
680 672
681 alternatives_smp_switch(1); 673 alternatives_smp_switch(1);
682 674
683 c_idle.idle = get_idle_for_cpu(cpu); 675 idle->thread.sp = (unsigned long) (((struct pt_regs *)
684 676 (THREAD_SIZE + task_stack_page(idle))) - 1);
685 /* 677 per_cpu(current_task, cpu) = idle;
686 * We can't use kernel_thread since we must avoid to
687 * reschedule the child.
688 */
689 if (c_idle.idle) {
690 c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
691 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
692 init_idle(c_idle.idle, cpu);
693 goto do_rest;
694 }
695
696 schedule_work(&c_idle.work);
697 wait_for_completion(&c_idle.done);
698 678
699 if (IS_ERR(c_idle.idle)) {
700 printk("failed fork for CPU %d\n", cpu);
701 destroy_work_on_stack(&c_idle.work);
702 return PTR_ERR(c_idle.idle);
703 }
704
705 set_idle_for_cpu(cpu, c_idle.idle);
706do_rest:
707 per_cpu(current_task, cpu) = c_idle.idle;
708#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
709 /* Stack for startup_32 can be just as for start_secondary onwards */ 680 /* Stack for startup_32 can be just as for start_secondary onwards */
710 irq_ctx_init(cpu); 681 irq_ctx_init(cpu);
711#else 682#else
712 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 683 clear_tsk_thread_flag(idle, TIF_FORK);
713 initial_gs = per_cpu_offset(cpu); 684 initial_gs = per_cpu_offset(cpu);
714 per_cpu(kernel_stack, cpu) = 685 per_cpu(kernel_stack, cpu) =
715 (unsigned long)task_stack_page(c_idle.idle) - 686 (unsigned long)task_stack_page(idle) -
716 KERNEL_STACK_OFFSET + THREAD_SIZE; 687 KERNEL_STACK_OFFSET + THREAD_SIZE;
717#endif 688#endif
718 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 689 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
719 initial_code = (unsigned long)start_secondary; 690 initial_code = (unsigned long)start_secondary;
720 stack_start = c_idle.idle->thread.sp; 691 stack_start = idle->thread.sp;
721 692
722 /* So we see what's up */ 693 /* So we see what's up */
723 announce_cpu(cpu, apicid); 694 announce_cpu(cpu, apicid);
@@ -815,12 +786,10 @@ do_rest:
815 */ 786 */
816 smpboot_restore_warm_reset_vector(); 787 smpboot_restore_warm_reset_vector();
817 } 788 }
818
819 destroy_work_on_stack(&c_idle.work);
820 return boot_error; 789 return boot_error;
821} 790}
822 791
823int __cpuinit native_cpu_up(unsigned int cpu) 792int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
824{ 793{
825 int apicid = apic->cpu_present_to_apicid(cpu); 794 int apicid = apic->cpu_present_to_apicid(cpu);
826 unsigned long flags; 795 unsigned long flags;
@@ -853,7 +822,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
853 822
854 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 823 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
855 824
856 err = do_boot_cpu(apicid, cpu); 825 err = do_boot_cpu(apicid, cpu, tidle);
857 if (err) { 826 if (err) {
858 pr_debug("do_boot_cpu failed %d\n", err); 827 pr_debug("do_boot_cpu failed %d\n", err);
859 return -EIO; 828 return -EIO;
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index c29e235792a..b79133abda4 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <asm/cacheflush.h> 13#include <asm/cacheflush.h>
14#include <asm/sections.h> 14#include <asm/sections.h>
15#include <asm/asm.h>
15 16
16int rodata_test(void) 17int rodata_test(void)
17{ 18{
@@ -42,14 +43,7 @@ int rodata_test(void)
42 ".section .fixup,\"ax\"\n" 43 ".section .fixup,\"ax\"\n"
43 "2: jmp 1b\n" 44 "2: jmp 1b\n"
44 ".previous\n" 45 ".previous\n"
45 ".section __ex_table,\"a\"\n" 46 _ASM_EXTABLE(0b,2b)
46 " .align 16\n"
47#ifdef CONFIG_X86_32
48 " .long 0b,2b\n"
49#else
50 " .quad 0b,2b\n"
51#endif
52 ".previous"
53 : [rslt] "=r" (result) 47 : [rslt] "=r" (result)
54 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) 48 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
55 ); 49 );
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index c6eba2b4267..24d3c91e981 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,7 +14,6 @@
14#include <linux/i8253.h> 14#include <linux/i8253.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/mca.h>
18 17
19#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
20#include <asm/x86_init.h> 19#include <asm/x86_init.h>
@@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc);
58static irqreturn_t timer_interrupt(int irq, void *dev_id) 57static irqreturn_t timer_interrupt(int irq, void *dev_id)
59{ 58{
60 global_clock_event->event_handler(global_clock_event); 59 global_clock_event->event_handler(global_clock_event);
61
62 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
63 if (MCA_bus)
64 outb_p(inb_p(0x61)| 0x80, 0x61);
65
66 return IRQ_HANDLED; 60 return IRQ_HANDLED;
67} 61}
68 62
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff9281f1602..ff08457a025 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,10 +37,6 @@
37#include <linux/eisa.h> 37#include <linux/eisa.h>
38#endif 38#endif
39 39
40#ifdef CONFIG_MCA
41#include <linux/mca.h>
42#endif
43
44#if defined(CONFIG_EDAC) 40#if defined(CONFIG_EDAC)
45#include <linux/edac.h> 41#include <linux/edac.h>
46#endif 42#endif
@@ -50,6 +46,7 @@
50#include <asm/processor.h> 46#include <asm/processor.h>
51#include <asm/debugreg.h> 47#include <asm/debugreg.h>
52#include <linux/atomic.h> 48#include <linux/atomic.h>
49#include <asm/ftrace.h>
53#include <asm/traps.h> 50#include <asm/traps.h>
54#include <asm/desc.h> 51#include <asm/desc.h>
55#include <asm/i387.h> 52#include <asm/i387.h>
@@ -303,8 +300,13 @@ gp_in_kernel:
303} 300}
304 301
305/* May run on IST stack. */ 302/* May run on IST stack. */
306dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) 303dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
307{ 304{
305#ifdef CONFIG_DYNAMIC_FTRACE
306 /* ftrace must be first, everything else may cause a recursive crash */
307 if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
308 return;
309#endif
308#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 310#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
309 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 311 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
310 SIGTRAP) == NOTIFY_STOP) 312 SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 00000000000..dc4e910a7d9
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,674 @@
1/*
2 * User-space Probes (UProbes) for x86
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/ptrace.h>
26#include <linux/uprobes.h>
27#include <linux/uaccess.h>
28
29#include <linux/kdebug.h>
30#include <asm/processor.h>
31#include <asm/insn.h>
32
33/* Post-execution fixups. */
34
35/* No fixup needed */
36#define UPROBE_FIX_NONE 0x0
37
38/* Adjust IP back to vicinity of actual insn */
39#define UPROBE_FIX_IP 0x1
40
41/* Adjust the return address of a call insn */
42#define UPROBE_FIX_CALL 0x2
43
44#define UPROBE_FIX_RIP_AX 0x8000
45#define UPROBE_FIX_RIP_CX 0x4000
46
47#define UPROBE_TRAP_NR UINT_MAX
48
49/* Adaptations for mhiramat x86 decoder v14. */
50#define OPCODE1(insn) ((insn)->opcode.bytes[0])
51#define OPCODE2(insn) ((insn)->opcode.bytes[1])
52#define OPCODE3(insn) ((insn)->opcode.bytes[2])
53#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
54
55#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
56 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
57 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
58 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
59 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
60 << (row % 32))
61
62/*
63 * Good-instruction tables for 32-bit apps. This is non-const and volatile
64 * to keep gcc from statically optimizing it out, as variable_test_bit makes
65 * some versions of gcc to think only *(unsigned long*) is used.
66 */
67static volatile u32 good_insns_32[256 / 32] = {
68 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
69 /* ---------------------------------------------- */
70 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
71 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
72 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
73 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
74 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
75 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
76 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
77 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
78 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
79 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
80 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
81 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
82 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
83 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
84 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
85 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
86 /* ---------------------------------------------- */
87 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
88};
89
90/* Using this for both 64-bit and 32-bit apps */
91static volatile u32 good_2byte_insns[256 / 32] = {
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 /* ---------------------------------------------- */
94 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
95 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
96 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
97 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
98 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
99 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
100 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
101 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
102 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
103 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
104 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
105 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
106 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
107 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
108 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
109 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
110 /* ---------------------------------------------- */
111 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
112};
113
114#ifdef CONFIG_X86_64
115/* Good-instruction tables for 64-bit apps */
116static volatile u32 good_insns_64[256 / 32] = {
117 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
118 /* ---------------------------------------------- */
119 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
120 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
121 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
122 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
123 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
124 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
125 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
126 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
127 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
128 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
129 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
130 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
131 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
132 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
133 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
134 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
135 /* ---------------------------------------------- */
136 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
137};
138#endif
139#undef W
140
141/*
142 * opcodes we'll probably never support:
143 *
144 * 6c-6d, e4-e5, ec-ed - in
145 * 6e-6f, e6-e7, ee-ef - out
146 * cc, cd - int3, int
147 * cf - iret
148 * d6 - illegal instruction
149 * f1 - int1/icebp
150 * f4 - hlt
151 * fa, fb - cli, sti
152 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
153 *
154 * invalid opcodes in 64-bit mode:
155 *
156 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
157 * 63 - we support this opcode in x86_64 but not in i386.
158 *
159 * opcodes we may need to refine support for:
160 *
161 * 0f - 2-byte instructions: For many of these instructions, the validity
162 * depends on the prefix and/or the reg field. On such instructions, we
163 * just consider the opcode combination valid if it corresponds to any
164 * valid instruction.
165 *
166 * 8f - Group 1 - only reg = 0 is OK
167 * c6-c7 - Group 11 - only reg = 0 is OK
168 * d9-df - fpu insns with some illegal encodings
169 * f2, f3 - repnz, repz prefixes. These are also the first byte for
170 * certain floating-point instructions, such as addsd.
171 *
172 * fe - Group 4 - only reg = 0 or 1 is OK
173 * ff - Group 5 - only reg = 0-6 is OK
174 *
175 * others -- Do we need to support these?
176 *
177 * 0f - (floating-point?) prefetch instructions
178 * 07, 17, 1f - pop es, pop ss, pop ds
179 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
180 * but 64 and 65 (fs: and gs:) seem to be used, so we support them
181 * 67 - addr16 prefix
182 * ce - into
183 * f0 - lock prefix
184 */
185
186/*
187 * TODO:
188 * - Where necessary, examine the modrm byte and allow only valid instructions
189 * in the different Groups and fpu instructions.
190 */
191
192static bool is_prefix_bad(struct insn *insn)
193{
194 int i;
195
196 for (i = 0; i < insn->prefixes.nbytes; i++) {
197 switch (insn->prefixes.bytes[i]) {
198 case 0x26: /* INAT_PFX_ES */
199 case 0x2E: /* INAT_PFX_CS */
200 case 0x36: /* INAT_PFX_DS */
201 case 0x3E: /* INAT_PFX_SS */
202 case 0xF0: /* INAT_PFX_LOCK */
203 return true;
204 }
205 }
206 return false;
207}
208
209static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
210{
211 insn_init(insn, auprobe->insn, false);
212
213 /* Skip good instruction prefixes; reject "bad" ones. */
214 insn_get_opcode(insn);
215 if (is_prefix_bad(insn))
216 return -ENOTSUPP;
217
218 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
219 return 0;
220
221 if (insn->opcode.nbytes == 2) {
222 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
223 return 0;
224 }
225
226 return -ENOTSUPP;
227}
228
229/*
230 * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
231 * annotate arch_uprobe->fixups accordingly. To start with,
232 * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
233 */
234static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
235{
236 bool fix_ip = true, fix_call = false; /* defaults */
237 int reg;
238
239 insn_get_opcode(insn); /* should be a nop */
240
241 switch (OPCODE1(insn)) {
242 case 0xc3: /* ret/lret */
243 case 0xcb:
244 case 0xc2:
245 case 0xca:
246 /* ip is correct */
247 fix_ip = false;
248 break;
249 case 0xe8: /* call relative - Fix return addr */
250 fix_call = true;
251 break;
252 case 0x9a: /* call absolute - Fix return addr, not ip */
253 fix_call = true;
254 fix_ip = false;
255 break;
256 case 0xff:
257 insn_get_modrm(insn);
258 reg = MODRM_REG(insn);
259 if (reg == 2 || reg == 3) {
260 /* call or lcall, indirect */
261 /* Fix return addr; ip is correct. */
262 fix_call = true;
263 fix_ip = false;
264 } else if (reg == 4 || reg == 5) {
265 /* jmp or ljmp, indirect */
266 /* ip is correct. */
267 fix_ip = false;
268 }
269 break;
270 case 0xea: /* jmp absolute -- ip is correct */
271 fix_ip = false;
272 break;
273 default:
274 break;
275 }
276 if (fix_ip)
277 auprobe->fixups |= UPROBE_FIX_IP;
278 if (fix_call)
279 auprobe->fixups |= UPROBE_FIX_CALL;
280}
281
282#ifdef CONFIG_X86_64
283/*
284 * If arch_uprobe->insn doesn't use rip-relative addressing, return
285 * immediately. Otherwise, rewrite the instruction so that it accesses
286 * its memory operand indirectly through a scratch register. Set
287 * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
288 * accordingly. (The contents of the scratch register will be saved
289 * before we single-step the modified instruction, and restored
290 * afterward.)
291 *
292 * We do this because a rip-relative instruction can access only a
293 * relatively small area (+/- 2 GB from the instruction), and the XOL
294 * area typically lies beyond that area. At least for instructions
295 * that store to memory, we can't execute the original instruction
296 * and "fix things up" later, because the misdirected store could be
297 * disastrous.
298 *
299 * Some useful facts about rip-relative instructions:
300 *
301 * - There's always a modrm byte.
302 * - There's never a SIB byte.
303 * - The displacement is always 4 bytes.
304 */
305static void
306handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
307{
308 u8 *cursor;
309 u8 reg;
310
311 if (mm->context.ia32_compat)
312 return;
313
314 auprobe->rip_rela_target_address = 0x0;
315 if (!insn_rip_relative(insn))
316 return;
317
318 /*
319 * insn_rip_relative() would have decoded rex_prefix, modrm.
320 * Clear REX.b bit (extension of MODRM.rm field):
321 * we want to encode rax/rcx, not r8/r9.
322 */
323 if (insn->rex_prefix.nbytes) {
324 cursor = auprobe->insn + insn_offset_rex_prefix(insn);
325 *cursor &= 0xfe; /* Clearing REX.B bit */
326 }
327
328 /*
329 * Point cursor at the modrm byte. The next 4 bytes are the
330 * displacement. Beyond the displacement, for some instructions,
331 * is the immediate operand.
332 */
333 cursor = auprobe->insn + insn_offset_modrm(insn);
334 insn_get_length(insn);
335
336 /*
337 * Convert from rip-relative addressing to indirect addressing
338 * via a scratch register. Change the r/m field from 0x5 (%rip)
339 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
340 */
341 reg = MODRM_REG(insn);
342 if (reg == 0) {
343 /*
344 * The register operand (if any) is either the A register
345 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
346 * REX prefix) %r8. In any case, we know the C register
347 * is NOT the register operand, so we use %rcx (register
348 * #1) for the scratch register.
349 */
350 auprobe->fixups = UPROBE_FIX_RIP_CX;
351 /* Change modrm from 00 000 101 to 00 000 001. */
352 *cursor = 0x1;
353 } else {
354 /* Use %rax (register #0) for the scratch register. */
355 auprobe->fixups = UPROBE_FIX_RIP_AX;
356 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
357 *cursor = (reg << 3);
358 }
359
360 /* Target address = address of next instruction + (signed) offset */
361 auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
362
363 /* Displacement field is gone; slide immediate field (if any) over. */
364 if (insn->immediate.nbytes) {
365 cursor++;
366 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
367 }
368 return;
369}
370
371static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
372{
373 insn_init(insn, auprobe->insn, true);
374
375 /* Skip good instruction prefixes; reject "bad" ones. */
376 insn_get_opcode(insn);
377 if (is_prefix_bad(insn))
378 return -ENOTSUPP;
379
380 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
381 return 0;
382
383 if (insn->opcode.nbytes == 2) {
384 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
385 return 0;
386 }
387 return -ENOTSUPP;
388}
389
390static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
391{
392 if (mm->context.ia32_compat)
393 return validate_insn_32bits(auprobe, insn);
394 return validate_insn_64bits(auprobe, insn);
395}
396#else /* 32-bit: */
397static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
398{
399 /* No RIP-relative addressing on 32-bit */
400}
401
402static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
403{
404 return validate_insn_32bits(auprobe, insn);
405}
406#endif /* CONFIG_X86_64 */
407
408/**
409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
410 * @mm: the probed address space.
411 * @arch_uprobe: the probepoint information.
412 * Return 0 on success or a -ve number on error.
413 */
414int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
415{
416 int ret;
417 struct insn insn;
418
419 auprobe->fixups = 0;
420 ret = validate_insn_bits(auprobe, mm, &insn);
421 if (ret != 0)
422 return ret;
423
424 handle_riprel_insn(auprobe, mm, &insn);
425 prepare_fixups(auprobe, &insn);
426
427 return 0;
428}
429
430#ifdef CONFIG_X86_64
431/*
432 * If we're emulating a rip-relative instruction, save the contents
433 * of the scratch register and store the target address in that register.
434 */
435static void
436pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
437 struct arch_uprobe_task *autask)
438{
439 if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
440 autask->saved_scratch_register = regs->ax;
441 regs->ax = current->utask->vaddr;
442 regs->ax += auprobe->rip_rela_target_address;
443 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
444 autask->saved_scratch_register = regs->cx;
445 regs->cx = current->utask->vaddr;
446 regs->cx += auprobe->rip_rela_target_address;
447 }
448}
449#else
450static void
451pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
452 struct arch_uprobe_task *autask)
453{
454 /* No RIP-relative addressing on 32-bit */
455}
456#endif
457
458/*
459 * arch_uprobe_pre_xol - prepare to execute out of line.
460 * @auprobe: the probepoint information.
461 * @regs: reflects the saved user state of current task.
462 */
463int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
464{
465 struct arch_uprobe_task *autask;
466
467 autask = &current->utask->autask;
468 autask->saved_trap_nr = current->thread.trap_nr;
469 current->thread.trap_nr = UPROBE_TRAP_NR;
470 regs->ip = current->utask->xol_vaddr;
471 pre_xol_rip_insn(auprobe, regs, autask);
472
473 return 0;
474}
475
476/*
477 * This function is called by arch_uprobe_post_xol() to adjust the return
478 * address pushed by a call instruction executed out of line.
479 */
480static int adjust_ret_addr(unsigned long sp, long correction)
481{
482 int rasize, ncopied;
483 long ra = 0;
484
485 if (is_ia32_task())
486 rasize = 4;
487 else
488 rasize = 8;
489
490 ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
491 if (unlikely(ncopied))
492 return -EFAULT;
493
494 ra += correction;
495 ncopied = copy_to_user((void __user *)sp, &ra, rasize);
496 if (unlikely(ncopied))
497 return -EFAULT;
498
499 return 0;
500}
501
502#ifdef CONFIG_X86_64
503static bool is_riprel_insn(struct arch_uprobe *auprobe)
504{
505 return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
506}
507
508static void
509handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
510{
511 if (is_riprel_insn(auprobe)) {
512 struct arch_uprobe_task *autask;
513
514 autask = &current->utask->autask;
515 if (auprobe->fixups & UPROBE_FIX_RIP_AX)
516 regs->ax = autask->saved_scratch_register;
517 else
518 regs->cx = autask->saved_scratch_register;
519
520 /*
521 * The original instruction includes a displacement, and so
522 * is 4 bytes longer than what we've just single-stepped.
523 * Fall through to handle stuff like "jmpq *...(%rip)" and
524 * "callq *...(%rip)".
525 */
526 if (correction)
527 *correction += 4;
528 }
529}
530#else
531static void
532handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
533{
534 /* No RIP-relative addressing on 32-bit */
535}
536#endif
537
538/*
539 * If xol insn itself traps and generates a signal(Say,
540 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
541 * instruction jumps back to its own address. It is assumed that anything
542 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
543 *
544 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
545 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
546 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
547 */
548bool arch_uprobe_xol_was_trapped(struct task_struct *t)
549{
550 if (t->thread.trap_nr != UPROBE_TRAP_NR)
551 return true;
552
553 return false;
554}
555
556/*
557 * Called after single-stepping. To avoid the SMP problems that can
558 * occur when we temporarily put back the original opcode to
559 * single-step, we single-stepped a copy of the instruction.
560 *
561 * This function prepares to resume execution after the single-step.
562 * We have to fix things up as follows:
563 *
564 * Typically, the new ip is relative to the copied instruction. We need
565 * to make it relative to the original instruction (FIX_IP). Exceptions
566 * are return instructions and absolute or indirect jump or call instructions.
567 *
568 * If the single-stepped instruction was a call, the return address that
569 * is atop the stack is the address following the copied instruction. We
570 * need to make it the address following the original instruction (FIX_CALL).
571 *
572 * If the original instruction was a rip-relative instruction such as
573 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
574 * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
575 * We need to restore the contents of the scratch register and adjust
576 * the ip, keeping in mind that the instruction we executed is 4 bytes
577 * shorter than the original instruction (since we squeezed out the offset
578 * field). (FIX_RIP_AX or FIX_RIP_CX)
579 */
580int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
581{
582 struct uprobe_task *utask;
583 long correction;
584 int result = 0;
585
586 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
587
588 utask = current->utask;
589 current->thread.trap_nr = utask->autask.saved_trap_nr;
590 correction = (long)(utask->vaddr - utask->xol_vaddr);
591 handle_riprel_post_xol(auprobe, regs, &correction);
592 if (auprobe->fixups & UPROBE_FIX_IP)
593 regs->ip += correction;
594
595 if (auprobe->fixups & UPROBE_FIX_CALL)
596 result = adjust_ret_addr(regs->sp, correction);
597
598 return result;
599}
600
601/* callback routine for handling exceptions. */
602int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
603{
604 struct die_args *args = data;
605 struct pt_regs *regs = args->regs;
606 int ret = NOTIFY_DONE;
607
608 /* We are only interested in userspace traps */
609 if (regs && !user_mode_vm(regs))
610 return NOTIFY_DONE;
611
612 switch (val) {
613 case DIE_INT3:
614 if (uprobe_pre_sstep_notifier(regs))
615 ret = NOTIFY_STOP;
616
617 break;
618
619 case DIE_DEBUG:
620 if (uprobe_post_sstep_notifier(regs))
621 ret = NOTIFY_STOP;
622
623 default:
624 break;
625 }
626
627 return ret;
628}
629
630/*
631 * This function gets called when XOL instruction either gets trapped or
632 * the thread has a fatal signal, so reset the instruction pointer to its
633 * probed address.
634 */
635void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
636{
637 struct uprobe_task *utask = current->utask;
638
639 current->thread.trap_nr = utask->autask.saved_trap_nr;
640 handle_riprel_post_xol(auprobe, regs, NULL);
641 instruction_pointer_set(regs, utask->vaddr);
642}
643
644/*
645 * Skip these instructions as per the currently known x86 ISA.
646 * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
647 */
648bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
649{
650 int i;
651
652 for (i = 0; i < MAX_UINSN_BYTES; i++) {
653 if ((auprobe->insn[i] == 0x66))
654 continue;
655
656 if (auprobe->insn[i] == 0x90)
657 return true;
658
659 if (i == (MAX_UINSN_BYTES - 1))
660 break;
661
662 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f))
663 return true;
664
665 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19))
666 return true;
667
668 if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0))
669 return true;
670
671 break;
672 }
673 return false;
674}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd48..8eeb55a551b 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18#include <linux/smp.h>
18 19
19#include <asm/apic.h> 20#include <asm/apic.h>
20#include <asm/pci-direct.h> 21#include <asm/pci-direct.h>
@@ -22,6 +23,8 @@
22#include <asm/paravirt.h> 23#include <asm/paravirt.h>
23#include <asm/setup.h> 24#include <asm/setup.h>
24 25
26#define TOPOLOGY_REGISTER_OFFSET 0x10
27
25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT 28#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
26/* 29/*
27 * Interrupt control on vSMPowered systems: 30 * Interrupt control on vSMPowered systems:
@@ -149,12 +152,49 @@ int is_vsmp_box(void)
149 return 0; 152 return 0;
150} 153}
151#endif 154#endif
155
156static void __init vsmp_cap_cpus(void)
157{
158#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
159 void __iomem *address;
160 unsigned int cfg, topology, node_shift, maxcpus;
161
162 /*
163 * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
164 * ones present in the first board, unless explicitly overridden by
165 * setup_max_cpus
166 */
167 if (setup_max_cpus != NR_CPUS)
168 return;
169
170 /* Read the vSMP Foundation topology register */
171 cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
172 address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
173 if (WARN_ON(!address))
174 return;
175
176 topology = readl(address);
177 node_shift = (topology >> 16) & 0x7;
178 if (!node_shift)
179 /* The value 0 should be decoded as 8 */
180 node_shift = 8;
181 maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
182
183 pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
184 maxcpus);
185 setup_max_cpus = maxcpus;
186 early_iounmap(address, 4);
187#endif
188}
189
152void __init vsmp_init(void) 190void __init vsmp_init(void)
153{ 191{
154 detect_vsmp_box(); 192 detect_vsmp_box();
155 if (!is_vsmp_box()) 193 if (!is_vsmp_box())
156 return; 194 return;
157 195
196 vsmp_cap_cpus();
197
158 set_vsmp_pv_ops(); 198 set_vsmp_pv_ops();
159 return; 199 return;
160} 200}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9cf71d0b2d3..35c5e543f55 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,6 +18,7 @@
18#include <asm/e820.h> 18#include <asm/e820.h>
19#include <asm/time.h> 19#include <asm/time.h>
20#include <asm/irq.h> 20#include <asm/irq.h>
21#include <asm/io_apic.h>
21#include <asm/pat.h> 22#include <asm/pat.h>
22#include <asm/tsc.h> 23#include <asm/tsc.h>
23#include <asm/iommu.h> 24#include <asm/iommu.h>
@@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = {
119 .teardown_msi_irqs = default_teardown_msi_irqs, 120 .teardown_msi_irqs = default_teardown_msi_irqs,
120 .restore_msi_irqs = default_restore_msi_irqs, 121 .restore_msi_irqs = default_restore_msi_irqs,
121}; 122};
123
124struct x86_io_apic_ops x86_io_apic_ops = {
125 .init = native_io_apic_init_mappings,
126 .read = native_io_apic_read,
127 .write = native_io_apic_write,
128 .modify = native_io_apic_modify,
129};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index e62728e30b0..bd18149b2b0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk)
48 if (!fx) 48 if (!fx)
49 return; 49 return;
50 50
51 BUG_ON(__thread_has_fpu(tsk));
52
53 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; 51 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
54 52
55 /* 53 /*