diff options
Diffstat (limited to 'arch/x86/kernel')
73 files changed, 2697 insertions, 1826 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 08484332f32..8215e5652d9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds | 5 | extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds |
6 | 6 | ||
7 | CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) | 7 | CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) |
8 | 8 | ||
@@ -47,7 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o | |||
47 | obj-y += cpu/ | 47 | obj-y += cpu/ |
48 | obj-y += acpi/ | 48 | obj-y += acpi/ |
49 | obj-y += reboot.o | 49 | obj-y += reboot.o |
50 | obj-$(CONFIG_MCA) += mca_32.o | ||
51 | obj-$(CONFIG_X86_MSR) += msr.o | 50 | obj-$(CONFIG_X86_MSR) += msr.o |
52 | obj-$(CONFIG_X86_CPUID) += cpuid.o | 51 | obj-$(CONFIG_X86_CPUID) += cpuid.o |
53 | obj-$(CONFIG_PCI) += early-quirks.o | 52 | obj-$(CONFIG_PCI) += early-quirks.o |
@@ -99,6 +98,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o | |||
99 | 98 | ||
100 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o | 99 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o |
101 | obj-$(CONFIG_OF) += devicetree.o | 100 | obj-$(CONFIG_OF) += devicetree.o |
101 | obj-$(CONFIG_UPROBES) += uprobes.o | ||
102 | 102 | ||
103 | ### | 103 | ### |
104 | # 64 bit specific files | 104 | # 64 bit specific files |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a415b1f4436..8afb6931981 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) | |||
593 | #ifdef CONFIG_ACPI_HOTPLUG_CPU | 593 | #ifdef CONFIG_ACPI_HOTPLUG_CPU |
594 | #include <acpi/processor.h> | 594 | #include <acpi/processor.h> |
595 | 595 | ||
596 | static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) | 596 | static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) |
597 | { | 597 | { |
598 | #ifdef CONFIG_ACPI_NUMA | 598 | #ifdef CONFIG_ACPI_NUMA |
599 | int nid; | 599 | int nid; |
@@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
990 | int i; | 990 | int i; |
991 | struct mpc_intsrc mp_irq; | 991 | struct mpc_intsrc mp_irq; |
992 | 992 | ||
993 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | 993 | #ifdef CONFIG_EISA |
994 | /* | 994 | /* |
995 | * Fabricate the legacy ISA bus (bus #31). | 995 | * Fabricate the legacy ISA bus (bus #31). |
996 | */ | 996 | */ |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index edc24480469..39a222e094a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
36 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
37 | 37 | ||
38 | #include <asm/irq_remapping.h> | ||
38 | #include <asm/perf_event.h> | 39 | #include <asm/perf_event.h> |
39 | #include <asm/x86_init.h> | 40 | #include <asm/x86_init.h> |
40 | #include <asm/pgalloc.h> | 41 | #include <asm/pgalloc.h> |
@@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void) | |||
1325 | acked); | 1326 | acked); |
1326 | break; | 1327 | break; |
1327 | } | 1328 | } |
1328 | if (cpu_has_tsc) { | 1329 | if (queued) { |
1329 | rdtscll(ntsc); | 1330 | if (cpu_has_tsc) { |
1330 | max_loops = (cpu_khz << 10) - (ntsc - tsc); | 1331 | rdtscll(ntsc); |
1331 | } else | 1332 | max_loops = (cpu_khz << 10) - (ntsc - tsc); |
1332 | max_loops--; | 1333 | } else |
1334 | max_loops--; | ||
1335 | } | ||
1333 | } while (queued && max_loops > 0); | 1336 | } while (queued && max_loops > 0); |
1334 | WARN_ON(max_loops <= 0); | 1337 | WARN_ON(max_loops <= 0); |
1335 | 1338 | ||
@@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void) | |||
1441 | * Now that local APIC setup is completed for BP, configure the fault | 1444 | * Now that local APIC setup is completed for BP, configure the fault |
1442 | * handling for interrupt remapping. | 1445 | * handling for interrupt remapping. |
1443 | */ | 1446 | */ |
1444 | if (intr_remapping_enabled) | 1447 | if (irq_remapping_enabled) |
1445 | enable_drhd_fault_handling(); | 1448 | irq_remap_enable_fault_handling(); |
1446 | 1449 | ||
1447 | } | 1450 | } |
1448 | 1451 | ||
@@ -1517,7 +1520,7 @@ void enable_x2apic(void) | |||
1517 | int __init enable_IR(void) | 1520 | int __init enable_IR(void) |
1518 | { | 1521 | { |
1519 | #ifdef CONFIG_IRQ_REMAP | 1522 | #ifdef CONFIG_IRQ_REMAP |
1520 | if (!intr_remapping_supported()) { | 1523 | if (!irq_remapping_supported()) { |
1521 | pr_debug("intr-remapping not supported\n"); | 1524 | pr_debug("intr-remapping not supported\n"); |
1522 | return -1; | 1525 | return -1; |
1523 | } | 1526 | } |
@@ -1528,7 +1531,7 @@ int __init enable_IR(void) | |||
1528 | return -1; | 1531 | return -1; |
1529 | } | 1532 | } |
1530 | 1533 | ||
1531 | return enable_intr_remapping(); | 1534 | return irq_remapping_enable(); |
1532 | #endif | 1535 | #endif |
1533 | return -1; | 1536 | return -1; |
1534 | } | 1537 | } |
@@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void) | |||
1537 | { | 1540 | { |
1538 | unsigned long flags; | 1541 | unsigned long flags; |
1539 | int ret, x2apic_enabled = 0; | 1542 | int ret, x2apic_enabled = 0; |
1540 | int dmar_table_init_ret; | 1543 | int hardware_init_ret; |
1544 | |||
1545 | /* Make sure irq_remap_ops are initialized */ | ||
1546 | setup_irq_remapping_ops(); | ||
1541 | 1547 | ||
1542 | dmar_table_init_ret = dmar_table_init(); | 1548 | hardware_init_ret = irq_remapping_prepare(); |
1543 | if (dmar_table_init_ret && !x2apic_supported()) | 1549 | if (hardware_init_ret && !x2apic_supported()) |
1544 | return; | 1550 | return; |
1545 | 1551 | ||
1546 | ret = save_ioapic_entries(); | 1552 | ret = save_ioapic_entries(); |
@@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void) | |||
1556 | if (x2apic_preenabled && nox2apic) | 1562 | if (x2apic_preenabled && nox2apic) |
1557 | disable_x2apic(); | 1563 | disable_x2apic(); |
1558 | 1564 | ||
1559 | if (dmar_table_init_ret) | 1565 | if (hardware_init_ret) |
1560 | ret = -1; | 1566 | ret = -1; |
1561 | else | 1567 | else |
1562 | ret = enable_IR(); | 1568 | ret = enable_IR(); |
@@ -2176,8 +2182,8 @@ static int lapic_suspend(void) | |||
2176 | local_irq_save(flags); | 2182 | local_irq_save(flags); |
2177 | disable_local_APIC(); | 2183 | disable_local_APIC(); |
2178 | 2184 | ||
2179 | if (intr_remapping_enabled) | 2185 | if (irq_remapping_enabled) |
2180 | disable_intr_remapping(); | 2186 | irq_remapping_disable(); |
2181 | 2187 | ||
2182 | local_irq_restore(flags); | 2188 | local_irq_restore(flags); |
2183 | return 0; | 2189 | return 0; |
@@ -2193,7 +2199,7 @@ static void lapic_resume(void) | |||
2193 | return; | 2199 | return; |
2194 | 2200 | ||
2195 | local_irq_save(flags); | 2201 | local_irq_save(flags); |
2196 | if (intr_remapping_enabled) { | 2202 | if (irq_remapping_enabled) { |
2197 | /* | 2203 | /* |
2198 | * IO-APIC and PIC have their own resume routines. | 2204 | * IO-APIC and PIC have their own resume routines. |
2199 | * We just mask them here to make sure the interrupt | 2205 | * We just mask them here to make sure the interrupt |
@@ -2245,8 +2251,8 @@ static void lapic_resume(void) | |||
2245 | apic_write(APIC_ESR, 0); | 2251 | apic_write(APIC_ESR, 0); |
2246 | apic_read(APIC_ESR); | 2252 | apic_read(APIC_ESR); |
2247 | 2253 | ||
2248 | if (intr_remapping_enabled) | 2254 | if (irq_remapping_enabled) |
2249 | reenable_intr_remapping(x2apic_mode); | 2255 | irq_remapping_reenable(x2apic_mode); |
2250 | 2256 | ||
2251 | local_irq_restore(flags); | 2257 | local_irq_restore(flags); |
2252 | } | 2258 | } |
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36..0e881c46e8c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c | |||
@@ -227,6 +227,7 @@ static struct apic apic_flat = { | |||
227 | 227 | ||
228 | .read = native_apic_mem_read, | 228 | .read = native_apic_mem_read, |
229 | .write = native_apic_mem_write, | 229 | .write = native_apic_mem_write, |
230 | .eoi_write = native_apic_mem_write, | ||
230 | .icr_read = native_apic_icr_read, | 231 | .icr_read = native_apic_icr_read, |
231 | .icr_write = native_apic_icr_write, | 232 | .icr_write = native_apic_icr_write, |
232 | .wait_icr_idle = native_apic_wait_icr_idle, | 233 | .wait_icr_idle = native_apic_wait_icr_idle, |
@@ -386,6 +387,7 @@ static struct apic apic_physflat = { | |||
386 | 387 | ||
387 | .read = native_apic_mem_read, | 388 | .read = native_apic_mem_read, |
388 | .write = native_apic_mem_write, | 389 | .write = native_apic_mem_write, |
390 | .eoi_write = native_apic_mem_write, | ||
389 | .icr_read = native_apic_icr_read, | 391 | .icr_read = native_apic_icr_read, |
390 | .icr_write = native_apic_icr_write, | 392 | .icr_write = native_apic_icr_write, |
391 | .wait_icr_idle = native_apic_wait_icr_idle, | 393 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c..a6e4c6e06c0 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c | |||
@@ -181,6 +181,7 @@ struct apic apic_noop = { | |||
181 | 181 | ||
182 | .read = noop_apic_read, | 182 | .read = noop_apic_read, |
183 | .write = noop_apic_write, | 183 | .write = noop_apic_write, |
184 | .eoi_write = noop_apic_write, | ||
184 | .icr_read = noop_apic_icr_read, | 185 | .icr_read = noop_apic_icr_read, |
185 | .icr_write = noop_apic_icr_write, | 186 | .icr_write = noop_apic_icr_write, |
186 | .wait_icr_idle = noop_apic_wait_icr_idle, | 187 | .wait_icr_idle = noop_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 23e75422e01..6ec6d5d297c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c | |||
@@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = { | |||
295 | 295 | ||
296 | .read = native_apic_mem_read, | 296 | .read = native_apic_mem_read, |
297 | .write = native_apic_mem_write, | 297 | .write = native_apic_mem_write, |
298 | .eoi_write = native_apic_mem_write, | ||
298 | .icr_read = native_apic_icr_read, | 299 | .icr_read = native_apic_icr_read, |
299 | .icr_write = native_apic_icr_write, | 300 | .icr_write = native_apic_icr_write, |
300 | .wait_icr_idle = native_apic_wait_icr_idle, | 301 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065af..31fbdbfbf96 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c | |||
@@ -248,6 +248,7 @@ static struct apic apic_bigsmp = { | |||
248 | 248 | ||
249 | .read = native_apic_mem_read, | 249 | .read = native_apic_mem_read, |
250 | .write = native_apic_mem_write, | 250 | .write = native_apic_mem_write, |
251 | .eoi_write = native_apic_mem_write, | ||
251 | .icr_read = native_apic_icr_read, | 252 | .icr_read = native_apic_icr_read, |
252 | .icr_write = native_apic_icr_write, | 253 | .icr_write = native_apic_icr_write, |
253 | .wait_icr_idle = native_apic_wait_icr_idle, | 254 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b913..db4ab1be3c7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = { | |||
678 | 678 | ||
679 | .read = native_apic_mem_read, | 679 | .read = native_apic_mem_read, |
680 | .write = native_apic_mem_write, | 680 | .write = native_apic_mem_write, |
681 | .eoi_write = native_apic_mem_write, | ||
681 | .icr_read = native_apic_icr_read, | 682 | .icr_read = native_apic_icr_read, |
682 | .icr_write = native_apic_icr_write, | 683 | .icr_write = native_apic_icr_write, |
683 | .wait_icr_idle = native_apic_wait_icr_idle, | 684 | .wait_icr_idle = native_apic_wait_icr_idle, |
@@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = { | |||
742 | 743 | ||
743 | .read = native_apic_mem_read, | 744 | .read = native_apic_mem_read, |
744 | .write = native_apic_mem_write, | 745 | .write = native_apic_mem_write, |
746 | .eoi_write = native_apic_mem_write, | ||
745 | .icr_read = native_apic_icr_read, | 747 | .icr_read = native_apic_icr_read, |
746 | .icr_write = native_apic_icr_write, | 748 | .icr_write = native_apic_icr_write, |
747 | .wait_icr_idle = native_apic_wait_icr_idle, | 749 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80..ac96561d1a9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -68,23 +68,21 @@ | |||
68 | #define for_each_irq_pin(entry, head) \ | 68 | #define for_each_irq_pin(entry, head) \ |
69 | for (entry = head; entry; entry = entry->next) | 69 | for (entry = head; entry; entry = entry->next) |
70 | 70 | ||
71 | static void __init __ioapic_init_mappings(void); | 71 | #ifdef CONFIG_IRQ_REMAP |
72 | 72 | static void irq_remap_modify_chip_defaults(struct irq_chip *chip); | |
73 | static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); | 73 | static inline bool irq_remapped(struct irq_cfg *cfg) |
74 | static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); | 74 | { |
75 | static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); | 75 | return cfg->irq_2_iommu.iommu != NULL; |
76 | 76 | } | |
77 | static struct io_apic_ops io_apic_ops = { | 77 | #else |
78 | .init = __ioapic_init_mappings, | 78 | static inline bool irq_remapped(struct irq_cfg *cfg) |
79 | .read = __io_apic_read, | 79 | { |
80 | .write = __io_apic_write, | 80 | return false; |
81 | .modify = __io_apic_modify, | 81 | } |
82 | }; | 82 | static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) |
83 | |||
84 | void __init set_io_apic_ops(const struct io_apic_ops *ops) | ||
85 | { | 83 | { |
86 | io_apic_ops = *ops; | ||
87 | } | 84 | } |
85 | #endif | ||
88 | 86 | ||
89 | /* | 87 | /* |
90 | * Is the SiS APIC rmw bug present ? | 88 | * Is the SiS APIC rmw bug present ? |
@@ -142,7 +140,7 @@ int mp_irq_entries; | |||
142 | /* GSI interrupts */ | 140 | /* GSI interrupts */ |
143 | static int nr_irqs_gsi = NR_IRQS_LEGACY; | 141 | static int nr_irqs_gsi = NR_IRQS_LEGACY; |
144 | 142 | ||
145 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | 143 | #ifdef CONFIG_EISA |
146 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | 144 | int mp_bus_id_to_type[MAX_MP_BUSSES]; |
147 | #endif | 145 | #endif |
148 | 146 | ||
@@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) | |||
313 | irq_free_desc(at); | 311 | irq_free_desc(at); |
314 | } | 312 | } |
315 | 313 | ||
316 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | ||
317 | { | ||
318 | return io_apic_ops.read(apic, reg); | ||
319 | } | ||
320 | |||
321 | static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | ||
322 | { | ||
323 | io_apic_ops.write(apic, reg, value); | ||
324 | } | ||
325 | |||
326 | static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) | ||
327 | { | ||
328 | io_apic_ops.modify(apic, reg, value); | ||
329 | } | ||
330 | |||
331 | 314 | ||
332 | struct io_apic { | 315 | struct io_apic { |
333 | unsigned int index; | 316 | unsigned int index; |
@@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) | |||
349 | writel(vector, &io_apic->eoi); | 332 | writel(vector, &io_apic->eoi); |
350 | } | 333 | } |
351 | 334 | ||
352 | static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) | 335 | unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) |
353 | { | 336 | { |
354 | struct io_apic __iomem *io_apic = io_apic_base(apic); | 337 | struct io_apic __iomem *io_apic = io_apic_base(apic); |
355 | writel(reg, &io_apic->index); | 338 | writel(reg, &io_apic->index); |
356 | return readl(&io_apic->data); | 339 | return readl(&io_apic->data); |
357 | } | 340 | } |
358 | 341 | ||
359 | static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | 342 | void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) |
360 | { | 343 | { |
361 | struct io_apic __iomem *io_apic = io_apic_base(apic); | 344 | struct io_apic __iomem *io_apic = io_apic_base(apic); |
362 | 345 | ||
@@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va | |||
370 | * | 353 | * |
371 | * Older SiS APIC requires we rewrite the index register | 354 | * Older SiS APIC requires we rewrite the index register |
372 | */ | 355 | */ |
373 | static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) | 356 | void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) |
374 | { | 357 | { |
375 | struct io_apic __iomem *io_apic = io_apic_base(apic); | 358 | struct io_apic __iomem *io_apic = io_apic_base(apic); |
376 | 359 | ||
@@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v | |||
379 | writel(value, &io_apic->data); | 362 | writel(value, &io_apic->data); |
380 | } | 363 | } |
381 | 364 | ||
382 | static bool io_apic_level_ack_pending(struct irq_cfg *cfg) | ||
383 | { | ||
384 | struct irq_pin_list *entry; | ||
385 | unsigned long flags; | ||
386 | |||
387 | raw_spin_lock_irqsave(&ioapic_lock, flags); | ||
388 | for_each_irq_pin(entry, cfg->irq_2_pin) { | ||
389 | unsigned int reg; | ||
390 | int pin; | ||
391 | |||
392 | pin = entry->pin; | ||
393 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
394 | /* Is the remote IRR bit set? */ | ||
395 | if (reg & IO_APIC_REDIR_REMOTE_IRR) { | ||
396 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | ||
397 | return true; | ||
398 | } | ||
399 | } | ||
400 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | ||
401 | |||
402 | return false; | ||
403 | } | ||
404 | |||
405 | union entry_union { | 365 | union entry_union { |
406 | struct { u32 w1, w2; }; | 366 | struct { u32 w1, w2; }; |
407 | struct IO_APIC_route_entry entry; | 367 | struct IO_APIC_route_entry entry; |
@@ -875,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type) | |||
875 | return -1; | 835 | return -1; |
876 | } | 836 | } |
877 | 837 | ||
878 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 838 | #ifdef CONFIG_EISA |
879 | /* | 839 | /* |
880 | * EISA Edge/Level control register, ELCR | 840 | * EISA Edge/Level control register, ELCR |
881 | */ | 841 | */ |
@@ -912,12 +872,6 @@ static int EISA_ELCR(unsigned int irq) | |||
912 | #define default_PCI_trigger(idx) (1) | 872 | #define default_PCI_trigger(idx) (1) |
913 | #define default_PCI_polarity(idx) (1) | 873 | #define default_PCI_polarity(idx) (1) |
914 | 874 | ||
915 | /* MCA interrupts are always polarity zero level triggered, | ||
916 | * when listed as conforming in the MP table. */ | ||
917 | |||
918 | #define default_MCA_trigger(idx) (1) | ||
919 | #define default_MCA_polarity(idx) default_ISA_polarity(idx) | ||
920 | |||
921 | static int irq_polarity(int idx) | 875 | static int irq_polarity(int idx) |
922 | { | 876 | { |
923 | int bus = mp_irqs[idx].srcbus; | 877 | int bus = mp_irqs[idx].srcbus; |
@@ -975,7 +929,7 @@ static int irq_trigger(int idx) | |||
975 | trigger = default_ISA_trigger(idx); | 929 | trigger = default_ISA_trigger(idx); |
976 | else | 930 | else |
977 | trigger = default_PCI_trigger(idx); | 931 | trigger = default_PCI_trigger(idx); |
978 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 932 | #ifdef CONFIG_EISA |
979 | switch (mp_bus_id_to_type[bus]) { | 933 | switch (mp_bus_id_to_type[bus]) { |
980 | case MP_BUS_ISA: /* ISA pin */ | 934 | case MP_BUS_ISA: /* ISA pin */ |
981 | { | 935 | { |
@@ -992,11 +946,6 @@ static int irq_trigger(int idx) | |||
992 | /* set before the switch */ | 946 | /* set before the switch */ |
993 | break; | 947 | break; |
994 | } | 948 | } |
995 | case MP_BUS_MCA: /* MCA pin */ | ||
996 | { | ||
997 | trigger = default_MCA_trigger(idx); | ||
998 | break; | ||
999 | } | ||
1000 | default: | 949 | default: |
1001 | { | 950 | { |
1002 | printk(KERN_WARNING "broken BIOS!!\n"); | 951 | printk(KERN_WARNING "broken BIOS!!\n"); |
@@ -1361,77 +1310,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, | |||
1361 | fasteoi ? "fasteoi" : "edge"); | 1310 | fasteoi ? "fasteoi" : "edge"); |
1362 | } | 1311 | } |
1363 | 1312 | ||
1364 | |||
1365 | static int setup_ir_ioapic_entry(int irq, | ||
1366 | struct IR_IO_APIC_route_entry *entry, | ||
1367 | unsigned int destination, int vector, | ||
1368 | struct io_apic_irq_attr *attr) | ||
1369 | { | ||
1370 | int index; | ||
1371 | struct irte irte; | ||
1372 | int ioapic_id = mpc_ioapic_id(attr->ioapic); | ||
1373 | struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); | ||
1374 | |||
1375 | if (!iommu) { | ||
1376 | pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); | ||
1377 | return -ENODEV; | ||
1378 | } | ||
1379 | |||
1380 | index = alloc_irte(iommu, irq, 1); | ||
1381 | if (index < 0) { | ||
1382 | pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); | ||
1383 | return -ENOMEM; | ||
1384 | } | ||
1385 | |||
1386 | prepare_irte(&irte, vector, destination); | ||
1387 | |||
1388 | /* Set source-id of interrupt request */ | ||
1389 | set_ioapic_sid(&irte, ioapic_id); | ||
1390 | |||
1391 | modify_irte(irq, &irte); | ||
1392 | |||
1393 | apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " | ||
1394 | "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " | ||
1395 | "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " | ||
1396 | "Avail:%X Vector:%02X Dest:%08X " | ||
1397 | "SID:%04X SQ:%X SVT:%X)\n", | ||
1398 | attr->ioapic, irte.present, irte.fpd, irte.dst_mode, | ||
1399 | irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, | ||
1400 | irte.avail, irte.vector, irte.dest_id, | ||
1401 | irte.sid, irte.sq, irte.svt); | ||
1402 | |||
1403 | memset(entry, 0, sizeof(*entry)); | ||
1404 | |||
1405 | entry->index2 = (index >> 15) & 0x1; | ||
1406 | entry->zero = 0; | ||
1407 | entry->format = 1; | ||
1408 | entry->index = (index & 0x7fff); | ||
1409 | /* | ||
1410 | * IO-APIC RTE will be configured with virtual vector. | ||
1411 | * irq handler will do the explicit EOI to the io-apic. | ||
1412 | */ | ||
1413 | entry->vector = attr->ioapic_pin; | ||
1414 | entry->mask = 0; /* enable IRQ */ | ||
1415 | entry->trigger = attr->trigger; | ||
1416 | entry->polarity = attr->polarity; | ||
1417 | |||
1418 | /* Mask level triggered irqs. | ||
1419 | * Use IRQ_DELAYED_DISABLE for edge triggered irqs. | ||
1420 | */ | ||
1421 | if (attr->trigger) | ||
1422 | entry->mask = 1; | ||
1423 | |||
1424 | return 0; | ||
1425 | } | ||
1426 | |||
1427 | static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, | 1313 | static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, |
1428 | unsigned int destination, int vector, | 1314 | unsigned int destination, int vector, |
1429 | struct io_apic_irq_attr *attr) | 1315 | struct io_apic_irq_attr *attr) |
1430 | { | 1316 | { |
1431 | if (intr_remapping_enabled) | 1317 | if (irq_remapping_enabled) |
1432 | return setup_ir_ioapic_entry(irq, | 1318 | return setup_ioapic_remapped_entry(irq, entry, destination, |
1433 | (struct IR_IO_APIC_route_entry *)entry, | 1319 | vector, attr); |
1434 | destination, vector, attr); | ||
1435 | 1320 | ||
1436 | memset(entry, 0, sizeof(*entry)); | 1321 | memset(entry, 0, sizeof(*entry)); |
1437 | 1322 | ||
@@ -1588,7 +1473,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, | |||
1588 | { | 1473 | { |
1589 | struct IO_APIC_route_entry entry; | 1474 | struct IO_APIC_route_entry entry; |
1590 | 1475 | ||
1591 | if (intr_remapping_enabled) | 1476 | if (irq_remapping_enabled) |
1592 | return; | 1477 | return; |
1593 | 1478 | ||
1594 | memset(&entry, 0, sizeof(entry)); | 1479 | memset(&entry, 0, sizeof(entry)); |
@@ -1674,7 +1559,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) | |||
1674 | 1559 | ||
1675 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | 1560 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); |
1676 | 1561 | ||
1677 | if (intr_remapping_enabled) { | 1562 | if (irq_remapping_enabled) { |
1678 | printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" | 1563 | printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" |
1679 | " Pol Stat Indx2 Zero Vect:\n"); | 1564 | " Pol Stat Indx2 Zero Vect:\n"); |
1680 | } else { | 1565 | } else { |
@@ -1683,7 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) | |||
1683 | } | 1568 | } |
1684 | 1569 | ||
1685 | for (i = 0; i <= reg_01.bits.entries; i++) { | 1570 | for (i = 0; i <= reg_01.bits.entries; i++) { |
1686 | if (intr_remapping_enabled) { | 1571 | if (irq_remapping_enabled) { |
1687 | struct IO_APIC_route_entry entry; | 1572 | struct IO_APIC_route_entry entry; |
1688 | struct IR_IO_APIC_route_entry *ir_entry; | 1573 | struct IR_IO_APIC_route_entry *ir_entry; |
1689 | 1574 | ||
@@ -2050,7 +1935,7 @@ void disable_IO_APIC(void) | |||
2050 | * IOAPIC RTE as well as interrupt-remapping table entry). | 1935 | * IOAPIC RTE as well as interrupt-remapping table entry). |
2051 | * As this gets called during crash dump, keep this simple for now. | 1936 | * As this gets called during crash dump, keep this simple for now. |
2052 | */ | 1937 | */ |
2053 | if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { | 1938 | if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { |
2054 | struct IO_APIC_route_entry entry; | 1939 | struct IO_APIC_route_entry entry; |
2055 | 1940 | ||
2056 | memset(&entry, 0, sizeof(entry)); | 1941 | memset(&entry, 0, sizeof(entry)); |
@@ -2074,7 +1959,7 @@ void disable_IO_APIC(void) | |||
2074 | * Use virtual wire A mode when interrupt remapping is enabled. | 1959 | * Use virtual wire A mode when interrupt remapping is enabled. |
2075 | */ | 1960 | */ |
2076 | if (cpu_has_apic || apic_from_smp_config()) | 1961 | if (cpu_has_apic || apic_from_smp_config()) |
2077 | disconnect_bsp_APIC(!intr_remapping_enabled && | 1962 | disconnect_bsp_APIC(!irq_remapping_enabled && |
2078 | ioapic_i8259.pin != -1); | 1963 | ioapic_i8259.pin != -1); |
2079 | } | 1964 | } |
2080 | 1965 | ||
@@ -2390,71 +2275,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
2390 | return ret; | 2275 | return ret; |
2391 | } | 2276 | } |
2392 | 2277 | ||
2393 | #ifdef CONFIG_IRQ_REMAP | ||
2394 | |||
2395 | /* | ||
2396 | * Migrate the IO-APIC irq in the presence of intr-remapping. | ||
2397 | * | ||
2398 | * For both level and edge triggered, irq migration is a simple atomic | ||
2399 | * update(of vector and cpu destination) of IRTE and flush the hardware cache. | ||
2400 | * | ||
2401 | * For level triggered, we eliminate the io-apic RTE modification (with the | ||
2402 | * updated vector information), by using a virtual vector (io-apic pin number). | ||
2403 | * Real vector that is used for interrupting cpu will be coming from | ||
2404 | * the interrupt-remapping table entry. | ||
2405 | * | ||
2406 | * As the migration is a simple atomic update of IRTE, the same mechanism | ||
2407 | * is used to migrate MSI irq's in the presence of interrupt-remapping. | ||
2408 | */ | ||
2409 | static int | ||
2410 | ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
2411 | bool force) | ||
2412 | { | ||
2413 | struct irq_cfg *cfg = data->chip_data; | ||
2414 | unsigned int dest, irq = data->irq; | ||
2415 | struct irte irte; | ||
2416 | |||
2417 | if (!cpumask_intersects(mask, cpu_online_mask)) | ||
2418 | return -EINVAL; | ||
2419 | |||
2420 | if (get_irte(irq, &irte)) | ||
2421 | return -EBUSY; | ||
2422 | |||
2423 | if (assign_irq_vector(irq, cfg, mask)) | ||
2424 | return -EBUSY; | ||
2425 | |||
2426 | dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); | ||
2427 | |||
2428 | irte.vector = cfg->vector; | ||
2429 | irte.dest_id = IRTE_DEST(dest); | ||
2430 | |||
2431 | /* | ||
2432 | * Atomically updates the IRTE with the new destination, vector | ||
2433 | * and flushes the interrupt entry cache. | ||
2434 | */ | ||
2435 | modify_irte(irq, &irte); | ||
2436 | |||
2437 | /* | ||
2438 | * After this point, all the interrupts will start arriving | ||
2439 | * at the new destination. So, time to cleanup the previous | ||
2440 | * vector allocation. | ||
2441 | */ | ||
2442 | if (cfg->move_in_progress) | ||
2443 | send_cleanup_vector(cfg); | ||
2444 | |||
2445 | cpumask_copy(data->affinity, mask); | ||
2446 | return 0; | ||
2447 | } | ||
2448 | |||
2449 | #else | ||
2450 | static inline int | ||
2451 | ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
2452 | bool force) | ||
2453 | { | ||
2454 | return 0; | ||
2455 | } | ||
2456 | #endif | ||
2457 | |||
2458 | asmlinkage void smp_irq_move_cleanup_interrupt(void) | 2278 | asmlinkage void smp_irq_move_cleanup_interrupt(void) |
2459 | { | 2279 | { |
2460 | unsigned vector, me; | 2280 | unsigned vector, me; |
@@ -2552,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data) | |||
2552 | atomic_t irq_mis_count; | 2372 | atomic_t irq_mis_count; |
2553 | 2373 | ||
2554 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 2374 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
2375 | static bool io_apic_level_ack_pending(struct irq_cfg *cfg) | ||
2376 | { | ||
2377 | struct irq_pin_list *entry; | ||
2378 | unsigned long flags; | ||
2379 | |||
2380 | raw_spin_lock_irqsave(&ioapic_lock, flags); | ||
2381 | for_each_irq_pin(entry, cfg->irq_2_pin) { | ||
2382 | unsigned int reg; | ||
2383 | int pin; | ||
2384 | |||
2385 | pin = entry->pin; | ||
2386 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
2387 | /* Is the remote IRR bit set? */ | ||
2388 | if (reg & IO_APIC_REDIR_REMOTE_IRR) { | ||
2389 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2390 | return true; | ||
2391 | } | ||
2392 | } | ||
2393 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2394 | |||
2395 | return false; | ||
2396 | } | ||
2397 | |||
2555 | static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) | 2398 | static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) |
2556 | { | 2399 | { |
2557 | /* If we are moving the irq we need to mask it */ | 2400 | /* If we are moving the irq we need to mask it */ |
@@ -2699,7 +2542,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) | |||
2699 | chip->irq_eoi = ir_ack_apic_level; | 2542 | chip->irq_eoi = ir_ack_apic_level; |
2700 | 2543 | ||
2701 | #ifdef CONFIG_SMP | 2544 | #ifdef CONFIG_SMP |
2702 | chip->irq_set_affinity = ir_ioapic_set_affinity; | 2545 | chip->irq_set_affinity = set_remapped_irq_affinity; |
2703 | #endif | 2546 | #endif |
2704 | } | 2547 | } |
2705 | #endif /* CONFIG_IRQ_REMAP */ | 2548 | #endif /* CONFIG_IRQ_REMAP */ |
@@ -2912,7 +2755,7 @@ static inline void __init check_timer(void) | |||
2912 | * 8259A. | 2755 | * 8259A. |
2913 | */ | 2756 | */ |
2914 | if (pin1 == -1) { | 2757 | if (pin1 == -1) { |
2915 | if (intr_remapping_enabled) | 2758 | if (irq_remapping_enabled) |
2916 | panic("BIOS bug: timer not connected to IO-APIC"); | 2759 | panic("BIOS bug: timer not connected to IO-APIC"); |
2917 | pin1 = pin2; | 2760 | pin1 = pin2; |
2918 | apic1 = apic2; | 2761 | apic1 = apic2; |
@@ -2945,7 +2788,7 @@ static inline void __init check_timer(void) | |||
2945 | clear_IO_APIC_pin(0, pin1); | 2788 | clear_IO_APIC_pin(0, pin1); |
2946 | goto out; | 2789 | goto out; |
2947 | } | 2790 | } |
2948 | if (intr_remapping_enabled) | 2791 | if (irq_remapping_enabled) |
2949 | panic("timer doesn't work through Interrupt-remapped IO-APIC"); | 2792 | panic("timer doesn't work through Interrupt-remapped IO-APIC"); |
2950 | local_irq_disable(); | 2793 | local_irq_disable(); |
2951 | clear_IO_APIC_pin(apic1, pin1); | 2794 | clear_IO_APIC_pin(apic1, pin1); |
@@ -3169,7 +3012,7 @@ void destroy_irq(unsigned int irq) | |||
3169 | irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); | 3012 | irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); |
3170 | 3013 | ||
3171 | if (irq_remapped(cfg)) | 3014 | if (irq_remapped(cfg)) |
3172 | free_irte(irq); | 3015 | free_remapped_irq(irq); |
3173 | raw_spin_lock_irqsave(&vector_lock, flags); | 3016 | raw_spin_lock_irqsave(&vector_lock, flags); |
3174 | __clear_irq_vector(irq, cfg); | 3017 | __clear_irq_vector(irq, cfg); |
3175 | raw_spin_unlock_irqrestore(&vector_lock, flags); | 3018 | raw_spin_unlock_irqrestore(&vector_lock, flags); |
@@ -3198,54 +3041,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, | |||
3198 | dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); | 3041 | dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); |
3199 | 3042 | ||
3200 | if (irq_remapped(cfg)) { | 3043 | if (irq_remapped(cfg)) { |
3201 | struct irte irte; | 3044 | compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); |
3202 | int ir_index; | 3045 | return err; |
3203 | u16 sub_handle; | 3046 | } |
3204 | |||
3205 | ir_index = map_irq_to_irte_handle(irq, &sub_handle); | ||
3206 | BUG_ON(ir_index == -1); | ||
3207 | |||
3208 | prepare_irte(&irte, cfg->vector, dest); | ||
3209 | |||
3210 | /* Set source-id of interrupt request */ | ||
3211 | if (pdev) | ||
3212 | set_msi_sid(&irte, pdev); | ||
3213 | else | ||
3214 | set_hpet_sid(&irte, hpet_id); | ||
3215 | |||
3216 | modify_irte(irq, &irte); | ||
3217 | 3047 | ||
3048 | if (x2apic_enabled()) | ||
3049 | msg->address_hi = MSI_ADDR_BASE_HI | | ||
3050 | MSI_ADDR_EXT_DEST_ID(dest); | ||
3051 | else | ||
3218 | msg->address_hi = MSI_ADDR_BASE_HI; | 3052 | msg->address_hi = MSI_ADDR_BASE_HI; |
3219 | msg->data = sub_handle; | ||
3220 | msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | | ||
3221 | MSI_ADDR_IR_SHV | | ||
3222 | MSI_ADDR_IR_INDEX1(ir_index) | | ||
3223 | MSI_ADDR_IR_INDEX2(ir_index); | ||
3224 | } else { | ||
3225 | if (x2apic_enabled()) | ||
3226 | msg->address_hi = MSI_ADDR_BASE_HI | | ||
3227 | MSI_ADDR_EXT_DEST_ID(dest); | ||
3228 | else | ||
3229 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
3230 | 3053 | ||
3231 | msg->address_lo = | 3054 | msg->address_lo = |
3232 | MSI_ADDR_BASE_LO | | 3055 | MSI_ADDR_BASE_LO | |
3233 | ((apic->irq_dest_mode == 0) ? | 3056 | ((apic->irq_dest_mode == 0) ? |
3234 | MSI_ADDR_DEST_MODE_PHYSICAL: | 3057 | MSI_ADDR_DEST_MODE_PHYSICAL: |
3235 | MSI_ADDR_DEST_MODE_LOGICAL) | | 3058 | MSI_ADDR_DEST_MODE_LOGICAL) | |
3236 | ((apic->irq_delivery_mode != dest_LowestPrio) ? | 3059 | ((apic->irq_delivery_mode != dest_LowestPrio) ? |
3237 | MSI_ADDR_REDIRECTION_CPU: | 3060 | MSI_ADDR_REDIRECTION_CPU: |
3238 | MSI_ADDR_REDIRECTION_LOWPRI) | | 3061 | MSI_ADDR_REDIRECTION_LOWPRI) | |
3239 | MSI_ADDR_DEST_ID(dest); | 3062 | MSI_ADDR_DEST_ID(dest); |
3063 | |||
3064 | msg->data = | ||
3065 | MSI_DATA_TRIGGER_EDGE | | ||
3066 | MSI_DATA_LEVEL_ASSERT | | ||
3067 | ((apic->irq_delivery_mode != dest_LowestPrio) ? | ||
3068 | MSI_DATA_DELIVERY_FIXED: | ||
3069 | MSI_DATA_DELIVERY_LOWPRI) | | ||
3070 | MSI_DATA_VECTOR(cfg->vector); | ||
3240 | 3071 | ||
3241 | msg->data = | ||
3242 | MSI_DATA_TRIGGER_EDGE | | ||
3243 | MSI_DATA_LEVEL_ASSERT | | ||
3244 | ((apic->irq_delivery_mode != dest_LowestPrio) ? | ||
3245 | MSI_DATA_DELIVERY_FIXED: | ||
3246 | MSI_DATA_DELIVERY_LOWPRI) | | ||
3247 | MSI_DATA_VECTOR(cfg->vector); | ||
3248 | } | ||
3249 | return err; | 3072 | return err; |
3250 | } | 3073 | } |
3251 | 3074 | ||
@@ -3288,33 +3111,6 @@ static struct irq_chip msi_chip = { | |||
3288 | .irq_retrigger = ioapic_retrigger_irq, | 3111 | .irq_retrigger = ioapic_retrigger_irq, |
3289 | }; | 3112 | }; |
3290 | 3113 | ||
3291 | /* | ||
3292 | * Map the PCI dev to the corresponding remapping hardware unit | ||
3293 | * and allocate 'nvec' consecutive interrupt-remapping table entries | ||
3294 | * in it. | ||
3295 | */ | ||
3296 | static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) | ||
3297 | { | ||
3298 | struct intel_iommu *iommu; | ||
3299 | int index; | ||
3300 | |||
3301 | iommu = map_dev_to_ir(dev); | ||
3302 | if (!iommu) { | ||
3303 | printk(KERN_ERR | ||
3304 | "Unable to map PCI %s to iommu\n", pci_name(dev)); | ||
3305 | return -ENOENT; | ||
3306 | } | ||
3307 | |||
3308 | index = alloc_irte(iommu, irq, nvec); | ||
3309 | if (index < 0) { | ||
3310 | printk(KERN_ERR | ||
3311 | "Unable to allocate %d IRTE for PCI %s\n", nvec, | ||
3312 | pci_name(dev)); | ||
3313 | return -ENOSPC; | ||
3314 | } | ||
3315 | return index; | ||
3316 | } | ||
3317 | |||
3318 | static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) | 3114 | static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) |
3319 | { | 3115 | { |
3320 | struct irq_chip *chip = &msi_chip; | 3116 | struct irq_chip *chip = &msi_chip; |
@@ -3345,7 +3141,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
3345 | int node, ret, sub_handle, index = 0; | 3141 | int node, ret, sub_handle, index = 0; |
3346 | unsigned int irq, irq_want; | 3142 | unsigned int irq, irq_want; |
3347 | struct msi_desc *msidesc; | 3143 | struct msi_desc *msidesc; |
3348 | struct intel_iommu *iommu = NULL; | ||
3349 | 3144 | ||
3350 | /* x86 doesn't support multiple MSI yet */ | 3145 | /* x86 doesn't support multiple MSI yet */ |
3351 | if (type == PCI_CAP_ID_MSI && nvec > 1) | 3146 | if (type == PCI_CAP_ID_MSI && nvec > 1) |
@@ -3359,7 +3154,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
3359 | if (irq == 0) | 3154 | if (irq == 0) |
3360 | return -1; | 3155 | return -1; |
3361 | irq_want = irq + 1; | 3156 | irq_want = irq + 1; |
3362 | if (!intr_remapping_enabled) | 3157 | if (!irq_remapping_enabled) |
3363 | goto no_ir; | 3158 | goto no_ir; |
3364 | 3159 | ||
3365 | if (!sub_handle) { | 3160 | if (!sub_handle) { |
@@ -3367,23 +3162,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
3367 | * allocate the consecutive block of IRTE's | 3162 | * allocate the consecutive block of IRTE's |
3368 | * for 'nvec' | 3163 | * for 'nvec' |
3369 | */ | 3164 | */ |
3370 | index = msi_alloc_irte(dev, irq, nvec); | 3165 | index = msi_alloc_remapped_irq(dev, irq, nvec); |
3371 | if (index < 0) { | 3166 | if (index < 0) { |
3372 | ret = index; | 3167 | ret = index; |
3373 | goto error; | 3168 | goto error; |
3374 | } | 3169 | } |
3375 | } else { | 3170 | } else { |
3376 | iommu = map_dev_to_ir(dev); | 3171 | ret = msi_setup_remapped_irq(dev, irq, index, |
3377 | if (!iommu) { | 3172 | sub_handle); |
3378 | ret = -ENOENT; | 3173 | if (ret < 0) |
3379 | goto error; | 3174 | goto error; |
3380 | } | ||
3381 | /* | ||
3382 | * setup the mapping between the irq and the IRTE | ||
3383 | * base index, the sub_handle pointing to the | ||
3384 | * appropriate interrupt remap table entry. | ||
3385 | */ | ||
3386 | set_irte_irq(irq, iommu, index, sub_handle); | ||
3387 | } | 3175 | } |
3388 | no_ir: | 3176 | no_ir: |
3389 | ret = setup_msi_irq(dev, msidesc, irq); | 3177 | ret = setup_msi_irq(dev, msidesc, irq); |
@@ -3501,15 +3289,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) | |||
3501 | struct msi_msg msg; | 3289 | struct msi_msg msg; |
3502 | int ret; | 3290 | int ret; |
3503 | 3291 | ||
3504 | if (intr_remapping_enabled) { | 3292 | if (irq_remapping_enabled) { |
3505 | struct intel_iommu *iommu = map_hpet_to_ir(id); | 3293 | if (!setup_hpet_msi_remapped(irq, id)) |
3506 | int index; | ||
3507 | |||
3508 | if (!iommu) | ||
3509 | return -1; | ||
3510 | |||
3511 | index = alloc_irte(iommu, irq, 1); | ||
3512 | if (index < 0) | ||
3513 | return -1; | 3294 | return -1; |
3514 | } | 3295 | } |
3515 | 3296 | ||
@@ -3888,8 +3669,8 @@ void __init setup_ioapic_dest(void) | |||
3888 | else | 3669 | else |
3889 | mask = apic->target_cpus(); | 3670 | mask = apic->target_cpus(); |
3890 | 3671 | ||
3891 | if (intr_remapping_enabled) | 3672 | if (irq_remapping_enabled) |
3892 | ir_ioapic_set_affinity(idata, mask, false); | 3673 | set_remapped_irq_affinity(idata, mask, false); |
3893 | else | 3674 | else |
3894 | ioapic_set_affinity(idata, mask, false); | 3675 | ioapic_set_affinity(idata, mask, false); |
3895 | } | 3676 | } |
@@ -3931,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) | |||
3931 | return res; | 3712 | return res; |
3932 | } | 3713 | } |
3933 | 3714 | ||
3934 | void __init ioapic_and_gsi_init(void) | 3715 | void __init native_io_apic_init_mappings(void) |
3935 | { | ||
3936 | io_apic_ops.init(); | ||
3937 | } | ||
3938 | |||
3939 | static void __init __ioapic_init_mappings(void) | ||
3940 | { | 3716 | { |
3941 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | 3717 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; |
3942 | struct resource *ioapic_res; | 3718 | struct resource *ioapic_res; |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c..f00a68cca37 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = { | |||
530 | 530 | ||
531 | .read = native_apic_mem_read, | 531 | .read = native_apic_mem_read, |
532 | .write = native_apic_mem_write, | 532 | .write = native_apic_mem_write, |
533 | .eoi_write = native_apic_mem_write, | ||
533 | .icr_read = native_apic_icr_read, | 534 | .icr_read = native_apic_icr_read, |
534 | .icr_write = native_apic_icr_write, | 535 | .icr_write = native_apic_icr_write, |
535 | .wait_icr_idle = native_apic_wait_icr_idle, | 536 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4..1b291da09e6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c | |||
@@ -142,6 +142,7 @@ static struct apic apic_default = { | |||
142 | 142 | ||
143 | .read = native_apic_mem_read, | 143 | .read = native_apic_mem_read, |
144 | .write = native_apic_mem_write, | 144 | .write = native_apic_mem_write, |
145 | .eoi_write = native_apic_mem_write, | ||
145 | .icr_read = native_apic_icr_read, | 146 | .icr_read = native_apic_icr_read, |
146 | .icr_write = native_apic_icr_write, | 147 | .icr_write = native_apic_icr_write, |
147 | .wait_icr_idle = native_apic_wait_icr_idle, | 148 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f0..659897c0075 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -546,6 +546,7 @@ static struct apic apic_summit = { | |||
546 | 546 | ||
547 | .read = native_apic_mem_read, | 547 | .read = native_apic_mem_read, |
548 | .write = native_apic_mem_write, | 548 | .write = native_apic_mem_write, |
549 | .eoi_write = native_apic_mem_write, | ||
549 | .icr_read = native_apic_icr_read, | 550 | .icr_read = native_apic_icr_read, |
550 | .icr_write = native_apic_icr_write, | 551 | .icr_write = native_apic_icr_write, |
551 | .wait_icr_idle = native_apic_wait_icr_idle, | 552 | .wait_icr_idle = native_apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 48f3103b3c9..ff35cff0e1a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = { | |||
260 | 260 | ||
261 | .read = native_apic_msr_read, | 261 | .read = native_apic_msr_read, |
262 | .write = native_apic_msr_write, | 262 | .write = native_apic_msr_write, |
263 | .eoi_write = native_apic_msr_eoi_write, | ||
263 | .icr_read = native_x2apic_icr_read, | 264 | .icr_read = native_x2apic_icr_read, |
264 | .icr_write = native_x2apic_icr_write, | 265 | .icr_write = native_x2apic_icr_write, |
265 | .wait_icr_idle = native_x2apic_wait_icr_idle, | 266 | .wait_icr_idle = native_x2apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 991e315f422..c17e982db27 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = { | |||
172 | 172 | ||
173 | .read = native_apic_msr_read, | 173 | .read = native_apic_msr_read, |
174 | .write = native_apic_msr_write, | 174 | .write = native_apic_msr_write, |
175 | .eoi_write = native_apic_msr_eoi_write, | ||
175 | .icr_read = native_x2apic_icr_read, | 176 | .icr_read = native_x2apic_icr_read, |
176 | .icr_write = native_x2apic_icr_write, | 177 | .icr_write = native_x2apic_icr_write, |
177 | .wait_icr_idle = native_x2apic_wait_icr_idle, | 178 | .wait_icr_idle = native_x2apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 87bfa69e216..c6d03f7a440 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { | |||
404 | 404 | ||
405 | .read = native_apic_msr_read, | 405 | .read = native_apic_msr_read, |
406 | .write = native_apic_msr_write, | 406 | .write = native_apic_msr_write, |
407 | .eoi_write = native_apic_msr_eoi_write, | ||
407 | .icr_read = native_x2apic_icr_read, | 408 | .icr_read = native_x2apic_icr_read, |
408 | .icr_write = native_x2apic_icr_write, | 409 | .icr_write = native_x2apic_icr_write, |
409 | .wait_icr_idle = native_x2apic_wait_icr_idle, | 410 | .wait_icr_idle = native_x2apic_wait_icr_idle, |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 459e78cbf61..07b0c0db466 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -2401,7 +2401,7 @@ static void __exit apm_exit(void) | |||
2401 | * (pm_idle), Wait for all processors to update cached/local | 2401 | * (pm_idle), Wait for all processors to update cached/local |
2402 | * copies of pm_idle before proceeding. | 2402 | * copies of pm_idle before proceeding. |
2403 | */ | 2403 | */ |
2404 | cpu_idle_wait(); | 2404 | kick_all_cpus_sync(); |
2405 | } | 2405 | } |
2406 | if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) | 2406 | if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) |
2407 | && (apm_info.connection_version > 0x0100)) { | 2407 | && (apm_info.connection_version > 0x0100)) { |
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8dd..e2dbcb7dabd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c | |||
@@ -27,21 +27,29 @@ static int num_scan_areas; | |||
27 | 27 | ||
28 | static __init int set_corruption_check(char *arg) | 28 | static __init int set_corruption_check(char *arg) |
29 | { | 29 | { |
30 | char *end; | 30 | ssize_t ret; |
31 | unsigned long val; | ||
31 | 32 | ||
32 | memory_corruption_check = simple_strtol(arg, &end, 10); | 33 | ret = kstrtoul(arg, 10, &val); |
34 | if (ret) | ||
35 | return ret; | ||
33 | 36 | ||
34 | return (*end == 0) ? 0 : -EINVAL; | 37 | memory_corruption_check = val; |
38 | return 0; | ||
35 | } | 39 | } |
36 | early_param("memory_corruption_check", set_corruption_check); | 40 | early_param("memory_corruption_check", set_corruption_check); |
37 | 41 | ||
38 | static __init int set_corruption_check_period(char *arg) | 42 | static __init int set_corruption_check_period(char *arg) |
39 | { | 43 | { |
40 | char *end; | 44 | ssize_t ret; |
45 | unsigned long val; | ||
41 | 46 | ||
42 | corruption_check_period = simple_strtoul(arg, &end, 10); | 47 | ret = kstrtoul(arg, 10, &val); |
48 | if (ret) | ||
49 | return ret; | ||
43 | 50 | ||
44 | return (*end == 0) ? 0 : -EINVAL; | 51 | corruption_check_period = val; |
52 | return 0; | ||
45 | } | 53 | } |
46 | early_param("memory_corruption_check_period", set_corruption_check_period); | 54 | early_param("memory_corruption_check_period", set_corruption_check_period); |
47 | 55 | ||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cf79302198a..82f29e70d05 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void) | |||
1185 | oist = &per_cpu(orig_ist, cpu); | 1185 | oist = &per_cpu(orig_ist, cpu); |
1186 | 1186 | ||
1187 | #ifdef CONFIG_NUMA | 1187 | #ifdef CONFIG_NUMA |
1188 | if (cpu != 0 && percpu_read(numa_node) == 0 && | 1188 | if (cpu != 0 && this_cpu_read(numa_node) == 0 && |
1189 | early_cpu_to_node(cpu) != NUMA_NO_NODE) | 1189 | early_cpu_to_node(cpu) != NUMA_NO_NODE) |
1190 | set_numa_node(early_cpu_to_node(cpu)); | 1190 | set_numa_node(early_cpu_to_node(cpu)); |
1191 | #endif | 1191 | #endif |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index b8f3653dddb..9a7c90d80bc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
615 | new_l2 = this_leaf.size/1024; | 615 | new_l2 = this_leaf.size/1024; |
616 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; | 616 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; |
617 | index_msb = get_count_order(num_threads_sharing); | 617 | index_msb = get_count_order(num_threads_sharing); |
618 | l2_id = c->apicid >> index_msb; | 618 | l2_id = c->apicid & ~((1 << index_msb) - 1); |
619 | break; | 619 | break; |
620 | case 3: | 620 | case 3: |
621 | new_l3 = this_leaf.size/1024; | 621 | new_l3 = this_leaf.size/1024; |
622 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; | 622 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; |
623 | index_msb = get_count_order( | 623 | index_msb = get_count_order( |
624 | num_threads_sharing); | 624 | num_threads_sharing); |
625 | l3_id = c->apicid >> index_msb; | 625 | l3_id = c->apicid & ~((1 << index_msb) - 1); |
626 | break; | 626 | break; |
627 | default: | 627 | default: |
628 | break; | 628 | break; |
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341..36565373af8 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c | |||
@@ -23,7 +23,7 @@ | |||
23 | * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) | 23 | * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) |
24 | * | 24 | * |
25 | * Arrays used to match for this should also be declared using | 25 | * Arrays used to match for this should also be declared using |
26 | * MODULE_DEVICE_TABLE(x86_cpu, ...) | 26 | * MODULE_DEVICE_TABLE(x86cpu, ...) |
27 | * | 27 | * |
28 | * This always matches against the boot cpu, assuming models and features are | 28 | * This always matches against the boot cpu, assuming models and features are |
29 | * consistent over all CPUs. | 29 | * consistent over all CPUs. |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 507ea58688e..cd8b166a173 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c | |||
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) | |||
42 | struct mce m; | 42 | struct mce m; |
43 | 43 | ||
44 | /* Only corrected MC is reported */ | 44 | /* Only corrected MC is reported */ |
45 | if (!corrected) | 45 | if (!corrected || !(mem_err->validation_bits & |
46 | CPER_MEM_VALID_PHYSICAL_ADDRESS)) | ||
46 | return; | 47 | return; |
47 | 48 | ||
48 | mce_setup(&m); | 49 | mce_setup(&m); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 0c82091b165..413c2ced887 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -126,6 +126,16 @@ static struct severity { | |||
126 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | 126 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), |
127 | USER | 127 | USER |
128 | ), | 128 | ), |
129 | MCESEV( | ||
130 | KEEP, "HT thread notices Action required: instruction fetch error", | ||
131 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), | ||
132 | MCGMASK(MCG_STATUS_EIPV, 0) | ||
133 | ), | ||
134 | MCESEV( | ||
135 | AR, "Action required: instruction fetch error", | ||
136 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), | ||
137 | USER | ||
138 | ), | ||
129 | #endif | 139 | #endif |
130 | MCESEV( | 140 | MCESEV( |
131 | PANIC, "Action required: unknown MCACOD", | 141 | PANIC, "Action required: unknown MCACOD", |
@@ -165,15 +175,19 @@ static struct severity { | |||
165 | }; | 175 | }; |
166 | 176 | ||
167 | /* | 177 | /* |
168 | * If the EIPV bit is set, it means the saved IP is the | 178 | * If mcgstatus indicated that ip/cs on the stack were |
169 | * instruction which caused the MCE. | 179 | * no good, then "m->cs" will be zero and we will have |
180 | * to assume the worst case (IN_KERNEL) as we actually | ||
181 | * have no idea what we were executing when the machine | ||
182 | * check hit. | ||
183 | * If we do have a good "m->cs" (or a faked one in the | ||
184 | * case we were executing in VM86 mode) we can use it to | ||
185 | * distinguish an exception taken in user from from one | ||
186 | * taken in the kernel. | ||
170 | */ | 187 | */ |
171 | static int error_context(struct mce *m) | 188 | static int error_context(struct mce *m) |
172 | { | 189 | { |
173 | if (m->mcgstatus & MCG_STATUS_EIPV) | 190 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; |
174 | return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | ||
175 | /* Unknown, assume kernel */ | ||
176 | return IN_KERNEL; | ||
177 | } | 191 | } |
178 | 192 | ||
179 | int mce_severity(struct mce *m, int tolerant, char **msg) | 193 | int mce_severity(struct mce *m, int tolerant, char **msg) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087..b772dd6ad45 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) | |||
437 | if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { | 437 | if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { |
438 | m->ip = regs->ip; | 438 | m->ip = regs->ip; |
439 | m->cs = regs->cs; | 439 | m->cs = regs->cs; |
440 | |||
441 | /* | ||
442 | * When in VM86 mode make the cs look like ring 3 | ||
443 | * always. This is a lie, but it's better than passing | ||
444 | * the additional vm86 bit around everywhere. | ||
445 | */ | ||
446 | if (v8086_mode(regs)) | ||
447 | m->cs |= 3; | ||
440 | } | 448 | } |
441 | /* Use accurate RIP reporting if available. */ | 449 | /* Use accurate RIP reporting if available. */ |
442 | if (rip_msr) | 450 | if (rip_msr) |
@@ -583,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
583 | struct mce m; | 591 | struct mce m; |
584 | int i; | 592 | int i; |
585 | 593 | ||
586 | percpu_inc(mce_poll_count); | 594 | this_cpu_inc(mce_poll_count); |
587 | 595 | ||
588 | mce_gather_info(&m, NULL); | 596 | mce_gather_info(&m, NULL); |
589 | 597 | ||
@@ -641,16 +649,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); | |||
641 | * Do a quick check if any of the events requires a panic. | 649 | * Do a quick check if any of the events requires a panic. |
642 | * This decides if we keep the events around or clear them. | 650 | * This decides if we keep the events around or clear them. |
643 | */ | 651 | */ |
644 | static int mce_no_way_out(struct mce *m, char **msg) | 652 | static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) |
645 | { | 653 | { |
646 | int i; | 654 | int i, ret = 0; |
647 | 655 | ||
648 | for (i = 0; i < banks; i++) { | 656 | for (i = 0; i < banks; i++) { |
649 | m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); | 657 | m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); |
658 | if (m->status & MCI_STATUS_VAL) | ||
659 | __set_bit(i, validp); | ||
650 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) | 660 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) |
651 | return 1; | 661 | ret = 1; |
652 | } | 662 | } |
653 | return 0; | 663 | return ret; |
654 | } | 664 | } |
655 | 665 | ||
656 | /* | 666 | /* |
@@ -945,9 +955,10 @@ struct mce_info { | |||
945 | atomic_t inuse; | 955 | atomic_t inuse; |
946 | struct task_struct *t; | 956 | struct task_struct *t; |
947 | __u64 paddr; | 957 | __u64 paddr; |
958 | int restartable; | ||
948 | } mce_info[MCE_INFO_MAX]; | 959 | } mce_info[MCE_INFO_MAX]; |
949 | 960 | ||
950 | static void mce_save_info(__u64 addr) | 961 | static void mce_save_info(__u64 addr, int c) |
951 | { | 962 | { |
952 | struct mce_info *mi; | 963 | struct mce_info *mi; |
953 | 964 | ||
@@ -955,6 +966,7 @@ static void mce_save_info(__u64 addr) | |||
955 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | 966 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { |
956 | mi->t = current; | 967 | mi->t = current; |
957 | mi->paddr = addr; | 968 | mi->paddr = addr; |
969 | mi->restartable = c; | ||
958 | return; | 970 | return; |
959 | } | 971 | } |
960 | } | 972 | } |
@@ -1011,11 +1023,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1011 | */ | 1023 | */ |
1012 | int kill_it = 0; | 1024 | int kill_it = 0; |
1013 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | 1025 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); |
1026 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); | ||
1014 | char *msg = "Unknown"; | 1027 | char *msg = "Unknown"; |
1015 | 1028 | ||
1016 | atomic_inc(&mce_entry); | 1029 | atomic_inc(&mce_entry); |
1017 | 1030 | ||
1018 | percpu_inc(mce_exception_count); | 1031 | this_cpu_inc(mce_exception_count); |
1019 | 1032 | ||
1020 | if (!banks) | 1033 | if (!banks) |
1021 | goto out; | 1034 | goto out; |
@@ -1025,7 +1038,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1025 | final = &__get_cpu_var(mces_seen); | 1038 | final = &__get_cpu_var(mces_seen); |
1026 | *final = m; | 1039 | *final = m; |
1027 | 1040 | ||
1028 | no_way_out = mce_no_way_out(&m, &msg); | 1041 | memset(valid_banks, 0, sizeof(valid_banks)); |
1042 | no_way_out = mce_no_way_out(&m, &msg, valid_banks); | ||
1029 | 1043 | ||
1030 | barrier(); | 1044 | barrier(); |
1031 | 1045 | ||
@@ -1045,6 +1059,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1045 | order = mce_start(&no_way_out); | 1059 | order = mce_start(&no_way_out); |
1046 | for (i = 0; i < banks; i++) { | 1060 | for (i = 0; i < banks; i++) { |
1047 | __clear_bit(i, toclear); | 1061 | __clear_bit(i, toclear); |
1062 | if (!test_bit(i, valid_banks)) | ||
1063 | continue; | ||
1048 | if (!mce_banks[i].ctl) | 1064 | if (!mce_banks[i].ctl) |
1049 | continue; | 1065 | continue; |
1050 | 1066 | ||
@@ -1130,7 +1146,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1130 | mce_panic("Fatal machine check on current CPU", &m, msg); | 1146 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1131 | if (worst == MCE_AR_SEVERITY) { | 1147 | if (worst == MCE_AR_SEVERITY) { |
1132 | /* schedule action before return to userland */ | 1148 | /* schedule action before return to userland */ |
1133 | mce_save_info(m.addr); | 1149 | mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); |
1134 | set_thread_flag(TIF_MCE_NOTIFY); | 1150 | set_thread_flag(TIF_MCE_NOTIFY); |
1135 | } else if (kill_it) { | 1151 | } else if (kill_it) { |
1136 | force_sig(SIGBUS, current); | 1152 | force_sig(SIGBUS, current); |
@@ -1179,7 +1195,13 @@ void mce_notify_process(void) | |||
1179 | 1195 | ||
1180 | pr_err("Uncorrected hardware memory error in user-access at %llx", | 1196 | pr_err("Uncorrected hardware memory error in user-access at %llx", |
1181 | mi->paddr); | 1197 | mi->paddr); |
1182 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | 1198 | /* |
1199 | * We must call memory_failure() here even if the current process is | ||
1200 | * doomed. We still need to mark the page as poisoned and alert any | ||
1201 | * other users of the page. | ||
1202 | */ | ||
1203 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || | ||
1204 | mi->restartable == 0) { | ||
1183 | pr_err("Memory error not recovered"); | 1205 | pr_err("Memory error not recovered"); |
1184 | force_sig(SIGBUS, current); | 1206 | force_sig(SIGBUS, current); |
1185 | } | 1207 | } |
@@ -1423,6 +1445,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
1423 | */ | 1445 | */ |
1424 | if (c->x86 == 6 && banks > 0) | 1446 | if (c->x86 == 6 && banks > 0) |
1425 | mce_banks[0].ctl = 0; | 1447 | mce_banks[0].ctl = 0; |
1448 | |||
1449 | /* | ||
1450 | * Turn off MC4_MISC thresholding banks on those models since | ||
1451 | * they're not supported there. | ||
1452 | */ | ||
1453 | if (c->x86 == 0x15 && | ||
1454 | (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { | ||
1455 | int i; | ||
1456 | u64 val, hwcr; | ||
1457 | bool need_toggle; | ||
1458 | u32 msrs[] = { | ||
1459 | 0x00000413, /* MC4_MISC0 */ | ||
1460 | 0xc0000408, /* MC4_MISC1 */ | ||
1461 | }; | ||
1462 | |||
1463 | rdmsrl(MSR_K7_HWCR, hwcr); | ||
1464 | |||
1465 | /* McStatusWrEn has to be set */ | ||
1466 | need_toggle = !(hwcr & BIT(18)); | ||
1467 | |||
1468 | if (need_toggle) | ||
1469 | wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); | ||
1470 | |||
1471 | for (i = 0; i < ARRAY_SIZE(msrs); i++) { | ||
1472 | rdmsrl(msrs[i], val); | ||
1473 | |||
1474 | /* CntP bit set? */ | ||
1475 | if (val & BIT(62)) { | ||
1476 | val &= ~BIT(62); | ||
1477 | wrmsrl(msrs[i], val); | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | /* restore old settings */ | ||
1482 | if (need_toggle) | ||
1483 | wrmsrl(MSR_K7_HWCR, hwcr); | ||
1484 | } | ||
1426 | } | 1485 | } |
1427 | 1486 | ||
1428 | if (c->x86_vendor == X86_VENDOR_INTEL) { | 1487 | if (c->x86_vendor == X86_VENDOR_INTEL) { |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 99b57179f91..f4873a64f46 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -51,6 +51,7 @@ struct threshold_block { | |||
51 | unsigned int cpu; | 51 | unsigned int cpu; |
52 | u32 address; | 52 | u32 address; |
53 | u16 interrupt_enable; | 53 | u16 interrupt_enable; |
54 | bool interrupt_capable; | ||
54 | u16 threshold_limit; | 55 | u16 threshold_limit; |
55 | struct kobject kobj; | 56 | struct kobject kobj; |
56 | struct list_head miscj; | 57 | struct list_head miscj; |
@@ -83,6 +84,21 @@ struct thresh_restart { | |||
83 | u16 old_limit; | 84 | u16 old_limit; |
84 | }; | 85 | }; |
85 | 86 | ||
87 | static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) | ||
88 | { | ||
89 | /* | ||
90 | * bank 4 supports APIC LVT interrupts implicitly since forever. | ||
91 | */ | ||
92 | if (bank == 4) | ||
93 | return true; | ||
94 | |||
95 | /* | ||
96 | * IntP: interrupt present; if this bit is set, the thresholding | ||
97 | * bank can generate APIC LVT interrupts | ||
98 | */ | ||
99 | return msr_high_bits & BIT(28); | ||
100 | } | ||
101 | |||
86 | static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) | 102 | static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) |
87 | { | 103 | { |
88 | int msr = (hi & MASK_LVTOFF_HI) >> 20; | 104 | int msr = (hi & MASK_LVTOFF_HI) >> 20; |
@@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) | |||
104 | return 1; | 120 | return 1; |
105 | }; | 121 | }; |
106 | 122 | ||
107 | /* must be called with correct cpu affinity */ | 123 | /* |
108 | /* Called via smp_call_function_single() */ | 124 | * Called via smp_call_function_single(), must be called with correct |
125 | * cpu affinity. | ||
126 | */ | ||
109 | static void threshold_restart_bank(void *_tr) | 127 | static void threshold_restart_bank(void *_tr) |
110 | { | 128 | { |
111 | struct thresh_restart *tr = _tr; | 129 | struct thresh_restart *tr = _tr; |
@@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr) | |||
128 | (new_count & THRESHOLD_MAX); | 146 | (new_count & THRESHOLD_MAX); |
129 | } | 147 | } |
130 | 148 | ||
149 | /* clear IntType */ | ||
150 | hi &= ~MASK_INT_TYPE_HI; | ||
151 | |||
152 | if (!tr->b->interrupt_capable) | ||
153 | goto done; | ||
154 | |||
131 | if (tr->set_lvt_off) { | 155 | if (tr->set_lvt_off) { |
132 | if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { | 156 | if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { |
133 | /* set new lvt offset */ | 157 | /* set new lvt offset */ |
@@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr) | |||
136 | } | 160 | } |
137 | } | 161 | } |
138 | 162 | ||
139 | tr->b->interrupt_enable ? | 163 | if (tr->b->interrupt_enable) |
140 | (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : | 164 | hi |= INT_TYPE_APIC; |
141 | (hi &= ~MASK_INT_TYPE_HI); | 165 | |
166 | done: | ||
142 | 167 | ||
143 | hi |= MASK_COUNT_EN_HI; | 168 | hi |= MASK_COUNT_EN_HI; |
144 | wrmsr(tr->b->address, lo, hi); | 169 | wrmsr(tr->b->address, lo, hi); |
@@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
202 | if (shared_bank[bank] && c->cpu_core_id) | 227 | if (shared_bank[bank] && c->cpu_core_id) |
203 | break; | 228 | break; |
204 | 229 | ||
205 | offset = setup_APIC_mce(offset, | ||
206 | (high & MASK_LVTOFF_HI) >> 20); | ||
207 | |||
208 | memset(&b, 0, sizeof(b)); | 230 | memset(&b, 0, sizeof(b)); |
209 | b.cpu = cpu; | 231 | b.cpu = cpu; |
210 | b.bank = bank; | 232 | b.bank = bank; |
211 | b.block = block; | 233 | b.block = block; |
212 | b.address = address; | 234 | b.address = address; |
235 | b.interrupt_capable = lvt_interrupt_supported(bank, high); | ||
236 | |||
237 | if (b.interrupt_capable) { | ||
238 | int new = (high & MASK_LVTOFF_HI) >> 20; | ||
239 | offset = setup_APIC_mce(offset, new); | ||
240 | } | ||
213 | 241 | ||
214 | mce_threshold_block_init(&b, offset); | 242 | mce_threshold_block_init(&b, offset); |
215 | mce_threshold_vector = amd_threshold_interrupt; | 243 | mce_threshold_vector = amd_threshold_interrupt; |
@@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) | |||
309 | struct thresh_restart tr; | 337 | struct thresh_restart tr; |
310 | unsigned long new; | 338 | unsigned long new; |
311 | 339 | ||
340 | if (!b->interrupt_capable) | ||
341 | return -EINVAL; | ||
342 | |||
312 | if (strict_strtoul(buf, 0, &new) < 0) | 343 | if (strict_strtoul(buf, 0, &new) < 0) |
313 | return -EINVAL; | 344 | return -EINVAL; |
314 | 345 | ||
@@ -390,10 +421,10 @@ RW_ATTR(threshold_limit); | |||
390 | RW_ATTR(error_count); | 421 | RW_ATTR(error_count); |
391 | 422 | ||
392 | static struct attribute *default_attrs[] = { | 423 | static struct attribute *default_attrs[] = { |
393 | &interrupt_enable.attr, | ||
394 | &threshold_limit.attr, | 424 | &threshold_limit.attr, |
395 | &error_count.attr, | 425 | &error_count.attr, |
396 | NULL | 426 | NULL, /* possibly interrupt_enable if supported, see below */ |
427 | NULL, | ||
397 | }; | 428 | }; |
398 | 429 | ||
399 | #define to_block(k) container_of(k, struct threshold_block, kobj) | 430 | #define to_block(k) container_of(k, struct threshold_block, kobj) |
@@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | |||
467 | b->cpu = cpu; | 498 | b->cpu = cpu; |
468 | b->address = address; | 499 | b->address = address; |
469 | b->interrupt_enable = 0; | 500 | b->interrupt_enable = 0; |
501 | b->interrupt_capable = lvt_interrupt_supported(bank, high); | ||
470 | b->threshold_limit = THRESHOLD_MAX; | 502 | b->threshold_limit = THRESHOLD_MAX; |
471 | 503 | ||
504 | if (b->interrupt_capable) | ||
505 | threshold_ktype.default_attrs[2] = &interrupt_enable.attr; | ||
506 | else | ||
507 | threshold_ktype.default_attrs[2] = NULL; | ||
508 | |||
472 | INIT_LIST_HEAD(&b->miscj); | 509 | INIT_LIST_HEAD(&b->miscj); |
473 | 510 | ||
474 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) { | 511 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) { |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e03407e1..e049d6da018 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) | |||
484 | 484 | ||
485 | /* mark unused */ | 485 | /* mark unused */ |
486 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | 486 | event->hw.extra_reg.idx = EXTRA_REG_NONE; |
487 | |||
488 | /* mark not used */ | ||
489 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | ||
490 | event->hw.branch_reg.idx = EXTRA_REG_NONE; | 487 | event->hw.branch_reg.idx = EXTRA_REG_NONE; |
491 | 488 | ||
492 | return x86_pmu.hw_config(event); | 489 | return x86_pmu.hw_config(event); |
@@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1186 | int idx, handled = 0; | 1183 | int idx, handled = 0; |
1187 | u64 val; | 1184 | u64 val; |
1188 | 1185 | ||
1189 | perf_sample_data_init(&data, 0); | ||
1190 | |||
1191 | cpuc = &__get_cpu_var(cpu_hw_events); | 1186 | cpuc = &__get_cpu_var(cpu_hw_events); |
1192 | 1187 | ||
1193 | /* | 1188 | /* |
@@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1222 | * event overflow | 1217 | * event overflow |
1223 | */ | 1218 | */ |
1224 | handled++; | 1219 | handled++; |
1225 | data.period = event->hw.last_period; | 1220 | perf_sample_data_init(&data, 0, event->hw.last_period); |
1226 | 1221 | ||
1227 | if (!x86_perf_event_set_period(event)) | 1222 | if (!x86_perf_event_set_period(event)) |
1228 | continue; | 1223 | continue; |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0..11a4eb9131d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) | |||
134 | 134 | ||
135 | static int amd_pmu_hw_config(struct perf_event *event) | 135 | static int amd_pmu_hw_config(struct perf_event *event) |
136 | { | 136 | { |
137 | int ret = x86_pmu_hw_config(event); | 137 | int ret; |
138 | 138 | ||
139 | /* pass precise event sampling to ibs: */ | ||
140 | if (event->attr.precise_ip && get_ibs_caps()) | ||
141 | return -ENOENT; | ||
142 | |||
143 | ret = x86_pmu_hw_config(event); | ||
139 | if (ret) | 144 | if (ret) |
140 | return ret; | 145 | return ret; |
141 | 146 | ||
@@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, | |||
205 | * when we come here | 210 | * when we come here |
206 | */ | 211 | */ |
207 | for (i = 0; i < x86_pmu.num_counters; i++) { | 212 | for (i = 0; i < x86_pmu.num_counters; i++) { |
208 | if (nb->owners[i] == event) { | 213 | if (cmpxchg(nb->owners + i, event, NULL) == event) |
209 | cmpxchg(nb->owners+i, event, NULL); | ||
210 | break; | 214 | break; |
211 | } | ||
212 | } | 215 | } |
213 | } | 216 | } |
214 | 217 | ||
@@ -493,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = { | |||
493 | * 0x023 DE PERF_CTL[2:0] | 496 | * 0x023 DE PERF_CTL[2:0] |
494 | * 0x02D LS PERF_CTL[3] | 497 | * 0x02D LS PERF_CTL[3] |
495 | * 0x02E LS PERF_CTL[3,0] | 498 | * 0x02E LS PERF_CTL[3,0] |
499 | * 0x031 LS PERF_CTL[2:0] (**) | ||
496 | * 0x043 CU PERF_CTL[2:0] | 500 | * 0x043 CU PERF_CTL[2:0] |
497 | * 0x045 CU PERF_CTL[2:0] | 501 | * 0x045 CU PERF_CTL[2:0] |
498 | * 0x046 CU PERF_CTL[2:0] | 502 | * 0x046 CU PERF_CTL[2:0] |
@@ -506,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = { | |||
506 | * 0x0DD LS PERF_CTL[5:0] | 510 | * 0x0DD LS PERF_CTL[5:0] |
507 | * 0x0DE LS PERF_CTL[5:0] | 511 | * 0x0DE LS PERF_CTL[5:0] |
508 | * 0x0DF LS PERF_CTL[5:0] | 512 | * 0x0DF LS PERF_CTL[5:0] |
513 | * 0x1C0 EX PERF_CTL[5:3] | ||
509 | * 0x1D6 EX PERF_CTL[5:0] | 514 | * 0x1D6 EX PERF_CTL[5:0] |
510 | * 0x1D8 EX PERF_CTL[5:0] | 515 | * 0x1D8 EX PERF_CTL[5:0] |
511 | * | 516 | * |
512 | * (*) depending on the umask all FPU counters may be used | 517 | * (*) depending on the umask all FPU counters may be used |
518 | * (**) only one unitmask enabled at a time | ||
513 | */ | 519 | */ |
514 | 520 | ||
515 | static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); | 521 | static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); |
@@ -559,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev | |||
559 | return &amd_f15_PMC3; | 565 | return &amd_f15_PMC3; |
560 | case 0x02E: | 566 | case 0x02E: |
561 | return &amd_f15_PMC30; | 567 | return &amd_f15_PMC30; |
568 | case 0x031: | ||
569 | if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) | ||
570 | return &amd_f15_PMC20; | ||
571 | return &emptyconstraint; | ||
572 | case 0x1C0: | ||
573 | return &amd_f15_PMC53; | ||
562 | default: | 574 | default: |
563 | return &amd_f15_PMC50; | 575 | return &amd_f15_PMC50; |
564 | } | 576 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14..da9bcdcd985 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/perf_event.h> | 9 | #include <linux/perf_event.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/pci.h> | 11 | #include <linux/pci.h> |
12 | #include <linux/ptrace.h> | ||
12 | 13 | ||
13 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
14 | 15 | ||
@@ -16,36 +17,591 @@ static u32 ibs_caps; | |||
16 | 17 | ||
17 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) | 18 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) |
18 | 19 | ||
19 | static struct pmu perf_ibs; | 20 | #include <linux/kprobes.h> |
21 | #include <linux/hardirq.h> | ||
22 | |||
23 | #include <asm/nmi.h> | ||
24 | |||
25 | #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) | ||
26 | #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT | ||
27 | |||
28 | enum ibs_states { | ||
29 | IBS_ENABLED = 0, | ||
30 | IBS_STARTED = 1, | ||
31 | IBS_STOPPING = 2, | ||
32 | |||
33 | IBS_MAX_STATES, | ||
34 | }; | ||
35 | |||
36 | struct cpu_perf_ibs { | ||
37 | struct perf_event *event; | ||
38 | unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; | ||
39 | }; | ||
40 | |||
41 | struct perf_ibs { | ||
42 | struct pmu pmu; | ||
43 | unsigned int msr; | ||
44 | u64 config_mask; | ||
45 | u64 cnt_mask; | ||
46 | u64 enable_mask; | ||
47 | u64 valid_mask; | ||
48 | u64 max_period; | ||
49 | unsigned long offset_mask[1]; | ||
50 | int offset_max; | ||
51 | struct cpu_perf_ibs __percpu *pcpu; | ||
52 | u64 (*get_count)(u64 config); | ||
53 | }; | ||
54 | |||
55 | struct perf_ibs_data { | ||
56 | u32 size; | ||
57 | union { | ||
58 | u32 data[0]; /* data buffer starts here */ | ||
59 | u32 caps; | ||
60 | }; | ||
61 | u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; | ||
62 | }; | ||
63 | |||
64 | static int | ||
65 | perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) | ||
66 | { | ||
67 | s64 left = local64_read(&hwc->period_left); | ||
68 | s64 period = hwc->sample_period; | ||
69 | int overflow = 0; | ||
70 | |||
71 | /* | ||
72 | * If we are way outside a reasonable range then just skip forward: | ||
73 | */ | ||
74 | if (unlikely(left <= -period)) { | ||
75 | left = period; | ||
76 | local64_set(&hwc->period_left, left); | ||
77 | hwc->last_period = period; | ||
78 | overflow = 1; | ||
79 | } | ||
80 | |||
81 | if (unlikely(left < (s64)min)) { | ||
82 | left += period; | ||
83 | local64_set(&hwc->period_left, left); | ||
84 | hwc->last_period = period; | ||
85 | overflow = 1; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * If the hw period that triggers the sw overflow is too short | ||
90 | * we might hit the irq handler. This biases the results. | ||
91 | * Thus we shorten the next-to-last period and set the last | ||
92 | * period to the max period. | ||
93 | */ | ||
94 | if (left > max) { | ||
95 | left -= max; | ||
96 | if (left > max) | ||
97 | left = max; | ||
98 | else if (left < min) | ||
99 | left = min; | ||
100 | } | ||
101 | |||
102 | *hw_period = (u64)left; | ||
103 | |||
104 | return overflow; | ||
105 | } | ||
106 | |||
107 | static int | ||
108 | perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) | ||
109 | { | ||
110 | struct hw_perf_event *hwc = &event->hw; | ||
111 | int shift = 64 - width; | ||
112 | u64 prev_raw_count; | ||
113 | u64 delta; | ||
114 | |||
115 | /* | ||
116 | * Careful: an NMI might modify the previous event value. | ||
117 | * | ||
118 | * Our tactic to handle this is to first atomically read and | ||
119 | * exchange a new raw count - then add that new-prev delta | ||
120 | * count to the generic event atomically: | ||
121 | */ | ||
122 | prev_raw_count = local64_read(&hwc->prev_count); | ||
123 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, | ||
124 | new_raw_count) != prev_raw_count) | ||
125 | return 0; | ||
126 | |||
127 | /* | ||
128 | * Now we have the new raw value and have updated the prev | ||
129 | * timestamp already. We can now calculate the elapsed delta | ||
130 | * (event-)time and add that to the generic event. | ||
131 | * | ||
132 | * Careful, not all hw sign-extends above the physical width | ||
133 | * of the count. | ||
134 | */ | ||
135 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | ||
136 | delta >>= shift; | ||
137 | |||
138 | local64_add(delta, &event->count); | ||
139 | local64_sub(delta, &hwc->period_left); | ||
140 | |||
141 | return 1; | ||
142 | } | ||
143 | |||
144 | static struct perf_ibs perf_ibs_fetch; | ||
145 | static struct perf_ibs perf_ibs_op; | ||
146 | |||
147 | static struct perf_ibs *get_ibs_pmu(int type) | ||
148 | { | ||
149 | if (perf_ibs_fetch.pmu.type == type) | ||
150 | return &perf_ibs_fetch; | ||
151 | if (perf_ibs_op.pmu.type == type) | ||
152 | return &perf_ibs_op; | ||
153 | return NULL; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Use IBS for precise event sampling: | ||
158 | * | ||
159 | * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count | ||
160 | * perf record -a -e r076:p ... # same as -e cpu-cycles:p | ||
161 | * perf record -a -e r0C1:p ... # use ibs op counting micro-ops | ||
162 | * | ||
163 | * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, | ||
164 | * MSRC001_1033) is used to select either cycle or micro-ops counting | ||
165 | * mode. | ||
166 | * | ||
167 | * The rip of IBS samples has skid 0. Thus, IBS supports precise | ||
168 | * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the | ||
169 | * rip is invalid when IBS was not able to record the rip correctly. | ||
170 | * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. | ||
171 | * | ||
172 | */ | ||
173 | static int perf_ibs_precise_event(struct perf_event *event, u64 *config) | ||
174 | { | ||
175 | switch (event->attr.precise_ip) { | ||
176 | case 0: | ||
177 | return -ENOENT; | ||
178 | case 1: | ||
179 | case 2: | ||
180 | break; | ||
181 | default: | ||
182 | return -EOPNOTSUPP; | ||
183 | } | ||
184 | |||
185 | switch (event->attr.type) { | ||
186 | case PERF_TYPE_HARDWARE: | ||
187 | switch (event->attr.config) { | ||
188 | case PERF_COUNT_HW_CPU_CYCLES: | ||
189 | *config = 0; | ||
190 | return 0; | ||
191 | } | ||
192 | break; | ||
193 | case PERF_TYPE_RAW: | ||
194 | switch (event->attr.config) { | ||
195 | case 0x0076: | ||
196 | *config = 0; | ||
197 | return 0; | ||
198 | case 0x00C1: | ||
199 | *config = IBS_OP_CNT_CTL; | ||
200 | return 0; | ||
201 | } | ||
202 | break; | ||
203 | default: | ||
204 | return -ENOENT; | ||
205 | } | ||
206 | |||
207 | return -EOPNOTSUPP; | ||
208 | } | ||
20 | 209 | ||
21 | static int perf_ibs_init(struct perf_event *event) | 210 | static int perf_ibs_init(struct perf_event *event) |
22 | { | 211 | { |
23 | if (perf_ibs.type != event->attr.type) | 212 | struct hw_perf_event *hwc = &event->hw; |
213 | struct perf_ibs *perf_ibs; | ||
214 | u64 max_cnt, config; | ||
215 | int ret; | ||
216 | |||
217 | perf_ibs = get_ibs_pmu(event->attr.type); | ||
218 | if (perf_ibs) { | ||
219 | config = event->attr.config; | ||
220 | } else { | ||
221 | perf_ibs = &perf_ibs_op; | ||
222 | ret = perf_ibs_precise_event(event, &config); | ||
223 | if (ret) | ||
224 | return ret; | ||
225 | } | ||
226 | |||
227 | if (event->pmu != &perf_ibs->pmu) | ||
24 | return -ENOENT; | 228 | return -ENOENT; |
229 | |||
230 | if (config & ~perf_ibs->config_mask) | ||
231 | return -EINVAL; | ||
232 | |||
233 | if (hwc->sample_period) { | ||
234 | if (config & perf_ibs->cnt_mask) | ||
235 | /* raw max_cnt may not be set */ | ||
236 | return -EINVAL; | ||
237 | if (!event->attr.sample_freq && hwc->sample_period & 0x0f) | ||
238 | /* | ||
239 | * lower 4 bits can not be set in ibs max cnt, | ||
240 | * but allowing it in case we adjust the | ||
241 | * sample period to set a frequency. | ||
242 | */ | ||
243 | return -EINVAL; | ||
244 | hwc->sample_period &= ~0x0FULL; | ||
245 | if (!hwc->sample_period) | ||
246 | hwc->sample_period = 0x10; | ||
247 | } else { | ||
248 | max_cnt = config & perf_ibs->cnt_mask; | ||
249 | config &= ~perf_ibs->cnt_mask; | ||
250 | event->attr.sample_period = max_cnt << 4; | ||
251 | hwc->sample_period = event->attr.sample_period; | ||
252 | } | ||
253 | |||
254 | if (!hwc->sample_period) | ||
255 | return -EINVAL; | ||
256 | |||
257 | /* | ||
258 | * If we modify hwc->sample_period, we also need to update | ||
259 | * hwc->last_period and hwc->period_left. | ||
260 | */ | ||
261 | hwc->last_period = hwc->sample_period; | ||
262 | local64_set(&hwc->period_left, hwc->sample_period); | ||
263 | |||
264 | hwc->config_base = perf_ibs->msr; | ||
265 | hwc->config = config; | ||
266 | |||
25 | return 0; | 267 | return 0; |
26 | } | 268 | } |
27 | 269 | ||
270 | static int perf_ibs_set_period(struct perf_ibs *perf_ibs, | ||
271 | struct hw_perf_event *hwc, u64 *period) | ||
272 | { | ||
273 | int overflow; | ||
274 | |||
275 | /* ignore lower 4 bits in min count: */ | ||
276 | overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); | ||
277 | local64_set(&hwc->prev_count, 0); | ||
278 | |||
279 | return overflow; | ||
280 | } | ||
281 | |||
282 | static u64 get_ibs_fetch_count(u64 config) | ||
283 | { | ||
284 | return (config & IBS_FETCH_CNT) >> 12; | ||
285 | } | ||
286 | |||
287 | static u64 get_ibs_op_count(u64 config) | ||
288 | { | ||
289 | u64 count = 0; | ||
290 | |||
291 | if (config & IBS_OP_VAL) | ||
292 | count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ | ||
293 | |||
294 | if (ibs_caps & IBS_CAPS_RDWROPCNT) | ||
295 | count += (config & IBS_OP_CUR_CNT) >> 32; | ||
296 | |||
297 | return count; | ||
298 | } | ||
299 | |||
300 | static void | ||
301 | perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, | ||
302 | u64 *config) | ||
303 | { | ||
304 | u64 count = perf_ibs->get_count(*config); | ||
305 | |||
306 | /* | ||
307 | * Set width to 64 since we do not overflow on max width but | ||
308 | * instead on max count. In perf_ibs_set_period() we clear | ||
309 | * prev count manually on overflow. | ||
310 | */ | ||
311 | while (!perf_event_try_update(event, count, 64)) { | ||
312 | rdmsrl(event->hw.config_base, *config); | ||
313 | count = perf_ibs->get_count(*config); | ||
314 | } | ||
315 | } | ||
316 | |||
317 | static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, | ||
318 | struct hw_perf_event *hwc, u64 config) | ||
319 | { | ||
320 | wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Erratum #420 Instruction-Based Sampling Engine May Generate | ||
325 | * Interrupt that Cannot Be Cleared: | ||
326 | * | ||
327 | * Must clear counter mask first, then clear the enable bit. See | ||
328 | * Revision Guide for AMD Family 10h Processors, Publication #41322. | ||
329 | */ | ||
330 | static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, | ||
331 | struct hw_perf_event *hwc, u64 config) | ||
332 | { | ||
333 | config &= ~perf_ibs->cnt_mask; | ||
334 | wrmsrl(hwc->config_base, config); | ||
335 | config &= ~perf_ibs->enable_mask; | ||
336 | wrmsrl(hwc->config_base, config); | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * We cannot restore the ibs pmu state, so we always needs to update | ||
341 | * the event while stopping it and then reset the state when starting | ||
342 | * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in | ||
343 | * perf_ibs_start()/perf_ibs_stop() and instead always do it. | ||
344 | */ | ||
345 | static void perf_ibs_start(struct perf_event *event, int flags) | ||
346 | { | ||
347 | struct hw_perf_event *hwc = &event->hw; | ||
348 | struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); | ||
349 | struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); | ||
350 | u64 period; | ||
351 | |||
352 | if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) | ||
353 | return; | ||
354 | |||
355 | WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); | ||
356 | hwc->state = 0; | ||
357 | |||
358 | perf_ibs_set_period(perf_ibs, hwc, &period); | ||
359 | set_bit(IBS_STARTED, pcpu->state); | ||
360 | perf_ibs_enable_event(perf_ibs, hwc, period >> 4); | ||
361 | |||
362 | perf_event_update_userpage(event); | ||
363 | } | ||
364 | |||
365 | static void perf_ibs_stop(struct perf_event *event, int flags) | ||
366 | { | ||
367 | struct hw_perf_event *hwc = &event->hw; | ||
368 | struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); | ||
369 | struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); | ||
370 | u64 config; | ||
371 | int stopping; | ||
372 | |||
373 | stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); | ||
374 | |||
375 | if (!stopping && (hwc->state & PERF_HES_UPTODATE)) | ||
376 | return; | ||
377 | |||
378 | rdmsrl(hwc->config_base, config); | ||
379 | |||
380 | if (stopping) { | ||
381 | set_bit(IBS_STOPPING, pcpu->state); | ||
382 | perf_ibs_disable_event(perf_ibs, hwc, config); | ||
383 | WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); | ||
384 | hwc->state |= PERF_HES_STOPPED; | ||
385 | } | ||
386 | |||
387 | if (hwc->state & PERF_HES_UPTODATE) | ||
388 | return; | ||
389 | |||
390 | /* | ||
391 | * Clear valid bit to not count rollovers on update, rollovers | ||
392 | * are only updated in the irq handler. | ||
393 | */ | ||
394 | config &= ~perf_ibs->valid_mask; | ||
395 | |||
396 | perf_ibs_event_update(perf_ibs, event, &config); | ||
397 | hwc->state |= PERF_HES_UPTODATE; | ||
398 | } | ||
399 | |||
28 | static int perf_ibs_add(struct perf_event *event, int flags) | 400 | static int perf_ibs_add(struct perf_event *event, int flags) |
29 | { | 401 | { |
402 | struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); | ||
403 | struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); | ||
404 | |||
405 | if (test_and_set_bit(IBS_ENABLED, pcpu->state)) | ||
406 | return -ENOSPC; | ||
407 | |||
408 | event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | ||
409 | |||
410 | pcpu->event = event; | ||
411 | |||
412 | if (flags & PERF_EF_START) | ||
413 | perf_ibs_start(event, PERF_EF_RELOAD); | ||
414 | |||
30 | return 0; | 415 | return 0; |
31 | } | 416 | } |
32 | 417 | ||
33 | static void perf_ibs_del(struct perf_event *event, int flags) | 418 | static void perf_ibs_del(struct perf_event *event, int flags) |
34 | { | 419 | { |
420 | struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); | ||
421 | struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); | ||
422 | |||
423 | if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) | ||
424 | return; | ||
425 | |||
426 | perf_ibs_stop(event, PERF_EF_UPDATE); | ||
427 | |||
428 | pcpu->event = NULL; | ||
429 | |||
430 | perf_event_update_userpage(event); | ||
35 | } | 431 | } |
36 | 432 | ||
37 | static struct pmu perf_ibs = { | 433 | static void perf_ibs_read(struct perf_event *event) { } |
38 | .event_init= perf_ibs_init, | 434 | |
39 | .add= perf_ibs_add, | 435 | static struct perf_ibs perf_ibs_fetch = { |
40 | .del= perf_ibs_del, | 436 | .pmu = { |
437 | .task_ctx_nr = perf_invalid_context, | ||
438 | |||
439 | .event_init = perf_ibs_init, | ||
440 | .add = perf_ibs_add, | ||
441 | .del = perf_ibs_del, | ||
442 | .start = perf_ibs_start, | ||
443 | .stop = perf_ibs_stop, | ||
444 | .read = perf_ibs_read, | ||
445 | }, | ||
446 | .msr = MSR_AMD64_IBSFETCHCTL, | ||
447 | .config_mask = IBS_FETCH_CONFIG_MASK, | ||
448 | .cnt_mask = IBS_FETCH_MAX_CNT, | ||
449 | .enable_mask = IBS_FETCH_ENABLE, | ||
450 | .valid_mask = IBS_FETCH_VAL, | ||
451 | .max_period = IBS_FETCH_MAX_CNT << 4, | ||
452 | .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, | ||
453 | .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, | ||
454 | |||
455 | .get_count = get_ibs_fetch_count, | ||
41 | }; | 456 | }; |
42 | 457 | ||
458 | static struct perf_ibs perf_ibs_op = { | ||
459 | .pmu = { | ||
460 | .task_ctx_nr = perf_invalid_context, | ||
461 | |||
462 | .event_init = perf_ibs_init, | ||
463 | .add = perf_ibs_add, | ||
464 | .del = perf_ibs_del, | ||
465 | .start = perf_ibs_start, | ||
466 | .stop = perf_ibs_stop, | ||
467 | .read = perf_ibs_read, | ||
468 | }, | ||
469 | .msr = MSR_AMD64_IBSOPCTL, | ||
470 | .config_mask = IBS_OP_CONFIG_MASK, | ||
471 | .cnt_mask = IBS_OP_MAX_CNT, | ||
472 | .enable_mask = IBS_OP_ENABLE, | ||
473 | .valid_mask = IBS_OP_VAL, | ||
474 | .max_period = IBS_OP_MAX_CNT << 4, | ||
475 | .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, | ||
476 | .offset_max = MSR_AMD64_IBSOP_REG_COUNT, | ||
477 | |||
478 | .get_count = get_ibs_op_count, | ||
479 | }; | ||
480 | |||
481 | static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) | ||
482 | { | ||
483 | struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); | ||
484 | struct perf_event *event = pcpu->event; | ||
485 | struct hw_perf_event *hwc = &event->hw; | ||
486 | struct perf_sample_data data; | ||
487 | struct perf_raw_record raw; | ||
488 | struct pt_regs regs; | ||
489 | struct perf_ibs_data ibs_data; | ||
490 | int offset, size, check_rip, offset_max, throttle = 0; | ||
491 | unsigned int msr; | ||
492 | u64 *buf, *config, period; | ||
493 | |||
494 | if (!test_bit(IBS_STARTED, pcpu->state)) { | ||
495 | /* | ||
496 | * Catch spurious interrupts after stopping IBS: After | ||
497 | * disabling IBS there could be still incomming NMIs | ||
498 | * with samples that even have the valid bit cleared. | ||
499 | * Mark all this NMIs as handled. | ||
500 | */ | ||
501 | return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; | ||
502 | } | ||
503 | |||
504 | msr = hwc->config_base; | ||
505 | buf = ibs_data.regs; | ||
506 | rdmsrl(msr, *buf); | ||
507 | if (!(*buf++ & perf_ibs->valid_mask)) | ||
508 | return 0; | ||
509 | |||
510 | config = &ibs_data.regs[0]; | ||
511 | perf_ibs_event_update(perf_ibs, event, config); | ||
512 | perf_sample_data_init(&data, 0, hwc->last_period); | ||
513 | if (!perf_ibs_set_period(perf_ibs, hwc, &period)) | ||
514 | goto out; /* no sw counter overflow */ | ||
515 | |||
516 | ibs_data.caps = ibs_caps; | ||
517 | size = 1; | ||
518 | offset = 1; | ||
519 | check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); | ||
520 | if (event->attr.sample_type & PERF_SAMPLE_RAW) | ||
521 | offset_max = perf_ibs->offset_max; | ||
522 | else if (check_rip) | ||
523 | offset_max = 2; | ||
524 | else | ||
525 | offset_max = 1; | ||
526 | do { | ||
527 | rdmsrl(msr + offset, *buf++); | ||
528 | size++; | ||
529 | offset = find_next_bit(perf_ibs->offset_mask, | ||
530 | perf_ibs->offset_max, | ||
531 | offset + 1); | ||
532 | } while (offset < offset_max); | ||
533 | ibs_data.size = sizeof(u64) * size; | ||
534 | |||
535 | regs = *iregs; | ||
536 | if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { | ||
537 | regs.flags &= ~PERF_EFLAGS_EXACT; | ||
538 | } else { | ||
539 | instruction_pointer_set(®s, ibs_data.regs[1]); | ||
540 | regs.flags |= PERF_EFLAGS_EXACT; | ||
541 | } | ||
542 | |||
543 | if (event->attr.sample_type & PERF_SAMPLE_RAW) { | ||
544 | raw.size = sizeof(u32) + ibs_data.size; | ||
545 | raw.data = ibs_data.data; | ||
546 | data.raw = &raw; | ||
547 | } | ||
548 | |||
549 | throttle = perf_event_overflow(event, &data, ®s); | ||
550 | out: | ||
551 | if (throttle) | ||
552 | perf_ibs_disable_event(perf_ibs, hwc, *config); | ||
553 | else | ||
554 | perf_ibs_enable_event(perf_ibs, hwc, period >> 4); | ||
555 | |||
556 | perf_event_update_userpage(event); | ||
557 | |||
558 | return 1; | ||
559 | } | ||
560 | |||
561 | static int __kprobes | ||
562 | perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) | ||
563 | { | ||
564 | int handled = 0; | ||
565 | |||
566 | handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); | ||
567 | handled += perf_ibs_handle_irq(&perf_ibs_op, regs); | ||
568 | |||
569 | if (handled) | ||
570 | inc_irq_stat(apic_perf_irqs); | ||
571 | |||
572 | return handled; | ||
573 | } | ||
574 | |||
575 | static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) | ||
576 | { | ||
577 | struct cpu_perf_ibs __percpu *pcpu; | ||
578 | int ret; | ||
579 | |||
580 | pcpu = alloc_percpu(struct cpu_perf_ibs); | ||
581 | if (!pcpu) | ||
582 | return -ENOMEM; | ||
583 | |||
584 | perf_ibs->pcpu = pcpu; | ||
585 | |||
586 | ret = perf_pmu_register(&perf_ibs->pmu, name, -1); | ||
587 | if (ret) { | ||
588 | perf_ibs->pcpu = NULL; | ||
589 | free_percpu(pcpu); | ||
590 | } | ||
591 | |||
592 | return ret; | ||
593 | } | ||
594 | |||
43 | static __init int perf_event_ibs_init(void) | 595 | static __init int perf_event_ibs_init(void) |
44 | { | 596 | { |
45 | if (!ibs_caps) | 597 | if (!ibs_caps) |
46 | return -ENODEV; /* ibs not supported by the cpu */ | 598 | return -ENODEV; /* ibs not supported by the cpu */ |
47 | 599 | ||
48 | perf_pmu_register(&perf_ibs, "ibs", -1); | 600 | perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); |
601 | if (ibs_caps & IBS_CAPS_OPCNT) | ||
602 | perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; | ||
603 | perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); | ||
604 | register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); | ||
49 | printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); | 605 | printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); |
50 | 606 | ||
51 | return 0; | 607 | return 0; |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2fef10..166546ec6ae 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
1027 | u64 status; | 1027 | u64 status; |
1028 | int handled; | 1028 | int handled; |
1029 | 1029 | ||
1030 | perf_sample_data_init(&data, 0); | ||
1031 | |||
1032 | cpuc = &__get_cpu_var(cpu_hw_events); | 1030 | cpuc = &__get_cpu_var(cpu_hw_events); |
1033 | 1031 | ||
1034 | /* | 1032 | /* |
@@ -1082,7 +1080,7 @@ again: | |||
1082 | if (!intel_pmu_save_and_restart(event)) | 1080 | if (!intel_pmu_save_and_restart(event)) |
1083 | continue; | 1081 | continue; |
1084 | 1082 | ||
1085 | data.period = event->hw.last_period; | 1083 | perf_sample_data_init(&data, 0, event->hw.last_period); |
1086 | 1084 | ||
1087 | if (has_branch_stack(event)) | 1085 | if (has_branch_stack(event)) |
1088 | data.br_stack = &cpuc->lbr_stack; | 1086 | data.br_stack = &cpuc->lbr_stack; |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7d..5a3edc27f6e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) | |||
316 | 316 | ||
317 | ds->bts_index = ds->bts_buffer_base; | 317 | ds->bts_index = ds->bts_buffer_base; |
318 | 318 | ||
319 | perf_sample_data_init(&data, 0); | 319 | perf_sample_data_init(&data, 0, event->hw.last_period); |
320 | data.period = event->hw.last_period; | ||
321 | regs.ip = 0; | 320 | regs.ip = 0; |
322 | 321 | ||
323 | /* | 322 | /* |
@@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
564 | if (!intel_pmu_save_and_restart(event)) | 563 | if (!intel_pmu_save_and_restart(event)) |
565 | return; | 564 | return; |
566 | 565 | ||
567 | perf_sample_data_init(&data, 0); | 566 | perf_sample_data_init(&data, 0, event->hw.last_period); |
568 | data.period = event->hw.last_period; | ||
569 | 567 | ||
570 | /* | 568 | /* |
571 | * We use the interrupt regs as a base because the PEBS record | 569 | * We use the interrupt regs as a base because the PEBS record |
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacfd710..47124a73dd7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
1005 | int idx, handled = 0; | 1005 | int idx, handled = 0; |
1006 | u64 val; | 1006 | u64 val; |
1007 | 1007 | ||
1008 | perf_sample_data_init(&data, 0); | ||
1009 | |||
1010 | cpuc = &__get_cpu_var(cpu_hw_events); | 1008 | cpuc = &__get_cpu_var(cpu_hw_events); |
1011 | 1009 | ||
1012 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1010 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
1034 | handled += overflow; | 1032 | handled += overflow; |
1035 | 1033 | ||
1036 | /* event overflow for sure */ | 1034 | /* event overflow for sure */ |
1037 | data.period = event->hw.last_period; | 1035 | perf_sample_data_init(&data, 0, hwc->last_period); |
1038 | 1036 | ||
1039 | if (!x86_perf_event_set_period(event)) | 1037 | if (!x86_perf_event_set_period(event)) |
1040 | continue; | 1038 | continue; |
1039 | |||
1040 | |||
1041 | if (perf_event_overflow(event, &data, regs)) | 1041 | if (perf_event_overflow(event, &data, regs)) |
1042 | x86_pmu_stop(event, 0); | 1042 | x86_pmu_stop(event, 0); |
1043 | } | 1043 | } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1b81839b6c8..571246d81ed 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
271 | current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) | 271 | current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) |
272 | return 1; | 272 | return 1; |
273 | 273 | ||
274 | show_registers(regs); | 274 | show_regs(regs); |
275 | #ifdef CONFIG_X86_32 | 275 | #ifdef CONFIG_X86_32 |
276 | if (user_mode_vm(regs)) { | 276 | if (user_mode_vm(regs)) { |
277 | sp = regs->sp; | 277 | sp = regs->sp; |
@@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err) | |||
311 | 311 | ||
312 | static int __init kstack_setup(char *s) | 312 | static int __init kstack_setup(char *s) |
313 | { | 313 | { |
314 | ssize_t ret; | ||
315 | unsigned long val; | ||
316 | |||
314 | if (!s) | 317 | if (!s) |
315 | return -EINVAL; | 318 | return -EINVAL; |
316 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | 319 | |
320 | ret = kstrtoul(s, 0, &val); | ||
321 | if (ret) | ||
322 | return ret; | ||
323 | kstack_depth_to_print = val; | ||
317 | return 0; | 324 | return 0; |
318 | } | 325 | } |
319 | early_param("kstack", kstack_setup); | 326 | early_param("kstack", kstack_setup); |
320 | 327 | ||
321 | static int __init code_bytes_setup(char *s) | 328 | static int __init code_bytes_setup(char *s) |
322 | { | 329 | { |
323 | code_bytes = simple_strtoul(s, NULL, 0); | 330 | ssize_t ret; |
331 | unsigned long val; | ||
332 | |||
333 | if (!s) | ||
334 | return -EINVAL; | ||
335 | |||
336 | ret = kstrtoul(s, 0, &val); | ||
337 | if (ret) | ||
338 | return ret; | ||
339 | |||
340 | code_bytes = val; | ||
324 | if (code_bytes > 8192) | 341 | if (code_bytes > 8192) |
325 | code_bytes = 8192; | 342 | code_bytes = 8192; |
326 | 343 | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 88ec9129271..e0b1d783daa 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
82 | } | 82 | } |
83 | 83 | ||
84 | 84 | ||
85 | void show_registers(struct pt_regs *regs) | 85 | void show_regs(struct pt_regs *regs) |
86 | { | 86 | { |
87 | int i; | 87 | int i; |
88 | 88 | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 17107bd6e1f..791b76122aa 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
245 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | 245 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); |
246 | } | 246 | } |
247 | 247 | ||
248 | void show_registers(struct pt_regs *regs) | 248 | void show_regs(struct pt_regs *regs) |
249 | { | 249 | { |
250 | int i; | 250 | int i; |
251 | unsigned long sp; | 251 | unsigned long sp; |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 62d61e9976e..41857970517 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, | |||
113 | int x = e820x->nr_map; | 113 | int x = e820x->nr_map; |
114 | 114 | ||
115 | if (x >= ARRAY_SIZE(e820x->map)) { | 115 | if (x >= ARRAY_SIZE(e820x->map)) { |
116 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | 116 | printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", |
117 | (unsigned long long) start, | ||
118 | (unsigned long long) (start + size - 1)); | ||
117 | return; | 119 | return; |
118 | } | 120 | } |
119 | 121 | ||
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type) | |||
133 | switch (type) { | 135 | switch (type) { |
134 | case E820_RAM: | 136 | case E820_RAM: |
135 | case E820_RESERVED_KERN: | 137 | case E820_RESERVED_KERN: |
136 | printk(KERN_CONT "(usable)"); | 138 | printk(KERN_CONT "usable"); |
137 | break; | 139 | break; |
138 | case E820_RESERVED: | 140 | case E820_RESERVED: |
139 | printk(KERN_CONT "(reserved)"); | 141 | printk(KERN_CONT "reserved"); |
140 | break; | 142 | break; |
141 | case E820_ACPI: | 143 | case E820_ACPI: |
142 | printk(KERN_CONT "(ACPI data)"); | 144 | printk(KERN_CONT "ACPI data"); |
143 | break; | 145 | break; |
144 | case E820_NVS: | 146 | case E820_NVS: |
145 | printk(KERN_CONT "(ACPI NVS)"); | 147 | printk(KERN_CONT "ACPI NVS"); |
146 | break; | 148 | break; |
147 | case E820_UNUSABLE: | 149 | case E820_UNUSABLE: |
148 | printk(KERN_CONT "(unusable)"); | 150 | printk(KERN_CONT "unusable"); |
149 | break; | 151 | break; |
150 | default: | 152 | default: |
151 | printk(KERN_CONT "type %u", type); | 153 | printk(KERN_CONT "type %u", type); |
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who) | |||
158 | int i; | 160 | int i; |
159 | 161 | ||
160 | for (i = 0; i < e820.nr_map; i++) { | 162 | for (i = 0; i < e820.nr_map; i++) { |
161 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | 163 | printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, |
162 | (unsigned long long) e820.map[i].addr, | 164 | (unsigned long long) e820.map[i].addr, |
163 | (unsigned long long) | 165 | (unsigned long long) |
164 | (e820.map[i].addr + e820.map[i].size)); | 166 | (e820.map[i].addr + e820.map[i].size - 1)); |
165 | e820_print_type(e820.map[i].type); | 167 | e820_print_type(e820.map[i].type); |
166 | printk(KERN_CONT "\n"); | 168 | printk(KERN_CONT "\n"); |
167 | } | 169 | } |
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, | |||
428 | size = ULLONG_MAX - start; | 430 | size = ULLONG_MAX - start; |
429 | 431 | ||
430 | end = start + size; | 432 | end = start + size; |
431 | printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", | 433 | printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", |
432 | (unsigned long long) start, | 434 | (unsigned long long) start, (unsigned long long) (end - 1)); |
433 | (unsigned long long) end); | ||
434 | e820_print_type(old_type); | 435 | e820_print_type(old_type); |
435 | printk(KERN_CONT " ==> "); | 436 | printk(KERN_CONT " ==> "); |
436 | e820_print_type(new_type); | 437 | e820_print_type(new_type); |
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, | |||
509 | size = ULLONG_MAX - start; | 510 | size = ULLONG_MAX - start; |
510 | 511 | ||
511 | end = start + size; | 512 | end = start + size; |
512 | printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", | 513 | printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", |
513 | (unsigned long long) start, | 514 | (unsigned long long) start, (unsigned long long) (end - 1)); |
514 | (unsigned long long) end); | ||
515 | if (checktype) | 515 | if (checktype) |
516 | e820_print_type(old_type); | 516 | e820_print_type(old_type); |
517 | printk(KERN_CONT "\n"); | 517 | printk(KERN_CONT "\n"); |
@@ -567,7 +567,7 @@ void __init update_e820(void) | |||
567 | if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) | 567 | if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) |
568 | return; | 568 | return; |
569 | e820.nr_map = nr_map; | 569 | e820.nr_map = nr_map; |
570 | printk(KERN_INFO "modified physical RAM map:\n"); | 570 | printk(KERN_INFO "e820: modified physical RAM map:\n"); |
571 | e820_print_map("modified"); | 571 | e820_print_map("modified"); |
572 | } | 572 | } |
573 | static void __init update_e820_saved(void) | 573 | static void __init update_e820_saved(void) |
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void) | |||
637 | if (!found) { | 637 | if (!found) { |
638 | gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; | 638 | gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; |
639 | printk(KERN_ERR | 639 | printk(KERN_ERR |
640 | "PCI: Warning: Cannot find a gap in the 32bit address range\n" | 640 | "e820: cannot find a gap in the 32bit address range\n" |
641 | "PCI: Unassigned devices with 32bit resource registers may break!\n"); | 641 | "e820: PCI devices with unassigned 32bit BARs may break!\n"); |
642 | } | 642 | } |
643 | #endif | 643 | #endif |
644 | 644 | ||
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void) | |||
648 | pci_mem_start = gapstart; | 648 | pci_mem_start = gapstart; |
649 | 649 | ||
650 | printk(KERN_INFO | 650 | printk(KERN_INFO |
651 | "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | 651 | "e820: [mem %#010lx-%#010lx] available for PCI devices\n", |
652 | pci_mem_start, gapstart, gapsize); | 652 | gapstart, gapstart + gapsize - 1); |
653 | } | 653 | } |
654 | 654 | ||
655 | /** | 655 | /** |
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata) | |||
667 | extmap = (struct e820entry *)(sdata->data); | 667 | extmap = (struct e820entry *)(sdata->data); |
668 | __append_e820_map(extmap, entries); | 668 | __append_e820_map(extmap, entries); |
669 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 669 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
670 | printk(KERN_INFO "extended physical RAM map:\n"); | 670 | printk(KERN_INFO "e820: extended physical RAM map:\n"); |
671 | e820_print_map("extended"); | 671 | e820_print_map("extended"); |
672 | } | 672 | } |
673 | 673 | ||
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) | |||
734 | addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); | 734 | addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); |
735 | if (addr) { | 735 | if (addr) { |
736 | e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); | 736 | e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); |
737 | printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); | 737 | printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); |
738 | update_e820_saved(); | 738 | update_e820_saved(); |
739 | } | 739 | } |
740 | 740 | ||
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) | |||
784 | if (last_pfn > max_arch_pfn) | 784 | if (last_pfn > max_arch_pfn) |
785 | last_pfn = max_arch_pfn; | 785 | last_pfn = max_arch_pfn; |
786 | 786 | ||
787 | printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", | 787 | printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", |
788 | last_pfn, max_arch_pfn); | 788 | last_pfn, max_arch_pfn); |
789 | return last_pfn; | 789 | return last_pfn; |
790 | } | 790 | } |
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void) | |||
888 | early_panic("Invalid user supplied memory map"); | 888 | early_panic("Invalid user supplied memory map"); |
889 | e820.nr_map = nr; | 889 | e820.nr_map = nr; |
890 | 890 | ||
891 | printk(KERN_INFO "user-defined physical RAM map:\n"); | 891 | printk(KERN_INFO "e820: user-defined physical RAM map:\n"); |
892 | e820_print_map("user"); | 892 | e820_print_map("user"); |
893 | } | 893 | } |
894 | } | 894 | } |
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void) | |||
996 | end = MAX_RESOURCE_SIZE; | 996 | end = MAX_RESOURCE_SIZE; |
997 | if (start >= end) | 997 | if (start >= end) |
998 | continue; | 998 | continue; |
999 | printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", | 999 | printk(KERN_DEBUG |
1000 | start, end); | 1000 | "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", |
1001 | start, end); | ||
1001 | reserve_region_with_split(&iomem_resource, start, end, | 1002 | reserve_region_with_split(&iomem_resource, start, end, |
1002 | "RAM buffer"); | 1003 | "RAM buffer"); |
1003 | } | 1004 | } |
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void) | |||
1047 | 1048 | ||
1048 | who = x86_init.resources.memory_setup(); | 1049 | who = x86_init.resources.memory_setup(); |
1049 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); | 1050 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); |
1050 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | 1051 | printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); |
1051 | e820_print_map(who); | 1052 | e820_print_map(who); |
1052 | } | 1053 | } |
1053 | 1054 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7b784f4ef1e..01ccf9b7147 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <asm/irq_vectors.h> | 56 | #include <asm/irq_vectors.h> |
57 | #include <asm/cpufeature.h> | 57 | #include <asm/cpufeature.h> |
58 | #include <asm/alternative-asm.h> | 58 | #include <asm/alternative-asm.h> |
59 | #include <asm/asm.h> | ||
59 | 60 | ||
60 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 61 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
61 | #include <linux/elf-em.h> | 62 | #include <linux/elf-em.h> |
@@ -151,10 +152,8 @@ | |||
151 | .pushsection .fixup, "ax" | 152 | .pushsection .fixup, "ax" |
152 | 99: movl $0, (%esp) | 153 | 99: movl $0, (%esp) |
153 | jmp 98b | 154 | jmp 98b |
154 | .section __ex_table, "a" | ||
155 | .align 4 | ||
156 | .long 98b, 99b | ||
157 | .popsection | 155 | .popsection |
156 | _ASM_EXTABLE(98b,99b) | ||
158 | .endm | 157 | .endm |
159 | 158 | ||
160 | .macro PTGS_TO_GS | 159 | .macro PTGS_TO_GS |
@@ -164,10 +163,8 @@ | |||
164 | .pushsection .fixup, "ax" | 163 | .pushsection .fixup, "ax" |
165 | 99: movl $0, PT_GS(%esp) | 164 | 99: movl $0, PT_GS(%esp) |
166 | jmp 98b | 165 | jmp 98b |
167 | .section __ex_table, "a" | ||
168 | .align 4 | ||
169 | .long 98b, 99b | ||
170 | .popsection | 166 | .popsection |
167 | _ASM_EXTABLE(98b,99b) | ||
171 | .endm | 168 | .endm |
172 | 169 | ||
173 | .macro GS_TO_REG reg | 170 | .macro GS_TO_REG reg |
@@ -249,12 +246,10 @@ | |||
249 | jmp 2b | 246 | jmp 2b |
250 | 6: movl $0, (%esp) | 247 | 6: movl $0, (%esp) |
251 | jmp 3b | 248 | jmp 3b |
252 | .section __ex_table, "a" | ||
253 | .align 4 | ||
254 | .long 1b, 4b | ||
255 | .long 2b, 5b | ||
256 | .long 3b, 6b | ||
257 | .popsection | 249 | .popsection |
250 | _ASM_EXTABLE(1b,4b) | ||
251 | _ASM_EXTABLE(2b,5b) | ||
252 | _ASM_EXTABLE(3b,6b) | ||
258 | POP_GS_EX | 253 | POP_GS_EX |
259 | .endm | 254 | .endm |
260 | 255 | ||
@@ -415,10 +410,7 @@ sysenter_past_esp: | |||
415 | jae syscall_fault | 410 | jae syscall_fault |
416 | 1: movl (%ebp),%ebp | 411 | 1: movl (%ebp),%ebp |
417 | movl %ebp,PT_EBP(%esp) | 412 | movl %ebp,PT_EBP(%esp) |
418 | .section __ex_table,"a" | 413 | _ASM_EXTABLE(1b,syscall_fault) |
419 | .align 4 | ||
420 | .long 1b,syscall_fault | ||
421 | .previous | ||
422 | 414 | ||
423 | GET_THREAD_INFO(%ebp) | 415 | GET_THREAD_INFO(%ebp) |
424 | 416 | ||
@@ -485,10 +477,8 @@ sysexit_audit: | |||
485 | .pushsection .fixup,"ax" | 477 | .pushsection .fixup,"ax" |
486 | 2: movl $0,PT_FS(%esp) | 478 | 2: movl $0,PT_FS(%esp) |
487 | jmp 1b | 479 | jmp 1b |
488 | .section __ex_table,"a" | ||
489 | .align 4 | ||
490 | .long 1b,2b | ||
491 | .popsection | 480 | .popsection |
481 | _ASM_EXTABLE(1b,2b) | ||
492 | PTGS_TO_GS_EX | 482 | PTGS_TO_GS_EX |
493 | ENDPROC(ia32_sysenter_target) | 483 | ENDPROC(ia32_sysenter_target) |
494 | 484 | ||
@@ -543,10 +533,7 @@ ENTRY(iret_exc) | |||
543 | pushl $do_iret_error | 533 | pushl $do_iret_error |
544 | jmp error_code | 534 | jmp error_code |
545 | .previous | 535 | .previous |
546 | .section __ex_table,"a" | 536 | _ASM_EXTABLE(irq_return,iret_exc) |
547 | .align 4 | ||
548 | .long irq_return,iret_exc | ||
549 | .previous | ||
550 | 537 | ||
551 | CFI_RESTORE_STATE | 538 | CFI_RESTORE_STATE |
552 | ldt_ss: | 539 | ldt_ss: |
@@ -901,10 +888,7 @@ END(device_not_available) | |||
901 | #ifdef CONFIG_PARAVIRT | 888 | #ifdef CONFIG_PARAVIRT |
902 | ENTRY(native_iret) | 889 | ENTRY(native_iret) |
903 | iret | 890 | iret |
904 | .section __ex_table,"a" | 891 | _ASM_EXTABLE(native_iret, iret_exc) |
905 | .align 4 | ||
906 | .long native_iret, iret_exc | ||
907 | .previous | ||
908 | END(native_iret) | 892 | END(native_iret) |
909 | 893 | ||
910 | ENTRY(native_irq_enable_sysexit) | 894 | ENTRY(native_irq_enable_sysexit) |
@@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback) | |||
1093 | movl %eax,16(%esp) | 1077 | movl %eax,16(%esp) |
1094 | jmp 4b | 1078 | jmp 4b |
1095 | .previous | 1079 | .previous |
1096 | .section __ex_table,"a" | 1080 | _ASM_EXTABLE(1b,6b) |
1097 | .align 4 | 1081 | _ASM_EXTABLE(2b,7b) |
1098 | .long 1b,6b | 1082 | _ASM_EXTABLE(3b,8b) |
1099 | .long 2b,7b | 1083 | _ASM_EXTABLE(4b,9b) |
1100 | .long 3b,8b | ||
1101 | .long 4b,9b | ||
1102 | .previous | ||
1103 | ENDPROC(xen_failsafe_callback) | 1084 | ENDPROC(xen_failsafe_callback) |
1104 | 1085 | ||
1105 | BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, | 1086 | BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cdc79b5cfcd..320852d0202 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <asm/paravirt.h> | 55 | #include <asm/paravirt.h> |
56 | #include <asm/ftrace.h> | 56 | #include <asm/ftrace.h> |
57 | #include <asm/percpu.h> | 57 | #include <asm/percpu.h> |
58 | #include <asm/asm.h> | ||
58 | #include <linux/err.h> | 59 | #include <linux/err.h> |
59 | 60 | ||
60 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 61 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
@@ -900,18 +901,12 @@ restore_args: | |||
900 | 901 | ||
901 | irq_return: | 902 | irq_return: |
902 | INTERRUPT_RETURN | 903 | INTERRUPT_RETURN |
903 | 904 | _ASM_EXTABLE(irq_return, bad_iret) | |
904 | .section __ex_table, "a" | ||
905 | .quad irq_return, bad_iret | ||
906 | .previous | ||
907 | 905 | ||
908 | #ifdef CONFIG_PARAVIRT | 906 | #ifdef CONFIG_PARAVIRT |
909 | ENTRY(native_iret) | 907 | ENTRY(native_iret) |
910 | iretq | 908 | iretq |
911 | 909 | _ASM_EXTABLE(native_iret, bad_iret) | |
912 | .section __ex_table,"a" | ||
913 | .quad native_iret, bad_iret | ||
914 | .previous | ||
915 | #endif | 910 | #endif |
916 | 911 | ||
917 | .section .fixup,"ax" | 912 | .section .fixup,"ax" |
@@ -1181,10 +1176,7 @@ gs_change: | |||
1181 | CFI_ENDPROC | 1176 | CFI_ENDPROC |
1182 | END(native_load_gs_index) | 1177 | END(native_load_gs_index) |
1183 | 1178 | ||
1184 | .section __ex_table,"a" | 1179 | _ASM_EXTABLE(gs_change,bad_gs) |
1185 | .align 8 | ||
1186 | .quad gs_change,bad_gs | ||
1187 | .previous | ||
1188 | .section .fixup,"ax" | 1180 | .section .fixup,"ax" |
1189 | /* running with kernelgs */ | 1181 | /* running with kernelgs */ |
1190 | bad_gs: | 1182 | bad_gs: |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272f..32ff36596ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -24,40 +24,21 @@ | |||
24 | #include <trace/syscall.h> | 24 | #include <trace/syscall.h> |
25 | 25 | ||
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/kprobes.h> | ||
27 | #include <asm/ftrace.h> | 28 | #include <asm/ftrace.h> |
28 | #include <asm/nops.h> | 29 | #include <asm/nops.h> |
29 | #include <asm/nmi.h> | ||
30 | |||
31 | 30 | ||
32 | #ifdef CONFIG_DYNAMIC_FTRACE | 31 | #ifdef CONFIG_DYNAMIC_FTRACE |
33 | 32 | ||
34 | /* | ||
35 | * modifying_code is set to notify NMIs that they need to use | ||
36 | * memory barriers when entering or exiting. But we don't want | ||
37 | * to burden NMIs with unnecessary memory barriers when code | ||
38 | * modification is not being done (which is most of the time). | ||
39 | * | ||
40 | * A mutex is already held when ftrace_arch_code_modify_prepare | ||
41 | * and post_process are called. No locks need to be taken here. | ||
42 | * | ||
43 | * Stop machine will make sure currently running NMIs are done | ||
44 | * and new NMIs will see the updated variable before we need | ||
45 | * to worry about NMIs doing memory barriers. | ||
46 | */ | ||
47 | static int modifying_code __read_mostly; | ||
48 | static DEFINE_PER_CPU(int, save_modifying_code); | ||
49 | |||
50 | int ftrace_arch_code_modify_prepare(void) | 33 | int ftrace_arch_code_modify_prepare(void) |
51 | { | 34 | { |
52 | set_kernel_text_rw(); | 35 | set_kernel_text_rw(); |
53 | set_all_modules_text_rw(); | 36 | set_all_modules_text_rw(); |
54 | modifying_code = 1; | ||
55 | return 0; | 37 | return 0; |
56 | } | 38 | } |
57 | 39 | ||
58 | int ftrace_arch_code_modify_post_process(void) | 40 | int ftrace_arch_code_modify_post_process(void) |
59 | { | 41 | { |
60 | modifying_code = 0; | ||
61 | set_all_modules_text_ro(); | 42 | set_all_modules_text_ro(); |
62 | set_kernel_text_ro(); | 43 | set_kernel_text_ro(); |
63 | return 0; | 44 | return 0; |
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) | |||
90 | return calc.code; | 71 | return calc.code; |
91 | } | 72 | } |
92 | 73 | ||
93 | /* | ||
94 | * Modifying code must take extra care. On an SMP machine, if | ||
95 | * the code being modified is also being executed on another CPU | ||
96 | * that CPU will have undefined results and possibly take a GPF. | ||
97 | * We use kstop_machine to stop other CPUS from exectuing code. | ||
98 | * But this does not stop NMIs from happening. We still need | ||
99 | * to protect against that. We separate out the modification of | ||
100 | * the code to take care of this. | ||
101 | * | ||
102 | * Two buffers are added: An IP buffer and a "code" buffer. | ||
103 | * | ||
104 | * 1) Put the instruction pointer into the IP buffer | ||
105 | * and the new code into the "code" buffer. | ||
106 | * 2) Wait for any running NMIs to finish and set a flag that says | ||
107 | * we are modifying code, it is done in an atomic operation. | ||
108 | * 3) Write the code | ||
109 | * 4) clear the flag. | ||
110 | * 5) Wait for any running NMIs to finish. | ||
111 | * | ||
112 | * If an NMI is executed, the first thing it does is to call | ||
113 | * "ftrace_nmi_enter". This will check if the flag is set to write | ||
114 | * and if it is, it will write what is in the IP and "code" buffers. | ||
115 | * | ||
116 | * The trick is, it does not matter if everyone is writing the same | ||
117 | * content to the code location. Also, if a CPU is executing code | ||
118 | * it is OK to write to that code location if the contents being written | ||
119 | * are the same as what exists. | ||
120 | */ | ||
121 | |||
122 | #define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ | ||
123 | static atomic_t nmi_running = ATOMIC_INIT(0); | ||
124 | static int mod_code_status; /* holds return value of text write */ | ||
125 | static void *mod_code_ip; /* holds the IP to write to */ | ||
126 | static const void *mod_code_newcode; /* holds the text to write to the IP */ | ||
127 | |||
128 | static unsigned nmi_wait_count; | ||
129 | static atomic_t nmi_update_count = ATOMIC_INIT(0); | ||
130 | |||
131 | int ftrace_arch_read_dyn_info(char *buf, int size) | ||
132 | { | ||
133 | int r; | ||
134 | |||
135 | r = snprintf(buf, size, "%u %u", | ||
136 | nmi_wait_count, | ||
137 | atomic_read(&nmi_update_count)); | ||
138 | return r; | ||
139 | } | ||
140 | |||
141 | static void clear_mod_flag(void) | ||
142 | { | ||
143 | int old = atomic_read(&nmi_running); | ||
144 | |||
145 | for (;;) { | ||
146 | int new = old & ~MOD_CODE_WRITE_FLAG; | ||
147 | |||
148 | if (old == new) | ||
149 | break; | ||
150 | |||
151 | old = atomic_cmpxchg(&nmi_running, old, new); | ||
152 | } | ||
153 | } | ||
154 | |||
155 | static void ftrace_mod_code(void) | ||
156 | { | ||
157 | /* | ||
158 | * Yes, more than one CPU process can be writing to mod_code_status. | ||
159 | * (and the code itself) | ||
160 | * But if one were to fail, then they all should, and if one were | ||
161 | * to succeed, then they all should. | ||
162 | */ | ||
163 | mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, | ||
164 | MCOUNT_INSN_SIZE); | ||
165 | |||
166 | /* if we fail, then kill any new writers */ | ||
167 | if (mod_code_status) | ||
168 | clear_mod_flag(); | ||
169 | } | ||
170 | |||
171 | void ftrace_nmi_enter(void) | ||
172 | { | ||
173 | __this_cpu_write(save_modifying_code, modifying_code); | ||
174 | |||
175 | if (!__this_cpu_read(save_modifying_code)) | ||
176 | return; | ||
177 | |||
178 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { | ||
179 | smp_rmb(); | ||
180 | ftrace_mod_code(); | ||
181 | atomic_inc(&nmi_update_count); | ||
182 | } | ||
183 | /* Must have previous changes seen before executions */ | ||
184 | smp_mb(); | ||
185 | } | ||
186 | |||
187 | void ftrace_nmi_exit(void) | ||
188 | { | ||
189 | if (!__this_cpu_read(save_modifying_code)) | ||
190 | return; | ||
191 | |||
192 | /* Finish all executions before clearing nmi_running */ | ||
193 | smp_mb(); | ||
194 | atomic_dec(&nmi_running); | ||
195 | } | ||
196 | |||
197 | static void wait_for_nmi_and_set_mod_flag(void) | ||
198 | { | ||
199 | if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) | ||
200 | return; | ||
201 | |||
202 | do { | ||
203 | cpu_relax(); | ||
204 | } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); | ||
205 | |||
206 | nmi_wait_count++; | ||
207 | } | ||
208 | |||
209 | static void wait_for_nmi(void) | ||
210 | { | ||
211 | if (!atomic_read(&nmi_running)) | ||
212 | return; | ||
213 | |||
214 | do { | ||
215 | cpu_relax(); | ||
216 | } while (atomic_read(&nmi_running)); | ||
217 | |||
218 | nmi_wait_count++; | ||
219 | } | ||
220 | |||
221 | static inline int | 74 | static inline int |
222 | within(unsigned long addr, unsigned long start, unsigned long end) | 75 | within(unsigned long addr, unsigned long start, unsigned long end) |
223 | { | 76 | { |
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) | |||
238 | if (within(ip, (unsigned long)_text, (unsigned long)_etext)) | 91 | if (within(ip, (unsigned long)_text, (unsigned long)_etext)) |
239 | ip = (unsigned long)__va(__pa(ip)); | 92 | ip = (unsigned long)__va(__pa(ip)); |
240 | 93 | ||
241 | mod_code_ip = (void *)ip; | 94 | return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); |
242 | mod_code_newcode = new_code; | ||
243 | |||
244 | /* The buffers need to be visible before we let NMIs write them */ | ||
245 | smp_mb(); | ||
246 | |||
247 | wait_for_nmi_and_set_mod_flag(); | ||
248 | |||
249 | /* Make sure all running NMIs have finished before we write the code */ | ||
250 | smp_mb(); | ||
251 | |||
252 | ftrace_mod_code(); | ||
253 | |||
254 | /* Make sure the write happens before clearing the bit */ | ||
255 | smp_mb(); | ||
256 | |||
257 | clear_mod_flag(); | ||
258 | wait_for_nmi(); | ||
259 | |||
260 | return mod_code_status; | ||
261 | } | 95 | } |
262 | 96 | ||
263 | static const unsigned char *ftrace_nop_replace(void) | 97 | static const unsigned char *ftrace_nop_replace(void) |
@@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func) | |||
334 | return ret; | 168 | return ret; |
335 | } | 169 | } |
336 | 170 | ||
171 | int modifying_ftrace_code __read_mostly; | ||
172 | |||
173 | /* | ||
174 | * A breakpoint was added to the code address we are about to | ||
175 | * modify, and this is the handle that will just skip over it. | ||
176 | * We are either changing a nop into a trace call, or a trace | ||
177 | * call to a nop. While the change is taking place, we treat | ||
178 | * it just like it was a nop. | ||
179 | */ | ||
180 | int ftrace_int3_handler(struct pt_regs *regs) | ||
181 | { | ||
182 | if (WARN_ON_ONCE(!regs)) | ||
183 | return 0; | ||
184 | |||
185 | if (!ftrace_location(regs->ip - 1)) | ||
186 | return 0; | ||
187 | |||
188 | regs->ip += MCOUNT_INSN_SIZE - 1; | ||
189 | |||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | static int ftrace_write(unsigned long ip, const char *val, int size) | ||
194 | { | ||
195 | /* | ||
196 | * On x86_64, kernel text mappings are mapped read-only with | ||
197 | * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead | ||
198 | * of the kernel text mapping to modify the kernel text. | ||
199 | * | ||
200 | * For 32bit kernels, these mappings are same and we can use | ||
201 | * kernel identity mapping to modify code. | ||
202 | */ | ||
203 | if (within(ip, (unsigned long)_text, (unsigned long)_etext)) | ||
204 | ip = (unsigned long)__va(__pa(ip)); | ||
205 | |||
206 | return probe_kernel_write((void *)ip, val, size); | ||
207 | } | ||
208 | |||
209 | static int add_break(unsigned long ip, const char *old) | ||
210 | { | ||
211 | unsigned char replaced[MCOUNT_INSN_SIZE]; | ||
212 | unsigned char brk = BREAKPOINT_INSTRUCTION; | ||
213 | |||
214 | if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) | ||
215 | return -EFAULT; | ||
216 | |||
217 | /* Make sure it is what we expect it to be */ | ||
218 | if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) | ||
219 | return -EINVAL; | ||
220 | |||
221 | if (ftrace_write(ip, &brk, 1)) | ||
222 | return -EPERM; | ||
223 | |||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) | ||
228 | { | ||
229 | unsigned const char *old; | ||
230 | unsigned long ip = rec->ip; | ||
231 | |||
232 | old = ftrace_call_replace(ip, addr); | ||
233 | |||
234 | return add_break(rec->ip, old); | ||
235 | } | ||
236 | |||
237 | |||
238 | static int add_brk_on_nop(struct dyn_ftrace *rec) | ||
239 | { | ||
240 | unsigned const char *old; | ||
241 | |||
242 | old = ftrace_nop_replace(); | ||
243 | |||
244 | return add_break(rec->ip, old); | ||
245 | } | ||
246 | |||
247 | static int add_breakpoints(struct dyn_ftrace *rec, int enable) | ||
248 | { | ||
249 | unsigned long ftrace_addr; | ||
250 | int ret; | ||
251 | |||
252 | ret = ftrace_test_record(rec, enable); | ||
253 | |||
254 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
255 | |||
256 | switch (ret) { | ||
257 | case FTRACE_UPDATE_IGNORE: | ||
258 | return 0; | ||
259 | |||
260 | case FTRACE_UPDATE_MAKE_CALL: | ||
261 | /* converting nop to call */ | ||
262 | return add_brk_on_nop(rec); | ||
263 | |||
264 | case FTRACE_UPDATE_MAKE_NOP: | ||
265 | /* converting a call to a nop */ | ||
266 | return add_brk_on_call(rec, ftrace_addr); | ||
267 | } | ||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * On error, we need to remove breakpoints. This needs to | ||
273 | * be done caefully. If the address does not currently have a | ||
274 | * breakpoint, we know we are done. Otherwise, we look at the | ||
275 | * remaining 4 bytes of the instruction. If it matches a nop | ||
276 | * we replace the breakpoint with the nop. Otherwise we replace | ||
277 | * it with the call instruction. | ||
278 | */ | ||
279 | static int remove_breakpoint(struct dyn_ftrace *rec) | ||
280 | { | ||
281 | unsigned char ins[MCOUNT_INSN_SIZE]; | ||
282 | unsigned char brk = BREAKPOINT_INSTRUCTION; | ||
283 | const unsigned char *nop; | ||
284 | unsigned long ftrace_addr; | ||
285 | unsigned long ip = rec->ip; | ||
286 | |||
287 | /* If we fail the read, just give up */ | ||
288 | if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) | ||
289 | return -EFAULT; | ||
290 | |||
291 | /* If this does not have a breakpoint, we are done */ | ||
292 | if (ins[0] != brk) | ||
293 | return -1; | ||
294 | |||
295 | nop = ftrace_nop_replace(); | ||
296 | |||
297 | /* | ||
298 | * If the last 4 bytes of the instruction do not match | ||
299 | * a nop, then we assume that this is a call to ftrace_addr. | ||
300 | */ | ||
301 | if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { | ||
302 | /* | ||
303 | * For extra paranoidism, we check if the breakpoint is on | ||
304 | * a call that would actually jump to the ftrace_addr. | ||
305 | * If not, don't touch the breakpoint, we make just create | ||
306 | * a disaster. | ||
307 | */ | ||
308 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
309 | nop = ftrace_call_replace(ip, ftrace_addr); | ||
310 | |||
311 | if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) | ||
312 | return -EINVAL; | ||
313 | } | ||
314 | |||
315 | return probe_kernel_write((void *)ip, &nop[0], 1); | ||
316 | } | ||
317 | |||
318 | static int add_update_code(unsigned long ip, unsigned const char *new) | ||
319 | { | ||
320 | /* skip breakpoint */ | ||
321 | ip++; | ||
322 | new++; | ||
323 | if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) | ||
324 | return -EPERM; | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) | ||
329 | { | ||
330 | unsigned long ip = rec->ip; | ||
331 | unsigned const char *new; | ||
332 | |||
333 | new = ftrace_call_replace(ip, addr); | ||
334 | return add_update_code(ip, new); | ||
335 | } | ||
336 | |||
337 | static int add_update_nop(struct dyn_ftrace *rec) | ||
338 | { | ||
339 | unsigned long ip = rec->ip; | ||
340 | unsigned const char *new; | ||
341 | |||
342 | new = ftrace_nop_replace(); | ||
343 | return add_update_code(ip, new); | ||
344 | } | ||
345 | |||
346 | static int add_update(struct dyn_ftrace *rec, int enable) | ||
347 | { | ||
348 | unsigned long ftrace_addr; | ||
349 | int ret; | ||
350 | |||
351 | ret = ftrace_test_record(rec, enable); | ||
352 | |||
353 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
354 | |||
355 | switch (ret) { | ||
356 | case FTRACE_UPDATE_IGNORE: | ||
357 | return 0; | ||
358 | |||
359 | case FTRACE_UPDATE_MAKE_CALL: | ||
360 | /* converting nop to call */ | ||
361 | return add_update_call(rec, ftrace_addr); | ||
362 | |||
363 | case FTRACE_UPDATE_MAKE_NOP: | ||
364 | /* converting a call to a nop */ | ||
365 | return add_update_nop(rec); | ||
366 | } | ||
367 | |||
368 | return 0; | ||
369 | } | ||
370 | |||
371 | static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) | ||
372 | { | ||
373 | unsigned long ip = rec->ip; | ||
374 | unsigned const char *new; | ||
375 | |||
376 | new = ftrace_call_replace(ip, addr); | ||
377 | |||
378 | if (ftrace_write(ip, new, 1)) | ||
379 | return -EPERM; | ||
380 | |||
381 | return 0; | ||
382 | } | ||
383 | |||
384 | static int finish_update_nop(struct dyn_ftrace *rec) | ||
385 | { | ||
386 | unsigned long ip = rec->ip; | ||
387 | unsigned const char *new; | ||
388 | |||
389 | new = ftrace_nop_replace(); | ||
390 | |||
391 | if (ftrace_write(ip, new, 1)) | ||
392 | return -EPERM; | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | static int finish_update(struct dyn_ftrace *rec, int enable) | ||
397 | { | ||
398 | unsigned long ftrace_addr; | ||
399 | int ret; | ||
400 | |||
401 | ret = ftrace_update_record(rec, enable); | ||
402 | |||
403 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
404 | |||
405 | switch (ret) { | ||
406 | case FTRACE_UPDATE_IGNORE: | ||
407 | return 0; | ||
408 | |||
409 | case FTRACE_UPDATE_MAKE_CALL: | ||
410 | /* converting nop to call */ | ||
411 | return finish_update_call(rec, ftrace_addr); | ||
412 | |||
413 | case FTRACE_UPDATE_MAKE_NOP: | ||
414 | /* converting a call to a nop */ | ||
415 | return finish_update_nop(rec); | ||
416 | } | ||
417 | |||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | static void do_sync_core(void *data) | ||
422 | { | ||
423 | sync_core(); | ||
424 | } | ||
425 | |||
426 | static void run_sync(void) | ||
427 | { | ||
428 | int enable_irqs = irqs_disabled(); | ||
429 | |||
430 | /* We may be called with interrupts disbled (on bootup). */ | ||
431 | if (enable_irqs) | ||
432 | local_irq_enable(); | ||
433 | on_each_cpu(do_sync_core, NULL, 1); | ||
434 | if (enable_irqs) | ||
435 | local_irq_disable(); | ||
436 | } | ||
437 | |||
438 | void ftrace_replace_code(int enable) | ||
439 | { | ||
440 | struct ftrace_rec_iter *iter; | ||
441 | struct dyn_ftrace *rec; | ||
442 | const char *report = "adding breakpoints"; | ||
443 | int count = 0; | ||
444 | int ret; | ||
445 | |||
446 | for_ftrace_rec_iter(iter) { | ||
447 | rec = ftrace_rec_iter_record(iter); | ||
448 | |||
449 | ret = add_breakpoints(rec, enable); | ||
450 | if (ret) | ||
451 | goto remove_breakpoints; | ||
452 | count++; | ||
453 | } | ||
454 | |||
455 | run_sync(); | ||
456 | |||
457 | report = "updating code"; | ||
458 | |||
459 | for_ftrace_rec_iter(iter) { | ||
460 | rec = ftrace_rec_iter_record(iter); | ||
461 | |||
462 | ret = add_update(rec, enable); | ||
463 | if (ret) | ||
464 | goto remove_breakpoints; | ||
465 | } | ||
466 | |||
467 | run_sync(); | ||
468 | |||
469 | report = "removing breakpoints"; | ||
470 | |||
471 | for_ftrace_rec_iter(iter) { | ||
472 | rec = ftrace_rec_iter_record(iter); | ||
473 | |||
474 | ret = finish_update(rec, enable); | ||
475 | if (ret) | ||
476 | goto remove_breakpoints; | ||
477 | } | ||
478 | |||
479 | run_sync(); | ||
480 | |||
481 | return; | ||
482 | |||
483 | remove_breakpoints: | ||
484 | ftrace_bug(ret, rec ? rec->ip : 0); | ||
485 | printk(KERN_WARNING "Failed on %s (%d):\n", report, count); | ||
486 | for_ftrace_rec_iter(iter) { | ||
487 | rec = ftrace_rec_iter_record(iter); | ||
488 | remove_breakpoint(rec); | ||
489 | } | ||
490 | } | ||
491 | |||
492 | void arch_ftrace_update_code(int command) | ||
493 | { | ||
494 | modifying_ftrace_code++; | ||
495 | |||
496 | ftrace_modify_all_code(command); | ||
497 | |||
498 | modifying_ftrace_code--; | ||
499 | } | ||
500 | |||
337 | int __init ftrace_dyn_arch_init(void *data) | 501 | int __init ftrace_dyn_arch_init(void *data) |
338 | { | 502 | { |
339 | /* The return code is retured via data */ | 503 | /* The return code is retured via data */ |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index a3c2b4ffebc..d42ab17b739 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <asm/msr-index.h> | 21 | #include <asm/msr-index.h> |
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/percpu.h> | 23 | #include <asm/percpu.h> |
24 | #include <asm/nops.h> | ||
24 | 25 | ||
25 | /* Physical address */ | 26 | /* Physical address */ |
26 | #define pa(X) ((X) - __PAGE_OFFSET) | 27 | #define pa(X) ((X) - __PAGE_OFFSET) |
@@ -360,28 +361,23 @@ default_entry: | |||
360 | pushl $0 | 361 | pushl $0 |
361 | popfl | 362 | popfl |
362 | 363 | ||
363 | #ifdef CONFIG_SMP | ||
364 | cmpb $0, ready | ||
365 | jnz checkCPUtype | ||
366 | #endif /* CONFIG_SMP */ | ||
367 | |||
368 | /* | 364 | /* |
369 | * start system 32-bit setup. We need to re-do some of the things done | 365 | * start system 32-bit setup. We need to re-do some of the things done |
370 | * in 16-bit mode for the "real" operations. | 366 | * in 16-bit mode for the "real" operations. |
371 | */ | 367 | */ |
372 | call setup_idt | 368 | movl setup_once_ref,%eax |
373 | 369 | andl %eax,%eax | |
374 | checkCPUtype: | 370 | jz 1f # Did we do this already? |
375 | 371 | call *%eax | |
376 | movl $-1,X86_CPUID # -1 for no CPUID initially | 372 | 1: |
377 | 373 | ||
378 | /* check if it is 486 or 386. */ | 374 | /* check if it is 486 or 386. */ |
379 | /* | 375 | /* |
380 | * XXX - this does a lot of unnecessary setup. Alignment checks don't | 376 | * XXX - this does a lot of unnecessary setup. Alignment checks don't |
381 | * apply at our cpl of 0 and the stack ought to be aligned already, and | 377 | * apply at our cpl of 0 and the stack ought to be aligned already, and |
382 | * we don't need to preserve eflags. | 378 | * we don't need to preserve eflags. |
383 | */ | 379 | */ |
384 | 380 | movl $-1,X86_CPUID # -1 for no CPUID initially | |
385 | movb $3,X86 # at least 386 | 381 | movb $3,X86 # at least 386 |
386 | pushfl # push EFLAGS | 382 | pushfl # push EFLAGS |
387 | popl %eax # get EFLAGS | 383 | popl %eax # get EFLAGS |
@@ -447,21 +443,6 @@ is386: movl $2,%ecx # set MP | |||
447 | movl $(__KERNEL_PERCPU), %eax | 443 | movl $(__KERNEL_PERCPU), %eax |
448 | movl %eax,%fs # set this cpu's percpu | 444 | movl %eax,%fs # set this cpu's percpu |
449 | 445 | ||
450 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
451 | /* | ||
452 | * The linker can't handle this by relocation. Manually set | ||
453 | * base address in stack canary segment descriptor. | ||
454 | */ | ||
455 | cmpb $0,ready | ||
456 | jne 1f | ||
457 | movl $gdt_page,%eax | ||
458 | movl $stack_canary,%ecx | ||
459 | movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) | ||
460 | shrl $16, %ecx | ||
461 | movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) | ||
462 | movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) | ||
463 | 1: | ||
464 | #endif | ||
465 | movl $(__KERNEL_STACK_CANARY),%eax | 446 | movl $(__KERNEL_STACK_CANARY),%eax |
466 | movl %eax,%gs | 447 | movl %eax,%gs |
467 | 448 | ||
@@ -470,7 +451,6 @@ is386: movl $2,%ecx # set MP | |||
470 | 451 | ||
471 | cld # gcc2 wants the direction flag cleared at all times | 452 | cld # gcc2 wants the direction flag cleared at all times |
472 | pushl $0 # fake return address for unwinder | 453 | pushl $0 # fake return address for unwinder |
473 | movb $1, ready | ||
474 | jmp *(initial_code) | 454 | jmp *(initial_code) |
475 | 455 | ||
476 | /* | 456 | /* |
@@ -492,81 +472,122 @@ check_x87: | |||
492 | .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ | 472 | .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ |
493 | ret | 473 | ret |
494 | 474 | ||
475 | |||
476 | #include "verify_cpu.S" | ||
477 | |||
495 | /* | 478 | /* |
496 | * setup_idt | 479 | * setup_once |
497 | * | 480 | * |
498 | * sets up a idt with 256 entries pointing to | 481 | * The setup work we only want to run on the BSP. |
499 | * ignore_int, interrupt gates. It doesn't actually load | ||
500 | * idt - that can be done only after paging has been enabled | ||
501 | * and the kernel moved to PAGE_OFFSET. Interrupts | ||
502 | * are enabled elsewhere, when we can be relatively | ||
503 | * sure everything is ok. | ||
504 | * | 482 | * |
505 | * Warning: %esi is live across this function. | 483 | * Warning: %esi is live across this function. |
506 | */ | 484 | */ |
507 | setup_idt: | 485 | __INIT |
508 | lea ignore_int,%edx | 486 | setup_once: |
509 | movl $(__KERNEL_CS << 16),%eax | 487 | /* |
510 | movw %dx,%ax /* selector = 0x0010 = cs */ | 488 | * Set up a idt with 256 entries pointing to ignore_int, |
511 | movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ | 489 | * interrupt gates. It doesn't actually load idt - that needs |
490 | * to be done on each CPU. Interrupts are enabled elsewhere, | ||
491 | * when we can be relatively sure everything is ok. | ||
492 | */ | ||
512 | 493 | ||
513 | lea idt_table,%edi | 494 | movl $idt_table,%edi |
514 | mov $256,%ecx | 495 | movl $early_idt_handlers,%eax |
515 | rp_sidt: | 496 | movl $NUM_EXCEPTION_VECTORS,%ecx |
497 | 1: | ||
516 | movl %eax,(%edi) | 498 | movl %eax,(%edi) |
517 | movl %edx,4(%edi) | 499 | movl %eax,4(%edi) |
500 | /* interrupt gate, dpl=0, present */ | ||
501 | movl $(0x8E000000 + __KERNEL_CS),2(%edi) | ||
502 | addl $9,%eax | ||
518 | addl $8,%edi | 503 | addl $8,%edi |
519 | dec %ecx | 504 | loop 1b |
520 | jne rp_sidt | ||
521 | 505 | ||
522 | .macro set_early_handler handler,trapno | 506 | movl $256 - NUM_EXCEPTION_VECTORS,%ecx |
523 | lea \handler,%edx | 507 | movl $ignore_int,%edx |
524 | movl $(__KERNEL_CS << 16),%eax | 508 | movl $(__KERNEL_CS << 16),%eax |
525 | movw %dx,%ax | 509 | movw %dx,%ax /* selector = 0x0010 = cs */ |
526 | movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ | 510 | movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ |
527 | lea idt_table,%edi | 511 | 2: |
528 | movl %eax,8*\trapno(%edi) | 512 | movl %eax,(%edi) |
529 | movl %edx,8*\trapno+4(%edi) | 513 | movl %edx,4(%edi) |
530 | .endm | 514 | addl $8,%edi |
515 | loop 2b | ||
531 | 516 | ||
532 | set_early_handler handler=early_divide_err,trapno=0 | 517 | #ifdef CONFIG_CC_STACKPROTECTOR |
533 | set_early_handler handler=early_illegal_opcode,trapno=6 | 518 | /* |
534 | set_early_handler handler=early_protection_fault,trapno=13 | 519 | * Configure the stack canary. The linker can't handle this by |
535 | set_early_handler handler=early_page_fault,trapno=14 | 520 | * relocation. Manually set base address in stack canary |
521 | * segment descriptor. | ||
522 | */ | ||
523 | movl $gdt_page,%eax | ||
524 | movl $stack_canary,%ecx | ||
525 | movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) | ||
526 | shrl $16, %ecx | ||
527 | movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) | ||
528 | movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) | ||
529 | #endif | ||
536 | 530 | ||
531 | andl $0,setup_once_ref /* Once is enough, thanks */ | ||
537 | ret | 532 | ret |
538 | 533 | ||
539 | early_divide_err: | 534 | ENTRY(early_idt_handlers) |
540 | xor %edx,%edx | 535 | # 36(%esp) %eflags |
541 | pushl $0 /* fake errcode */ | 536 | # 32(%esp) %cs |
542 | jmp early_fault | 537 | # 28(%esp) %eip |
538 | # 24(%rsp) error code | ||
539 | i = 0 | ||
540 | .rept NUM_EXCEPTION_VECTORS | ||
541 | .if (EXCEPTION_ERRCODE_MASK >> i) & 1 | ||
542 | ASM_NOP2 | ||
543 | .else | ||
544 | pushl $0 # Dummy error code, to make stack frame uniform | ||
545 | .endif | ||
546 | pushl $i # 20(%esp) Vector number | ||
547 | jmp early_idt_handler | ||
548 | i = i + 1 | ||
549 | .endr | ||
550 | ENDPROC(early_idt_handlers) | ||
551 | |||
552 | /* This is global to keep gas from relaxing the jumps */ | ||
553 | ENTRY(early_idt_handler) | ||
554 | cld | ||
555 | cmpl $2,%ss:early_recursion_flag | ||
556 | je hlt_loop | ||
557 | incl %ss:early_recursion_flag | ||
543 | 558 | ||
544 | early_illegal_opcode: | 559 | push %eax # 16(%esp) |
545 | movl $6,%edx | 560 | push %ecx # 12(%esp) |
546 | pushl $0 /* fake errcode */ | 561 | push %edx # 8(%esp) |
547 | jmp early_fault | 562 | push %ds # 4(%esp) |
563 | push %es # 0(%esp) | ||
564 | movl $(__KERNEL_DS),%eax | ||
565 | movl %eax,%ds | ||
566 | movl %eax,%es | ||
548 | 567 | ||
549 | early_protection_fault: | 568 | cmpl $(__KERNEL_CS),32(%esp) |
550 | movl $13,%edx | 569 | jne 10f |
551 | jmp early_fault | ||
552 | 570 | ||
553 | early_page_fault: | 571 | leal 28(%esp),%eax # Pointer to %eip |
554 | movl $14,%edx | 572 | call early_fixup_exception |
555 | jmp early_fault | 573 | andl %eax,%eax |
574 | jnz ex_entry /* found an exception entry */ | ||
556 | 575 | ||
557 | early_fault: | 576 | 10: |
558 | cld | ||
559 | #ifdef CONFIG_PRINTK | 577 | #ifdef CONFIG_PRINTK |
560 | pusha | 578 | xorl %eax,%eax |
561 | movl $(__KERNEL_DS),%eax | 579 | movw %ax,2(%esp) /* clean up the segment values on some cpus */ |
562 | movl %eax,%ds | 580 | movw %ax,6(%esp) |
563 | movl %eax,%es | 581 | movw %ax,34(%esp) |
564 | cmpl $2,early_recursion_flag | 582 | leal 40(%esp),%eax |
565 | je hlt_loop | 583 | pushl %eax /* %esp before the exception */ |
566 | incl early_recursion_flag | 584 | pushl %ebx |
585 | pushl %ebp | ||
586 | pushl %esi | ||
587 | pushl %edi | ||
567 | movl %cr2,%eax | 588 | movl %cr2,%eax |
568 | pushl %eax | 589 | pushl %eax |
569 | pushl %edx /* trapno */ | 590 | pushl (20+6*4)(%esp) /* trapno */ |
570 | pushl $fault_msg | 591 | pushl $fault_msg |
571 | call printk | 592 | call printk |
572 | #endif | 593 | #endif |
@@ -575,6 +596,17 @@ hlt_loop: | |||
575 | hlt | 596 | hlt |
576 | jmp hlt_loop | 597 | jmp hlt_loop |
577 | 598 | ||
599 | ex_entry: | ||
600 | pop %es | ||
601 | pop %ds | ||
602 | pop %edx | ||
603 | pop %ecx | ||
604 | pop %eax | ||
605 | addl $8,%esp /* drop vector number and error code */ | ||
606 | decl %ss:early_recursion_flag | ||
607 | iret | ||
608 | ENDPROC(early_idt_handler) | ||
609 | |||
578 | /* This is the default interrupt "handler" :-) */ | 610 | /* This is the default interrupt "handler" :-) */ |
579 | ALIGN | 611 | ALIGN |
580 | ignore_int: | 612 | ignore_int: |
@@ -608,13 +640,18 @@ ignore_int: | |||
608 | popl %eax | 640 | popl %eax |
609 | #endif | 641 | #endif |
610 | iret | 642 | iret |
643 | ENDPROC(ignore_int) | ||
644 | __INITDATA | ||
645 | .align 4 | ||
646 | early_recursion_flag: | ||
647 | .long 0 | ||
611 | 648 | ||
612 | #include "verify_cpu.S" | 649 | __REFDATA |
613 | 650 | .align 4 | |
614 | __REFDATA | ||
615 | .align 4 | ||
616 | ENTRY(initial_code) | 651 | ENTRY(initial_code) |
617 | .long i386_start_kernel | 652 | .long i386_start_kernel |
653 | ENTRY(setup_once_ref) | ||
654 | .long setup_once | ||
618 | 655 | ||
619 | /* | 656 | /* |
620 | * BSS section | 657 | * BSS section |
@@ -667,22 +704,19 @@ ENTRY(initial_page_table) | |||
667 | ENTRY(stack_start) | 704 | ENTRY(stack_start) |
668 | .long init_thread_union+THREAD_SIZE | 705 | .long init_thread_union+THREAD_SIZE |
669 | 706 | ||
670 | early_recursion_flag: | 707 | __INITRODATA |
671 | .long 0 | ||
672 | |||
673 | ready: .byte 0 | ||
674 | |||
675 | int_msg: | 708 | int_msg: |
676 | .asciz "Unknown interrupt or fault at: %p %p %p\n" | 709 | .asciz "Unknown interrupt or fault at: %p %p %p\n" |
677 | 710 | ||
678 | fault_msg: | 711 | fault_msg: |
679 | /* fault info: */ | 712 | /* fault info: */ |
680 | .ascii "BUG: Int %d: CR2 %p\n" | 713 | .ascii "BUG: Int %d: CR2 %p\n" |
681 | /* pusha regs: */ | 714 | /* regs pushed in early_idt_handler: */ |
682 | .ascii " EDI %p ESI %p EBP %p ESP %p\n" | 715 | .ascii " EDI %p ESI %p EBP %p EBX %p\n" |
683 | .ascii " EBX %p EDX %p ECX %p EAX %p\n" | 716 | .ascii " ESP %p ES %p DS %p\n" |
717 | .ascii " EDX %p ECX %p EAX %p\n" | ||
684 | /* fault frame: */ | 718 | /* fault frame: */ |
685 | .ascii " err %p EIP %p CS %p flg %p\n" | 719 | .ascii " vec %p err %p EIP %p CS %p flg %p\n" |
686 | .ascii "Stack: %p %p %p %p %p %p %p %p\n" | 720 | .ascii "Stack: %p %p %p %p %p %p %p %p\n" |
687 | .ascii " %p %p %p %p %p %p %p %p\n" | 721 | .ascii " %p %p %p %p %p %p %p %p\n" |
688 | .asciz " %p %p %p %p %p %p %p %p\n" | 722 | .asciz " %p %p %p %p %p %p %p %p\n" |
@@ -696,6 +730,7 @@ fault_msg: | |||
696 | * segment size, and 32-bit linear address value: | 730 | * segment size, and 32-bit linear address value: |
697 | */ | 731 | */ |
698 | 732 | ||
733 | .data | ||
699 | .globl boot_gdt_descr | 734 | .globl boot_gdt_descr |
700 | .globl idt_descr | 735 | .globl idt_descr |
701 | 736 | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d70bc2eb202..94bf9cc2c7e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -19,12 +19,15 @@ | |||
19 | #include <asm/cache.h> | 19 | #include <asm/cache.h> |
20 | #include <asm/processor-flags.h> | 20 | #include <asm/processor-flags.h> |
21 | #include <asm/percpu.h> | 21 | #include <asm/percpu.h> |
22 | #include <asm/nops.h> | ||
22 | 23 | ||
23 | #ifdef CONFIG_PARAVIRT | 24 | #ifdef CONFIG_PARAVIRT |
24 | #include <asm/asm-offsets.h> | 25 | #include <asm/asm-offsets.h> |
25 | #include <asm/paravirt.h> | 26 | #include <asm/paravirt.h> |
27 | #define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg | ||
26 | #else | 28 | #else |
27 | #define GET_CR2_INTO_RCX movq %cr2, %rcx | 29 | #define GET_CR2_INTO(reg) movq %cr2, reg |
30 | #define INTERRUPT_RETURN iretq | ||
28 | #endif | 31 | #endif |
29 | 32 | ||
30 | /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE | 33 | /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE |
@@ -266,36 +269,56 @@ bad_address: | |||
266 | jmp bad_address | 269 | jmp bad_address |
267 | 270 | ||
268 | .section ".init.text","ax" | 271 | .section ".init.text","ax" |
269 | #ifdef CONFIG_EARLY_PRINTK | ||
270 | .globl early_idt_handlers | 272 | .globl early_idt_handlers |
271 | early_idt_handlers: | 273 | early_idt_handlers: |
274 | # 104(%rsp) %rflags | ||
275 | # 96(%rsp) %cs | ||
276 | # 88(%rsp) %rip | ||
277 | # 80(%rsp) error code | ||
272 | i = 0 | 278 | i = 0 |
273 | .rept NUM_EXCEPTION_VECTORS | 279 | .rept NUM_EXCEPTION_VECTORS |
274 | movl $i, %esi | 280 | .if (EXCEPTION_ERRCODE_MASK >> i) & 1 |
281 | ASM_NOP2 | ||
282 | .else | ||
283 | pushq $0 # Dummy error code, to make stack frame uniform | ||
284 | .endif | ||
285 | pushq $i # 72(%rsp) Vector number | ||
275 | jmp early_idt_handler | 286 | jmp early_idt_handler |
276 | i = i + 1 | 287 | i = i + 1 |
277 | .endr | 288 | .endr |
278 | #endif | ||
279 | 289 | ||
280 | ENTRY(early_idt_handler) | 290 | ENTRY(early_idt_handler) |
281 | #ifdef CONFIG_EARLY_PRINTK | 291 | cld |
292 | |||
282 | cmpl $2,early_recursion_flag(%rip) | 293 | cmpl $2,early_recursion_flag(%rip) |
283 | jz 1f | 294 | jz 1f |
284 | incl early_recursion_flag(%rip) | 295 | incl early_recursion_flag(%rip) |
285 | GET_CR2_INTO_RCX | 296 | |
286 | movq %rcx,%r9 | 297 | pushq %rax # 64(%rsp) |
287 | xorl %r8d,%r8d # zero for error code | 298 | pushq %rcx # 56(%rsp) |
288 | movl %esi,%ecx # get vector number | 299 | pushq %rdx # 48(%rsp) |
289 | # Test %ecx against mask of vectors that push error code. | 300 | pushq %rsi # 40(%rsp) |
290 | cmpl $31,%ecx | 301 | pushq %rdi # 32(%rsp) |
291 | ja 0f | 302 | pushq %r8 # 24(%rsp) |
292 | movl $1,%eax | 303 | pushq %r9 # 16(%rsp) |
293 | salq %cl,%rax | 304 | pushq %r10 # 8(%rsp) |
294 | testl $0x27d00,%eax | 305 | pushq %r11 # 0(%rsp) |
295 | je 0f | 306 | |
296 | popq %r8 # get error code | 307 | cmpl $__KERNEL_CS,96(%rsp) |
297 | 0: movq 0(%rsp),%rcx # get ip | 308 | jne 10f |
298 | movq 8(%rsp),%rdx # get cs | 309 | |
310 | leaq 88(%rsp),%rdi # Pointer to %rip | ||
311 | call early_fixup_exception | ||
312 | andl %eax,%eax | ||
313 | jnz 20f # Found an exception entry | ||
314 | |||
315 | 10: | ||
316 | #ifdef CONFIG_EARLY_PRINTK | ||
317 | GET_CR2_INTO(%r9) # can clobber any volatile register if pv | ||
318 | movl 80(%rsp),%r8d # error code | ||
319 | movl 72(%rsp),%esi # vector number | ||
320 | movl 96(%rsp),%edx # %cs | ||
321 | movq 88(%rsp),%rcx # %rip | ||
299 | xorl %eax,%eax | 322 | xorl %eax,%eax |
300 | leaq early_idt_msg(%rip),%rdi | 323 | leaq early_idt_msg(%rip),%rdi |
301 | call early_printk | 324 | call early_printk |
@@ -304,17 +327,32 @@ ENTRY(early_idt_handler) | |||
304 | call dump_stack | 327 | call dump_stack |
305 | #ifdef CONFIG_KALLSYMS | 328 | #ifdef CONFIG_KALLSYMS |
306 | leaq early_idt_ripmsg(%rip),%rdi | 329 | leaq early_idt_ripmsg(%rip),%rdi |
307 | movq 0(%rsp),%rsi # get rip again | 330 | movq 40(%rsp),%rsi # %rip again |
308 | call __print_symbol | 331 | call __print_symbol |
309 | #endif | 332 | #endif |
310 | #endif /* EARLY_PRINTK */ | 333 | #endif /* EARLY_PRINTK */ |
311 | 1: hlt | 334 | 1: hlt |
312 | jmp 1b | 335 | jmp 1b |
313 | 336 | ||
314 | #ifdef CONFIG_EARLY_PRINTK | 337 | 20: # Exception table entry found |
338 | popq %r11 | ||
339 | popq %r10 | ||
340 | popq %r9 | ||
341 | popq %r8 | ||
342 | popq %rdi | ||
343 | popq %rsi | ||
344 | popq %rdx | ||
345 | popq %rcx | ||
346 | popq %rax | ||
347 | addq $16,%rsp # drop vector number and error code | ||
348 | decl early_recursion_flag(%rip) | ||
349 | INTERRUPT_RETURN | ||
350 | |||
351 | .balign 4 | ||
315 | early_recursion_flag: | 352 | early_recursion_flag: |
316 | .long 0 | 353 | .long 0 |
317 | 354 | ||
355 | #ifdef CONFIG_EARLY_PRINTK | ||
318 | early_idt_msg: | 356 | early_idt_msg: |
319 | .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" | 357 | .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" |
320 | early_idt_ripmsg: | 358 | early_idt_ripmsg: |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad0de0c2714..9cc7b4392f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -94,13 +94,18 @@ static int hpet_verbose; | |||
94 | 94 | ||
95 | static int __init hpet_setup(char *str) | 95 | static int __init hpet_setup(char *str) |
96 | { | 96 | { |
97 | if (str) { | 97 | while (str) { |
98 | char *next = strchr(str, ','); | ||
99 | |||
100 | if (next) | ||
101 | *next++ = 0; | ||
98 | if (!strncmp("disable", str, 7)) | 102 | if (!strncmp("disable", str, 7)) |
99 | boot_hpet_disable = 1; | 103 | boot_hpet_disable = 1; |
100 | if (!strncmp("force", str, 5)) | 104 | if (!strncmp("force", str, 5)) |
101 | hpet_force_user = 1; | 105 | hpet_force_user = 1; |
102 | if (!strncmp("verbose", str, 7)) | 106 | if (!strncmp("verbose", str, 7)) |
103 | hpet_verbose = 1; | 107 | hpet_verbose = 1; |
108 | str = next; | ||
104 | } | 109 | } |
105 | return 1; | 110 | return 1; |
106 | } | 111 | } |
@@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode, | |||
319 | now = hpet_readl(HPET_COUNTER); | 324 | now = hpet_readl(HPET_COUNTER); |
320 | cmp = now + (unsigned int) delta; | 325 | cmp = now + (unsigned int) delta; |
321 | cfg = hpet_readl(HPET_Tn_CFG(timer)); | 326 | cfg = hpet_readl(HPET_Tn_CFG(timer)); |
322 | /* Make sure we use edge triggered interrupts */ | ||
323 | cfg &= ~HPET_TN_LEVEL; | ||
324 | cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | | 327 | cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | |
325 | HPET_TN_SETVAL | HPET_TN_32BIT; | 328 | HPET_TN_SETVAL | HPET_TN_32BIT; |
326 | hpet_writel(cfg, HPET_Tn_CFG(timer)); | 329 | hpet_writel(cfg, HPET_Tn_CFG(timer)); |
@@ -787,15 +790,16 @@ static int hpet_clocksource_register(void) | |||
787 | return 0; | 790 | return 0; |
788 | } | 791 | } |
789 | 792 | ||
793 | static u32 *hpet_boot_cfg; | ||
794 | |||
790 | /** | 795 | /** |
791 | * hpet_enable - Try to setup the HPET timer. Returns 1 on success. | 796 | * hpet_enable - Try to setup the HPET timer. Returns 1 on success. |
792 | */ | 797 | */ |
793 | int __init hpet_enable(void) | 798 | int __init hpet_enable(void) |
794 | { | 799 | { |
795 | unsigned long hpet_period; | 800 | u32 hpet_period, cfg, id; |
796 | unsigned int id; | ||
797 | u64 freq; | 801 | u64 freq; |
798 | int i; | 802 | unsigned int i, last; |
799 | 803 | ||
800 | if (!is_hpet_capable()) | 804 | if (!is_hpet_capable()) |
801 | return 0; | 805 | return 0; |
@@ -847,15 +851,45 @@ int __init hpet_enable(void) | |||
847 | id = hpet_readl(HPET_ID); | 851 | id = hpet_readl(HPET_ID); |
848 | hpet_print_config(); | 852 | hpet_print_config(); |
849 | 853 | ||
854 | last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; | ||
855 | |||
850 | #ifdef CONFIG_HPET_EMULATE_RTC | 856 | #ifdef CONFIG_HPET_EMULATE_RTC |
851 | /* | 857 | /* |
852 | * The legacy routing mode needs at least two channels, tick timer | 858 | * The legacy routing mode needs at least two channels, tick timer |
853 | * and the rtc emulation channel. | 859 | * and the rtc emulation channel. |
854 | */ | 860 | */ |
855 | if (!(id & HPET_ID_NUMBER)) | 861 | if (!last) |
856 | goto out_nohpet; | 862 | goto out_nohpet; |
857 | #endif | 863 | #endif |
858 | 864 | ||
865 | cfg = hpet_readl(HPET_CFG); | ||
866 | hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), | ||
867 | GFP_KERNEL); | ||
868 | if (hpet_boot_cfg) | ||
869 | *hpet_boot_cfg = cfg; | ||
870 | else | ||
871 | pr_warn("HPET initial state will not be saved\n"); | ||
872 | cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); | ||
873 | hpet_writel(cfg, HPET_Tn_CFG(i)); | ||
874 | if (cfg) | ||
875 | pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", | ||
876 | cfg); | ||
877 | |||
878 | for (i = 0; i <= last; ++i) { | ||
879 | cfg = hpet_readl(HPET_Tn_CFG(i)); | ||
880 | if (hpet_boot_cfg) | ||
881 | hpet_boot_cfg[i + 1] = cfg; | ||
882 | cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); | ||
883 | hpet_writel(cfg, HPET_Tn_CFG(i)); | ||
884 | cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP | ||
885 | | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | ||
886 | | HPET_TN_FSB | HPET_TN_FSB_CAP); | ||
887 | if (cfg) | ||
888 | pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", | ||
889 | cfg, i); | ||
890 | } | ||
891 | hpet_print_config(); | ||
892 | |||
859 | if (hpet_clocksource_register()) | 893 | if (hpet_clocksource_register()) |
860 | goto out_nohpet; | 894 | goto out_nohpet; |
861 | 895 | ||
@@ -923,14 +957,28 @@ fs_initcall(hpet_late_init); | |||
923 | void hpet_disable(void) | 957 | void hpet_disable(void) |
924 | { | 958 | { |
925 | if (is_hpet_capable() && hpet_virt_address) { | 959 | if (is_hpet_capable() && hpet_virt_address) { |
926 | unsigned int cfg = hpet_readl(HPET_CFG); | 960 | unsigned int cfg = hpet_readl(HPET_CFG), id, last; |
927 | 961 | ||
928 | if (hpet_legacy_int_enabled) { | 962 | if (hpet_boot_cfg) |
963 | cfg = *hpet_boot_cfg; | ||
964 | else if (hpet_legacy_int_enabled) { | ||
929 | cfg &= ~HPET_CFG_LEGACY; | 965 | cfg &= ~HPET_CFG_LEGACY; |
930 | hpet_legacy_int_enabled = 0; | 966 | hpet_legacy_int_enabled = 0; |
931 | } | 967 | } |
932 | cfg &= ~HPET_CFG_ENABLE; | 968 | cfg &= ~HPET_CFG_ENABLE; |
933 | hpet_writel(cfg, HPET_CFG); | 969 | hpet_writel(cfg, HPET_CFG); |
970 | |||
971 | if (!hpet_boot_cfg) | ||
972 | return; | ||
973 | |||
974 | id = hpet_readl(HPET_ID); | ||
975 | last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); | ||
976 | |||
977 | for (id = 0; id <= last; ++id) | ||
978 | hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); | ||
979 | |||
980 | if (*hpet_boot_cfg & HPET_CFG_ENABLE) | ||
981 | hpet_writel(*hpet_boot_cfg, HPET_CFG); | ||
934 | } | 982 | } |
935 | } | 983 | } |
936 | 984 | ||
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2d6e6498c17..f250431fb50 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -88,7 +88,7 @@ void kernel_fpu_begin(void) | |||
88 | __thread_clear_has_fpu(me); | 88 | __thread_clear_has_fpu(me); |
89 | /* We do 'stts()' in kernel_fpu_end() */ | 89 | /* We do 'stts()' in kernel_fpu_end() */ |
90 | } else { | 90 | } else { |
91 | percpu_write(fpu_owner_task, NULL); | 91 | this_cpu_write(fpu_owner_task, NULL); |
92 | clts(); | 92 | clts(); |
93 | } | 93 | } |
94 | } | 94 | } |
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c deleted file mode 100644 index 43e9ccf4494..00000000000 --- a/arch/x86/kernel/init_task.c +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
14 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
15 | |||
16 | /* | ||
17 | * Initial thread structure. | ||
18 | * | ||
19 | * We need to make sure that this is THREAD_SIZE aligned due to the | ||
20 | * way process stacks are handled. This is done by having a special | ||
21 | * "init_task" linker map entry.. | ||
22 | */ | ||
23 | union thread_union init_thread_union __init_task_data = | ||
24 | { INIT_THREAD_INFO(init_task) }; | ||
25 | |||
26 | /* | ||
27 | * Initial task structure. | ||
28 | * | ||
29 | * All other task structs will be allocated on slabs in fork.c | ||
30 | */ | ||
31 | struct task_struct init_task = INIT_TASK(init_task); | ||
32 | EXPORT_SYMBOL(init_task); | ||
33 | |||
34 | /* | ||
35 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
36 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
37 | * so they are allowed to end up in the .data..cacheline_aligned | ||
38 | * section. Since TSS's are completely CPU-local, we want them | ||
39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
40 | */ | ||
41 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | ||
42 | |||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 58b7f27cb3e..344faf8d0d6 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu) | |||
127 | return; | 127 | return; |
128 | 128 | ||
129 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), | 129 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), |
130 | THREAD_FLAGS, | 130 | THREADINFO_GFP, |
131 | THREAD_ORDER)); | 131 | THREAD_SIZE_ORDER)); |
132 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); | 132 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); |
133 | irqctx->tinfo.cpu = cpu; | 133 | irqctx->tinfo.cpu = cpu; |
134 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | 134 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; |
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu) | |||
137 | per_cpu(hardirq_ctx, cpu) = irqctx; | 137 | per_cpu(hardirq_ctx, cpu) = irqctx; |
138 | 138 | ||
139 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), | 139 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), |
140 | THREAD_FLAGS, | 140 | THREADINFO_GFP, |
141 | THREAD_ORDER)); | 141 | THREAD_SIZE_ORDER)); |
142 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); | 142 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); |
143 | irqctx->tinfo.cpu = cpu; | 143 | irqctx->tinfo.cpu = cpu; |
144 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 144 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e213fc8408d..e2f751efb7b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
1037 | "current sp %p does not match saved sp %p\n", | 1037 | "current sp %p does not match saved sp %p\n", |
1038 | stack_addr(regs), kcb->jprobe_saved_sp); | 1038 | stack_addr(regs), kcb->jprobe_saved_sp); |
1039 | printk(KERN_ERR "Saved registers for jprobe %p\n", jp); | 1039 | printk(KERN_ERR "Saved registers for jprobe %p\n", jp); |
1040 | show_registers(saved_regs); | 1040 | show_regs(saved_regs); |
1041 | printk(KERN_ERR "Current registers\n"); | 1041 | printk(KERN_ERR "Current registers\n"); |
1042 | show_registers(regs); | 1042 | show_regs(regs); |
1043 | BUG(); | 1043 | BUG(); |
1044 | } | 1044 | } |
1045 | *regs = kcb->jprobe_saved_regs; | 1045 | *regs = kcb->jprobe_saved_regs; |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b8ba6e4a27e..e554e5ad2fe 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -79,7 +79,6 @@ struct kvm_task_sleep_node { | |||
79 | u32 token; | 79 | u32 token; |
80 | int cpu; | 80 | int cpu; |
81 | bool halted; | 81 | bool halted; |
82 | struct mm_struct *mm; | ||
83 | }; | 82 | }; |
84 | 83 | ||
85 | static struct kvm_task_sleep_head { | 84 | static struct kvm_task_sleep_head { |
@@ -126,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token) | |||
126 | 125 | ||
127 | n.token = token; | 126 | n.token = token; |
128 | n.cpu = smp_processor_id(); | 127 | n.cpu = smp_processor_id(); |
129 | n.mm = current->active_mm; | ||
130 | n.halted = idle || preempt_count() > 1; | 128 | n.halted = idle || preempt_count() > 1; |
131 | atomic_inc(&n.mm->mm_count); | ||
132 | init_waitqueue_head(&n.wq); | 129 | init_waitqueue_head(&n.wq); |
133 | hlist_add_head(&n.link, &b->list); | 130 | hlist_add_head(&n.link, &b->list); |
134 | spin_unlock(&b->lock); | 131 | spin_unlock(&b->lock); |
@@ -161,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); | |||
161 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) | 158 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) |
162 | { | 159 | { |
163 | hlist_del_init(&n->link); | 160 | hlist_del_init(&n->link); |
164 | if (!n->mm) | ||
165 | return; | ||
166 | mmdrop(n->mm); | ||
167 | if (n->halted) | 161 | if (n->halted) |
168 | smp_send_reschedule(n->cpu); | 162 | smp_send_reschedule(n->cpu); |
169 | else if (waitqueue_active(&n->wq)) | 163 | else if (waitqueue_active(&n->wq)) |
@@ -207,7 +201,7 @@ again: | |||
207 | * async PF was not yet handled. | 201 | * async PF was not yet handled. |
208 | * Add dummy entry for the token. | 202 | * Add dummy entry for the token. |
209 | */ | 203 | */ |
210 | n = kmalloc(sizeof(*n), GFP_ATOMIC); | 204 | n = kzalloc(sizeof(*n), GFP_ATOMIC); |
211 | if (!n) { | 205 | if (!n) { |
212 | /* | 206 | /* |
213 | * Allocation failed! Busy wait while other cpu | 207 | * Allocation failed! Busy wait while other cpu |
@@ -219,7 +213,6 @@ again: | |||
219 | } | 213 | } |
220 | n->token = token; | 214 | n->token = token; |
221 | n->cpu = smp_processor_id(); | 215 | n->cpu = smp_processor_id(); |
222 | n->mm = NULL; | ||
223 | init_waitqueue_head(&n->wq); | 216 | init_waitqueue_head(&n->wq); |
224 | hlist_add_head(&n->link, &b->list); | 217 | hlist_add_head(&n->link, &b->list); |
225 | } else | 218 | } else |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f8492da65bf..086eb58c6e8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/msr.h> | 22 | #include <asm/msr.h> |
23 | #include <asm/apic.h> | 23 | #include <asm/apic.h> |
24 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
25 | #include <linux/hardirq.h> | ||
25 | 26 | ||
26 | #include <asm/x86_init.h> | 27 | #include <asm/x86_init.h> |
27 | #include <asm/reboot.h> | 28 | #include <asm/reboot.h> |
@@ -114,6 +115,25 @@ static void kvm_get_preset_lpj(void) | |||
114 | preset_lpj = lpj; | 115 | preset_lpj = lpj; |
115 | } | 116 | } |
116 | 117 | ||
118 | bool kvm_check_and_clear_guest_paused(void) | ||
119 | { | ||
120 | bool ret = false; | ||
121 | struct pvclock_vcpu_time_info *src; | ||
122 | |||
123 | /* | ||
124 | * per_cpu() is safe here because this function is only called from | ||
125 | * timer functions where preemption is already disabled. | ||
126 | */ | ||
127 | WARN_ON(!in_atomic()); | ||
128 | src = &__get_cpu_var(hv_clock); | ||
129 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { | ||
130 | __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); | ||
131 | ret = true; | ||
132 | } | ||
133 | |||
134 | return ret; | ||
135 | } | ||
136 | |||
117 | static struct clocksource kvm_clock = { | 137 | static struct clocksource kvm_clock = { |
118 | .name = "kvm-clock", | 138 | .name = "kvm-clock", |
119 | .read = kvm_clock_get_cycles, | 139 | .read = kvm_clock_get_cycles, |
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c deleted file mode 100644 index 7eb1e2b9782..00000000000 --- a/arch/x86/kernel/mca_32.c +++ /dev/null | |||
@@ -1,476 +0,0 @@ | |||
1 | /* | ||
2 | * Written by Martin Kolinek, February 1996 | ||
3 | * | ||
4 | * Changes: | ||
5 | * | ||
6 | * Chris Beauregard July 28th, 1996 | ||
7 | * - Fixed up integrated SCSI detection | ||
8 | * | ||
9 | * Chris Beauregard August 3rd, 1996 | ||
10 | * - Made mca_info local | ||
11 | * - Made integrated registers accessible through standard function calls | ||
12 | * - Added name field | ||
13 | * - More sanity checking | ||
14 | * | ||
15 | * Chris Beauregard August 9th, 1996 | ||
16 | * - Rewrote /proc/mca | ||
17 | * | ||
18 | * Chris Beauregard January 7th, 1997 | ||
19 | * - Added basic NMI-processing | ||
20 | * - Added more information to mca_info structure | ||
21 | * | ||
22 | * David Weinehall October 12th, 1998 | ||
23 | * - Made a lot of cleaning up in the source | ||
24 | * - Added use of save_flags / restore_flags | ||
25 | * - Added the 'driver_loaded' flag in MCA_adapter | ||
26 | * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter | ||
27 | * | ||
28 | * David Weinehall March 24th, 1999 | ||
29 | * - Fixed the output of 'Driver Installed' in /proc/mca/pos | ||
30 | * - Made the Integrated Video & SCSI show up even if they have id 0000 | ||
31 | * | ||
32 | * Alexander Viro November 9th, 1999 | ||
33 | * - Switched to regular procfs methods | ||
34 | * | ||
35 | * Alfred Arnold & David Weinehall August 23rd, 2000 | ||
36 | * - Added support for Planar POS-registers | ||
37 | */ | ||
38 | |||
39 | #include <linux/module.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/errno.h> | ||
42 | #include <linux/kernel.h> | ||
43 | #include <linux/mca.h> | ||
44 | #include <linux/kprobes.h> | ||
45 | #include <linux/slab.h> | ||
46 | #include <asm/io.h> | ||
47 | #include <linux/proc_fs.h> | ||
48 | #include <linux/mman.h> | ||
49 | #include <linux/mm.h> | ||
50 | #include <linux/pagemap.h> | ||
51 | #include <linux/ioport.h> | ||
52 | #include <asm/uaccess.h> | ||
53 | #include <linux/init.h> | ||
54 | |||
55 | static unsigned char which_scsi; | ||
56 | |||
57 | int MCA_bus; | ||
58 | EXPORT_SYMBOL(MCA_bus); | ||
59 | |||
60 | /* | ||
61 | * Motherboard register spinlock. Untested on SMP at the moment, but | ||
62 | * are there any MCA SMP boxes? | ||
63 | * | ||
64 | * Yes - Alan | ||
65 | */ | ||
66 | static DEFINE_SPINLOCK(mca_lock); | ||
67 | |||
68 | /* Build the status info for the adapter */ | ||
69 | |||
70 | static void mca_configure_adapter_status(struct mca_device *mca_dev) | ||
71 | { | ||
72 | mca_dev->status = MCA_ADAPTER_NONE; | ||
73 | |||
74 | mca_dev->pos_id = mca_dev->pos[0] | ||
75 | + (mca_dev->pos[1] << 8); | ||
76 | |||
77 | if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { | ||
78 | |||
79 | /* | ||
80 | * id = 0x0000 usually indicates hardware failure, | ||
81 | * however, ZP Gu (zpg@castle.net> reports that his 9556 | ||
82 | * has 0x0000 as id and everything still works. There | ||
83 | * also seem to be an adapter with id = 0x0000; the | ||
84 | * NCR Parallel Bus Memory Card. Until this is confirmed, | ||
85 | * however, this code will stay. | ||
86 | */ | ||
87 | |||
88 | mca_dev->status = MCA_ADAPTER_ERROR; | ||
89 | |||
90 | return; | ||
91 | } else if (mca_dev->pos_id != 0xffff) { | ||
92 | |||
93 | /* | ||
94 | * 0xffff usually indicates that there's no adapter, | ||
95 | * however, some integrated adapters may have 0xffff as | ||
96 | * their id and still be valid. Examples are on-board | ||
97 | * VGA of the 55sx, the integrated SCSI of the 56 & 57, | ||
98 | * and possibly also the 95 ULTIMEDIA. | ||
99 | */ | ||
100 | |||
101 | mca_dev->status = MCA_ADAPTER_NORMAL; | ||
102 | } | ||
103 | |||
104 | if ((mca_dev->pos_id == 0xffff || | ||
105 | mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { | ||
106 | int j; | ||
107 | |||
108 | for (j = 2; j < 8; j++) { | ||
109 | if (mca_dev->pos[j] != 0xff) { | ||
110 | mca_dev->status = MCA_ADAPTER_NORMAL; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | } | ||
115 | |||
116 | if (!(mca_dev->pos[2] & MCA_ENABLED)) { | ||
117 | |||
118 | /* enabled bit is in POS 2 */ | ||
119 | |||
120 | mca_dev->status = MCA_ADAPTER_DISABLED; | ||
121 | } | ||
122 | } /* mca_configure_adapter_status */ | ||
123 | |||
124 | /*--------------------------------------------------------------------*/ | ||
125 | |||
126 | static struct resource mca_standard_resources[] = { | ||
127 | { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, | ||
128 | { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, | ||
129 | { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, | ||
130 | { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, | ||
131 | { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, | ||
132 | { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, | ||
133 | { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } | ||
134 | }; | ||
135 | |||
136 | #define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources) | ||
137 | |||
138 | /* | ||
139 | * mca_read_and_store_pos - read the POS registers into a memory buffer | ||
140 | * @pos: a char pointer to 8 bytes, contains the POS register value on | ||
141 | * successful return | ||
142 | * | ||
143 | * Returns 1 if a card actually exists (i.e. the pos isn't | ||
144 | * all 0xff) or 0 otherwise | ||
145 | */ | ||
146 | static int mca_read_and_store_pos(unsigned char *pos) | ||
147 | { | ||
148 | int j; | ||
149 | int found = 0; | ||
150 | |||
151 | for (j = 0; j < 8; j++) { | ||
152 | pos[j] = inb_p(MCA_POS_REG(j)); | ||
153 | if (pos[j] != 0xff) { | ||
154 | /* 0xff all across means no device. 0x00 means | ||
155 | * something's broken, but a device is | ||
156 | * probably there. However, if you get 0x00 | ||
157 | * from a motherboard register it won't matter | ||
158 | * what we find. For the record, on the | ||
159 | * 57SLC, the integrated SCSI adapter has | ||
160 | * 0xffff for the adapter ID, but nonzero for | ||
161 | * other registers. */ | ||
162 | |||
163 | found = 1; | ||
164 | } | ||
165 | } | ||
166 | return found; | ||
167 | } | ||
168 | |||
169 | static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) | ||
170 | { | ||
171 | unsigned char byte; | ||
172 | unsigned long flags; | ||
173 | |||
174 | if (reg < 0 || reg >= 8) | ||
175 | return 0; | ||
176 | |||
177 | spin_lock_irqsave(&mca_lock, flags); | ||
178 | if (mca_dev->pos_register) { | ||
179 | /* Disable adapter setup, enable motherboard setup */ | ||
180 | |||
181 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
182 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
183 | |||
184 | byte = inb_p(MCA_POS_REG(reg)); | ||
185 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
186 | } else { | ||
187 | |||
188 | /* Make sure motherboard setup is off */ | ||
189 | |||
190 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
191 | |||
192 | /* Read the appropriate register */ | ||
193 | |||
194 | outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); | ||
195 | byte = inb_p(MCA_POS_REG(reg)); | ||
196 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
197 | } | ||
198 | spin_unlock_irqrestore(&mca_lock, flags); | ||
199 | |||
200 | mca_dev->pos[reg] = byte; | ||
201 | |||
202 | return byte; | ||
203 | } | ||
204 | |||
205 | static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, | ||
206 | unsigned char byte) | ||
207 | { | ||
208 | unsigned long flags; | ||
209 | |||
210 | if (reg < 0 || reg >= 8) | ||
211 | return; | ||
212 | |||
213 | spin_lock_irqsave(&mca_lock, flags); | ||
214 | |||
215 | /* Make sure motherboard setup is off */ | ||
216 | |||
217 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
218 | |||
219 | /* Read in the appropriate register */ | ||
220 | |||
221 | outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); | ||
222 | outb_p(byte, MCA_POS_REG(reg)); | ||
223 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
224 | |||
225 | spin_unlock_irqrestore(&mca_lock, flags); | ||
226 | |||
227 | /* Update the global register list, while we have the byte */ | ||
228 | |||
229 | mca_dev->pos[reg] = byte; | ||
230 | |||
231 | } | ||
232 | |||
233 | /* for the primary MCA bus, we have identity transforms */ | ||
234 | static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq) | ||
235 | { | ||
236 | return irq; | ||
237 | } | ||
238 | |||
239 | static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port) | ||
240 | { | ||
241 | return port; | ||
242 | } | ||
243 | |||
244 | static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem) | ||
245 | { | ||
246 | return mem; | ||
247 | } | ||
248 | |||
249 | |||
250 | static int __init mca_init(void) | ||
251 | { | ||
252 | unsigned int i, j; | ||
253 | struct mca_device *mca_dev; | ||
254 | unsigned char pos[8]; | ||
255 | short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; | ||
256 | struct mca_bus *bus; | ||
257 | |||
258 | /* | ||
259 | * WARNING: Be careful when making changes here. Putting an adapter | ||
260 | * and the motherboard simultaneously into setup mode may result in | ||
261 | * damage to chips (according to The Indispensable PC Hardware Book | ||
262 | * by Hans-Peter Messmer). Also, we disable system interrupts (so | ||
263 | * that we are not disturbed in the middle of this). | ||
264 | */ | ||
265 | |||
266 | /* Make sure the MCA bus is present */ | ||
267 | |||
268 | if (mca_system_init()) { | ||
269 | printk(KERN_ERR "MCA bus system initialisation failed\n"); | ||
270 | return -ENODEV; | ||
271 | } | ||
272 | |||
273 | if (!MCA_bus) | ||
274 | return -ENODEV; | ||
275 | |||
276 | printk(KERN_INFO "Micro Channel bus detected.\n"); | ||
277 | |||
278 | /* All MCA systems have at least a primary bus */ | ||
279 | bus = mca_attach_bus(MCA_PRIMARY_BUS); | ||
280 | if (!bus) | ||
281 | goto out_nomem; | ||
282 | bus->default_dma_mask = 0xffffffffLL; | ||
283 | bus->f.mca_write_pos = mca_pc_write_pos; | ||
284 | bus->f.mca_read_pos = mca_pc_read_pos; | ||
285 | bus->f.mca_transform_irq = mca_dummy_transform_irq; | ||
286 | bus->f.mca_transform_ioport = mca_dummy_transform_ioport; | ||
287 | bus->f.mca_transform_memory = mca_dummy_transform_memory; | ||
288 | |||
289 | /* get the motherboard device */ | ||
290 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); | ||
291 | if (unlikely(!mca_dev)) | ||
292 | goto out_nomem; | ||
293 | |||
294 | /* | ||
295 | * We do not expect many MCA interrupts during initialization, | ||
296 | * but let us be safe: | ||
297 | */ | ||
298 | spin_lock_irq(&mca_lock); | ||
299 | |||
300 | /* Make sure adapter setup is off */ | ||
301 | |||
302 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
303 | |||
304 | /* Read motherboard POS registers */ | ||
305 | |||
306 | mca_dev->pos_register = 0x7f; | ||
307 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
308 | mca_dev->name[0] = 0; | ||
309 | mca_read_and_store_pos(mca_dev->pos); | ||
310 | mca_configure_adapter_status(mca_dev); | ||
311 | /* fake POS and slot for a motherboard */ | ||
312 | mca_dev->pos_id = MCA_MOTHERBOARD_POS; | ||
313 | mca_dev->slot = MCA_MOTHERBOARD; | ||
314 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
315 | |||
316 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
317 | if (unlikely(!mca_dev)) | ||
318 | goto out_unlock_nomem; | ||
319 | |||
320 | /* Put motherboard into video setup mode, read integrated video | ||
321 | * POS registers, and turn motherboard setup off. | ||
322 | */ | ||
323 | |||
324 | mca_dev->pos_register = 0xdf; | ||
325 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
326 | mca_dev->name[0] = 0; | ||
327 | mca_read_and_store_pos(mca_dev->pos); | ||
328 | mca_configure_adapter_status(mca_dev); | ||
329 | /* fake POS and slot for the integrated video */ | ||
330 | mca_dev->pos_id = MCA_INTEGVIDEO_POS; | ||
331 | mca_dev->slot = MCA_INTEGVIDEO; | ||
332 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
333 | |||
334 | /* | ||
335 | * Put motherboard into scsi setup mode, read integrated scsi | ||
336 | * POS registers, and turn motherboard setup off. | ||
337 | * | ||
338 | * It seems there are two possible SCSI registers. Martin says that | ||
339 | * for the 56,57, 0xf7 is the one, but fails on the 76. | ||
340 | * Alfredo (apena@vnet.ibm.com) says | ||
341 | * 0xfd works on his machine. We'll try both of them. I figure it's | ||
342 | * a good bet that only one could be valid at a time. This could | ||
343 | * screw up though if one is used for something else on the other | ||
344 | * machine. | ||
345 | */ | ||
346 | |||
347 | for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { | ||
348 | outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); | ||
349 | if (mca_read_and_store_pos(pos)) | ||
350 | break; | ||
351 | } | ||
352 | if (which_scsi) { | ||
353 | /* found a scsi card */ | ||
354 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
355 | if (unlikely(!mca_dev)) | ||
356 | goto out_unlock_nomem; | ||
357 | |||
358 | for (j = 0; j < 8; j++) | ||
359 | mca_dev->pos[j] = pos[j]; | ||
360 | |||
361 | mca_configure_adapter_status(mca_dev); | ||
362 | /* fake POS and slot for integrated SCSI controller */ | ||
363 | mca_dev->pos_id = MCA_INTEGSCSI_POS; | ||
364 | mca_dev->slot = MCA_INTEGSCSI; | ||
365 | mca_dev->pos_register = which_scsi; | ||
366 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
367 | } | ||
368 | |||
369 | /* Turn off motherboard setup */ | ||
370 | |||
371 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
372 | |||
373 | /* | ||
374 | * Now loop over MCA slots: put each adapter into setup mode, and | ||
375 | * read its POS registers. Then put adapter setup off. | ||
376 | */ | ||
377 | |||
378 | for (i = 0; i < MCA_MAX_SLOT_NR; i++) { | ||
379 | outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); | ||
380 | if (!mca_read_and_store_pos(pos)) | ||
381 | continue; | ||
382 | |||
383 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
384 | if (unlikely(!mca_dev)) | ||
385 | goto out_unlock_nomem; | ||
386 | |||
387 | for (j = 0; j < 8; j++) | ||
388 | mca_dev->pos[j] = pos[j]; | ||
389 | |||
390 | mca_dev->driver_loaded = 0; | ||
391 | mca_dev->slot = i; | ||
392 | mca_dev->pos_register = 0; | ||
393 | mca_configure_adapter_status(mca_dev); | ||
394 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
395 | } | ||
396 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
397 | |||
398 | /* Enable interrupts and return memory start */ | ||
399 | spin_unlock_irq(&mca_lock); | ||
400 | |||
401 | for (i = 0; i < MCA_STANDARD_RESOURCES; i++) | ||
402 | request_resource(&ioport_resource, mca_standard_resources + i); | ||
403 | |||
404 | mca_do_proc_init(); | ||
405 | |||
406 | return 0; | ||
407 | |||
408 | out_unlock_nomem: | ||
409 | spin_unlock_irq(&mca_lock); | ||
410 | out_nomem: | ||
411 | printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); | ||
412 | return -ENOMEM; | ||
413 | } | ||
414 | |||
415 | subsys_initcall(mca_init); | ||
416 | |||
417 | /*--------------------------------------------------------------------*/ | ||
418 | |||
419 | static __kprobes void | ||
420 | mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) | ||
421 | { | ||
422 | int slot = mca_dev->slot; | ||
423 | |||
424 | if (slot == MCA_INTEGSCSI) { | ||
425 | printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", | ||
426 | mca_dev->name); | ||
427 | } else if (slot == MCA_INTEGVIDEO) { | ||
428 | printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", | ||
429 | mca_dev->name); | ||
430 | } else if (slot == MCA_MOTHERBOARD) { | ||
431 | printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", | ||
432 | mca_dev->name); | ||
433 | } | ||
434 | |||
435 | /* More info available in POS 6 and 7? */ | ||
436 | |||
437 | if (check_flag) { | ||
438 | unsigned char pos6, pos7; | ||
439 | |||
440 | pos6 = mca_device_read_pos(mca_dev, 6); | ||
441 | pos7 = mca_device_read_pos(mca_dev, 7); | ||
442 | |||
443 | printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); | ||
444 | } | ||
445 | |||
446 | } /* mca_handle_nmi_slot */ | ||
447 | |||
448 | /*--------------------------------------------------------------------*/ | ||
449 | |||
450 | static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) | ||
451 | { | ||
452 | struct mca_device *mca_dev = to_mca_device(dev); | ||
453 | unsigned char pos5; | ||
454 | |||
455 | pos5 = mca_device_read_pos(mca_dev, 5); | ||
456 | |||
457 | if (!(pos5 & 0x80)) { | ||
458 | /* | ||
459 | * Bit 7 of POS 5 is reset when this adapter has a hardware | ||
460 | * error. Bit 7 it reset if there's error information | ||
461 | * available in POS 6 and 7. | ||
462 | */ | ||
463 | mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); | ||
464 | return 1; | ||
465 | } | ||
466 | return 0; | ||
467 | } | ||
468 | |||
469 | void __kprobes mca_handle_nmi(void) | ||
470 | { | ||
471 | /* | ||
472 | * First try - scan the various adapters and see if a specific | ||
473 | * adapter was responsible for the error. | ||
474 | */ | ||
475 | bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); | ||
476 | } | ||
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index c9bda6d6035..fbdfc691718 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev, | |||
299 | { | 299 | { |
300 | unsigned long val; | 300 | unsigned long val; |
301 | int cpu = dev->id; | 301 | int cpu = dev->id; |
302 | int ret = 0; | 302 | ssize_t ret = 0; |
303 | char *end; | ||
304 | 303 | ||
305 | val = simple_strtoul(buf, &end, 0); | 304 | ret = kstrtoul(buf, 0, &val); |
306 | if (end == buf) | 305 | if (ret) |
307 | return -EINVAL; | 306 | return ret; |
308 | 307 | ||
309 | if (val == 1) { | 308 | if (val == 1) { |
310 | get_online_cpus(); | 309 | get_online_cpus(); |
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3ca42d0e43a..0327e2b3c40 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) | |||
147 | 147 | ||
148 | memset(csig, 0, sizeof(*csig)); | 148 | memset(csig, 0, sizeof(*csig)); |
149 | 149 | ||
150 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
151 | cpu_has(c, X86_FEATURE_IA64)) { | ||
152 | pr_err("CPU%d not a capable Intel processor\n", cpu_num); | ||
153 | return -1; | ||
154 | } | ||
155 | |||
156 | csig->sig = cpuid_eax(0x00000001); | 150 | csig->sig = cpuid_eax(0x00000001); |
157 | 151 | ||
158 | if ((c->x86_model >= 5) || (c->x86 > 6)) { | 152 | if ((c->x86_model >= 5) || (c->x86 > 6)) { |
@@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = { | |||
463 | 457 | ||
464 | struct microcode_ops * __init init_intel_microcode(void) | 458 | struct microcode_ops * __init init_intel_microcode(void) |
465 | { | 459 | { |
460 | struct cpuinfo_x86 *c = &cpu_data(0); | ||
461 | |||
462 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
463 | cpu_has(c, X86_FEATURE_IA64)) { | ||
464 | pr_err("Intel CPU family 0x%x not supported\n", c->x86); | ||
465 | return NULL; | ||
466 | } | ||
467 | |||
466 | return µcode_intel_ops; | 468 | return µcode_intel_ops; |
467 | } | 469 | } |
468 | 470 | ||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index f44d3115735..d2b56489d70 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -96,7 +96,7 @@ static void __init MP_bus_info(struct mpc_bus *m) | |||
96 | 96 | ||
97 | set_bit(m->busid, mp_bus_not_pci); | 97 | set_bit(m->busid, mp_bus_not_pci); |
98 | if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { | 98 | if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { |
99 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 99 | #ifdef CONFIG_EISA |
100 | mp_bus_id_to_type[m->busid] = MP_BUS_ISA; | 100 | mp_bus_id_to_type[m->busid] = MP_BUS_ISA; |
101 | #endif | 101 | #endif |
102 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { | 102 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { |
@@ -104,12 +104,10 @@ static void __init MP_bus_info(struct mpc_bus *m) | |||
104 | x86_init.mpparse.mpc_oem_pci_bus(m); | 104 | x86_init.mpparse.mpc_oem_pci_bus(m); |
105 | 105 | ||
106 | clear_bit(m->busid, mp_bus_not_pci); | 106 | clear_bit(m->busid, mp_bus_not_pci); |
107 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 107 | #ifdef CONFIG_EISA |
108 | mp_bus_id_to_type[m->busid] = MP_BUS_PCI; | 108 | mp_bus_id_to_type[m->busid] = MP_BUS_PCI; |
109 | } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { | 109 | } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { |
110 | mp_bus_id_to_type[m->busid] = MP_BUS_EISA; | 110 | mp_bus_id_to_type[m->busid] = MP_BUS_EISA; |
111 | } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { | ||
112 | mp_bus_id_to_type[m->busid] = MP_BUS_MCA; | ||
113 | #endif | 111 | #endif |
114 | } else | 112 | } else |
115 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); | 113 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); |
@@ -367,9 +365,6 @@ static void __init construct_ioapic_table(int mpc_default_type) | |||
367 | case 3: | 365 | case 3: |
368 | memcpy(bus.bustype, "EISA ", 6); | 366 | memcpy(bus.bustype, "EISA ", 6); |
369 | break; | 367 | break; |
370 | case 4: | ||
371 | case 7: | ||
372 | memcpy(bus.bustype, "MCA ", 6); | ||
373 | } | 368 | } |
374 | MP_bus_info(&bus); | 369 | MP_bus_info(&bus); |
375 | if (mpc_default_type > 4) { | 370 | if (mpc_default_type > 4) { |
@@ -572,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) | |||
572 | struct mpf_intel *mpf; | 567 | struct mpf_intel *mpf; |
573 | unsigned long mem; | 568 | unsigned long mem; |
574 | 569 | ||
575 | apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", | 570 | apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", |
576 | bp, length); | 571 | base, base + length - 1); |
577 | BUILD_BUG_ON(sizeof(*mpf) != 16); | 572 | BUILD_BUG_ON(sizeof(*mpf) != 16); |
578 | 573 | ||
579 | while (length > 0) { | 574 | while (length > 0) { |
@@ -588,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) | |||
588 | #endif | 583 | #endif |
589 | mpf_found = mpf; | 584 | mpf_found = mpf; |
590 | 585 | ||
591 | printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", | 586 | printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", |
592 | mpf, (u64)virt_to_phys(mpf)); | 587 | (unsigned long long) virt_to_phys(mpf), |
588 | (unsigned long long) virt_to_phys(mpf) + | ||
589 | sizeof(*mpf) - 1, mpf); | ||
593 | 590 | ||
594 | mem = virt_to_phys(mpf); | 591 | mem = virt_to_phys(mpf); |
595 | memblock_reserve(mem, sizeof(*mpf)); | 592 | memblock_reserve(mem, sizeof(*mpf)); |
@@ -622,7 +619,7 @@ void __init default_find_smp_config(void) | |||
622 | return; | 619 | return; |
623 | /* | 620 | /* |
624 | * If it is an SMP machine we should know now, unless the | 621 | * If it is an SMP machine we should know now, unless the |
625 | * configuration is in an EISA/MCA bus machine with an | 622 | * configuration is in an EISA bus machine with an |
626 | * extended bios data area. | 623 | * extended bios data area. |
627 | * | 624 | * |
628 | * there is a real-mode segmented pointer pointing to the | 625 | * there is a real-mode segmented pointer pointing to the |
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf31916..90875279ef3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -19,8 +19,6 @@ | |||
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/export.h> | 20 | #include <linux/export.h> |
21 | 21 | ||
22 | #include <linux/mca.h> | ||
23 | |||
24 | #if defined(CONFIG_EDAC) | 22 | #if defined(CONFIG_EDAC) |
25 | #include <linux/edac.h> | 23 | #include <linux/edac.h> |
26 | #endif | 24 | #endif |
@@ -31,14 +29,6 @@ | |||
31 | #include <asm/nmi.h> | 29 | #include <asm/nmi.h> |
32 | #include <asm/x86_init.h> | 30 | #include <asm/x86_init.h> |
33 | 31 | ||
34 | #define NMI_MAX_NAMELEN 16 | ||
35 | struct nmiaction { | ||
36 | struct list_head list; | ||
37 | nmi_handler_t handler; | ||
38 | unsigned int flags; | ||
39 | char *name; | ||
40 | }; | ||
41 | |||
42 | struct nmi_desc { | 32 | struct nmi_desc { |
43 | spinlock_t lock; | 33 | spinlock_t lock; |
44 | struct list_head head; | 34 | struct list_head head; |
@@ -54,6 +44,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] = | |||
54 | .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), | 44 | .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), |
55 | .head = LIST_HEAD_INIT(nmi_desc[1].head), | 45 | .head = LIST_HEAD_INIT(nmi_desc[1].head), |
56 | }, | 46 | }, |
47 | { | ||
48 | .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), | ||
49 | .head = LIST_HEAD_INIT(nmi_desc[2].head), | ||
50 | }, | ||
51 | { | ||
52 | .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), | ||
53 | .head = LIST_HEAD_INIT(nmi_desc[3].head), | ||
54 | }, | ||
57 | 55 | ||
58 | }; | 56 | }; |
59 | 57 | ||
@@ -84,7 +82,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); | |||
84 | 82 | ||
85 | #define nmi_to_desc(type) (&nmi_desc[type]) | 83 | #define nmi_to_desc(type) (&nmi_desc[type]) |
86 | 84 | ||
87 | static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) | 85 | static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) |
88 | { | 86 | { |
89 | struct nmi_desc *desc = nmi_to_desc(type); | 87 | struct nmi_desc *desc = nmi_to_desc(type); |
90 | struct nmiaction *a; | 88 | struct nmiaction *a; |
@@ -107,11 +105,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, | |||
107 | return handled; | 105 | return handled; |
108 | } | 106 | } |
109 | 107 | ||
110 | static int __setup_nmi(unsigned int type, struct nmiaction *action) | 108 | int __register_nmi_handler(unsigned int type, struct nmiaction *action) |
111 | { | 109 | { |
112 | struct nmi_desc *desc = nmi_to_desc(type); | 110 | struct nmi_desc *desc = nmi_to_desc(type); |
113 | unsigned long flags; | 111 | unsigned long flags; |
114 | 112 | ||
113 | if (!action->handler) | ||
114 | return -EINVAL; | ||
115 | |||
115 | spin_lock_irqsave(&desc->lock, flags); | 116 | spin_lock_irqsave(&desc->lock, flags); |
116 | 117 | ||
117 | /* | 118 | /* |
@@ -120,6 +121,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) | |||
120 | * to manage expectations | 121 | * to manage expectations |
121 | */ | 122 | */ |
122 | WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); | 123 | WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); |
124 | WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); | ||
125 | WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); | ||
123 | 126 | ||
124 | /* | 127 | /* |
125 | * some handlers need to be executed first otherwise a fake | 128 | * some handlers need to be executed first otherwise a fake |
@@ -133,8 +136,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) | |||
133 | spin_unlock_irqrestore(&desc->lock, flags); | 136 | spin_unlock_irqrestore(&desc->lock, flags); |
134 | return 0; | 137 | return 0; |
135 | } | 138 | } |
139 | EXPORT_SYMBOL(__register_nmi_handler); | ||
136 | 140 | ||
137 | static struct nmiaction *__free_nmi(unsigned int type, const char *name) | 141 | void unregister_nmi_handler(unsigned int type, const char *name) |
138 | { | 142 | { |
139 | struct nmi_desc *desc = nmi_to_desc(type); | 143 | struct nmi_desc *desc = nmi_to_desc(type); |
140 | struct nmiaction *n; | 144 | struct nmiaction *n; |
@@ -157,61 +161,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name) | |||
157 | 161 | ||
158 | spin_unlock_irqrestore(&desc->lock, flags); | 162 | spin_unlock_irqrestore(&desc->lock, flags); |
159 | synchronize_rcu(); | 163 | synchronize_rcu(); |
160 | return (n); | ||
161 | } | 164 | } |
162 | |||
163 | int register_nmi_handler(unsigned int type, nmi_handler_t handler, | ||
164 | unsigned long nmiflags, const char *devname) | ||
165 | { | ||
166 | struct nmiaction *action; | ||
167 | int retval = -ENOMEM; | ||
168 | |||
169 | if (!handler) | ||
170 | return -EINVAL; | ||
171 | |||
172 | action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); | ||
173 | if (!action) | ||
174 | goto fail_action; | ||
175 | |||
176 | action->handler = handler; | ||
177 | action->flags = nmiflags; | ||
178 | action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); | ||
179 | if (!action->name) | ||
180 | goto fail_action_name; | ||
181 | |||
182 | retval = __setup_nmi(type, action); | ||
183 | |||
184 | if (retval) | ||
185 | goto fail_setup_nmi; | ||
186 | |||
187 | return retval; | ||
188 | |||
189 | fail_setup_nmi: | ||
190 | kfree(action->name); | ||
191 | fail_action_name: | ||
192 | kfree(action); | ||
193 | fail_action: | ||
194 | |||
195 | return retval; | ||
196 | } | ||
197 | EXPORT_SYMBOL_GPL(register_nmi_handler); | ||
198 | |||
199 | void unregister_nmi_handler(unsigned int type, const char *name) | ||
200 | { | ||
201 | struct nmiaction *a; | ||
202 | |||
203 | a = __free_nmi(type, name); | ||
204 | if (a) { | ||
205 | kfree(a->name); | ||
206 | kfree(a); | ||
207 | } | ||
208 | } | ||
209 | |||
210 | EXPORT_SYMBOL_GPL(unregister_nmi_handler); | 165 | EXPORT_SYMBOL_GPL(unregister_nmi_handler); |
211 | 166 | ||
212 | static notrace __kprobes void | 167 | static __kprobes void |
213 | pci_serr_error(unsigned char reason, struct pt_regs *regs) | 168 | pci_serr_error(unsigned char reason, struct pt_regs *regs) |
214 | { | 169 | { |
170 | /* check to see if anyone registered against these types of errors */ | ||
171 | if (nmi_handle(NMI_SERR, regs, false)) | ||
172 | return; | ||
173 | |||
215 | pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", | 174 | pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", |
216 | reason, smp_processor_id()); | 175 | reason, smp_processor_id()); |
217 | 176 | ||
@@ -236,15 +195,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) | |||
236 | outb(reason, NMI_REASON_PORT); | 195 | outb(reason, NMI_REASON_PORT); |
237 | } | 196 | } |
238 | 197 | ||
239 | static notrace __kprobes void | 198 | static __kprobes void |
240 | io_check_error(unsigned char reason, struct pt_regs *regs) | 199 | io_check_error(unsigned char reason, struct pt_regs *regs) |
241 | { | 200 | { |
242 | unsigned long i; | 201 | unsigned long i; |
243 | 202 | ||
203 | /* check to see if anyone registered against these types of errors */ | ||
204 | if (nmi_handle(NMI_IO_CHECK, regs, false)) | ||
205 | return; | ||
206 | |||
244 | pr_emerg( | 207 | pr_emerg( |
245 | "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", | 208 | "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", |
246 | reason, smp_processor_id()); | 209 | reason, smp_processor_id()); |
247 | show_registers(regs); | 210 | show_regs(regs); |
248 | 211 | ||
249 | if (panic_on_io_nmi) | 212 | if (panic_on_io_nmi) |
250 | panic("NMI IOCK error: Not continuing"); | 213 | panic("NMI IOCK error: Not continuing"); |
@@ -263,7 +226,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) | |||
263 | outb(reason, NMI_REASON_PORT); | 226 | outb(reason, NMI_REASON_PORT); |
264 | } | 227 | } |
265 | 228 | ||
266 | static notrace __kprobes void | 229 | static __kprobes void |
267 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | 230 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) |
268 | { | 231 | { |
269 | int handled; | 232 | int handled; |
@@ -282,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
282 | 245 | ||
283 | __this_cpu_add(nmi_stats.unknown, 1); | 246 | __this_cpu_add(nmi_stats.unknown, 1); |
284 | 247 | ||
285 | #ifdef CONFIG_MCA | ||
286 | /* | ||
287 | * Might actually be able to figure out what the guilty party | ||
288 | * is: | ||
289 | */ | ||
290 | if (MCA_bus) { | ||
291 | mca_handle_nmi(); | ||
292 | return; | ||
293 | } | ||
294 | #endif | ||
295 | pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", | 248 | pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", |
296 | reason, smp_processor_id()); | 249 | reason, smp_processor_id()); |
297 | 250 | ||
@@ -305,7 +258,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
305 | static DEFINE_PER_CPU(bool, swallow_nmi); | 258 | static DEFINE_PER_CPU(bool, swallow_nmi); |
306 | static DEFINE_PER_CPU(unsigned long, last_nmi_rip); | 259 | static DEFINE_PER_CPU(unsigned long, last_nmi_rip); |
307 | 260 | ||
308 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 261 | static __kprobes void default_do_nmi(struct pt_regs *regs) |
309 | { | 262 | { |
310 | unsigned char reason = 0; | 263 | unsigned char reason = 0; |
311 | int handled; | 264 | int handled; |
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510f..e31bf8d5c4d 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/cpumask.h> | 13 | #include <linux/cpumask.h> |
14 | #include <linux/delay.h> | 14 | #include <linux/delay.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/percpu.h> | ||
16 | 17 | ||
17 | #include <asm/apic.h> | 18 | #include <asm/apic.h> |
18 | #include <asm/nmi.h> | 19 | #include <asm/nmi.h> |
@@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected) | |||
117 | unexpected_testcase_failures++; | 118 | unexpected_testcase_failures++; |
118 | 119 | ||
119 | if (nmi_fail == FAILURE) | 120 | if (nmi_fail == FAILURE) |
120 | printk("FAILED |"); | 121 | printk(KERN_CONT "FAILED |"); |
121 | else if (nmi_fail == TIMEOUT) | 122 | else if (nmi_fail == TIMEOUT) |
122 | printk("TIMEOUT|"); | 123 | printk(KERN_CONT "TIMEOUT|"); |
123 | else | 124 | else |
124 | printk("ERROR |"); | 125 | printk(KERN_CONT "ERROR |"); |
125 | dump_stack(); | 126 | dump_stack(); |
126 | } else { | 127 | } else { |
127 | testcase_successes++; | 128 | testcase_successes++; |
128 | printk(" ok |"); | 129 | printk(KERN_CONT " ok |"); |
129 | } | 130 | } |
130 | testcase_total++; | 131 | testcase_total++; |
131 | 132 | ||
@@ -150,10 +151,10 @@ void __init nmi_selftest(void) | |||
150 | 151 | ||
151 | print_testname("remote IPI"); | 152 | print_testname("remote IPI"); |
152 | dotest(remote_ipi, SUCCESS); | 153 | dotest(remote_ipi, SUCCESS); |
153 | printk("\n"); | 154 | printk(KERN_CONT "\n"); |
154 | print_testname("local IPI"); | 155 | print_testname("local IPI"); |
155 | dotest(local_ipi, SUCCESS); | 156 | dotest(local_ipi, SUCCESS); |
156 | printk("\n"); | 157 | printk(KERN_CONT "\n"); |
157 | 158 | ||
158 | cleanup_nmi_testsuite(); | 159 | cleanup_nmi_testsuite(); |
159 | 160 | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab137605e69..9ce885996fd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA | |||
241 | 241 | ||
242 | static inline void enter_lazy(enum paravirt_lazy_mode mode) | 242 | static inline void enter_lazy(enum paravirt_lazy_mode mode) |
243 | { | 243 | { |
244 | BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); | 244 | BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); |
245 | 245 | ||
246 | percpu_write(paravirt_lazy_mode, mode); | 246 | this_cpu_write(paravirt_lazy_mode, mode); |
247 | } | 247 | } |
248 | 248 | ||
249 | static void leave_lazy(enum paravirt_lazy_mode mode) | 249 | static void leave_lazy(enum paravirt_lazy_mode mode) |
250 | { | 250 | { |
251 | BUG_ON(percpu_read(paravirt_lazy_mode) != mode); | 251 | BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); |
252 | 252 | ||
253 | percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); | 253 | this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); |
254 | } | 254 | } |
255 | 255 | ||
256 | void paravirt_enter_lazy_mmu(void) | 256 | void paravirt_enter_lazy_mmu(void) |
@@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev) | |||
267 | { | 267 | { |
268 | BUG_ON(preemptible()); | 268 | BUG_ON(preemptible()); |
269 | 269 | ||
270 | if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { | 270 | if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { |
271 | arch_leave_lazy_mmu_mode(); | 271 | arch_leave_lazy_mmu_mode(); |
272 | set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); | 272 | set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); |
273 | } | 273 | } |
@@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) | |||
289 | if (in_interrupt()) | 289 | if (in_interrupt()) |
290 | return PARAVIRT_LAZY_NONE; | 290 | return PARAVIRT_LAZY_NONE; |
291 | 291 | ||
292 | return percpu_read(paravirt_lazy_mode); | 292 | return this_cpu_read(paravirt_lazy_mode); |
293 | } | 293 | } |
294 | 294 | ||
295 | void arch_flush_lazy_mmu_mode(void) | 295 | void arch_flush_lazy_mmu_mode(void) |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index d0b2fb9ccbb..b72838bae64 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -1480,8 +1480,9 @@ cleanup: | |||
1480 | static int __init calgary_parse_options(char *p) | 1480 | static int __init calgary_parse_options(char *p) |
1481 | { | 1481 | { |
1482 | unsigned int bridge; | 1482 | unsigned int bridge; |
1483 | unsigned long val; | ||
1483 | size_t len; | 1484 | size_t len; |
1484 | char* endp; | 1485 | ssize_t ret; |
1485 | 1486 | ||
1486 | while (*p) { | 1487 | while (*p) { |
1487 | if (!strncmp(p, "64k", 3)) | 1488 | if (!strncmp(p, "64k", 3)) |
@@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p) | |||
1512 | ++p; | 1513 | ++p; |
1513 | if (*p == '\0') | 1514 | if (*p == '\0') |
1514 | break; | 1515 | break; |
1515 | bridge = simple_strtoul(p, &endp, 0); | 1516 | ret = kstrtoul(p, 0, &val); |
1516 | if (p == endp) | 1517 | if (ret) |
1517 | break; | 1518 | break; |
1518 | 1519 | ||
1520 | bridge = val; | ||
1519 | if (bridge < MAX_PHB_BUS_NUM) { | 1521 | if (bridge < MAX_PHB_BUS_NUM) { |
1520 | printk(KERN_INFO "Calgary: disabling " | 1522 | printk(KERN_INFO "Calgary: disabling " |
1521 | "translation for PHB %#x\n", bridge); | 1523 | "translation for PHB %#x\n", bridge); |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3003250ac51..62c9457ccd2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, | |||
100 | struct dma_attrs *attrs) | 100 | struct dma_attrs *attrs) |
101 | { | 101 | { |
102 | unsigned long dma_mask; | 102 | unsigned long dma_mask; |
103 | struct page *page; | 103 | struct page *page = NULL; |
104 | unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
104 | dma_addr_t addr; | 105 | dma_addr_t addr; |
105 | 106 | ||
106 | dma_mask = dma_alloc_coherent_mask(dev, flag); | 107 | dma_mask = dma_alloc_coherent_mask(dev, flag); |
107 | 108 | ||
108 | flag |= __GFP_ZERO; | 109 | flag |= __GFP_ZERO; |
109 | again: | 110 | again: |
110 | page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); | 111 | if (!(flag & GFP_ATOMIC)) |
112 | page = dma_alloc_from_contiguous(dev, count, get_order(size)); | ||
113 | if (!page) | ||
114 | page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); | ||
111 | if (!page) | 115 | if (!page) |
112 | return NULL; | 116 | return NULL; |
113 | 117 | ||
@@ -127,6 +131,16 @@ again: | |||
127 | return page_address(page); | 131 | return page_address(page); |
128 | } | 132 | } |
129 | 133 | ||
134 | void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, | ||
135 | dma_addr_t dma_addr, struct dma_attrs *attrs) | ||
136 | { | ||
137 | unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
138 | struct page *page = virt_to_page(vaddr); | ||
139 | |||
140 | if (!dma_release_from_contiguous(dev, page, count)) | ||
141 | free_pages((unsigned long)vaddr, get_order(size)); | ||
142 | } | ||
143 | |||
130 | /* | 144 | /* |
131 | * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel | 145 | * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel |
132 | * parameter documentation. | 146 | * parameter documentation. |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index f96050685b4..871be4a84c7 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
74 | return nents; | 74 | return nents; |
75 | } | 75 | } |
76 | 76 | ||
77 | static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, | ||
78 | dma_addr_t dma_addr, struct dma_attrs *attrs) | ||
79 | { | ||
80 | free_pages((unsigned long)vaddr, get_order(size)); | ||
81 | } | ||
82 | |||
83 | static void nommu_sync_single_for_device(struct device *dev, | 77 | static void nommu_sync_single_for_device(struct device *dev, |
84 | dma_addr_t addr, size_t size, | 78 | dma_addr_t addr, size_t size, |
85 | enum dma_data_direction dir) | 79 | enum dma_data_direction dir) |
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev, | |||
97 | 91 | ||
98 | struct dma_map_ops nommu_dma_ops = { | 92 | struct dma_map_ops nommu_dma_ops = { |
99 | .alloc = dma_generic_alloc_coherent, | 93 | .alloc = dma_generic_alloc_coherent, |
100 | .free = nommu_free_coherent, | 94 | .free = dma_generic_free_coherent, |
101 | .map_sg = nommu_map_sg, | 95 | .map_sg = nommu_map_sg, |
102 | .map_page = nommu_map_page, | 96 | .map_page = nommu_map_page, |
103 | .sync_single_for_device = nommu_sync_single_for_device, | 97 | .sync_single_for_device = nommu_sync_single_for_device, |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8..735279e54e5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -27,6 +27,15 @@ | |||
27 | #include <asm/debugreg.h> | 27 | #include <asm/debugreg.h> |
28 | #include <asm/nmi.h> | 28 | #include <asm/nmi.h> |
29 | 29 | ||
30 | /* | ||
31 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
32 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
33 | * so they are allowed to end up in the .data..cacheline_aligned | ||
34 | * section. Since TSS's are completely CPU-local, we want them | ||
35 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
36 | */ | ||
37 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | ||
38 | |||
30 | #ifdef CONFIG_X86_64 | 39 | #ifdef CONFIG_X86_64 |
31 | static DEFINE_PER_CPU(unsigned char, is_idle); | 40 | static DEFINE_PER_CPU(unsigned char, is_idle); |
32 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | 41 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); |
@@ -47,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); | |||
47 | struct kmem_cache *task_xstate_cachep; | 56 | struct kmem_cache *task_xstate_cachep; |
48 | EXPORT_SYMBOL_GPL(task_xstate_cachep); | 57 | EXPORT_SYMBOL_GPL(task_xstate_cachep); |
49 | 58 | ||
59 | /* | ||
60 | * this gets called so that we can store lazy state into memory and copy the | ||
61 | * current task into the new thread. | ||
62 | */ | ||
50 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 63 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
51 | { | 64 | { |
52 | int ret; | 65 | int ret; |
53 | 66 | ||
67 | unlazy_fpu(src); | ||
68 | |||
54 | *dst = *src; | 69 | *dst = *src; |
55 | if (fpu_allocated(&src->thread.fpu)) { | 70 | if (fpu_allocated(&src->thread.fpu)) { |
56 | memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); | 71 | memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); |
@@ -67,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk) | |||
67 | fpu_free(&tsk->thread.fpu); | 82 | fpu_free(&tsk->thread.fpu); |
68 | } | 83 | } |
69 | 84 | ||
70 | void free_thread_info(struct thread_info *ti) | 85 | void arch_release_task_struct(struct task_struct *tsk) |
71 | { | 86 | { |
72 | free_thread_xstate(ti->task); | 87 | free_thread_xstate(tsk); |
73 | free_pages((unsigned long)ti, THREAD_ORDER); | ||
74 | } | 88 | } |
75 | 89 | ||
76 | void arch_task_cache_init(void) | 90 | void arch_task_cache_init(void) |
@@ -81,6 +95,16 @@ void arch_task_cache_init(void) | |||
81 | SLAB_PANIC | SLAB_NOTRACK, NULL); | 95 | SLAB_PANIC | SLAB_NOTRACK, NULL); |
82 | } | 96 | } |
83 | 97 | ||
98 | static inline void drop_fpu(struct task_struct *tsk) | ||
99 | { | ||
100 | /* | ||
101 | * Forget coprocessor state.. | ||
102 | */ | ||
103 | tsk->fpu_counter = 0; | ||
104 | clear_fpu(tsk); | ||
105 | clear_used_math(); | ||
106 | } | ||
107 | |||
84 | /* | 108 | /* |
85 | * Free current thread data structures etc.. | 109 | * Free current thread data structures etc.. |
86 | */ | 110 | */ |
@@ -103,12 +127,8 @@ void exit_thread(void) | |||
103 | put_cpu(); | 127 | put_cpu(); |
104 | kfree(bp); | 128 | kfree(bp); |
105 | } | 129 | } |
106 | } | ||
107 | 130 | ||
108 | void show_regs(struct pt_regs *regs) | 131 | drop_fpu(me); |
109 | { | ||
110 | show_registers(regs); | ||
111 | show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); | ||
112 | } | 132 | } |
113 | 133 | ||
114 | void show_regs_common(void) | 134 | void show_regs_common(void) |
@@ -143,12 +163,7 @@ void flush_thread(void) | |||
143 | 163 | ||
144 | flush_ptrace_hw_breakpoint(tsk); | 164 | flush_ptrace_hw_breakpoint(tsk); |
145 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | 165 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); |
146 | /* | 166 | drop_fpu(tsk); |
147 | * Forget coprocessor state.. | ||
148 | */ | ||
149 | tsk->fpu_counter = 0; | ||
150 | clear_fpu(tsk); | ||
151 | clear_used_math(); | ||
152 | } | 167 | } |
153 | 168 | ||
154 | static void hard_disable_TSC(void) | 169 | static void hard_disable_TSC(void) |
@@ -377,7 +392,7 @@ static inline void play_dead(void) | |||
377 | #ifdef CONFIG_X86_64 | 392 | #ifdef CONFIG_X86_64 |
378 | void enter_idle(void) | 393 | void enter_idle(void) |
379 | { | 394 | { |
380 | percpu_write(is_idle, 1); | 395 | this_cpu_write(is_idle, 1); |
381 | atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); | 396 | atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); |
382 | } | 397 | } |
383 | 398 | ||
@@ -516,26 +531,6 @@ void stop_this_cpu(void *dummy) | |||
516 | } | 531 | } |
517 | } | 532 | } |
518 | 533 | ||
519 | static void do_nothing(void *unused) | ||
520 | { | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * cpu_idle_wait - Used to ensure that all the CPUs discard old value of | ||
525 | * pm_idle and update to new pm_idle value. Required while changing pm_idle | ||
526 | * handler on SMP systems. | ||
527 | * | ||
528 | * Caller must have changed pm_idle to the new value before the call. Old | ||
529 | * pm_idle value will not be used by any CPU after the return of this function. | ||
530 | */ | ||
531 | void cpu_idle_wait(void) | ||
532 | { | ||
533 | smp_mb(); | ||
534 | /* kick all the CPUs so that they exit out of pm_idle */ | ||
535 | smp_call_function(do_nothing, NULL, 1); | ||
536 | } | ||
537 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
538 | |||
539 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ | 534 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ |
540 | static void mwait_idle(void) | 535 | static void mwait_idle(void) |
541 | { | 536 | { |
@@ -594,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) | |||
594 | { | 589 | { |
595 | u32 eax, ebx, ecx, edx; | 590 | u32 eax, ebx, ecx, edx; |
596 | 591 | ||
592 | /* Use mwait if idle=mwait boot option is given */ | ||
597 | if (boot_option_idle_override == IDLE_FORCE_MWAIT) | 593 | if (boot_option_idle_override == IDLE_FORCE_MWAIT) |
598 | return 1; | 594 | return 1; |
599 | 595 | ||
596 | /* | ||
597 | * Any idle= boot option other than idle=mwait means that we must not | ||
598 | * use mwait. Eg: idle=halt or idle=poll or idle=nomwait | ||
599 | */ | ||
600 | if (boot_option_idle_override != IDLE_NO_OVERRIDE) | ||
601 | return 0; | ||
602 | |||
600 | if (c->cpuid_level < MWAIT_INFO) | 603 | if (c->cpuid_level < MWAIT_INFO) |
601 | return 0; | 604 | return 0; |
602 | 605 | ||
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e2..516fa186121 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task) | |||
126 | release_vm86_irqs(dead_task); | 126 | release_vm86_irqs(dead_task); |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | ||
130 | * This gets called before we allocate a new thread and copy | ||
131 | * the current task into it. | ||
132 | */ | ||
133 | void prepare_to_copy(struct task_struct *tsk) | ||
134 | { | ||
135 | unlazy_fpu(tsk); | ||
136 | } | ||
137 | |||
138 | int copy_thread(unsigned long clone_flags, unsigned long sp, | 129 | int copy_thread(unsigned long clone_flags, unsigned long sp, |
139 | unsigned long unused, | 130 | unsigned long unused, |
140 | struct task_struct *p, struct pt_regs *regs) | 131 | struct task_struct *p, struct pt_regs *regs) |
@@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
302 | 293 | ||
303 | switch_fpu_finish(next_p, fpu); | 294 | switch_fpu_finish(next_p, fpu); |
304 | 295 | ||
305 | percpu_write(current_task, next_p); | 296 | this_cpu_write(current_task, next_p); |
306 | 297 | ||
307 | return prev_p; | 298 | return prev_p; |
308 | } | 299 | } |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 733ca39f367..61cdf7fdf09 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) | |||
145 | return get_desc_base(&t->thread.tls_array[tls]); | 145 | return get_desc_base(&t->thread.tls_array[tls]); |
146 | } | 146 | } |
147 | 147 | ||
148 | /* | ||
149 | * This gets called before we allocate a new thread and copy | ||
150 | * the current task into it. | ||
151 | */ | ||
152 | void prepare_to_copy(struct task_struct *tsk) | ||
153 | { | ||
154 | unlazy_fpu(tsk); | ||
155 | } | ||
156 | |||
157 | int copy_thread(unsigned long clone_flags, unsigned long sp, | 148 | int copy_thread(unsigned long clone_flags, unsigned long sp, |
158 | unsigned long unused, | 149 | unsigned long unused, |
159 | struct task_struct *p, struct pt_regs *regs) | 150 | struct task_struct *p, struct pt_regs *regs) |
@@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | |||
237 | current->thread.usersp = new_sp; | 228 | current->thread.usersp = new_sp; |
238 | regs->ip = new_ip; | 229 | regs->ip = new_ip; |
239 | regs->sp = new_sp; | 230 | regs->sp = new_sp; |
240 | percpu_write(old_rsp, new_sp); | 231 | this_cpu_write(old_rsp, new_sp); |
241 | regs->cs = _cs; | 232 | regs->cs = _cs; |
242 | regs->ss = _ss; | 233 | regs->ss = _ss; |
243 | regs->flags = X86_EFLAGS_IF; | 234 | regs->flags = X86_EFLAGS_IF; |
@@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
359 | /* | 350 | /* |
360 | * Switch the PDA and FPU contexts. | 351 | * Switch the PDA and FPU contexts. |
361 | */ | 352 | */ |
362 | prev->usersp = percpu_read(old_rsp); | 353 | prev->usersp = this_cpu_read(old_rsp); |
363 | percpu_write(old_rsp, next->usersp); | 354 | this_cpu_write(old_rsp, next->usersp); |
364 | percpu_write(current_task, next_p); | 355 | this_cpu_write(current_task, next_p); |
365 | 356 | ||
366 | percpu_write(kernel_stack, | 357 | this_cpu_write(kernel_stack, |
367 | (unsigned long)task_stack_page(next_p) + | 358 | (unsigned long)task_stack_page(next_p) + |
368 | THREAD_SIZE - KERNEL_STACK_OFFSET); | 359 | THREAD_SIZE - KERNEL_STACK_OFFSET); |
369 | 360 | ||
@@ -423,6 +414,7 @@ void set_personality_ia32(bool x32) | |||
423 | current_thread_info()->status |= TS_COMPAT; | 414 | current_thread_info()->status |= TS_COMPAT; |
424 | } | 415 | } |
425 | } | 416 | } |
417 | EXPORT_SYMBOL_GPL(set_personality_ia32); | ||
426 | 418 | ||
427 | unsigned long get_wchan(struct task_struct *p) | 419 | unsigned long get_wchan(struct task_struct *p) |
428 | { | 420 | { |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 685845cf16e..13b1990c7c5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
1480 | regs->flags |= X86_EFLAGS_TF; | 1480 | regs->flags |= X86_EFLAGS_TF; |
1481 | 1481 | ||
1482 | /* do the secure computing check first */ | 1482 | /* do the secure computing check first */ |
1483 | secure_computing(regs->orig_ax); | 1483 | if (secure_computing(regs->orig_ax)) { |
1484 | /* seccomp failures shouldn't expose any additional code. */ | ||
1485 | ret = -1L; | ||
1486 | goto out; | ||
1487 | } | ||
1484 | 1488 | ||
1485 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) | 1489 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) |
1486 | ret = -1L; | 1490 | ret = -1L; |
@@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
1505 | regs->dx, regs->r10); | 1509 | regs->dx, regs->r10); |
1506 | #endif | 1510 | #endif |
1507 | 1511 | ||
1512 | out: | ||
1508 | return ret ?: regs->orig_ax; | 1513 | return ret ?: regs->orig_ax; |
1509 | } | 1514 | } |
1510 | 1515 | ||
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 658f856f09a..79c45af8160 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -40,7 +40,8 @@ static int reboot_mode; | |||
40 | enum reboot_type reboot_type = BOOT_ACPI; | 40 | enum reboot_type reboot_type = BOOT_ACPI; |
41 | int reboot_force; | 41 | int reboot_force; |
42 | 42 | ||
43 | /* This variable is used privately to keep track of whether or not | 43 | /* |
44 | * This variable is used privately to keep track of whether or not | ||
44 | * reboot_type is still set to its default value (i.e., reboot= hasn't | 45 | * reboot_type is still set to its default value (i.e., reboot= hasn't |
45 | * been set on the command line). This is needed so that we can | 46 | * been set on the command line). This is needed so that we can |
46 | * suppress DMI scanning for reboot quirks. Without it, it's | 47 | * suppress DMI scanning for reboot quirks. Without it, it's |
@@ -52,7 +53,8 @@ static int reboot_default = 1; | |||
52 | static int reboot_cpu = -1; | 53 | static int reboot_cpu = -1; |
53 | #endif | 54 | #endif |
54 | 55 | ||
55 | /* This is set if we need to go through the 'emergency' path. | 56 | /* |
57 | * This is set if we need to go through the 'emergency' path. | ||
56 | * When machine_emergency_restart() is called, we may be on | 58 | * When machine_emergency_restart() is called, we may be on |
57 | * an inconsistent state and won't be able to do a clean cleanup | 59 | * an inconsistent state and won't be able to do a clean cleanup |
58 | */ | 60 | */ |
@@ -61,22 +63,24 @@ static int reboot_emergency; | |||
61 | /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ | 63 | /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ |
62 | bool port_cf9_safe = false; | 64 | bool port_cf9_safe = false; |
63 | 65 | ||
64 | /* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] | 66 | /* |
65 | warm Don't set the cold reboot flag | 67 | * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] |
66 | cold Set the cold reboot flag | 68 | * warm Don't set the cold reboot flag |
67 | bios Reboot by jumping through the BIOS (only for X86_32) | 69 | * cold Set the cold reboot flag |
68 | smp Reboot by executing reset on BSP or other CPU (only for X86_32) | 70 | * bios Reboot by jumping through the BIOS (only for X86_32) |
69 | triple Force a triple fault (init) | 71 | * smp Reboot by executing reset on BSP or other CPU (only for X86_32) |
70 | kbd Use the keyboard controller. cold reset (default) | 72 | * triple Force a triple fault (init) |
71 | acpi Use the RESET_REG in the FADT | 73 | * kbd Use the keyboard controller. cold reset (default) |
72 | efi Use efi reset_system runtime service | 74 | * acpi Use the RESET_REG in the FADT |
73 | pci Use the so-called "PCI reset register", CF9 | 75 | * efi Use efi reset_system runtime service |
74 | force Avoid anything that could hang. | 76 | * pci Use the so-called "PCI reset register", CF9 |
77 | * force Avoid anything that could hang. | ||
75 | */ | 78 | */ |
76 | static int __init reboot_setup(char *str) | 79 | static int __init reboot_setup(char *str) |
77 | { | 80 | { |
78 | for (;;) { | 81 | for (;;) { |
79 | /* Having anything passed on the command line via | 82 | /* |
83 | * Having anything passed on the command line via | ||
80 | * reboot= will cause us to disable DMI checking | 84 | * reboot= will cause us to disable DMI checking |
81 | * below. | 85 | * below. |
82 | */ | 86 | */ |
@@ -99,9 +103,11 @@ static int __init reboot_setup(char *str) | |||
99 | if (isdigit(*(str+2))) | 103 | if (isdigit(*(str+2))) |
100 | reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); | 104 | reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); |
101 | } | 105 | } |
102 | /* we will leave sorting out the final value | 106 | /* |
103 | when we are ready to reboot, since we might not | 107 | * We will leave sorting out the final value |
104 | have detected BSP APIC ID or smp_num_cpu */ | 108 | * when we are ready to reboot, since we might not |
109 | * have detected BSP APIC ID or smp_num_cpu | ||
110 | */ | ||
105 | break; | 111 | break; |
106 | #endif /* CONFIG_SMP */ | 112 | #endif /* CONFIG_SMP */ |
107 | 113 | ||
@@ -151,6 +157,62 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) | |||
151 | return 0; | 157 | return 0; |
152 | } | 158 | } |
153 | 159 | ||
160 | void machine_real_restart(unsigned int type) | ||
161 | { | ||
162 | void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) | ||
163 | real_mode_header->machine_real_restart_asm; | ||
164 | |||
165 | local_irq_disable(); | ||
166 | |||
167 | /* | ||
168 | * Write zero to CMOS register number 0x0f, which the BIOS POST | ||
169 | * routine will recognize as telling it to do a proper reboot. (Well | ||
170 | * that's what this book in front of me says -- it may only apply to | ||
171 | * the Phoenix BIOS though, it's not clear). At the same time, | ||
172 | * disable NMIs by setting the top bit in the CMOS address register, | ||
173 | * as we're about to do peculiar things to the CPU. I'm not sure if | ||
174 | * `outb_p' is needed instead of just `outb'. Use it to be on the | ||
175 | * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) | ||
176 | */ | ||
177 | spin_lock(&rtc_lock); | ||
178 | CMOS_WRITE(0x00, 0x8f); | ||
179 | spin_unlock(&rtc_lock); | ||
180 | |||
181 | /* | ||
182 | * Switch back to the initial page table. | ||
183 | */ | ||
184 | load_cr3(initial_page_table); | ||
185 | |||
186 | /* | ||
187 | * Write 0x1234 to absolute memory location 0x472. The BIOS reads | ||
188 | * this on booting to tell it to "Bypass memory test (also warm | ||
189 | * boot)". This seems like a fairly standard thing that gets set by | ||
190 | * REBOOT.COM programs, and the previous reset routine did this | ||
191 | * too. */ | ||
192 | *((unsigned short *)0x472) = reboot_mode; | ||
193 | |||
194 | /* Jump to the identity-mapped low memory code */ | ||
195 | restart_lowmem(type); | ||
196 | } | ||
197 | #ifdef CONFIG_APM_MODULE | ||
198 | EXPORT_SYMBOL(machine_real_restart); | ||
199 | #endif | ||
200 | |||
201 | #endif /* CONFIG_X86_32 */ | ||
202 | |||
203 | /* | ||
204 | * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot | ||
205 | */ | ||
206 | static int __init set_pci_reboot(const struct dmi_system_id *d) | ||
207 | { | ||
208 | if (reboot_type != BOOT_CF9) { | ||
209 | reboot_type = BOOT_CF9; | ||
210 | printk(KERN_INFO "%s series board detected. " | ||
211 | "Selecting PCI-method for reboots.\n", d->ident); | ||
212 | } | ||
213 | return 0; | ||
214 | } | ||
215 | |||
154 | static int __init set_kbd_reboot(const struct dmi_system_id *d) | 216 | static int __init set_kbd_reboot(const struct dmi_system_id *d) |
155 | { | 217 | { |
156 | if (reboot_type != BOOT_KBD) { | 218 | if (reboot_type != BOOT_KBD) { |
@@ -160,7 +222,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) | |||
160 | return 0; | 222 | return 0; |
161 | } | 223 | } |
162 | 224 | ||
225 | /* | ||
226 | * This is a single dmi_table handling all reboot quirks. Note that | ||
227 | * REBOOT_BIOS is only available for 32bit | ||
228 | */ | ||
163 | static struct dmi_system_id __initdata reboot_dmi_table[] = { | 229 | static struct dmi_system_id __initdata reboot_dmi_table[] = { |
230 | #ifdef CONFIG_X86_32 | ||
164 | { /* Handle problems with rebooting on Dell E520's */ | 231 | { /* Handle problems with rebooting on Dell E520's */ |
165 | .callback = set_bios_reboot, | 232 | .callback = set_bios_reboot, |
166 | .ident = "Dell E520", | 233 | .ident = "Dell E520", |
@@ -185,7 +252,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
185 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), | 252 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), |
186 | }, | 253 | }, |
187 | }, | 254 | }, |
188 | { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ | 255 | { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ |
189 | .callback = set_bios_reboot, | 256 | .callback = set_bios_reboot, |
190 | .ident = "Dell OptiPlex 745", | 257 | .ident = "Dell OptiPlex 745", |
191 | .matches = { | 258 | .matches = { |
@@ -193,7 +260,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
193 | DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), | 260 | DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), |
194 | }, | 261 | }, |
195 | }, | 262 | }, |
196 | { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ | 263 | { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ |
197 | .callback = set_bios_reboot, | 264 | .callback = set_bios_reboot, |
198 | .ident = "Dell OptiPlex 745", | 265 | .ident = "Dell OptiPlex 745", |
199 | .matches = { | 266 | .matches = { |
@@ -202,7 +269,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
202 | DMI_MATCH(DMI_BOARD_NAME, "0MM599"), | 269 | DMI_MATCH(DMI_BOARD_NAME, "0MM599"), |
203 | }, | 270 | }, |
204 | }, | 271 | }, |
205 | { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ | 272 | { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ |
206 | .callback = set_bios_reboot, | 273 | .callback = set_bios_reboot, |
207 | .ident = "Dell OptiPlex 745", | 274 | .ident = "Dell OptiPlex 745", |
208 | .matches = { | 275 | .matches = { |
@@ -211,7 +278,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
211 | DMI_MATCH(DMI_BOARD_NAME, "0KW626"), | 278 | DMI_MATCH(DMI_BOARD_NAME, "0KW626"), |
212 | }, | 279 | }, |
213 | }, | 280 | }, |
214 | { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ | 281 | { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ |
215 | .callback = set_bios_reboot, | 282 | .callback = set_bios_reboot, |
216 | .ident = "Dell OptiPlex 330", | 283 | .ident = "Dell OptiPlex 330", |
217 | .matches = { | 284 | .matches = { |
@@ -220,7 +287,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
220 | DMI_MATCH(DMI_BOARD_NAME, "0KP561"), | 287 | DMI_MATCH(DMI_BOARD_NAME, "0KP561"), |
221 | }, | 288 | }, |
222 | }, | 289 | }, |
223 | { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ | 290 | { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ |
224 | .callback = set_bios_reboot, | 291 | .callback = set_bios_reboot, |
225 | .ident = "Dell OptiPlex 360", | 292 | .ident = "Dell OptiPlex 360", |
226 | .matches = { | 293 | .matches = { |
@@ -229,7 +296,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
229 | DMI_MATCH(DMI_BOARD_NAME, "0T656F"), | 296 | DMI_MATCH(DMI_BOARD_NAME, "0T656F"), |
230 | }, | 297 | }, |
231 | }, | 298 | }, |
232 | { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ | 299 | { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ |
233 | .callback = set_bios_reboot, | 300 | .callback = set_bios_reboot, |
234 | .ident = "Dell OptiPlex 760", | 301 | .ident = "Dell OptiPlex 760", |
235 | .matches = { | 302 | .matches = { |
@@ -302,7 +369,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
302 | DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), | 369 | DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), |
303 | }, | 370 | }, |
304 | }, | 371 | }, |
305 | { /* Handle problems with rebooting on ASUS P4S800 */ | 372 | { /* Handle problems with rebooting on ASUS P4S800 */ |
306 | .callback = set_bios_reboot, | 373 | .callback = set_bios_reboot, |
307 | .ident = "ASUS P4S800", | 374 | .ident = "ASUS P4S800", |
308 | .matches = { | 375 | .matches = { |
@@ -310,7 +377,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
310 | DMI_MATCH(DMI_BOARD_NAME, "P4S800"), | 377 | DMI_MATCH(DMI_BOARD_NAME, "P4S800"), |
311 | }, | 378 | }, |
312 | }, | 379 | }, |
313 | { /* Handle reboot issue on Acer Aspire one */ | 380 | #endif /* CONFIG_X86_32 */ |
381 | |||
382 | { /* Handle reboot issue on Acer Aspire one */ | ||
314 | .callback = set_kbd_reboot, | 383 | .callback = set_kbd_reboot, |
315 | .ident = "Acer Aspire One A110", | 384 | .ident = "Acer Aspire One A110", |
316 | .matches = { | 385 | .matches = { |
@@ -318,76 +387,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
318 | DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), | 387 | DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), |
319 | }, | 388 | }, |
320 | }, | 389 | }, |
321 | { } | ||
322 | }; | ||
323 | |||
324 | static int __init reboot_init(void) | ||
325 | { | ||
326 | /* Only do the DMI check if reboot_type hasn't been overridden | ||
327 | * on the command line | ||
328 | */ | ||
329 | if (reboot_default) { | ||
330 | dmi_check_system(reboot_dmi_table); | ||
331 | } | ||
332 | return 0; | ||
333 | } | ||
334 | core_initcall(reboot_init); | ||
335 | |||
336 | void machine_real_restart(unsigned int type) | ||
337 | { | ||
338 | void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) | ||
339 | real_mode_header->machine_real_restart_asm; | ||
340 | |||
341 | local_irq_disable(); | ||
342 | |||
343 | /* Write zero to CMOS register number 0x0f, which the BIOS POST | ||
344 | routine will recognize as telling it to do a proper reboot. (Well | ||
345 | that's what this book in front of me says -- it may only apply to | ||
346 | the Phoenix BIOS though, it's not clear). At the same time, | ||
347 | disable NMIs by setting the top bit in the CMOS address register, | ||
348 | as we're about to do peculiar things to the CPU. I'm not sure if | ||
349 | `outb_p' is needed instead of just `outb'. Use it to be on the | ||
350 | safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) | ||
351 | */ | ||
352 | spin_lock(&rtc_lock); | ||
353 | CMOS_WRITE(0x00, 0x8f); | ||
354 | spin_unlock(&rtc_lock); | ||
355 | |||
356 | /* | ||
357 | * Switch back to the initial page table. | ||
358 | */ | ||
359 | load_cr3(initial_page_table); | ||
360 | |||
361 | /* Write 0x1234 to absolute memory location 0x472. The BIOS reads | ||
362 | this on booting to tell it to "Bypass memory test (also warm | ||
363 | boot)". This seems like a fairly standard thing that gets set by | ||
364 | REBOOT.COM programs, and the previous reset routine did this | ||
365 | too. */ | ||
366 | *((unsigned short *)0x472) = reboot_mode; | ||
367 | |||
368 | /* Jump to the identity-mapped low memory code */ | ||
369 | restart_lowmem(type); | ||
370 | } | ||
371 | #ifdef CONFIG_APM_MODULE | ||
372 | EXPORT_SYMBOL(machine_real_restart); | ||
373 | #endif | ||
374 | |||
375 | #endif /* CONFIG_X86_32 */ | ||
376 | |||
377 | /* | ||
378 | * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot | ||
379 | */ | ||
380 | static int __init set_pci_reboot(const struct dmi_system_id *d) | ||
381 | { | ||
382 | if (reboot_type != BOOT_CF9) { | ||
383 | reboot_type = BOOT_CF9; | ||
384 | printk(KERN_INFO "%s series board detected. " | ||
385 | "Selecting PCI-method for reboots.\n", d->ident); | ||
386 | } | ||
387 | return 0; | ||
388 | } | ||
389 | |||
390 | static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { | ||
391 | { /* Handle problems with rebooting on Apple MacBook5 */ | 390 | { /* Handle problems with rebooting on Apple MacBook5 */ |
392 | .callback = set_pci_reboot, | 391 | .callback = set_pci_reboot, |
393 | .ident = "Apple MacBook5", | 392 | .ident = "Apple MacBook5", |
@@ -455,17 +454,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { | |||
455 | { } | 454 | { } |
456 | }; | 455 | }; |
457 | 456 | ||
458 | static int __init pci_reboot_init(void) | 457 | static int __init reboot_init(void) |
459 | { | 458 | { |
460 | /* Only do the DMI check if reboot_type hasn't been overridden | 459 | /* |
460 | * Only do the DMI check if reboot_type hasn't been overridden | ||
461 | * on the command line | 461 | * on the command line |
462 | */ | 462 | */ |
463 | if (reboot_default) { | 463 | if (reboot_default) |
464 | dmi_check_system(pci_reboot_dmi_table); | 464 | dmi_check_system(reboot_dmi_table); |
465 | } | ||
466 | return 0; | 465 | return 0; |
467 | } | 466 | } |
468 | core_initcall(pci_reboot_init); | 467 | core_initcall(reboot_init); |
469 | 468 | ||
470 | static inline void kb_wait(void) | 469 | static inline void kb_wait(void) |
471 | { | 470 | { |
@@ -483,14 +482,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs) | |||
483 | cpu_emergency_vmxoff(); | 482 | cpu_emergency_vmxoff(); |
484 | } | 483 | } |
485 | 484 | ||
486 | /* Use NMIs as IPIs to tell all CPUs to disable virtualization | 485 | /* Use NMIs as IPIs to tell all CPUs to disable virtualization */ |
487 | */ | ||
488 | static void emergency_vmx_disable_all(void) | 486 | static void emergency_vmx_disable_all(void) |
489 | { | 487 | { |
490 | /* Just make sure we won't change CPUs while doing this */ | 488 | /* Just make sure we won't change CPUs while doing this */ |
491 | local_irq_disable(); | 489 | local_irq_disable(); |
492 | 490 | ||
493 | /* We need to disable VMX on all CPUs before rebooting, otherwise | 491 | /* |
492 | * We need to disable VMX on all CPUs before rebooting, otherwise | ||
494 | * we risk hanging up the machine, because the CPU ignore INIT | 493 | * we risk hanging up the machine, because the CPU ignore INIT |
495 | * signals when VMX is enabled. | 494 | * signals when VMX is enabled. |
496 | * | 495 | * |
@@ -509,8 +508,7 @@ static void emergency_vmx_disable_all(void) | |||
509 | * is still enabling VMX. | 508 | * is still enabling VMX. |
510 | */ | 509 | */ |
511 | if (cpu_has_vmx() && cpu_vmx_enabled()) { | 510 | if (cpu_has_vmx() && cpu_vmx_enabled()) { |
512 | /* Disable VMX on this CPU. | 511 | /* Disable VMX on this CPU. */ |
513 | */ | ||
514 | cpu_vmxoff(); | 512 | cpu_vmxoff(); |
515 | 513 | ||
516 | /* Halt and disable VMX on the other CPUs */ | 514 | /* Halt and disable VMX on the other CPUs */ |
@@ -555,12 +553,12 @@ static void native_machine_emergency_restart(void) | |||
555 | /* Could also try the reset bit in the Hammer NB */ | 553 | /* Could also try the reset bit in the Hammer NB */ |
556 | switch (reboot_type) { | 554 | switch (reboot_type) { |
557 | case BOOT_KBD: | 555 | case BOOT_KBD: |
558 | mach_reboot_fixups(); /* for board specific fixups */ | 556 | mach_reboot_fixups(); /* For board specific fixups */ |
559 | 557 | ||
560 | for (i = 0; i < 10; i++) { | 558 | for (i = 0; i < 10; i++) { |
561 | kb_wait(); | 559 | kb_wait(); |
562 | udelay(50); | 560 | udelay(50); |
563 | outb(0xfe, 0x64); /* pulse reset low */ | 561 | outb(0xfe, 0x64); /* Pulse reset low */ |
564 | udelay(50); | 562 | udelay(50); |
565 | } | 563 | } |
566 | if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { | 564 | if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { |
@@ -602,7 +600,7 @@ static void native_machine_emergency_restart(void) | |||
602 | 600 | ||
603 | case BOOT_CF9: | 601 | case BOOT_CF9: |
604 | port_cf9_safe = true; | 602 | port_cf9_safe = true; |
605 | /* fall through */ | 603 | /* Fall through */ |
606 | 604 | ||
607 | case BOOT_CF9_COND: | 605 | case BOOT_CF9_COND: |
608 | if (port_cf9_safe) { | 606 | if (port_cf9_safe) { |
@@ -640,7 +638,8 @@ void native_machine_shutdown(void) | |||
640 | /* Make certain I only run on the appropriate processor */ | 638 | /* Make certain I only run on the appropriate processor */ |
641 | set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); | 639 | set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); |
642 | 640 | ||
643 | /* O.K Now that I'm on the appropriate processor, | 641 | /* |
642 | * O.K Now that I'm on the appropriate processor, | ||
644 | * stop all of the others. | 643 | * stop all of the others. |
645 | */ | 644 | */ |
646 | stop_other_cpus(); | 645 | stop_other_cpus(); |
@@ -678,12 +677,11 @@ static void native_machine_restart(char *__unused) | |||
678 | 677 | ||
679 | static void native_machine_halt(void) | 678 | static void native_machine_halt(void) |
680 | { | 679 | { |
681 | /* stop other cpus and apics */ | 680 | /* Stop other cpus and apics */ |
682 | machine_shutdown(); | 681 | machine_shutdown(); |
683 | 682 | ||
684 | tboot_shutdown(TB_SHUTDOWN_HALT); | 683 | tboot_shutdown(TB_SHUTDOWN_HALT); |
685 | 684 | ||
686 | /* stop this cpu */ | ||
687 | stop_this_cpu(NULL); | 685 | stop_this_cpu(NULL); |
688 | } | 686 | } |
689 | 687 | ||
@@ -694,7 +692,7 @@ static void native_machine_power_off(void) | |||
694 | machine_shutdown(); | 692 | machine_shutdown(); |
695 | pm_power_off(); | 693 | pm_power_off(); |
696 | } | 694 | } |
697 | /* a fallback in case there is no PM info available */ | 695 | /* A fallback in case there is no PM info available */ |
698 | tboot_shutdown(TB_SHUTDOWN_HALT); | 696 | tboot_shutdown(TB_SHUTDOWN_HALT); |
699 | } | 697 | } |
700 | 698 | ||
@@ -756,7 +754,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) | |||
756 | 754 | ||
757 | cpu = raw_smp_processor_id(); | 755 | cpu = raw_smp_processor_id(); |
758 | 756 | ||
759 | /* Don't do anything if this handler is invoked on crashing cpu. | 757 | /* |
758 | * Don't do anything if this handler is invoked on crashing cpu. | ||
760 | * Otherwise, system will completely hang. Crashing cpu can get | 759 | * Otherwise, system will completely hang. Crashing cpu can get |
761 | * an NMI if system was initially booted with nmi_watchdog parameter. | 760 | * an NMI if system was initially booted with nmi_watchdog parameter. |
762 | */ | 761 | */ |
@@ -780,7 +779,8 @@ static void smp_send_nmi_allbutself(void) | |||
780 | apic->send_IPI_allbutself(NMI_VECTOR); | 779 | apic->send_IPI_allbutself(NMI_VECTOR); |
781 | } | 780 | } |
782 | 781 | ||
783 | /* Halt all other CPUs, calling the specified function on each of them | 782 | /* |
783 | * Halt all other CPUs, calling the specified function on each of them | ||
784 | * | 784 | * |
785 | * This function can be used to halt all other CPUs on crash | 785 | * This function can be used to halt all other CPUs on crash |
786 | * or emergency reboot time. The function passed as parameter | 786 | * or emergency reboot time. The function passed as parameter |
@@ -791,7 +791,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) | |||
791 | unsigned long msecs; | 791 | unsigned long msecs; |
792 | local_irq_disable(); | 792 | local_irq_disable(); |
793 | 793 | ||
794 | /* Make a note of crashing cpu. Will be used in NMI callback.*/ | 794 | /* Make a note of crashing cpu. Will be used in NMI callback. */ |
795 | crashing_cpu = safe_smp_processor_id(); | 795 | crashing_cpu = safe_smp_processor_id(); |
796 | 796 | ||
797 | shootdown_callback = callback; | 797 | shootdown_callback = callback; |
@@ -800,8 +800,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) | |||
800 | /* Would it be better to replace the trap vector here? */ | 800 | /* Would it be better to replace the trap vector here? */ |
801 | if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, | 801 | if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, |
802 | NMI_FLAG_FIRST, "crash")) | 802 | NMI_FLAG_FIRST, "crash")) |
803 | return; /* return what? */ | 803 | return; /* Return what? */ |
804 | /* Ensure the new callback function is set before sending | 804 | /* |
805 | * Ensure the new callback function is set before sending | ||
805 | * out the NMI | 806 | * out the NMI |
806 | */ | 807 | */ |
807 | wmb(); | 808 | wmb(); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index efcf305210a..16be6dc14db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> |
35 | #include <linux/seq_file.h> | 35 | #include <linux/seq_file.h> |
36 | #include <linux/console.h> | 36 | #include <linux/console.h> |
37 | #include <linux/mca.h> | ||
38 | #include <linux/root_dev.h> | 37 | #include <linux/root_dev.h> |
39 | #include <linux/highmem.h> | 38 | #include <linux/highmem.h> |
40 | #include <linux/module.h> | 39 | #include <linux/module.h> |
@@ -50,6 +49,7 @@ | |||
50 | #include <asm/pci-direct.h> | 49 | #include <asm/pci-direct.h> |
51 | #include <linux/init_ohci1394_dma.h> | 50 | #include <linux/init_ohci1394_dma.h> |
52 | #include <linux/kvm_para.h> | 51 | #include <linux/kvm_para.h> |
52 | #include <linux/dma-contiguous.h> | ||
53 | 53 | ||
54 | #include <linux/errno.h> | 54 | #include <linux/errno.h> |
55 | #include <linux/kernel.h> | 55 | #include <linux/kernel.h> |
@@ -179,12 +179,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; | |||
179 | /* common cpu data for all cpus */ | 179 | /* common cpu data for all cpus */ |
180 | struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; | 180 | struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; |
181 | EXPORT_SYMBOL(boot_cpu_data); | 181 | EXPORT_SYMBOL(boot_cpu_data); |
182 | static void set_mca_bus(int x) | ||
183 | { | ||
184 | #ifdef CONFIG_MCA | ||
185 | MCA_bus = x; | ||
186 | #endif | ||
187 | } | ||
188 | 182 | ||
189 | unsigned int def_to_bigsmp; | 183 | unsigned int def_to_bigsmp; |
190 | 184 | ||
@@ -340,8 +334,8 @@ static void __init relocate_initrd(void) | |||
340 | memblock_reserve(ramdisk_here, area_size); | 334 | memblock_reserve(ramdisk_here, area_size); |
341 | initrd_start = ramdisk_here + PAGE_OFFSET; | 335 | initrd_start = ramdisk_here + PAGE_OFFSET; |
342 | initrd_end = initrd_start + ramdisk_size; | 336 | initrd_end = initrd_start + ramdisk_size; |
343 | printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", | 337 | printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", |
344 | ramdisk_here, ramdisk_here + ramdisk_size); | 338 | ramdisk_here, ramdisk_here + ramdisk_size - 1); |
345 | 339 | ||
346 | q = (char *)initrd_start; | 340 | q = (char *)initrd_start; |
347 | 341 | ||
@@ -372,8 +366,8 @@ static void __init relocate_initrd(void) | |||
372 | /* high pages is not converted by early_res_to_bootmem */ | 366 | /* high pages is not converted by early_res_to_bootmem */ |
373 | ramdisk_image = boot_params.hdr.ramdisk_image; | 367 | ramdisk_image = boot_params.hdr.ramdisk_image; |
374 | ramdisk_size = boot_params.hdr.ramdisk_size; | 368 | ramdisk_size = boot_params.hdr.ramdisk_size; |
375 | printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" | 369 | printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" |
376 | " %08llx - %08llx\n", | 370 | " [mem %#010llx-%#010llx]\n", |
377 | ramdisk_image, ramdisk_image + ramdisk_size - 1, | 371 | ramdisk_image, ramdisk_image + ramdisk_size - 1, |
378 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | 372 | ramdisk_here, ramdisk_here + ramdisk_size - 1); |
379 | } | 373 | } |
@@ -393,14 +387,13 @@ static void __init reserve_initrd(void) | |||
393 | initrd_start = 0; | 387 | initrd_start = 0; |
394 | 388 | ||
395 | if (ramdisk_size >= (end_of_lowmem>>1)) { | 389 | if (ramdisk_size >= (end_of_lowmem>>1)) { |
396 | memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); | 390 | panic("initrd too large to handle, " |
397 | printk(KERN_ERR "initrd too large to handle, " | 391 | "disabling initrd (%lld needed, %lld available)\n", |
398 | "disabling initrd\n"); | 392 | ramdisk_size, end_of_lowmem>>1); |
399 | return; | ||
400 | } | 393 | } |
401 | 394 | ||
402 | printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, | 395 | printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, |
403 | ramdisk_end); | 396 | ramdisk_end - 1); |
404 | 397 | ||
405 | 398 | ||
406 | if (ramdisk_end <= end_of_lowmem) { | 399 | if (ramdisk_end <= end_of_lowmem) { |
@@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p) | |||
717 | apm_info.bios = boot_params.apm_bios_info; | 710 | apm_info.bios = boot_params.apm_bios_info; |
718 | ist_info = boot_params.ist_info; | 711 | ist_info = boot_params.ist_info; |
719 | if (boot_params.sys_desc_table.length != 0) { | 712 | if (boot_params.sys_desc_table.length != 0) { |
720 | set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); | ||
721 | machine_id = boot_params.sys_desc_table.table[0]; | 713 | machine_id = boot_params.sys_desc_table.table[0]; |
722 | machine_submodel_id = boot_params.sys_desc_table.table[1]; | 714 | machine_submodel_id = boot_params.sys_desc_table.table[1]; |
723 | BIOS_revision = boot_params.sys_desc_table.table[2]; | 715 | BIOS_revision = boot_params.sys_desc_table.table[2]; |
@@ -914,8 +906,8 @@ void __init setup_arch(char **cmdline_p) | |||
914 | setup_bios_corruption_check(); | 906 | setup_bios_corruption_check(); |
915 | #endif | 907 | #endif |
916 | 908 | ||
917 | printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", | 909 | printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", |
918 | max_pfn_mapped<<PAGE_SHIFT); | 910 | (max_pfn_mapped<<PAGE_SHIFT) - 1); |
919 | 911 | ||
920 | setup_real_mode(); | 912 | setup_real_mode(); |
921 | 913 | ||
@@ -934,6 +926,7 @@ void __init setup_arch(char **cmdline_p) | |||
934 | } | 926 | } |
935 | #endif | 927 | #endif |
936 | memblock.current_limit = get_max_mapped(); | 928 | memblock.current_limit = get_max_mapped(); |
929 | dma_contiguous_reserve(0); | ||
937 | 930 | ||
938 | /* | 931 | /* |
939 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. | 932 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. |
@@ -1014,7 +1007,8 @@ void __init setup_arch(char **cmdline_p) | |||
1014 | init_cpu_to_node(); | 1007 | init_cpu_to_node(); |
1015 | 1008 | ||
1016 | init_apic_mappings(); | 1009 | init_apic_mappings(); |
1017 | ioapic_and_gsi_init(); | 1010 | if (x86_io_apic_ops.init) |
1011 | x86_io_apic_ops.init(); | ||
1018 | 1012 | ||
1019 | kvm_guest_init(); | 1013 | kvm_guest_init(); |
1020 | 1014 | ||
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 71f4727da37..5a98aa27218 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void) | |||
185 | #endif | 185 | #endif |
186 | rc = -EINVAL; | 186 | rc = -EINVAL; |
187 | if (pcpu_chosen_fc != PCPU_FC_PAGE) { | 187 | if (pcpu_chosen_fc != PCPU_FC_PAGE) { |
188 | const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; | ||
189 | const size_t dyn_size = PERCPU_MODULE_RESERVE + | 188 | const size_t dyn_size = PERCPU_MODULE_RESERVE + |
190 | PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; | 189 | PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; |
190 | size_t atom_size; | ||
191 | 191 | ||
192 | /* | ||
193 | * On 64bit, use PMD_SIZE for atom_size so that embedded | ||
194 | * percpu areas are aligned to PMD. This, in the future, | ||
195 | * can also allow using PMD mappings in vmalloc area. Use | ||
196 | * PAGE_SIZE on 32bit as vmalloc space is highly contended | ||
197 | * and large vmalloc area allocs can easily fail. | ||
198 | */ | ||
199 | #ifdef CONFIG_X86_64 | ||
200 | atom_size = PMD_SIZE; | ||
201 | #else | ||
202 | atom_size = PAGE_SIZE; | ||
203 | #endif | ||
192 | rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, | 204 | rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, |
193 | dyn_size, atom_size, | 205 | dyn_size, atom_size, |
194 | pcpu_cpu_distance, | 206 | pcpu_cpu_distance, |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 115eac43148..965dfda0fd5 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/personality.h> | 18 | #include <linux/personality.h> |
19 | #include <linux/uaccess.h> | 19 | #include <linux/uaccess.h> |
20 | #include <linux/user-return-notifier.h> | 20 | #include <linux/user-return-notifier.h> |
21 | #include <linux/uprobes.h> | ||
21 | 22 | ||
22 | #include <asm/processor.h> | 23 | #include <asm/processor.h> |
23 | #include <asm/ucontext.h> | 24 | #include <asm/ucontext.h> |
@@ -478,18 +479,8 @@ asmlinkage int | |||
478 | sys_sigsuspend(int history0, int history1, old_sigset_t mask) | 479 | sys_sigsuspend(int history0, int history1, old_sigset_t mask) |
479 | { | 480 | { |
480 | sigset_t blocked; | 481 | sigset_t blocked; |
481 | |||
482 | current->saved_sigmask = current->blocked; | ||
483 | |||
484 | mask &= _BLOCKABLE; | ||
485 | siginitset(&blocked, mask); | 482 | siginitset(&blocked, mask); |
486 | set_current_blocked(&blocked); | 483 | return sigsuspend(&blocked); |
487 | |||
488 | current->state = TASK_INTERRUPTIBLE; | ||
489 | schedule(); | ||
490 | |||
491 | set_restore_sigmask(); | ||
492 | return -ERESTARTNOHAND; | ||
493 | } | 484 | } |
494 | 485 | ||
495 | asmlinkage int | 486 | asmlinkage int |
@@ -824,6 +815,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
824 | mce_notify_process(); | 815 | mce_notify_process(); |
825 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | 816 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ |
826 | 817 | ||
818 | if (thread_info_flags & _TIF_UPROBE) { | ||
819 | clear_thread_flag(TIF_UPROBE); | ||
820 | uprobe_notify_resume(regs); | ||
821 | } | ||
822 | |||
827 | /* deal with pending signal delivery */ | 823 | /* deal with pending signal delivery */ |
828 | if (thread_info_flags & _TIF_SIGPENDING) | 824 | if (thread_info_flags & _TIF_SIGPENDING) |
829 | do_signal(regs); | 825 | do_signal(regs); |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 66c74f481ca..48d2b7ded42 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -109,6 +109,9 @@ | |||
109 | * about nothing of note with C stepping upwards. | 109 | * about nothing of note with C stepping upwards. |
110 | */ | 110 | */ |
111 | 111 | ||
112 | static atomic_t stopping_cpu = ATOMIC_INIT(-1); | ||
113 | static bool smp_no_nmi_ipi = false; | ||
114 | |||
112 | /* | 115 | /* |
113 | * this function sends a 'reschedule' IPI to another CPU. | 116 | * this function sends a 'reschedule' IPI to another CPU. |
114 | * it goes straight through and wastes no time serializing | 117 | * it goes straight through and wastes no time serializing |
@@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask) | |||
149 | free_cpumask_var(allbutself); | 152 | free_cpumask_var(allbutself); |
150 | } | 153 | } |
151 | 154 | ||
152 | static atomic_t stopping_cpu = ATOMIC_INIT(-1); | ||
153 | |||
154 | static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) | 155 | static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) |
155 | { | 156 | { |
156 | /* We are registered on stopping cpu too, avoid spurious NMI */ | 157 | /* We are registered on stopping cpu too, avoid spurious NMI */ |
@@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) | |||
162 | return NMI_HANDLED; | 163 | return NMI_HANDLED; |
163 | } | 164 | } |
164 | 165 | ||
165 | static void native_nmi_stop_other_cpus(int wait) | 166 | /* |
167 | * this function calls the 'stop' function on all other CPUs in the system. | ||
168 | */ | ||
169 | |||
170 | asmlinkage void smp_reboot_interrupt(void) | ||
171 | { | ||
172 | ack_APIC_irq(); | ||
173 | irq_enter(); | ||
174 | stop_this_cpu(NULL); | ||
175 | irq_exit(); | ||
176 | } | ||
177 | |||
178 | static void native_stop_other_cpus(int wait) | ||
166 | { | 179 | { |
167 | unsigned long flags; | 180 | unsigned long flags; |
168 | unsigned long timeout; | 181 | unsigned long timeout; |
@@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait) | |||
174 | * Use an own vector here because smp_call_function | 187 | * Use an own vector here because smp_call_function |
175 | * does lots of things not suitable in a panic situation. | 188 | * does lots of things not suitable in a panic situation. |
176 | */ | 189 | */ |
190 | |||
191 | /* | ||
192 | * We start by using the REBOOT_VECTOR irq. | ||
193 | * The irq is treated as a sync point to allow critical | ||
194 | * regions of code on other cpus to release their spin locks | ||
195 | * and re-enable irqs. Jumping straight to an NMI might | ||
196 | * accidentally cause deadlocks with further shutdown/panic | ||
197 | * code. By syncing, we give the cpus up to one second to | ||
198 | * finish their work before we force them off with the NMI. | ||
199 | */ | ||
177 | if (num_online_cpus() > 1) { | 200 | if (num_online_cpus() > 1) { |
178 | /* did someone beat us here? */ | 201 | /* did someone beat us here? */ |
179 | if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) | 202 | if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) |
180 | return; | 203 | return; |
181 | 204 | ||
182 | if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, | 205 | /* sync above data before sending IRQ */ |
183 | NMI_FLAG_FIRST, "smp_stop")) | ||
184 | /* Note: we ignore failures here */ | ||
185 | return; | ||
186 | |||
187 | /* sync above data before sending NMI */ | ||
188 | wmb(); | 206 | wmb(); |
189 | 207 | ||
190 | apic->send_IPI_allbutself(NMI_VECTOR); | 208 | apic->send_IPI_allbutself(REBOOT_VECTOR); |
191 | 209 | ||
192 | /* | 210 | /* |
193 | * Don't wait longer than a second if the caller | 211 | * Don't wait longer than a second if the caller |
@@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait) | |||
197 | while (num_online_cpus() > 1 && (wait || timeout--)) | 215 | while (num_online_cpus() > 1 && (wait || timeout--)) |
198 | udelay(1); | 216 | udelay(1); |
199 | } | 217 | } |
218 | |||
219 | /* if the REBOOT_VECTOR didn't work, try with the NMI */ | ||
220 | if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { | ||
221 | if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, | ||
222 | NMI_FLAG_FIRST, "smp_stop")) | ||
223 | /* Note: we ignore failures here */ | ||
224 | /* Hope the REBOOT_IRQ is good enough */ | ||
225 | goto finish; | ||
200 | 226 | ||
201 | local_irq_save(flags); | 227 | /* sync above data before sending IRQ */ |
202 | disable_local_APIC(); | 228 | wmb(); |
203 | local_irq_restore(flags); | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * this function calls the 'stop' function on all other CPUs in the system. | ||
208 | */ | ||
209 | |||
210 | asmlinkage void smp_reboot_interrupt(void) | ||
211 | { | ||
212 | ack_APIC_irq(); | ||
213 | irq_enter(); | ||
214 | stop_this_cpu(NULL); | ||
215 | irq_exit(); | ||
216 | } | ||
217 | |||
218 | static void native_irq_stop_other_cpus(int wait) | ||
219 | { | ||
220 | unsigned long flags; | ||
221 | unsigned long timeout; | ||
222 | 229 | ||
223 | if (reboot_force) | 230 | pr_emerg("Shutting down cpus with NMI\n"); |
224 | return; | ||
225 | 231 | ||
226 | /* | 232 | apic->send_IPI_allbutself(NMI_VECTOR); |
227 | * Use an own vector here because smp_call_function | ||
228 | * does lots of things not suitable in a panic situation. | ||
229 | * On most systems we could also use an NMI here, | ||
230 | * but there are a few systems around where NMI | ||
231 | * is problematic so stay with an non NMI for now | ||
232 | * (this implies we cannot stop CPUs spinning with irq off | ||
233 | * currently) | ||
234 | */ | ||
235 | if (num_online_cpus() > 1) { | ||
236 | apic->send_IPI_allbutself(REBOOT_VECTOR); | ||
237 | 233 | ||
238 | /* | 234 | /* |
239 | * Don't wait longer than a second if the caller | 235 | * Don't wait longer than a 10 ms if the caller |
240 | * didn't ask us to wait. | 236 | * didn't ask us to wait. |
241 | */ | 237 | */ |
242 | timeout = USEC_PER_SEC; | 238 | timeout = USEC_PER_MSEC * 10; |
243 | while (num_online_cpus() > 1 && (wait || timeout--)) | 239 | while (num_online_cpus() > 1 && (wait || timeout--)) |
244 | udelay(1); | 240 | udelay(1); |
245 | } | 241 | } |
246 | 242 | ||
243 | finish: | ||
247 | local_irq_save(flags); | 244 | local_irq_save(flags); |
248 | disable_local_APIC(); | 245 | disable_local_APIC(); |
249 | local_irq_restore(flags); | 246 | local_irq_restore(flags); |
250 | } | 247 | } |
251 | 248 | ||
252 | static void native_smp_disable_nmi_ipi(void) | ||
253 | { | ||
254 | smp_ops.stop_other_cpus = native_irq_stop_other_cpus; | ||
255 | } | ||
256 | |||
257 | /* | 249 | /* |
258 | * Reschedule call back. | 250 | * Reschedule call back. |
259 | */ | 251 | */ |
@@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) | |||
287 | 279 | ||
288 | static int __init nonmi_ipi_setup(char *str) | 280 | static int __init nonmi_ipi_setup(char *str) |
289 | { | 281 | { |
290 | native_smp_disable_nmi_ipi(); | 282 | smp_no_nmi_ipi = true; |
291 | return 1; | 283 | return 1; |
292 | } | 284 | } |
293 | 285 | ||
294 | __setup("nonmi_ipi", nonmi_ipi_setup); | 286 | __setup("nonmi_ipi", nonmi_ipi_setup); |
@@ -298,7 +290,7 @@ struct smp_ops smp_ops = { | |||
298 | .smp_prepare_cpus = native_smp_prepare_cpus, | 290 | .smp_prepare_cpus = native_smp_prepare_cpus, |
299 | .smp_cpus_done = native_smp_cpus_done, | 291 | .smp_cpus_done = native_smp_cpus_done, |
300 | 292 | ||
301 | .stop_other_cpus = native_nmi_stop_other_cpus, | 293 | .stop_other_cpus = native_stop_other_cpus, |
302 | .smp_send_reschedule = native_smp_send_reschedule, | 294 | .smp_send_reschedule = native_smp_send_reschedule, |
303 | 295 | ||
304 | .cpu_up = native_cpu_up, | 296 | .cpu_up = native_cpu_up, |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 757c4b1d0a0..f56f96da77f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -78,20 +78,8 @@ | |||
78 | /* State of each CPU */ | 78 | /* State of each CPU */ |
79 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | 79 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; |
80 | 80 | ||
81 | /* Store all idle threads, this can be reused instead of creating | ||
82 | * a new thread. Also avoids complicated thread destroy functionality | ||
83 | * for idle threads. | ||
84 | */ | ||
85 | #ifdef CONFIG_HOTPLUG_CPU | 81 | #ifdef CONFIG_HOTPLUG_CPU |
86 | /* | 82 | /* |
87 | * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is | ||
88 | * removed after init for !CONFIG_HOTPLUG_CPU. | ||
89 | */ | ||
90 | static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); | ||
91 | #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) | ||
92 | #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) | ||
93 | |||
94 | /* | ||
95 | * We need this for trampoline_base protection from concurrent accesses when | 83 | * We need this for trampoline_base protection from concurrent accesses when |
96 | * off- and onlining cores wildly. | 84 | * off- and onlining cores wildly. |
97 | */ | 85 | */ |
@@ -99,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); | |||
99 | 87 | ||
100 | void cpu_hotplug_driver_lock(void) | 88 | void cpu_hotplug_driver_lock(void) |
101 | { | 89 | { |
102 | mutex_lock(&x86_cpu_hotplug_driver_mutex); | 90 | mutex_lock(&x86_cpu_hotplug_driver_mutex); |
103 | } | 91 | } |
104 | 92 | ||
105 | void cpu_hotplug_driver_unlock(void) | 93 | void cpu_hotplug_driver_unlock(void) |
106 | { | 94 | { |
107 | mutex_unlock(&x86_cpu_hotplug_driver_mutex); | 95 | mutex_unlock(&x86_cpu_hotplug_driver_mutex); |
108 | } | 96 | } |
109 | 97 | ||
110 | ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } | 98 | ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } |
111 | ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } | 99 | ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } |
112 | #else | ||
113 | static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | ||
114 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | ||
115 | #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) | ||
116 | #endif | 100 | #endif |
117 | 101 | ||
118 | /* Number of siblings per CPU package */ | 102 | /* Number of siblings per CPU package */ |
@@ -317,59 +301,90 @@ void __cpuinit smp_store_cpu_info(int id) | |||
317 | identify_secondary_cpu(c); | 301 | identify_secondary_cpu(c); |
318 | } | 302 | } |
319 | 303 | ||
320 | static void __cpuinit link_thread_siblings(int cpu1, int cpu2) | 304 | static bool __cpuinit |
305 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) | ||
321 | { | 306 | { |
322 | cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); | 307 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
323 | cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); | 308 | |
324 | cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); | 309 | return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), |
325 | cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); | 310 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " |
326 | cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); | 311 | "[node: %d != %d]. Ignoring dependency.\n", |
327 | cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); | 312 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); |
328 | } | 313 | } |
329 | 314 | ||
315 | #define link_mask(_m, c1, c2) \ | ||
316 | do { \ | ||
317 | cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ | ||
318 | cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ | ||
319 | } while (0) | ||
320 | |||
321 | static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
322 | { | ||
323 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | ||
324 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
325 | |||
326 | if (c->phys_proc_id == o->phys_proc_id && | ||
327 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && | ||
328 | c->compute_unit_id == o->compute_unit_id) | ||
329 | return topology_sane(c, o, "smt"); | ||
330 | |||
331 | } else if (c->phys_proc_id == o->phys_proc_id && | ||
332 | c->cpu_core_id == o->cpu_core_id) { | ||
333 | return topology_sane(c, o, "smt"); | ||
334 | } | ||
335 | |||
336 | return false; | ||
337 | } | ||
338 | |||
339 | static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
340 | { | ||
341 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
342 | |||
343 | if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && | ||
344 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) | ||
345 | return topology_sane(c, o, "llc"); | ||
346 | |||
347 | return false; | ||
348 | } | ||
349 | |||
350 | static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
351 | { | ||
352 | if (c->phys_proc_id == o->phys_proc_id) | ||
353 | return topology_sane(c, o, "mc"); | ||
354 | |||
355 | return false; | ||
356 | } | ||
330 | 357 | ||
331 | void __cpuinit set_cpu_sibling_map(int cpu) | 358 | void __cpuinit set_cpu_sibling_map(int cpu) |
332 | { | 359 | { |
333 | int i; | 360 | bool has_mc = boot_cpu_data.x86_max_cores > 1; |
361 | bool has_smt = smp_num_siblings > 1; | ||
334 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 362 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
363 | struct cpuinfo_x86 *o; | ||
364 | int i; | ||
335 | 365 | ||
336 | cpumask_set_cpu(cpu, cpu_sibling_setup_mask); | 366 | cpumask_set_cpu(cpu, cpu_sibling_setup_mask); |
337 | 367 | ||
338 | if (smp_num_siblings > 1) { | 368 | if (!has_smt && !has_mc) { |
339 | for_each_cpu(i, cpu_sibling_setup_mask) { | ||
340 | struct cpuinfo_x86 *o = &cpu_data(i); | ||
341 | |||
342 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | ||
343 | if (c->phys_proc_id == o->phys_proc_id && | ||
344 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && | ||
345 | c->compute_unit_id == o->compute_unit_id) | ||
346 | link_thread_siblings(cpu, i); | ||
347 | } else if (c->phys_proc_id == o->phys_proc_id && | ||
348 | c->cpu_core_id == o->cpu_core_id) { | ||
349 | link_thread_siblings(cpu, i); | ||
350 | } | ||
351 | } | ||
352 | } else { | ||
353 | cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); | 369 | cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); |
354 | } | 370 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); |
355 | 371 | cpumask_set_cpu(cpu, cpu_core_mask(cpu)); | |
356 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); | ||
357 | |||
358 | if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { | ||
359 | cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); | ||
360 | c->booted_cores = 1; | 372 | c->booted_cores = 1; |
361 | return; | 373 | return; |
362 | } | 374 | } |
363 | 375 | ||
364 | for_each_cpu(i, cpu_sibling_setup_mask) { | 376 | for_each_cpu(i, cpu_sibling_setup_mask) { |
365 | if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && | 377 | o = &cpu_data(i); |
366 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { | 378 | |
367 | cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); | 379 | if ((i == cpu) || (has_smt && match_smt(c, o))) |
368 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); | 380 | link_mask(sibling, cpu, i); |
369 | } | 381 | |
370 | if (c->phys_proc_id == cpu_data(i).phys_proc_id) { | 382 | if ((i == cpu) || (has_mc && match_llc(c, o))) |
371 | cpumask_set_cpu(i, cpu_core_mask(cpu)); | 383 | link_mask(llc_shared, cpu, i); |
372 | cpumask_set_cpu(cpu, cpu_core_mask(i)); | 384 | |
385 | if ((i == cpu) || (has_mc && match_mc(c, o))) { | ||
386 | link_mask(core, cpu, i); | ||
387 | |||
373 | /* | 388 | /* |
374 | * Does this new cpu bringup a new core? | 389 | * Does this new cpu bringup a new core? |
375 | */ | 390 | */ |
@@ -400,8 +415,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) | |||
400 | * For perf, we return last level cache shared map. | 415 | * For perf, we return last level cache shared map. |
401 | * And for power savings, we return cpu_core_map | 416 | * And for power savings, we return cpu_core_map |
402 | */ | 417 | */ |
403 | if ((sched_mc_power_savings || sched_smt_power_savings) && | 418 | if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) |
404 | !(cpu_has(c, X86_FEATURE_AMD_DCM))) | ||
405 | return cpu_core_mask(cpu); | 419 | return cpu_core_mask(cpu); |
406 | else | 420 | else |
407 | return cpu_llc_shared_mask(cpu); | 421 | return cpu_llc_shared_mask(cpu); |
@@ -620,22 +634,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) | |||
620 | return (send_status | accept_status); | 634 | return (send_status | accept_status); |
621 | } | 635 | } |
622 | 636 | ||
623 | struct create_idle { | ||
624 | struct work_struct work; | ||
625 | struct task_struct *idle; | ||
626 | struct completion done; | ||
627 | int cpu; | ||
628 | }; | ||
629 | |||
630 | static void __cpuinit do_fork_idle(struct work_struct *work) | ||
631 | { | ||
632 | struct create_idle *c_idle = | ||
633 | container_of(work, struct create_idle, work); | ||
634 | |||
635 | c_idle->idle = fork_idle(c_idle->cpu); | ||
636 | complete(&c_idle->done); | ||
637 | } | ||
638 | |||
639 | /* reduce the number of lines printed when booting a large cpu count system */ | 637 | /* reduce the number of lines printed when booting a large cpu count system */ |
640 | static void __cpuinit announce_cpu(int cpu, int apicid) | 638 | static void __cpuinit announce_cpu(int cpu, int apicid) |
641 | { | 639 | { |
@@ -662,7 +660,7 @@ static void __cpuinit announce_cpu(int cpu, int apicid) | |||
662 | * Returns zero if CPU booted OK, else error code from | 660 | * Returns zero if CPU booted OK, else error code from |
663 | * ->wakeup_secondary_cpu. | 661 | * ->wakeup_secondary_cpu. |
664 | */ | 662 | */ |
665 | static int __cpuinit do_boot_cpu(int apicid, int cpu) | 663 | static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) |
666 | { | 664 | { |
667 | volatile u32 *trampoline_status = | 665 | volatile u32 *trampoline_status = |
668 | (volatile u32 *) __va(real_mode_header->trampoline_status); | 666 | (volatile u32 *) __va(real_mode_header->trampoline_status); |
@@ -671,53 +669,26 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
671 | 669 | ||
672 | unsigned long boot_error = 0; | 670 | unsigned long boot_error = 0; |
673 | int timeout; | 671 | int timeout; |
674 | struct create_idle c_idle = { | ||
675 | .cpu = cpu, | ||
676 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), | ||
677 | }; | ||
678 | |||
679 | INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); | ||
680 | 672 | ||
681 | alternatives_smp_switch(1); | 673 | alternatives_smp_switch(1); |
682 | 674 | ||
683 | c_idle.idle = get_idle_for_cpu(cpu); | 675 | idle->thread.sp = (unsigned long) (((struct pt_regs *) |
684 | 676 | (THREAD_SIZE + task_stack_page(idle))) - 1); | |
685 | /* | 677 | per_cpu(current_task, cpu) = idle; |
686 | * We can't use kernel_thread since we must avoid to | ||
687 | * reschedule the child. | ||
688 | */ | ||
689 | if (c_idle.idle) { | ||
690 | c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) | ||
691 | (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); | ||
692 | init_idle(c_idle.idle, cpu); | ||
693 | goto do_rest; | ||
694 | } | ||
695 | |||
696 | schedule_work(&c_idle.work); | ||
697 | wait_for_completion(&c_idle.done); | ||
698 | 678 | ||
699 | if (IS_ERR(c_idle.idle)) { | ||
700 | printk("failed fork for CPU %d\n", cpu); | ||
701 | destroy_work_on_stack(&c_idle.work); | ||
702 | return PTR_ERR(c_idle.idle); | ||
703 | } | ||
704 | |||
705 | set_idle_for_cpu(cpu, c_idle.idle); | ||
706 | do_rest: | ||
707 | per_cpu(current_task, cpu) = c_idle.idle; | ||
708 | #ifdef CONFIG_X86_32 | 679 | #ifdef CONFIG_X86_32 |
709 | /* Stack for startup_32 can be just as for start_secondary onwards */ | 680 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
710 | irq_ctx_init(cpu); | 681 | irq_ctx_init(cpu); |
711 | #else | 682 | #else |
712 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | 683 | clear_tsk_thread_flag(idle, TIF_FORK); |
713 | initial_gs = per_cpu_offset(cpu); | 684 | initial_gs = per_cpu_offset(cpu); |
714 | per_cpu(kernel_stack, cpu) = | 685 | per_cpu(kernel_stack, cpu) = |
715 | (unsigned long)task_stack_page(c_idle.idle) - | 686 | (unsigned long)task_stack_page(idle) - |
716 | KERNEL_STACK_OFFSET + THREAD_SIZE; | 687 | KERNEL_STACK_OFFSET + THREAD_SIZE; |
717 | #endif | 688 | #endif |
718 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | 689 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); |
719 | initial_code = (unsigned long)start_secondary; | 690 | initial_code = (unsigned long)start_secondary; |
720 | stack_start = c_idle.idle->thread.sp; | 691 | stack_start = idle->thread.sp; |
721 | 692 | ||
722 | /* So we see what's up */ | 693 | /* So we see what's up */ |
723 | announce_cpu(cpu, apicid); | 694 | announce_cpu(cpu, apicid); |
@@ -815,12 +786,10 @@ do_rest: | |||
815 | */ | 786 | */ |
816 | smpboot_restore_warm_reset_vector(); | 787 | smpboot_restore_warm_reset_vector(); |
817 | } | 788 | } |
818 | |||
819 | destroy_work_on_stack(&c_idle.work); | ||
820 | return boot_error; | 789 | return boot_error; |
821 | } | 790 | } |
822 | 791 | ||
823 | int __cpuinit native_cpu_up(unsigned int cpu) | 792 | int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) |
824 | { | 793 | { |
825 | int apicid = apic->cpu_present_to_apicid(cpu); | 794 | int apicid = apic->cpu_present_to_apicid(cpu); |
826 | unsigned long flags; | 795 | unsigned long flags; |
@@ -853,7 +822,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
853 | 822 | ||
854 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 823 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; |
855 | 824 | ||
856 | err = do_boot_cpu(apicid, cpu); | 825 | err = do_boot_cpu(apicid, cpu, tidle); |
857 | if (err) { | 826 | if (err) { |
858 | pr_debug("do_boot_cpu failed %d\n", err); | 827 | pr_debug("do_boot_cpu failed %d\n", err); |
859 | return -EIO; | 828 | return -EIO; |
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index c29e235792a..b79133abda4 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <asm/cacheflush.h> | 13 | #include <asm/cacheflush.h> |
14 | #include <asm/sections.h> | 14 | #include <asm/sections.h> |
15 | #include <asm/asm.h> | ||
15 | 16 | ||
16 | int rodata_test(void) | 17 | int rodata_test(void) |
17 | { | 18 | { |
@@ -42,14 +43,7 @@ int rodata_test(void) | |||
42 | ".section .fixup,\"ax\"\n" | 43 | ".section .fixup,\"ax\"\n" |
43 | "2: jmp 1b\n" | 44 | "2: jmp 1b\n" |
44 | ".previous\n" | 45 | ".previous\n" |
45 | ".section __ex_table,\"a\"\n" | 46 | _ASM_EXTABLE(0b,2b) |
46 | " .align 16\n" | ||
47 | #ifdef CONFIG_X86_32 | ||
48 | " .long 0b,2b\n" | ||
49 | #else | ||
50 | " .quad 0b,2b\n" | ||
51 | #endif | ||
52 | ".previous" | ||
53 | : [rslt] "=r" (result) | 47 | : [rslt] "=r" (result) |
54 | : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) | 48 | : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) |
55 | ); | 49 | ); |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index c6eba2b4267..24d3c91e981 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/i8253.h> | 14 | #include <linux/i8253.h> |
15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/mca.h> | ||
18 | 17 | ||
19 | #include <asm/vsyscall.h> | 18 | #include <asm/vsyscall.h> |
20 | #include <asm/x86_init.h> | 19 | #include <asm/x86_init.h> |
@@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc); | |||
58 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | 57 | static irqreturn_t timer_interrupt(int irq, void *dev_id) |
59 | { | 58 | { |
60 | global_clock_event->event_handler(global_clock_event); | 59 | global_clock_event->event_handler(global_clock_event); |
61 | |||
62 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ | ||
63 | if (MCA_bus) | ||
64 | outb_p(inb_p(0x61)| 0x80, 0x61); | ||
65 | |||
66 | return IRQ_HANDLED; | 60 | return IRQ_HANDLED; |
67 | } | 61 | } |
68 | 62 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f1602..ff08457a025 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -37,10 +37,6 @@ | |||
37 | #include <linux/eisa.h> | 37 | #include <linux/eisa.h> |
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | #ifdef CONFIG_MCA | ||
41 | #include <linux/mca.h> | ||
42 | #endif | ||
43 | |||
44 | #if defined(CONFIG_EDAC) | 40 | #if defined(CONFIG_EDAC) |
45 | #include <linux/edac.h> | 41 | #include <linux/edac.h> |
46 | #endif | 42 | #endif |
@@ -50,6 +46,7 @@ | |||
50 | #include <asm/processor.h> | 46 | #include <asm/processor.h> |
51 | #include <asm/debugreg.h> | 47 | #include <asm/debugreg.h> |
52 | #include <linux/atomic.h> | 48 | #include <linux/atomic.h> |
49 | #include <asm/ftrace.h> | ||
53 | #include <asm/traps.h> | 50 | #include <asm/traps.h> |
54 | #include <asm/desc.h> | 51 | #include <asm/desc.h> |
55 | #include <asm/i387.h> | 52 | #include <asm/i387.h> |
@@ -303,8 +300,13 @@ gp_in_kernel: | |||
303 | } | 300 | } |
304 | 301 | ||
305 | /* May run on IST stack. */ | 302 | /* May run on IST stack. */ |
306 | dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | 303 | dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) |
307 | { | 304 | { |
305 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
306 | /* ftrace must be first, everything else may cause a recursive crash */ | ||
307 | if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) | ||
308 | return; | ||
309 | #endif | ||
308 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP | 310 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP |
309 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, | 311 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, |
310 | SIGTRAP) == NOTIFY_STOP) | 312 | SIGTRAP) == NOTIFY_STOP) |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 00000000000..dc4e910a7d9 --- /dev/null +++ b/arch/x86/kernel/uprobes.c | |||
@@ -0,0 +1,674 @@ | |||
1 | /* | ||
2 | * User-space Probes (UProbes) for x86 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2008-2011 | ||
19 | * Authors: | ||
20 | * Srikar Dronamraju | ||
21 | * Jim Keniston | ||
22 | */ | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/sched.h> | ||
25 | #include <linux/ptrace.h> | ||
26 | #include <linux/uprobes.h> | ||
27 | #include <linux/uaccess.h> | ||
28 | |||
29 | #include <linux/kdebug.h> | ||
30 | #include <asm/processor.h> | ||
31 | #include <asm/insn.h> | ||
32 | |||
33 | /* Post-execution fixups. */ | ||
34 | |||
35 | /* No fixup needed */ | ||
36 | #define UPROBE_FIX_NONE 0x0 | ||
37 | |||
38 | /* Adjust IP back to vicinity of actual insn */ | ||
39 | #define UPROBE_FIX_IP 0x1 | ||
40 | |||
41 | /* Adjust the return address of a call insn */ | ||
42 | #define UPROBE_FIX_CALL 0x2 | ||
43 | |||
44 | #define UPROBE_FIX_RIP_AX 0x8000 | ||
45 | #define UPROBE_FIX_RIP_CX 0x4000 | ||
46 | |||
47 | #define UPROBE_TRAP_NR UINT_MAX | ||
48 | |||
49 | /* Adaptations for mhiramat x86 decoder v14. */ | ||
50 | #define OPCODE1(insn) ((insn)->opcode.bytes[0]) | ||
51 | #define OPCODE2(insn) ((insn)->opcode.bytes[1]) | ||
52 | #define OPCODE3(insn) ((insn)->opcode.bytes[2]) | ||
53 | #define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) | ||
54 | |||
55 | #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ | ||
56 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
57 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
58 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
59 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
60 | << (row % 32)) | ||
61 | |||
62 | /* | ||
63 | * Good-instruction tables for 32-bit apps. This is non-const and volatile | ||
64 | * to keep gcc from statically optimizing it out, as variable_test_bit makes | ||
65 | * some versions of gcc to think only *(unsigned long*) is used. | ||
66 | */ | ||
67 | static volatile u32 good_insns_32[256 / 32] = { | ||
68 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
69 | /* ---------------------------------------------- */ | ||
70 | W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ | ||
71 | W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ | ||
72 | W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ | ||
73 | W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ | ||
74 | W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ | ||
75 | W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ | ||
76 | W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ | ||
77 | W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ | ||
78 | W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ | ||
79 | W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ | ||
80 | W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ | ||
81 | W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ | ||
82 | W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ | ||
83 | W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ | ||
84 | W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ | ||
85 | W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ | ||
86 | /* ---------------------------------------------- */ | ||
87 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
88 | }; | ||
89 | |||
90 | /* Using this for both 64-bit and 32-bit apps */ | ||
91 | static volatile u32 good_2byte_insns[256 / 32] = { | ||
92 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
93 | /* ---------------------------------------------- */ | ||
94 | W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ | ||
95 | W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ | ||
96 | W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ | ||
97 | W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ | ||
98 | W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ | ||
99 | W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ | ||
100 | W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ | ||
101 | W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ | ||
102 | W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ | ||
103 | W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ | ||
104 | W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ | ||
105 | W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ | ||
106 | W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ | ||
107 | W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ | ||
108 | W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ | ||
109 | W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ | ||
110 | /* ---------------------------------------------- */ | ||
111 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
112 | }; | ||
113 | |||
114 | #ifdef CONFIG_X86_64 | ||
115 | /* Good-instruction tables for 64-bit apps */ | ||
116 | static volatile u32 good_insns_64[256 / 32] = { | ||
117 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
118 | /* ---------------------------------------------- */ | ||
119 | W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ | ||
120 | W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ | ||
121 | W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ | ||
122 | W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ | ||
123 | W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ | ||
124 | W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ | ||
125 | W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ | ||
126 | W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ | ||
127 | W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ | ||
128 | W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ | ||
129 | W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ | ||
130 | W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ | ||
131 | W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ | ||
132 | W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ | ||
133 | W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ | ||
134 | W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ | ||
135 | /* ---------------------------------------------- */ | ||
136 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
137 | }; | ||
138 | #endif | ||
139 | #undef W | ||
140 | |||
141 | /* | ||
142 | * opcodes we'll probably never support: | ||
143 | * | ||
144 | * 6c-6d, e4-e5, ec-ed - in | ||
145 | * 6e-6f, e6-e7, ee-ef - out | ||
146 | * cc, cd - int3, int | ||
147 | * cf - iret | ||
148 | * d6 - illegal instruction | ||
149 | * f1 - int1/icebp | ||
150 | * f4 - hlt | ||
151 | * fa, fb - cli, sti | ||
152 | * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 | ||
153 | * | ||
154 | * invalid opcodes in 64-bit mode: | ||
155 | * | ||
156 | * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 | ||
157 | * 63 - we support this opcode in x86_64 but not in i386. | ||
158 | * | ||
159 | * opcodes we may need to refine support for: | ||
160 | * | ||
161 | * 0f - 2-byte instructions: For many of these instructions, the validity | ||
162 | * depends on the prefix and/or the reg field. On such instructions, we | ||
163 | * just consider the opcode combination valid if it corresponds to any | ||
164 | * valid instruction. | ||
165 | * | ||
166 | * 8f - Group 1 - only reg = 0 is OK | ||
167 | * c6-c7 - Group 11 - only reg = 0 is OK | ||
168 | * d9-df - fpu insns with some illegal encodings | ||
169 | * f2, f3 - repnz, repz prefixes. These are also the first byte for | ||
170 | * certain floating-point instructions, such as addsd. | ||
171 | * | ||
172 | * fe - Group 4 - only reg = 0 or 1 is OK | ||
173 | * ff - Group 5 - only reg = 0-6 is OK | ||
174 | * | ||
175 | * others -- Do we need to support these? | ||
176 | * | ||
177 | * 0f - (floating-point?) prefetch instructions | ||
178 | * 07, 17, 1f - pop es, pop ss, pop ds | ||
179 | * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- | ||
180 | * but 64 and 65 (fs: and gs:) seem to be used, so we support them | ||
181 | * 67 - addr16 prefix | ||
182 | * ce - into | ||
183 | * f0 - lock prefix | ||
184 | */ | ||
185 | |||
186 | /* | ||
187 | * TODO: | ||
188 | * - Where necessary, examine the modrm byte and allow only valid instructions | ||
189 | * in the different Groups and fpu instructions. | ||
190 | */ | ||
191 | |||
192 | static bool is_prefix_bad(struct insn *insn) | ||
193 | { | ||
194 | int i; | ||
195 | |||
196 | for (i = 0; i < insn->prefixes.nbytes; i++) { | ||
197 | switch (insn->prefixes.bytes[i]) { | ||
198 | case 0x26: /* INAT_PFX_ES */ | ||
199 | case 0x2E: /* INAT_PFX_CS */ | ||
200 | case 0x36: /* INAT_PFX_DS */ | ||
201 | case 0x3E: /* INAT_PFX_SS */ | ||
202 | case 0xF0: /* INAT_PFX_LOCK */ | ||
203 | return true; | ||
204 | } | ||
205 | } | ||
206 | return false; | ||
207 | } | ||
208 | |||
209 | static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) | ||
210 | { | ||
211 | insn_init(insn, auprobe->insn, false); | ||
212 | |||
213 | /* Skip good instruction prefixes; reject "bad" ones. */ | ||
214 | insn_get_opcode(insn); | ||
215 | if (is_prefix_bad(insn)) | ||
216 | return -ENOTSUPP; | ||
217 | |||
218 | if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) | ||
219 | return 0; | ||
220 | |||
221 | if (insn->opcode.nbytes == 2) { | ||
222 | if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) | ||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | return -ENOTSUPP; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Figure out which fixups arch_uprobe_post_xol() will need to perform, and | ||
231 | * annotate arch_uprobe->fixups accordingly. To start with, | ||
232 | * arch_uprobe->fixups is either zero or it reflects rip-related fixups. | ||
233 | */ | ||
234 | static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) | ||
235 | { | ||
236 | bool fix_ip = true, fix_call = false; /* defaults */ | ||
237 | int reg; | ||
238 | |||
239 | insn_get_opcode(insn); /* should be a nop */ | ||
240 | |||
241 | switch (OPCODE1(insn)) { | ||
242 | case 0xc3: /* ret/lret */ | ||
243 | case 0xcb: | ||
244 | case 0xc2: | ||
245 | case 0xca: | ||
246 | /* ip is correct */ | ||
247 | fix_ip = false; | ||
248 | break; | ||
249 | case 0xe8: /* call relative - Fix return addr */ | ||
250 | fix_call = true; | ||
251 | break; | ||
252 | case 0x9a: /* call absolute - Fix return addr, not ip */ | ||
253 | fix_call = true; | ||
254 | fix_ip = false; | ||
255 | break; | ||
256 | case 0xff: | ||
257 | insn_get_modrm(insn); | ||
258 | reg = MODRM_REG(insn); | ||
259 | if (reg == 2 || reg == 3) { | ||
260 | /* call or lcall, indirect */ | ||
261 | /* Fix return addr; ip is correct. */ | ||
262 | fix_call = true; | ||
263 | fix_ip = false; | ||
264 | } else if (reg == 4 || reg == 5) { | ||
265 | /* jmp or ljmp, indirect */ | ||
266 | /* ip is correct. */ | ||
267 | fix_ip = false; | ||
268 | } | ||
269 | break; | ||
270 | case 0xea: /* jmp absolute -- ip is correct */ | ||
271 | fix_ip = false; | ||
272 | break; | ||
273 | default: | ||
274 | break; | ||
275 | } | ||
276 | if (fix_ip) | ||
277 | auprobe->fixups |= UPROBE_FIX_IP; | ||
278 | if (fix_call) | ||
279 | auprobe->fixups |= UPROBE_FIX_CALL; | ||
280 | } | ||
281 | |||
282 | #ifdef CONFIG_X86_64 | ||
283 | /* | ||
284 | * If arch_uprobe->insn doesn't use rip-relative addressing, return | ||
285 | * immediately. Otherwise, rewrite the instruction so that it accesses | ||
286 | * its memory operand indirectly through a scratch register. Set | ||
287 | * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address | ||
288 | * accordingly. (The contents of the scratch register will be saved | ||
289 | * before we single-step the modified instruction, and restored | ||
290 | * afterward.) | ||
291 | * | ||
292 | * We do this because a rip-relative instruction can access only a | ||
293 | * relatively small area (+/- 2 GB from the instruction), and the XOL | ||
294 | * area typically lies beyond that area. At least for instructions | ||
295 | * that store to memory, we can't execute the original instruction | ||
296 | * and "fix things up" later, because the misdirected store could be | ||
297 | * disastrous. | ||
298 | * | ||
299 | * Some useful facts about rip-relative instructions: | ||
300 | * | ||
301 | * - There's always a modrm byte. | ||
302 | * - There's never a SIB byte. | ||
303 | * - The displacement is always 4 bytes. | ||
304 | */ | ||
305 | static void | ||
306 | handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) | ||
307 | { | ||
308 | u8 *cursor; | ||
309 | u8 reg; | ||
310 | |||
311 | if (mm->context.ia32_compat) | ||
312 | return; | ||
313 | |||
314 | auprobe->rip_rela_target_address = 0x0; | ||
315 | if (!insn_rip_relative(insn)) | ||
316 | return; | ||
317 | |||
318 | /* | ||
319 | * insn_rip_relative() would have decoded rex_prefix, modrm. | ||
320 | * Clear REX.b bit (extension of MODRM.rm field): | ||
321 | * we want to encode rax/rcx, not r8/r9. | ||
322 | */ | ||
323 | if (insn->rex_prefix.nbytes) { | ||
324 | cursor = auprobe->insn + insn_offset_rex_prefix(insn); | ||
325 | *cursor &= 0xfe; /* Clearing REX.B bit */ | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Point cursor at the modrm byte. The next 4 bytes are the | ||
330 | * displacement. Beyond the displacement, for some instructions, | ||
331 | * is the immediate operand. | ||
332 | */ | ||
333 | cursor = auprobe->insn + insn_offset_modrm(insn); | ||
334 | insn_get_length(insn); | ||
335 | |||
336 | /* | ||
337 | * Convert from rip-relative addressing to indirect addressing | ||
338 | * via a scratch register. Change the r/m field from 0x5 (%rip) | ||
339 | * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. | ||
340 | */ | ||
341 | reg = MODRM_REG(insn); | ||
342 | if (reg == 0) { | ||
343 | /* | ||
344 | * The register operand (if any) is either the A register | ||
345 | * (%rax, %eax, etc.) or (if the 0x4 bit is set in the | ||
346 | * REX prefix) %r8. In any case, we know the C register | ||
347 | * is NOT the register operand, so we use %rcx (register | ||
348 | * #1) for the scratch register. | ||
349 | */ | ||
350 | auprobe->fixups = UPROBE_FIX_RIP_CX; | ||
351 | /* Change modrm from 00 000 101 to 00 000 001. */ | ||
352 | *cursor = 0x1; | ||
353 | } else { | ||
354 | /* Use %rax (register #0) for the scratch register. */ | ||
355 | auprobe->fixups = UPROBE_FIX_RIP_AX; | ||
356 | /* Change modrm from 00 xxx 101 to 00 xxx 000 */ | ||
357 | *cursor = (reg << 3); | ||
358 | } | ||
359 | |||
360 | /* Target address = address of next instruction + (signed) offset */ | ||
361 | auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; | ||
362 | |||
363 | /* Displacement field is gone; slide immediate field (if any) over. */ | ||
364 | if (insn->immediate.nbytes) { | ||
365 | cursor++; | ||
366 | memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); | ||
367 | } | ||
368 | return; | ||
369 | } | ||
370 | |||
371 | static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) | ||
372 | { | ||
373 | insn_init(insn, auprobe->insn, true); | ||
374 | |||
375 | /* Skip good instruction prefixes; reject "bad" ones. */ | ||
376 | insn_get_opcode(insn); | ||
377 | if (is_prefix_bad(insn)) | ||
378 | return -ENOTSUPP; | ||
379 | |||
380 | if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) | ||
381 | return 0; | ||
382 | |||
383 | if (insn->opcode.nbytes == 2) { | ||
384 | if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) | ||
385 | return 0; | ||
386 | } | ||
387 | return -ENOTSUPP; | ||
388 | } | ||
389 | |||
390 | static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) | ||
391 | { | ||
392 | if (mm->context.ia32_compat) | ||
393 | return validate_insn_32bits(auprobe, insn); | ||
394 | return validate_insn_64bits(auprobe, insn); | ||
395 | } | ||
396 | #else /* 32-bit: */ | ||
397 | static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) | ||
398 | { | ||
399 | /* No RIP-relative addressing on 32-bit */ | ||
400 | } | ||
401 | |||
402 | static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) | ||
403 | { | ||
404 | return validate_insn_32bits(auprobe, insn); | ||
405 | } | ||
406 | #endif /* CONFIG_X86_64 */ | ||
407 | |||
408 | /** | ||
409 | * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. | ||
410 | * @mm: the probed address space. | ||
411 | * @arch_uprobe: the probepoint information. | ||
412 | * Return 0 on success or a -ve number on error. | ||
413 | */ | ||
414 | int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) | ||
415 | { | ||
416 | int ret; | ||
417 | struct insn insn; | ||
418 | |||
419 | auprobe->fixups = 0; | ||
420 | ret = validate_insn_bits(auprobe, mm, &insn); | ||
421 | if (ret != 0) | ||
422 | return ret; | ||
423 | |||
424 | handle_riprel_insn(auprobe, mm, &insn); | ||
425 | prepare_fixups(auprobe, &insn); | ||
426 | |||
427 | return 0; | ||
428 | } | ||
429 | |||
430 | #ifdef CONFIG_X86_64 | ||
431 | /* | ||
432 | * If we're emulating a rip-relative instruction, save the contents | ||
433 | * of the scratch register and store the target address in that register. | ||
434 | */ | ||
435 | static void | ||
436 | pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, | ||
437 | struct arch_uprobe_task *autask) | ||
438 | { | ||
439 | if (auprobe->fixups & UPROBE_FIX_RIP_AX) { | ||
440 | autask->saved_scratch_register = regs->ax; | ||
441 | regs->ax = current->utask->vaddr; | ||
442 | regs->ax += auprobe->rip_rela_target_address; | ||
443 | } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { | ||
444 | autask->saved_scratch_register = regs->cx; | ||
445 | regs->cx = current->utask->vaddr; | ||
446 | regs->cx += auprobe->rip_rela_target_address; | ||
447 | } | ||
448 | } | ||
449 | #else | ||
450 | static void | ||
451 | pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, | ||
452 | struct arch_uprobe_task *autask) | ||
453 | { | ||
454 | /* No RIP-relative addressing on 32-bit */ | ||
455 | } | ||
456 | #endif | ||
457 | |||
458 | /* | ||
459 | * arch_uprobe_pre_xol - prepare to execute out of line. | ||
460 | * @auprobe: the probepoint information. | ||
461 | * @regs: reflects the saved user state of current task. | ||
462 | */ | ||
463 | int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | ||
464 | { | ||
465 | struct arch_uprobe_task *autask; | ||
466 | |||
467 | autask = ¤t->utask->autask; | ||
468 | autask->saved_trap_nr = current->thread.trap_nr; | ||
469 | current->thread.trap_nr = UPROBE_TRAP_NR; | ||
470 | regs->ip = current->utask->xol_vaddr; | ||
471 | pre_xol_rip_insn(auprobe, regs, autask); | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * This function is called by arch_uprobe_post_xol() to adjust the return | ||
478 | * address pushed by a call instruction executed out of line. | ||
479 | */ | ||
480 | static int adjust_ret_addr(unsigned long sp, long correction) | ||
481 | { | ||
482 | int rasize, ncopied; | ||
483 | long ra = 0; | ||
484 | |||
485 | if (is_ia32_task()) | ||
486 | rasize = 4; | ||
487 | else | ||
488 | rasize = 8; | ||
489 | |||
490 | ncopied = copy_from_user(&ra, (void __user *)sp, rasize); | ||
491 | if (unlikely(ncopied)) | ||
492 | return -EFAULT; | ||
493 | |||
494 | ra += correction; | ||
495 | ncopied = copy_to_user((void __user *)sp, &ra, rasize); | ||
496 | if (unlikely(ncopied)) | ||
497 | return -EFAULT; | ||
498 | |||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | #ifdef CONFIG_X86_64 | ||
503 | static bool is_riprel_insn(struct arch_uprobe *auprobe) | ||
504 | { | ||
505 | return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); | ||
506 | } | ||
507 | |||
508 | static void | ||
509 | handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) | ||
510 | { | ||
511 | if (is_riprel_insn(auprobe)) { | ||
512 | struct arch_uprobe_task *autask; | ||
513 | |||
514 | autask = ¤t->utask->autask; | ||
515 | if (auprobe->fixups & UPROBE_FIX_RIP_AX) | ||
516 | regs->ax = autask->saved_scratch_register; | ||
517 | else | ||
518 | regs->cx = autask->saved_scratch_register; | ||
519 | |||
520 | /* | ||
521 | * The original instruction includes a displacement, and so | ||
522 | * is 4 bytes longer than what we've just single-stepped. | ||
523 | * Fall through to handle stuff like "jmpq *...(%rip)" and | ||
524 | * "callq *...(%rip)". | ||
525 | */ | ||
526 | if (correction) | ||
527 | *correction += 4; | ||
528 | } | ||
529 | } | ||
530 | #else | ||
531 | static void | ||
532 | handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) | ||
533 | { | ||
534 | /* No RIP-relative addressing on 32-bit */ | ||
535 | } | ||
536 | #endif | ||
537 | |||
538 | /* | ||
539 | * If xol insn itself traps and generates a signal(Say, | ||
540 | * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped | ||
541 | * instruction jumps back to its own address. It is assumed that anything | ||
542 | * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. | ||
543 | * | ||
544 | * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, | ||
545 | * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to | ||
546 | * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). | ||
547 | */ | ||
548 | bool arch_uprobe_xol_was_trapped(struct task_struct *t) | ||
549 | { | ||
550 | if (t->thread.trap_nr != UPROBE_TRAP_NR) | ||
551 | return true; | ||
552 | |||
553 | return false; | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * Called after single-stepping. To avoid the SMP problems that can | ||
558 | * occur when we temporarily put back the original opcode to | ||
559 | * single-step, we single-stepped a copy of the instruction. | ||
560 | * | ||
561 | * This function prepares to resume execution after the single-step. | ||
562 | * We have to fix things up as follows: | ||
563 | * | ||
564 | * Typically, the new ip is relative to the copied instruction. We need | ||
565 | * to make it relative to the original instruction (FIX_IP). Exceptions | ||
566 | * are return instructions and absolute or indirect jump or call instructions. | ||
567 | * | ||
568 | * If the single-stepped instruction was a call, the return address that | ||
569 | * is atop the stack is the address following the copied instruction. We | ||
570 | * need to make it the address following the original instruction (FIX_CALL). | ||
571 | * | ||
572 | * If the original instruction was a rip-relative instruction such as | ||
573 | * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent | ||
574 | * instruction using a scratch register -- e.g., "movl %edx,(%rax)". | ||
575 | * We need to restore the contents of the scratch register and adjust | ||
576 | * the ip, keeping in mind that the instruction we executed is 4 bytes | ||
577 | * shorter than the original instruction (since we squeezed out the offset | ||
578 | * field). (FIX_RIP_AX or FIX_RIP_CX) | ||
579 | */ | ||
580 | int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | ||
581 | { | ||
582 | struct uprobe_task *utask; | ||
583 | long correction; | ||
584 | int result = 0; | ||
585 | |||
586 | WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); | ||
587 | |||
588 | utask = current->utask; | ||
589 | current->thread.trap_nr = utask->autask.saved_trap_nr; | ||
590 | correction = (long)(utask->vaddr - utask->xol_vaddr); | ||
591 | handle_riprel_post_xol(auprobe, regs, &correction); | ||
592 | if (auprobe->fixups & UPROBE_FIX_IP) | ||
593 | regs->ip += correction; | ||
594 | |||
595 | if (auprobe->fixups & UPROBE_FIX_CALL) | ||
596 | result = adjust_ret_addr(regs->sp, correction); | ||
597 | |||
598 | return result; | ||
599 | } | ||
600 | |||
601 | /* callback routine for handling exceptions. */ | ||
602 | int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) | ||
603 | { | ||
604 | struct die_args *args = data; | ||
605 | struct pt_regs *regs = args->regs; | ||
606 | int ret = NOTIFY_DONE; | ||
607 | |||
608 | /* We are only interested in userspace traps */ | ||
609 | if (regs && !user_mode_vm(regs)) | ||
610 | return NOTIFY_DONE; | ||
611 | |||
612 | switch (val) { | ||
613 | case DIE_INT3: | ||
614 | if (uprobe_pre_sstep_notifier(regs)) | ||
615 | ret = NOTIFY_STOP; | ||
616 | |||
617 | break; | ||
618 | |||
619 | case DIE_DEBUG: | ||
620 | if (uprobe_post_sstep_notifier(regs)) | ||
621 | ret = NOTIFY_STOP; | ||
622 | |||
623 | default: | ||
624 | break; | ||
625 | } | ||
626 | |||
627 | return ret; | ||
628 | } | ||
629 | |||
630 | /* | ||
631 | * This function gets called when XOL instruction either gets trapped or | ||
632 | * the thread has a fatal signal, so reset the instruction pointer to its | ||
633 | * probed address. | ||
634 | */ | ||
635 | void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | ||
636 | { | ||
637 | struct uprobe_task *utask = current->utask; | ||
638 | |||
639 | current->thread.trap_nr = utask->autask.saved_trap_nr; | ||
640 | handle_riprel_post_xol(auprobe, regs, NULL); | ||
641 | instruction_pointer_set(regs, utask->vaddr); | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * Skip these instructions as per the currently known x86 ISA. | ||
646 | * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } | ||
647 | */ | ||
648 | bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) | ||
649 | { | ||
650 | int i; | ||
651 | |||
652 | for (i = 0; i < MAX_UINSN_BYTES; i++) { | ||
653 | if ((auprobe->insn[i] == 0x66)) | ||
654 | continue; | ||
655 | |||
656 | if (auprobe->insn[i] == 0x90) | ||
657 | return true; | ||
658 | |||
659 | if (i == (MAX_UINSN_BYTES - 1)) | ||
660 | break; | ||
661 | |||
662 | if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) | ||
663 | return true; | ||
664 | |||
665 | if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) | ||
666 | return true; | ||
667 | |||
668 | if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) | ||
669 | return true; | ||
670 | |||
671 | break; | ||
672 | } | ||
673 | return false; | ||
674 | } | ||
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a1d804bcd48..8eeb55a551b 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/pci_ids.h> | 16 | #include <linux/pci_ids.h> |
17 | #include <linux/pci_regs.h> | 17 | #include <linux/pci_regs.h> |
18 | #include <linux/smp.h> | ||
18 | 19 | ||
19 | #include <asm/apic.h> | 20 | #include <asm/apic.h> |
20 | #include <asm/pci-direct.h> | 21 | #include <asm/pci-direct.h> |
@@ -22,6 +23,8 @@ | |||
22 | #include <asm/paravirt.h> | 23 | #include <asm/paravirt.h> |
23 | #include <asm/setup.h> | 24 | #include <asm/setup.h> |
24 | 25 | ||
26 | #define TOPOLOGY_REGISTER_OFFSET 0x10 | ||
27 | |||
25 | #if defined CONFIG_PCI && defined CONFIG_PARAVIRT | 28 | #if defined CONFIG_PCI && defined CONFIG_PARAVIRT |
26 | /* | 29 | /* |
27 | * Interrupt control on vSMPowered systems: | 30 | * Interrupt control on vSMPowered systems: |
@@ -149,12 +152,49 @@ int is_vsmp_box(void) | |||
149 | return 0; | 152 | return 0; |
150 | } | 153 | } |
151 | #endif | 154 | #endif |
155 | |||
156 | static void __init vsmp_cap_cpus(void) | ||
157 | { | ||
158 | #if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) | ||
159 | void __iomem *address; | ||
160 | unsigned int cfg, topology, node_shift, maxcpus; | ||
161 | |||
162 | /* | ||
163 | * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the | ||
164 | * ones present in the first board, unless explicitly overridden by | ||
165 | * setup_max_cpus | ||
166 | */ | ||
167 | if (setup_max_cpus != NR_CPUS) | ||
168 | return; | ||
169 | |||
170 | /* Read the vSMP Foundation topology register */ | ||
171 | cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); | ||
172 | address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); | ||
173 | if (WARN_ON(!address)) | ||
174 | return; | ||
175 | |||
176 | topology = readl(address); | ||
177 | node_shift = (topology >> 16) & 0x7; | ||
178 | if (!node_shift) | ||
179 | /* The value 0 should be decoded as 8 */ | ||
180 | node_shift = 8; | ||
181 | maxcpus = (topology & ((1 << node_shift) - 1)) + 1; | ||
182 | |||
183 | pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", | ||
184 | maxcpus); | ||
185 | setup_max_cpus = maxcpus; | ||
186 | early_iounmap(address, 4); | ||
187 | #endif | ||
188 | } | ||
189 | |||
152 | void __init vsmp_init(void) | 190 | void __init vsmp_init(void) |
153 | { | 191 | { |
154 | detect_vsmp_box(); | 192 | detect_vsmp_box(); |
155 | if (!is_vsmp_box()) | 193 | if (!is_vsmp_box()) |
156 | return; | 194 | return; |
157 | 195 | ||
196 | vsmp_cap_cpus(); | ||
197 | |||
158 | set_vsmp_pv_ops(); | 198 | set_vsmp_pv_ops(); |
159 | return; | 199 | return; |
160 | } | 200 | } |
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9cf71d0b2d3..35c5e543f55 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
19 | #include <asm/time.h> | 19 | #include <asm/time.h> |
20 | #include <asm/irq.h> | 20 | #include <asm/irq.h> |
21 | #include <asm/io_apic.h> | ||
21 | #include <asm/pat.h> | 22 | #include <asm/pat.h> |
22 | #include <asm/tsc.h> | 23 | #include <asm/tsc.h> |
23 | #include <asm/iommu.h> | 24 | #include <asm/iommu.h> |
@@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = { | |||
119 | .teardown_msi_irqs = default_teardown_msi_irqs, | 120 | .teardown_msi_irqs = default_teardown_msi_irqs, |
120 | .restore_msi_irqs = default_restore_msi_irqs, | 121 | .restore_msi_irqs = default_restore_msi_irqs, |
121 | }; | 122 | }; |
123 | |||
124 | struct x86_io_apic_ops x86_io_apic_ops = { | ||
125 | .init = native_io_apic_init_mappings, | ||
126 | .read = native_io_apic_read, | ||
127 | .write = native_io_apic_write, | ||
128 | .modify = native_io_apic_modify, | ||
129 | }; | ||
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e62728e30b0..bd18149b2b0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk) | |||
48 | if (!fx) | 48 | if (!fx) |
49 | return; | 49 | return; |
50 | 50 | ||
51 | BUG_ON(__thread_has_fpu(tsk)); | ||
52 | |||
53 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | 51 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; |
54 | 52 | ||
55 | /* | 53 | /* |