aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-04-14 07:18:27 -0400
committerIngo Molnar <mingo@kernel.org>2012-04-14 07:19:04 -0400
commit6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch)
tree021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /arch/x86/kernel
parent682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff)
parenta385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff)
Merge branch 'perf/core' into perf/uprobes
Merge in latest upstream (and the latest perf development tree), to prepare for tooling changes, and also to pick up v3.4 MM changes that the uprobes code needs to take care of. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c7
-rw-r--r--arch/x86/kernel/acpi/cstate.c1
-rw-r--r--arch/x86/kernel/amd_gart_64.c11
-rw-r--r--arch/x86/kernel/apic/apic.c13
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c199
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c6
-rw-r--r--arch/x86/kernel/apm_32.c12
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/common.c18
-rw-r--r--arch/x86/kernel/cpu/match.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c195
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c106
-rw-r--r--arch/x86/kernel/cpu/perf_event.h43
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c194
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c526
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c19
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_32.c6
-rw-r--r--arch/x86/kernel/devicetree.c101
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/entry_32.S17
-rw-r--r--arch/x86/kernel/entry_64.S73
-rw-r--r--arch/x86/kernel/i387.c83
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/irq.c7
-rw-r--r--arch/x86/kernel/irq_32.c11
-rw-r--r--arch/x86/kernel/irqinit.c7
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c67
-rw-r--r--arch/x86/kernel/kprobes-common.h102
-rw-r--r--arch/x86/kernel/kprobes-opt.c512
-rw-r--r--arch/x86/kernel/kprobes.c664
-rw-r--r--arch/x86/kernel/kvm.c8
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/microcode_core.c15
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/nmi_selftest.c37
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/pci-calgary_64.c10
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c6
-rw-r--r--arch/x86/kernel/pci-swiotlb.c17
-rw-r--r--arch/x86/kernel/probe_roms.c1
-rw-r--r--arch/x86/kernel/process.c150
-rw-r--r--arch/x86/kernel/process_32.c63
-rw-r--r--arch/x86/kernel/process_64.c113
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smpboot.c32
-rw-r--r--arch/x86/kernel/sys_x86_64.c34
-rw-r--r--arch/x86/kernel/tboot.c9
-rw-r--r--arch/x86/kernel/tce_64.c1
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tls.c5
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c17
-rw-r--r--arch/x86/kernel/tsc_sync.c29
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kernel/vsyscall_64.c25
-rw-r--r--arch/x86/kernel/x86_init.c5
-rw-r--r--arch/x86/kernel/xsave.c1
88 files changed, 2563 insertions, 1308 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8c8c365a3bc3..d23d83577d6b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_KPROBES) += kprobes.o 71obj-$(CONFIG_KPROBES) += kprobes.o
72obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
72obj-$(CONFIG_MODULES) += module.o 73obj-$(CONFIG_MODULES) += module.o
73obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 74obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
74obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f33ea8e..a415b1f44365 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
239 * to not preallocating memory for all NR_CPUS 239 * to not preallocating memory for all NR_CPUS
240 * when we use CPU hotplug. 240 * when we use CPU hotplug.
241 */ 241 */
242 if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) 242 if (!apic->apic_id_valid(apic_id) && enabled)
243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
244 else 244 else
245 acpi_register_lapic(apic_id, enabled); 245 acpi_register_lapic(apic_id, enabled);
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)
593#ifdef CONFIG_ACPI_HOTPLUG_CPU 593#ifdef CONFIG_ACPI_HOTPLUG_CPU
594#include <acpi/processor.h> 594#include <acpi/processor.h>
595 595
596static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) 596static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
597{ 597{
598#ifdef CONFIG_ACPI_NUMA 598#ifdef CONFIG_ACPI_NUMA
599 int nid; 599 int nid;
@@ -642,6 +642,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
642 kfree(buffer.pointer); 642 kfree(buffer.pointer);
643 buffer.length = ACPI_ALLOCATE_BUFFER; 643 buffer.length = ACPI_ALLOCATE_BUFFER;
644 buffer.pointer = NULL; 644 buffer.pointer = NULL;
645 lapic = NULL;
645 646
646 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) 647 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
647 goto out; 648 goto out;
@@ -650,7 +651,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
650 goto free_tmp_map; 651 goto free_tmp_map;
651 652
652 cpumask_copy(tmp_map, cpu_present_mask); 653 cpumask_copy(tmp_map, cpu_present_mask);
653 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); 654 acpi_register_lapic(physid, ACPI_MADT_ENABLED);
654 655
655 /* 656 /*
656 * If mp_register_lapic successfully generates a new logical cpu 657 * If mp_register_lapic successfully generates a new logical cpu
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb2a201..d2b7f27781bc 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,6 +14,7 @@
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h> 16#include <asm/mwait.h>
17#include <asm/special_insns.h>
17 18
18/* 19/*
19 * Initialize bm_flags based on the CPU cache properties 20 * Initialize bm_flags based on the CPU cache properties
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b1e7c7f7a0af..e66311200cbd 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -477,7 +477,7 @@ error:
477/* allocate and map a coherent mapping */ 477/* allocate and map a coherent mapping */
478static void * 478static void *
479gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, 479gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
480 gfp_t flag) 480 gfp_t flag, struct dma_attrs *attrs)
481{ 481{
482 dma_addr_t paddr; 482 dma_addr_t paddr;
483 unsigned long align_mask; 483 unsigned long align_mask;
@@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
500 } 500 }
501 __free_pages(page, get_order(size)); 501 __free_pages(page, get_order(size));
502 } else 502 } else
503 return dma_generic_alloc_coherent(dev, size, dma_addr, flag); 503 return dma_generic_alloc_coherent(dev, size, dma_addr, flag,
504 attrs);
504 505
505 return NULL; 506 return NULL;
506} 507}
@@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
508/* free a coherent mapping */ 509/* free a coherent mapping */
509static void 510static void
510gart_free_coherent(struct device *dev, size_t size, void *vaddr, 511gart_free_coherent(struct device *dev, size_t size, void *vaddr,
511 dma_addr_t dma_addr) 512 dma_addr_t dma_addr, struct dma_attrs *attrs)
512{ 513{
513 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); 514 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
514 free_pages((unsigned long)vaddr, get_order(size)); 515 free_pages((unsigned long)vaddr, get_order(size));
@@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = {
700 .unmap_sg = gart_unmap_sg, 701 .unmap_sg = gart_unmap_sg,
701 .map_page = gart_map_page, 702 .map_page = gart_map_page,
702 .unmap_page = gart_unmap_page, 703 .unmap_page = gart_unmap_page,
703 .alloc_coherent = gart_alloc_coherent, 704 .alloc = gart_alloc_coherent,
704 .free_coherent = gart_free_coherent, 705 .free = gart_free_coherent,
705 .mapping_error = gart_mapping_error, 706 .mapping_error = gart_mapping_error,
706}; 707};
707 708
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2eec05b6d1b8..11544d8f1e97 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -383,20 +383,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
383 383
384static unsigned int reserve_eilvt_offset(int offset, unsigned int new) 384static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
385{ 385{
386 unsigned int rsvd; /* 0: uninitialized */ 386 unsigned int rsvd, vector;
387 387
388 if (offset >= APIC_EILVT_NR_MAX) 388 if (offset >= APIC_EILVT_NR_MAX)
389 return ~0; 389 return ~0;
390 390
391 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; 391 rsvd = atomic_read(&eilvt_offsets[offset]);
392 do { 392 do {
393 if (rsvd && 393 vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
394 !eilvt_entry_is_changeable(rsvd, new)) 394 if (vector && !eilvt_entry_is_changeable(vector, new))
395 /* may not change if vectors are different */ 395 /* may not change if vectors are different */
396 return rsvd; 396 return rsvd;
397 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); 397 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
398 } while (rsvd != new); 398 } while (rsvd != new);
399 399
400 rsvd &= ~APIC_EILVT_MASKED;
401 if (rsvd && rsvd != vector)
402 pr_info("LVT offset %d assigned for vector 0x%02x\n",
403 offset, rsvd);
404
400 return new; 405 return new;
401} 406}
402 407
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8c3cdded6f2b..359b6899a36c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -180,6 +180,7 @@ static struct apic apic_flat = {
180 .name = "flat", 180 .name = "flat",
181 .probe = flat_probe, 181 .probe = flat_probe,
182 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 182 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
183 .apic_id_valid = default_apic_id_valid,
183 .apic_id_registered = flat_apic_id_registered, 184 .apic_id_registered = flat_apic_id_registered,
184 185
185 .irq_delivery_mode = dest_LowestPrio, 186 .irq_delivery_mode = dest_LowestPrio,
@@ -337,6 +338,7 @@ static struct apic apic_physflat = {
337 .name = "physical flat", 338 .name = "physical flat",
338 .probe = physflat_probe, 339 .probe = physflat_probe,
339 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 340 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
341 .apic_id_valid = default_apic_id_valid,
340 .apic_id_registered = flat_apic_id_registered, 342 .apic_id_registered = flat_apic_id_registered,
341 343
342 .irq_delivery_mode = dest_Fixed, 344 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 775b82bc655c..634ae6cdd5c9 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -124,6 +124,7 @@ struct apic apic_noop = {
124 .probe = noop_probe, 124 .probe = noop_probe,
125 .acpi_madt_oem_check = NULL, 125 .acpi_madt_oem_check = NULL,
126 126
127 .apic_id_valid = default_apic_id_valid,
127 .apic_id_registered = noop_apic_id_registered, 128 .apic_id_registered = noop_apic_id_registered,
128 129
129 .irq_delivery_mode = dest_LowestPrio, 130 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 09d3d8c1cd99..899803e03214 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void)
56 return get_apic_id(apic_read(APIC_ID)); 56 return get_apic_id(apic_read(APIC_ID));
57} 57}
58 58
59static int numachip_apic_id_valid(int apicid)
60{
61 /* Trust what bootloader passes in MADT */
62 return 1;
63}
64
59static int numachip_apic_id_registered(void) 65static int numachip_apic_id_registered(void)
60{ 66{
61 return physid_isset(read_xapic_id(), phys_cpu_present_map); 67 return physid_isset(read_xapic_id(), phys_cpu_present_map);
@@ -238,6 +244,7 @@ static struct apic apic_numachip __refconst = {
238 .name = "NumaConnect system", 244 .name = "NumaConnect system",
239 .probe = numachip_probe, 245 .probe = numachip_probe,
240 .acpi_madt_oem_check = numachip_acpi_madt_oem_check, 246 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
247 .apic_id_valid = numachip_apic_id_valid,
241 .apic_id_registered = numachip_apic_id_registered, 248 .apic_id_registered = numachip_apic_id_registered,
242 249
243 .irq_delivery_mode = dest_Fixed, 250 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 521bead01137..0cdec7065aff 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -198,6 +198,7 @@ static struct apic apic_bigsmp = {
198 .name = "bigsmp", 198 .name = "bigsmp",
199 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
200 .acpi_madt_oem_check = NULL, 200 .acpi_madt_oem_check = NULL,
201 .apic_id_valid = default_apic_id_valid,
201 .apic_id_registered = bigsmp_apic_id_registered, 202 .apic_id_registered = bigsmp_apic_id_registered,
202 203
203 .irq_delivery_mode = dest_Fixed, 204 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 5d513bc47b6b..e42d1d3b9134 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = {
625 .name = "es7000", 625 .name = "es7000",
626 .probe = probe_es7000, 626 .probe = probe_es7000,
627 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, 627 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
628 .apic_id_valid = default_apic_id_valid,
628 .apic_id_registered = es7000_apic_id_registered, 629 .apic_id_registered = es7000_apic_id_registered,
629 630
630 .irq_delivery_mode = dest_LowestPrio, 631 .irq_delivery_mode = dest_LowestPrio,
@@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = {
690 .name = "es7000", 691 .name = "es7000",
691 .probe = probe_es7000, 692 .probe = probe_es7000,
692 .acpi_madt_oem_check = es7000_acpi_madt_oem_check, 693 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
694 .apic_id_valid = default_apic_id_valid,
693 .apic_id_registered = es7000_apic_id_registered, 695 .apic_id_registered = es7000_apic_id_registered,
694 696
695 .irq_delivery_mode = dest_Fixed, 697 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb072754bc1d..e88300d8e80a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -64,9 +64,28 @@
64#include <asm/apic.h> 64#include <asm/apic.h>
65 65
66#define __apicdebuginit(type) static type __init 66#define __apicdebuginit(type) static type __init
67
67#define for_each_irq_pin(entry, head) \ 68#define for_each_irq_pin(entry, head) \
68 for (entry = head; entry; entry = entry->next) 69 for (entry = head; entry; entry = entry->next)
69 70
71static void __init __ioapic_init_mappings(void);
72
73static unsigned int __io_apic_read (unsigned int apic, unsigned int reg);
74static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
75static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
76
77static struct io_apic_ops io_apic_ops = {
78 .init = __ioapic_init_mappings,
79 .read = __io_apic_read,
80 .write = __io_apic_write,
81 .modify = __io_apic_modify,
82};
83
84void __init set_io_apic_ops(const struct io_apic_ops *ops)
85{
86 io_apic_ops = *ops;
87}
88
70/* 89/*
71 * Is the SiS APIC rmw bug present ? 90 * Is the SiS APIC rmw bug present ?
72 * -1 = don't know, 0 = no, 1 = yes 91 * -1 = don't know, 0 = no, 1 = yes
@@ -294,6 +313,22 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
294 irq_free_desc(at); 313 irq_free_desc(at);
295} 314}
296 315
316static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
317{
318 return io_apic_ops.read(apic, reg);
319}
320
321static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
322{
323 io_apic_ops.write(apic, reg, value);
324}
325
326static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
327{
328 io_apic_ops.modify(apic, reg, value);
329}
330
331
297struct io_apic { 332struct io_apic {
298 unsigned int index; 333 unsigned int index;
299 unsigned int unused[3]; 334 unsigned int unused[3];
@@ -314,16 +349,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
314 writel(vector, &io_apic->eoi); 349 writel(vector, &io_apic->eoi);
315} 350}
316 351
317static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 352static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
318{ 353{
319 struct io_apic __iomem *io_apic = io_apic_base(apic); 354 struct io_apic __iomem *io_apic = io_apic_base(apic);
320 writel(reg, &io_apic->index); 355 writel(reg, &io_apic->index);
321 return readl(&io_apic->data); 356 return readl(&io_apic->data);
322} 357}
323 358
324static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) 359static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
325{ 360{
326 struct io_apic __iomem *io_apic = io_apic_base(apic); 361 struct io_apic __iomem *io_apic = io_apic_base(apic);
362
327 writel(reg, &io_apic->index); 363 writel(reg, &io_apic->index);
328 writel(value, &io_apic->data); 364 writel(value, &io_apic->data);
329} 365}
@@ -334,7 +370,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
334 * 370 *
335 * Older SiS APIC requires we rewrite the index register 371 * Older SiS APIC requires we rewrite the index register
336 */ 372 */
337static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) 373static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
338{ 374{
339 struct io_apic __iomem *io_apic = io_apic_base(apic); 375 struct io_apic __iomem *io_apic = io_apic_base(apic);
340 376
@@ -377,6 +413,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
377 413
378 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 414 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
379 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 415 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
416
380 return eu.entry; 417 return eu.entry;
381} 418}
382 419
@@ -384,9 +421,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
384{ 421{
385 union entry_union eu; 422 union entry_union eu;
386 unsigned long flags; 423 unsigned long flags;
424
387 raw_spin_lock_irqsave(&ioapic_lock, flags); 425 raw_spin_lock_irqsave(&ioapic_lock, flags);
388 eu.entry = __ioapic_read_entry(apic, pin); 426 eu.entry = __ioapic_read_entry(apic, pin);
389 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 427 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
428
390 return eu.entry; 429 return eu.entry;
391} 430}
392 431
@@ -396,8 +435,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
396 * the interrupt, and we need to make sure the entry is fully populated 435 * the interrupt, and we need to make sure the entry is fully populated
397 * before that happens. 436 * before that happens.
398 */ 437 */
399static void 438static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
400__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
401{ 439{
402 union entry_union eu = {{0, 0}}; 440 union entry_union eu = {{0, 0}};
403 441
@@ -409,6 +447,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
409static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 447static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
410{ 448{
411 unsigned long flags; 449 unsigned long flags;
450
412 raw_spin_lock_irqsave(&ioapic_lock, flags); 451 raw_spin_lock_irqsave(&ioapic_lock, flags);
413 __ioapic_write_entry(apic, pin, e); 452 __ioapic_write_entry(apic, pin, e);
414 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 453 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -435,8 +474,7 @@ static void ioapic_mask_entry(int apic, int pin)
435 * shared ISA-space IRQs, so we have to support them. We are super 474 * shared ISA-space IRQs, so we have to support them. We are super
436 * fast in the common case, and fast for shared ISA-space IRQs. 475 * fast in the common case, and fast for shared ISA-space IRQs.
437 */ 476 */
438static int 477static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
439__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
440{ 478{
441 struct irq_pin_list **last, *entry; 479 struct irq_pin_list **last, *entry;
442 480
@@ -521,6 +559,7 @@ static void io_apic_sync(struct irq_pin_list *entry)
521 * a dummy read from the IO-APIC 559 * a dummy read from the IO-APIC
522 */ 560 */
523 struct io_apic __iomem *io_apic; 561 struct io_apic __iomem *io_apic;
562
524 io_apic = io_apic_base(entry->apic); 563 io_apic = io_apic_base(entry->apic);
525 readl(&io_apic->data); 564 readl(&io_apic->data);
526} 565}
@@ -2512,21 +2551,73 @@ static void ack_apic_edge(struct irq_data *data)
2512 2551
2513atomic_t irq_mis_count; 2552atomic_t irq_mis_count;
2514 2553
2515static void ack_apic_level(struct irq_data *data)
2516{
2517 struct irq_cfg *cfg = data->chip_data;
2518 int i, do_unmask_irq = 0, irq = data->irq;
2519 unsigned long v;
2520
2521 irq_complete_move(cfg);
2522#ifdef CONFIG_GENERIC_PENDING_IRQ 2554#ifdef CONFIG_GENERIC_PENDING_IRQ
2555static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
2556{
2523 /* If we are moving the irq we need to mask it */ 2557 /* If we are moving the irq we need to mask it */
2524 if (unlikely(irqd_is_setaffinity_pending(data))) { 2558 if (unlikely(irqd_is_setaffinity_pending(data))) {
2525 do_unmask_irq = 1;
2526 mask_ioapic(cfg); 2559 mask_ioapic(cfg);
2560 return true;
2527 } 2561 }
2562 return false;
2563}
2564
2565static inline void ioapic_irqd_unmask(struct irq_data *data,
2566 struct irq_cfg *cfg, bool masked)
2567{
2568 if (unlikely(masked)) {
2569 /* Only migrate the irq if the ack has been received.
2570 *
2571 * On rare occasions the broadcast level triggered ack gets
2572 * delayed going to ioapics, and if we reprogram the
2573 * vector while Remote IRR is still set the irq will never
2574 * fire again.
2575 *
2576 * To prevent this scenario we read the Remote IRR bit
2577 * of the ioapic. This has two effects.
2578 * - On any sane system the read of the ioapic will
2579 * flush writes (and acks) going to the ioapic from
2580 * this cpu.
2581 * - We get to see if the ACK has actually been delivered.
2582 *
2583 * Based on failed experiments of reprogramming the
2584 * ioapic entry from outside of irq context starting
2585 * with masking the ioapic entry and then polling until
2586 * Remote IRR was clear before reprogramming the
2587 * ioapic I don't trust the Remote IRR bit to be
2588 * completey accurate.
2589 *
2590 * However there appears to be no other way to plug
2591 * this race, so if the Remote IRR bit is not
2592 * accurate and is causing problems then it is a hardware bug
2593 * and you can go talk to the chipset vendor about it.
2594 */
2595 if (!io_apic_level_ack_pending(cfg))
2596 irq_move_masked_irq(data);
2597 unmask_ioapic(cfg);
2598 }
2599}
2600#else
2601static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
2602{
2603 return false;
2604}
2605static inline void ioapic_irqd_unmask(struct irq_data *data,
2606 struct irq_cfg *cfg, bool masked)
2607{
2608}
2528#endif 2609#endif
2529 2610
2611static void ack_apic_level(struct irq_data *data)
2612{
2613 struct irq_cfg *cfg = data->chip_data;
2614 int i, irq = data->irq;
2615 unsigned long v;
2616 bool masked;
2617
2618 irq_complete_move(cfg);
2619 masked = ioapic_irqd_mask(data, cfg);
2620
2530 /* 2621 /*
2531 * It appears there is an erratum which affects at least version 0x11 2622 * It appears there is an erratum which affects at least version 0x11
2532 * of I/O APIC (that's the 82093AA and cores integrated into various 2623 * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2581,38 +2672,7 @@ static void ack_apic_level(struct irq_data *data)
2581 eoi_ioapic_irq(irq, cfg); 2672 eoi_ioapic_irq(irq, cfg);
2582 } 2673 }
2583 2674
2584 /* Now we can move and renable the irq */ 2675 ioapic_irqd_unmask(data, cfg, masked);
2585 if (unlikely(do_unmask_irq)) {
2586 /* Only migrate the irq if the ack has been received.
2587 *
2588 * On rare occasions the broadcast level triggered ack gets
2589 * delayed going to ioapics, and if we reprogram the
2590 * vector while Remote IRR is still set the irq will never
2591 * fire again.
2592 *
2593 * To prevent this scenario we read the Remote IRR bit
2594 * of the ioapic. This has two effects.
2595 * - On any sane system the read of the ioapic will
2596 * flush writes (and acks) going to the ioapic from
2597 * this cpu.
2598 * - We get to see if the ACK has actually been delivered.
2599 *
2600 * Based on failed experiments of reprogramming the
2601 * ioapic entry from outside of irq context starting
2602 * with masking the ioapic entry and then polling until
2603 * Remote IRR was clear before reprogramming the
2604 * ioapic I don't trust the Remote IRR bit to be
2605 * completey accurate.
2606 *
2607 * However there appears to be no other way to plug
2608 * this race, so if the Remote IRR bit is not
2609 * accurate and is causing problems then it is a hardware bug
2610 * and you can go talk to the chipset vendor about it.
2611 */
2612 if (!io_apic_level_ack_pending(cfg))
2613 irq_move_masked_irq(data);
2614 unmask_ioapic(cfg);
2615 }
2616} 2676}
2617 2677
2618#ifdef CONFIG_IRQ_REMAP 2678#ifdef CONFIG_IRQ_REMAP
@@ -3873,6 +3933,11 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
3873 3933
3874void __init ioapic_and_gsi_init(void) 3934void __init ioapic_and_gsi_init(void)
3875{ 3935{
3936 io_apic_ops.init();
3937}
3938
3939static void __init __ioapic_init_mappings(void)
3940{
3876 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3941 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3877 struct resource *ioapic_res; 3942 struct resource *ioapic_res;
3878 int i; 3943 int i;
@@ -3967,18 +4032,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
3967static __init int bad_ioapic(unsigned long address) 4032static __init int bad_ioapic(unsigned long address)
3968{ 4033{
3969 if (nr_ioapics >= MAX_IO_APICS) { 4034 if (nr_ioapics >= MAX_IO_APICS) {
3970 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " 4035 pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
3971 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 4036 MAX_IO_APICS, nr_ioapics);
3972 return 1; 4037 return 1;
3973 } 4038 }
3974 if (!address) { 4039 if (!address) {
3975 printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" 4040 pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
3976 " found in table, skipping!\n");
3977 return 1; 4041 return 1;
3978 } 4042 }
3979 return 0; 4043 return 0;
3980} 4044}
3981 4045
4046static __init int bad_ioapic_register(int idx)
4047{
4048 union IO_APIC_reg_00 reg_00;
4049 union IO_APIC_reg_01 reg_01;
4050 union IO_APIC_reg_02 reg_02;
4051
4052 reg_00.raw = io_apic_read(idx, 0);
4053 reg_01.raw = io_apic_read(idx, 1);
4054 reg_02.raw = io_apic_read(idx, 2);
4055
4056 if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
4057 pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
4058 mpc_ioapic_addr(idx));
4059 return 1;
4060 }
4061
4062 return 0;
4063}
4064
3982void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4065void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3983{ 4066{
3984 int idx = 0; 4067 int idx = 0;
@@ -3995,6 +4078,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3995 ioapics[idx].mp_config.apicaddr = address; 4078 ioapics[idx].mp_config.apicaddr = address;
3996 4079
3997 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 4080 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4081
4082 if (bad_ioapic_register(idx)) {
4083 clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
4084 return;
4085 }
4086
3998 ioapics[idx].mp_config.apicid = io_apic_unique_id(id); 4087 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
3999 ioapics[idx].mp_config.apicver = io_apic_get_version(idx); 4088 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4000 4089
@@ -4015,10 +4104,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4015 if (gsi_cfg->gsi_end >= gsi_top) 4104 if (gsi_cfg->gsi_end >= gsi_top)
4016 gsi_top = gsi_cfg->gsi_end + 1; 4105 gsi_top = gsi_cfg->gsi_end + 1;
4017 4106
4018 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4107 pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
4019 "GSI %d-%d\n", idx, mpc_ioapic_id(idx), 4108 idx, mpc_ioapic_id(idx),
4020 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), 4109 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4021 gsi_cfg->gsi_base, gsi_cfg->gsi_end); 4110 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4022 4111
4023 nr_ioapics++; 4112 nr_ioapics++;
4024} 4113}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index c4a61ca1349a..00d2422ca7c9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = {
478 .name = "NUMAQ", 478 .name = "NUMAQ",
479 .probe = probe_numaq, 479 .probe = probe_numaq,
480 .acpi_madt_oem_check = NULL, 480 .acpi_madt_oem_check = NULL,
481 .apic_id_valid = default_apic_id_valid,
481 .apic_id_registered = numaq_apic_id_registered, 482 .apic_id_registered = numaq_apic_id_registered,
482 483
483 .irq_delivery_mode = dest_LowestPrio, 484 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0787bb3412f4..ff2c1b9aac4d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -92,6 +92,7 @@ static struct apic apic_default = {
92 .name = "default", 92 .name = "default",
93 .probe = probe_default, 93 .probe = probe_default,
94 .acpi_madt_oem_check = NULL, 94 .acpi_madt_oem_check = NULL,
95 .apic_id_valid = default_apic_id_valid,
95 .apic_id_registered = default_apic_id_registered, 96 .apic_id_registered = default_apic_id_registered,
96 97
97 .irq_delivery_mode = dest_LowestPrio, 98 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 19114423c58c..fea000b27f07 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -496,6 +496,7 @@ static struct apic apic_summit = {
496 .name = "summit", 496 .name = "summit",
497 .probe = probe_summit, 497 .probe = probe_summit,
498 .acpi_madt_oem_check = summit_acpi_madt_oem_check, 498 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
499 .apic_id_valid = default_apic_id_valid,
499 .apic_id_registered = summit_apic_id_registered, 500 .apic_id_registered = summit_apic_id_registered,
500 501
501 .irq_delivery_mode = dest_LowestPrio, 502 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 500795875827..48f3103b3c93 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = {
213 .name = "cluster x2apic", 213 .name = "cluster x2apic",
214 .probe = x2apic_cluster_probe, 214 .probe = x2apic_cluster_probe,
215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
216 .apic_id_valid = x2apic_apic_id_valid,
216 .apic_id_registered = x2apic_apic_id_registered, 217 .apic_id_registered = x2apic_apic_id_registered,
217 218
218 .irq_delivery_mode = dest_LowestPrio, 219 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373dfde21e..8a778db45e3a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = {
119 .name = "physical x2apic", 119 .name = "physical x2apic",
120 .probe = x2apic_phys_probe, 120 .probe = x2apic_phys_probe,
121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
122 .apic_id_valid = x2apic_apic_id_valid,
122 .apic_id_registered = x2apic_apic_id_registered, 123 .apic_id_registered = x2apic_apic_id_registered,
123 124
124 .irq_delivery_mode = dest_Fixed, 125 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 79b05b88aa19..87bfa69e216e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector)
266 uv_send_IPI_mask(cpu_online_mask, vector); 266 uv_send_IPI_mask(cpu_online_mask, vector);
267} 267}
268 268
269static int uv_apic_id_valid(int apicid)
270{
271 return 1;
272}
273
269static int uv_apic_id_registered(void) 274static int uv_apic_id_registered(void)
270{ 275{
271 return 1; 276 return 1;
@@ -351,6 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
351 .name = "UV large system", 356 .name = "UV large system",
352 .probe = uv_probe, 357 .probe = uv_probe,
353 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 358 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
359 .apic_id_valid = uv_apic_id_valid,
354 .apic_id_registered = uv_apic_id_registered, 360 .apic_id_registered = uv_apic_id_registered,
355 361
356 .irq_delivery_mode = dest_Fixed, 362 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623cbe263..459e78cbf61e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -231,7 +231,6 @@
231#include <linux/syscore_ops.h> 231#include <linux/syscore_ops.h>
232#include <linux/i8253.h> 232#include <linux/i8253.h>
233 233
234#include <asm/system.h>
235#include <asm/uaccess.h> 234#include <asm/uaccess.h>
236#include <asm/desc.h> 235#include <asm/desc.h>
237#include <asm/olpc.h> 236#include <asm/olpc.h>
@@ -1234,8 +1233,7 @@ static int suspend(int vetoable)
1234 struct apm_user *as; 1233 struct apm_user *as;
1235 1234
1236 dpm_suspend_start(PMSG_SUSPEND); 1235 dpm_suspend_start(PMSG_SUSPEND);
1237 1236 dpm_suspend_end(PMSG_SUSPEND);
1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1237
1240 local_irq_disable(); 1238 local_irq_disable();
1241 syscore_suspend(); 1239 syscore_suspend();
@@ -1259,9 +1257,9 @@ static int suspend(int vetoable)
1259 syscore_resume(); 1257 syscore_resume();
1260 local_irq_enable(); 1258 local_irq_enable();
1261 1259
1262 dpm_resume_noirq(PMSG_RESUME); 1260 dpm_resume_start(PMSG_RESUME);
1263
1264 dpm_resume_end(PMSG_RESUME); 1261 dpm_resume_end(PMSG_RESUME);
1262
1265 queue_event(APM_NORMAL_RESUME, NULL); 1263 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1264 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1265 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1275,7 @@ static void standby(void)
1277{ 1275{
1278 int err; 1276 int err;
1279 1277
1280 dpm_suspend_noirq(PMSG_SUSPEND); 1278 dpm_suspend_end(PMSG_SUSPEND);
1281 1279
1282 local_irq_disable(); 1280 local_irq_disable();
1283 syscore_suspend(); 1281 syscore_suspend();
@@ -1291,7 +1289,7 @@ static void standby(void)
1291 syscore_resume(); 1289 syscore_resume();
1292 local_irq_enable(); 1290 local_irq_enable();
1293 1291
1294 dpm_resume_noirq(PMSG_RESUME); 1292 dpm_resume_start(PMSG_RESUME);
1295} 1293}
1296 1294
1297static apm_event_t get_event(void) 1295static apm_event_t get_event(void)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 25f24dccdcfa..6ab6aa2fdfdd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o
19 20
20obj-$(CONFIG_X86_32) += bugs.o 21obj-$(CONFIG_X86_32) += bugs.o
21obj-$(CONFIG_X86_64) += bugs_64.o 22obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c0f7d68d318f..67e258362a3d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
18#include <asm/archrandom.h> 18#include <asm/archrandom.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/processor.h> 20#include <asm/processor.h>
21#include <asm/debugreg.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22#include <linux/topology.h> 23#include <linux/topology.h>
23#include <linux/cpumask.h> 24#include <linux/cpumask.h>
@@ -28,6 +29,7 @@
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
30#include <asm/i387.h> 31#include <asm/i387.h>
32#include <asm/fpu-internal.h>
31#include <asm/mtrr.h> 33#include <asm/mtrr.h>
32#include <linux/numa.h> 34#include <linux/numa.h>
33#include <asm/asm.h> 35#include <asm/asm.h>
@@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {
933 { 0xc0011000, 0xc001103b}, 935 { 0xc0011000, 0xc001103b},
934}; 936};
935 937
936static void __cpuinit print_cpu_msr(void) 938static void __cpuinit __print_cpu_msr(void)
937{ 939{
938 unsigned index_min, index_max; 940 unsigned index_min, index_max;
939 unsigned index; 941 unsigned index;
@@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
997 else 999 else
998 printk(KERN_CONT "\n"); 1000 printk(KERN_CONT "\n");
999 1001
1000#ifdef CONFIG_SMP 1002 print_cpu_msr(c);
1003}
1004
1005void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
1006{
1001 if (c->cpu_index < show_msr) 1007 if (c->cpu_index < show_msr)
1002 print_cpu_msr(); 1008 __print_cpu_msr();
1003#else
1004 if (show_msr)
1005 print_cpu_msr();
1006#endif
1007} 1009}
1008 1010
1009static __init int setup_disablecpuid(char *arg) 1011static __init int setup_disablecpuid(char *arg)
@@ -1045,7 +1047,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1045DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1047DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1046 1048
1047DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1049DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1048EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1049 1050
1050/* 1051/*
1051 * Special IST stacks which the CPU switches to when it calls 1052 * Special IST stacks which the CPU switches to when it calls
@@ -1115,7 +1116,6 @@ void debug_stack_reset(void)
1115DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1116DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1116EXPORT_PER_CPU_SYMBOL(current_task); 1117EXPORT_PER_CPU_SYMBOL(current_task);
1117DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1118DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1118EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1119 1119
1120#ifdef CONFIG_CC_STACKPROTECTOR 1120#ifdef CONFIG_CC_STACKPROTECTOR
1121DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1121DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000000..5502b289341b
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,91 @@
1#include <asm/cpu_device_id.h>
2#include <asm/processor.h>
3#include <linux/cpu.h>
4#include <linux/module.h>
5#include <linux/slab.h>
6
7/**
8 * x86_match_cpu - match current CPU again an array of x86_cpu_ids
9 * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
10 * {}.
11 *
12 * Return the entry if the current CPU matches the entries in the
13 * passed x86_cpu_id match table. Otherwise NULL. The match table
14 * contains vendor (X86_VENDOR_*), family, model and feature bits or
15 * respective wildcard entries.
16 *
17 * A typical table entry would be to match a specific CPU
18 * { X86_VENDOR_INTEL, 6, 0x12 }
19 * or to match a specific CPU feature
20 * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
21 *
22 * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
23 * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
24 *
25 * Arrays used to match for this should also be declared using
26 * MODULE_DEVICE_TABLE(x86_cpu, ...)
27 *
28 * This always matches against the boot cpu, assuming models and features are
29 * consistent over all CPUs.
30 */
31const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
32{
33 const struct x86_cpu_id *m;
34 struct cpuinfo_x86 *c = &boot_cpu_data;
35
36 for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
37 if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
38 continue;
39 if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
40 continue;
41 if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
42 continue;
43 if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
44 continue;
45 return m;
46 }
47 return NULL;
48}
49EXPORT_SYMBOL(x86_match_cpu);
50
51ssize_t arch_print_cpu_modalias(struct device *dev,
52 struct device_attribute *attr,
53 char *bufptr)
54{
55 int size = PAGE_SIZE;
56 int i, n;
57 char *buf = bufptr;
58
59 n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
60 "model:%04X:feature:",
61 boot_cpu_data.x86_vendor,
62 boot_cpu_data.x86,
63 boot_cpu_data.x86_model);
64 size -= n;
65 buf += n;
66 size -= 1;
67 for (i = 0; i < NCAPINTS*32; i++) {
68 if (boot_cpu_has(i)) {
69 n = snprintf(buf, size, ",%04X", i);
70 if (n >= size) {
71 WARN(1, "x86 features overflow page\n");
72 break;
73 }
74 size -= n;
75 buf += n;
76 }
77 }
78 *buf++ = '\n';
79 return buf - bufptr;
80}
81
82int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
83{
84 char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
85 if (buf) {
86 arch_print_cpu_modalias(NULL, NULL, buf);
87 add_uevent_var(env, "MODALIAS=%s", buf);
88 kfree(buf);
89 }
90 return 0;
91}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f4272d..0c82091b1652 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,14 @@ static struct severity {
54#define MASK(x, y) .mask = x, .result = y 54#define MASK(x, y) .mask = x, .result = y
55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
57#define MCACOD 0xffff 58#define MCACOD 0xffff
59/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
60#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
61#define MCACOD_SCRUBMSK 0xfff0
62#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
63#define MCACOD_DATA 0x0134 /* Data Load */
64#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
58 65
59 MCESEV( 66 MCESEV(
60 NO, "Invalid", 67 NO, "Invalid",
@@ -102,11 +109,24 @@ static struct severity {
102 SER, BITCLR(MCI_STATUS_S) 109 SER, BITCLR(MCI_STATUS_S)
103 ), 110 ),
104 111
105 /* AR add known MCACODs here */
106 MCESEV( 112 MCESEV(
107 PANIC, "Action required with lost events", 113 PANIC, "Action required with lost events",
108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) 114 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
109 ), 115 ),
116
117 /* known AR MCACODs: */
118#ifdef CONFIG_MEMORY_FAILURE
119 MCESEV(
120 KEEP, "HT thread notices Action required: data load error",
121 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
122 MCGMASK(MCG_STATUS_EIPV, 0)
123 ),
124 MCESEV(
125 AR, "Action required: data load error",
126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
127 USER
128 ),
129#endif
110 MCESEV( 130 MCESEV(
111 PANIC, "Action required: unknown MCACOD", 131 PANIC, "Action required: unknown MCACOD",
112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) 132 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +135,11 @@ static struct severity {
115 /* known AO MCACODs: */ 135 /* known AO MCACODs: */
116 MCESEV( 136 MCESEV(
117 AO, "Action optional: memory scrubbing error", 137 AO, "Action optional: memory scrubbing error",
118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) 138 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
119 ), 139 ),
120 MCESEV( 140 MCESEV(
121 AO, "Action optional: last level cache writeback error", 141 AO, "Action optional: last level cache writeback error",
122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) 142 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
123 ), 143 ),
124 MCESEV( 144 MCESEV(
125 SOME, "Action optional: unknown MCACOD", 145 SOME, "Action optional: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..d086a09c087d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void)
191{ 191{
192 unsigned int next, i, prev = 0; 192 unsigned int next, i, prev = 0;
193 193
194 next = rcu_dereference_check_mce(mcelog.next); 194 next = ACCESS_ONCE(mcelog.next);
195 195
196 do { 196 do {
197 struct mce *m; 197 struct mce *m;
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
540 irq_work_queue(&__get_cpu_var(mce_irq_work)); 540 irq_work_queue(&__get_cpu_var(mce_irq_work));
541} 541}
542 542
543/*
544 * Read ADDR and MISC registers.
545 */
546static void mce_read_aux(struct mce *m, int i)
547{
548 if (m->status & MCI_STATUS_MISCV)
549 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
550 if (m->status & MCI_STATUS_ADDRV) {
551 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
552
553 /*
554 * Mask the reported address by the reported granularity.
555 */
556 if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
557 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
558 m->addr >>= shift;
559 m->addr <<= shift;
560 }
561 }
562}
563
543DEFINE_PER_CPU(unsigned, mce_poll_count); 564DEFINE_PER_CPU(unsigned, mce_poll_count);
544 565
545/* 566/*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 611 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
591 continue; 612 continue;
592 613
593 if (m.status & MCI_STATUS_MISCV) 614 mce_read_aux(&m, i);
594 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
595 if (m.status & MCI_STATUS_ADDRV)
596 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
597 615
598 if (!(flags & MCP_TIMESTAMP)) 616 if (!(flags & MCP_TIMESTAMP))
599 m.tsc = 0; 617 m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
917} 935}
918 936
919/* 937/*
938 * Need to save faulting physical address associated with a process
939 * in the machine check handler some place where we can grab it back
940 * later in mce_notify_process()
941 */
942#define MCE_INFO_MAX 16
943
944struct mce_info {
945 atomic_t inuse;
946 struct task_struct *t;
947 __u64 paddr;
948} mce_info[MCE_INFO_MAX];
949
950static void mce_save_info(__u64 addr)
951{
952 struct mce_info *mi;
953
954 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
956 mi->t = current;
957 mi->paddr = addr;
958 return;
959 }
960 }
961
962 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
963}
964
965static struct mce_info *mce_find_info(void)
966{
967 struct mce_info *mi;
968
969 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
970 if (atomic_read(&mi->inuse) && mi->t == current)
971 return mi;
972 return NULL;
973}
974
975static void mce_clear_info(struct mce_info *mi)
976{
977 atomic_set(&mi->inuse, 0);
978}
979
980/*
920 * The actual machine check handler. This only handles real 981 * The actual machine check handler. This only handles real
921 * exceptions when something got corrupted coming in through int 18. 982 * exceptions when something got corrupted coming in through int 18.
922 * 983 *
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
969 barrier(); 1030 barrier();
970 1031
971 /* 1032 /*
972 * When no restart IP must always kill or panic. 1033 * When no restart IP might need to kill or panic.
1034 * Assume the worst for now, but if we find the
1035 * severity is MCE_AR_SEVERITY we have other options.
973 */ 1036 */
974 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1037 if (!(m.mcgstatus & MCG_STATUS_RIPV))
975 kill_it = 1; 1038 kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1023 continue; 1086 continue;
1024 } 1087 }
1025 1088
1026 /* 1089 mce_read_aux(&m, i);
1027 * Kill on action required.
1028 */
1029 if (severity == MCE_AR_SEVERITY)
1030 kill_it = 1;
1031
1032 if (m.status & MCI_STATUS_MISCV)
1033 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1034 if (m.status & MCI_STATUS_ADDRV)
1035 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1036 1090
1037 /* 1091 /*
1038 * Action optional error. Queue address for later processing. 1092 * Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1052 } 1106 }
1053 } 1107 }
1054 1108
1109 /* mce_clear_state will clear *final, save locally for use later */
1110 m = *final;
1111
1055 if (!no_way_out) 1112 if (!no_way_out)
1056 mce_clear_state(toclear); 1113 mce_clear_state(toclear);
1057 1114
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063 no_way_out = worst >= MCE_PANIC_SEVERITY; 1120 no_way_out = worst >= MCE_PANIC_SEVERITY;
1064 1121
1065 /* 1122 /*
1066 * If we have decided that we just CAN'T continue, and the user 1123 * At insane "tolerant" levels we take no action. Otherwise
1067 * has not set tolerant to an insane level, give up and die. 1124 * we only die if we have no other choice. For less serious
1068 * 1125 * issues we try to recover, or limit damage to the current
1069 * This is mainly used in the case when the system doesn't 1126 * process.
1070 * support MCE broadcasting or it has been disabled.
1071 */
1072 if (no_way_out && tolerant < 3)
1073 mce_panic("Fatal machine check on current CPU", final, msg);
1074
1075 /*
1076 * If the error seems to be unrecoverable, something should be
1077 * done. Try to kill as little as possible. If we can kill just
1078 * one task, do that. If the user has set the tolerance very
1079 * high, don't try to do anything at all.
1080 */ 1127 */
1081 1128 if (tolerant < 3) {
1082 if (kill_it && tolerant < 3) 1129 if (no_way_out)
1083 force_sig(SIGBUS, current); 1130 mce_panic("Fatal machine check on current CPU", &m, msg);
1084 1131 if (worst == MCE_AR_SEVERITY) {
1085 /* notify userspace ASAP */ 1132 /* schedule action before return to userland */
1086 set_thread_flag(TIF_MCE_NOTIFY); 1133 mce_save_info(m.addr);
1134 set_thread_flag(TIF_MCE_NOTIFY);
1135 } else if (kill_it) {
1136 force_sig(SIGBUS, current);
1137 }
1138 }
1087 1139
1088 if (worst > 0) 1140 if (worst > 0)
1089 mce_report_event(regs); 1141 mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
1094} 1146}
1095EXPORT_SYMBOL_GPL(do_machine_check); 1147EXPORT_SYMBOL_GPL(do_machine_check);
1096 1148
1097/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1149#ifndef CONFIG_MEMORY_FAILURE
1098void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1150int memory_failure(unsigned long pfn, int vector, int flags)
1099{ 1151{
1100 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1152 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1153 BUG_ON(flags & MF_ACTION_REQUIRED);
1154 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
1155 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
1156
1157 return 0;
1101} 1158}
1159#endif
1102 1160
1103/* 1161/*
1104 * Called after mce notification in process context. This code 1162 * Called in process context that interrupted by MCE and marked with
1105 * is allowed to sleep. Call the high level VM handler to process 1163 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1106 * any corrupted pages. 1164 * This code is allowed to sleep.
1107 * Assume that the work queue code only calls this one at a time 1165 * Attempt possible recovery such as calling the high level VM handler to
1108 * per CPU. 1166 * process any corrupted pages, and kill/signal current process if required.
1109 * Note we don't disable preemption, so this code might run on the wrong 1167 * Action required errors are handled here.
1110 * CPU. In this case the event is picked up by the scheduled work queue.
1111 * This is merely a fast path to expedite processing in some common
1112 * cases.
1113 */ 1168 */
1114void mce_notify_process(void) 1169void mce_notify_process(void)
1115{ 1170{
1116 unsigned long pfn; 1171 unsigned long pfn;
1117 mce_notify_irq(); 1172 struct mce_info *mi = mce_find_info();
1118 while (mce_ring_get(&pfn)) 1173
1119 memory_failure(pfn, MCE_VECTOR); 1174 if (!mi)
1175 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1176 pfn = mi->paddr >> PAGE_SHIFT;
1177
1178 clear_thread_flag(TIF_MCE_NOTIFY);
1179
1180 pr_err("Uncorrected hardware memory error in user-access at %llx",
1181 mi->paddr);
1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
1183 pr_err("Memory error not recovered");
1184 force_sig(SIGBUS, current);
1185 }
1186 mce_clear_info(mi);
1120} 1187}
1121 1188
1189/*
1190 * Action optional processing happens here (picking up
1191 * from the list of faulting pages that do_machine_check()
1192 * placed into the "ring").
1193 */
1122static void mce_process_work(struct work_struct *dummy) 1194static void mce_process_work(struct work_struct *dummy)
1123{ 1195{
1124 mce_notify_process(); 1196 unsigned long pfn;
1197
1198 while (mce_ring_get(&pfn))
1199 memory_failure(pfn, MCE_VECTOR, 0);
1125} 1200}
1126 1201
1127#ifdef CONFIG_X86_MCE_INTEL 1202#ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
1211 /* Not more than two messages every minute */ 1286 /* Not more than two messages every minute */
1212 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1287 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1213 1288
1214 clear_thread_flag(TIF_MCE_NOTIFY);
1215
1216 if (test_and_clear_bit(0, &mce_need_notify)) { 1289 if (test_and_clear_bit(0, &mce_need_notify)) {
1217 /* wake processes polling /dev/mcelog */ 1290 /* wake processes polling /dev/mcelog */
1218 wake_up_interruptible(&mce_chrdev_wait); 1291 wake_up_interruptible(&mce_chrdev_wait);
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1541 /* Error or no more MCE record */ 1614 /* Error or no more MCE record */
1542 if (rc <= 0) { 1615 if (rc <= 0) {
1543 mce_apei_read_done = 1; 1616 mce_apei_read_done = 1;
1617 /*
1618 * When ERST is disabled, mce_chrdev_read() should return
1619 * "no record" instead of "no device."
1620 */
1621 if (rc == -ENODEV)
1622 return 0;
1544 return rc; 1623 return rc;
1545 } 1624 }
1546 rc = -EFAULT; 1625 rc = -EFAULT;
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = {
1859 .dev_name = "machinecheck", 1938 .dev_name = "machinecheck",
1860}; 1939};
1861 1940
1862struct device *mce_device[CONFIG_NR_CPUS]; 1941DEFINE_PER_CPU(struct device *, mce_device);
1863 1942
1864__cpuinitdata 1943__cpuinitdata
1865void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1944void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
2038 goto error2; 2117 goto error2;
2039 } 2118 }
2040 cpumask_set_cpu(cpu, mce_device_initialized); 2119 cpumask_set_cpu(cpu, mce_device_initialized);
2041 mce_device[cpu] = dev; 2120 per_cpu(mce_device, cpu) = dev;
2042 2121
2043 return 0; 2122 return 0;
2044error2: 2123error2:
@@ -2055,7 +2134,7 @@ error:
2055 2134
2056static __cpuinit void mce_device_remove(unsigned int cpu) 2135static __cpuinit void mce_device_remove(unsigned int cpu)
2057{ 2136{
2058 struct device *dev = mce_device[cpu]; 2137 struct device *dev = per_cpu(mce_device, cpu);
2059 int i; 2138 int i;
2060 2139
2061 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2140 if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
2069 2148
2070 device_unregister(dev); 2149 device_unregister(dev);
2071 cpumask_clear_cpu(cpu, mce_device_initialized); 2150 cpumask_clear_cpu(cpu, mce_device_initialized);
2072 mce_device[cpu] = NULL; 2151 per_cpu(mce_device, cpu) = NULL;
2073} 2152}
2074 2153
2075/* Make sure there are no machine checks on offlined CPUs. */ 2154/* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e4eeaaf58a47..99b57179f912 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
523{ 523{
524 int i, err = 0; 524 int i, err = 0;
525 struct threshold_bank *b = NULL; 525 struct threshold_bank *b = NULL;
526 struct device *dev = mce_device[cpu]; 526 struct device *dev = per_cpu(mce_device, cpu);
527 char name[32]; 527 char name[32];
528 528
529 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
@@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
587 if (i == cpu) 587 if (i == cpu)
588 continue; 588 continue;
589 589
590 dev = mce_device[i]; 590 dev = per_cpu(mce_device, i);
591 if (dev) 591 if (dev)
592 err = sysfs_create_link(&dev->kobj,b->kobj, name); 592 err = sysfs_create_link(&dev->kobj,b->kobj, name);
593 if (err) 593 if (err)
@@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
667#ifdef CONFIG_SMP 667#ifdef CONFIG_SMP
668 /* sibling symlink */ 668 /* sibling symlink */
669 if (shared_bank[bank] && b->blocks->cpu != cpu) { 669 if (shared_bank[bank] && b->blocks->cpu != cpu) {
670 sysfs_remove_link(&mce_device[cpu]->kobj, name); 670 dev = per_cpu(mce_device, cpu);
671 sysfs_remove_link(&dev->kobj, name);
671 per_cpu(threshold_banks, cpu)[bank] = NULL; 672 per_cpu(threshold_banks, cpu)[bank] = NULL;
672 673
673 return; 674 return;
@@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
679 if (i == cpu) 680 if (i == cpu)
680 continue; 681 continue;
681 682
682 dev = mce_device[i]; 683 dev = per_cpu(mce_device, i);
683 if (dev) 684 if (dev)
684 sysfs_remove_link(&dev->kobj, name); 685 sysfs_remove_link(&dev->kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 686 per_cpu(threshold_banks, i)[bank] = NULL;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 5c0e6533d9bc..2d5454cd2c4f 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,7 +9,6 @@
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h> 12#include <asm/mce.h>
14#include <asm/msr.h> 13#include <asm/msr.h>
15 14
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 67bb17a37a0a..47a1870279aa 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -25,7 +25,6 @@
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26 26
27#include <asm/processor.h> 27#include <asm/processor.h>
28#include <asm/system.h>
29#include <asm/apic.h> 28#include <asm/apic.h>
30#include <asm/idle.h> 29#include <asm/idle.h>
31#include <asm/mce.h> 30#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 54060f565974..2d7998fb628c 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,7 +8,6 @@
8#include <linux/init.h> 8#include <linux/init.h>
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/system.h>
12#include <asm/mce.h> 11#include <asm/mce.h>
13#include <asm/msr.h> 12#include <asm/msr.h>
14 13
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 97b26356e9ee..75772ae6c65f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,7 +12,6 @@
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13#include <asm/cpufeature.h> 13#include <asm/cpufeature.h>
14#include <asm/tlbflush.h> 14#include <asm/tlbflush.h>
15#include <asm/system.h>
16#include <asm/mtrr.h> 15#include <asm/mtrr.h>
17#include <asm/msr.h> 16#include <asm/msr.h>
18#include <asm/pat.h> 17#include <asm/pat.h>
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1c52bdbb9b8b..bb8e03407e18 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -352,6 +352,36 @@ int x86_setup_perfctr(struct perf_event *event)
352 return 0; 352 return 0;
353} 353}
354 354
355/*
356 * check that branch_sample_type is compatible with
357 * settings needed for precise_ip > 1 which implies
358 * using the LBR to capture ALL taken branches at the
359 * priv levels of the measurement
360 */
361static inline int precise_br_compat(struct perf_event *event)
362{
363 u64 m = event->attr.branch_sample_type;
364 u64 b = 0;
365
366 /* must capture all branches */
367 if (!(m & PERF_SAMPLE_BRANCH_ANY))
368 return 0;
369
370 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
371
372 if (!event->attr.exclude_user)
373 b |= PERF_SAMPLE_BRANCH_USER;
374
375 if (!event->attr.exclude_kernel)
376 b |= PERF_SAMPLE_BRANCH_KERNEL;
377
378 /*
379 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
380 */
381
382 return m == b;
383}
384
355int x86_pmu_hw_config(struct perf_event *event) 385int x86_pmu_hw_config(struct perf_event *event)
356{ 386{
357 if (event->attr.precise_ip) { 387 if (event->attr.precise_ip) {
@@ -368,6 +398,36 @@ int x86_pmu_hw_config(struct perf_event *event)
368 398
369 if (event->attr.precise_ip > precise) 399 if (event->attr.precise_ip > precise)
370 return -EOPNOTSUPP; 400 return -EOPNOTSUPP;
401 /*
402 * check that PEBS LBR correction does not conflict with
403 * whatever the user is asking with attr->branch_sample_type
404 */
405 if (event->attr.precise_ip > 1) {
406 u64 *br_type = &event->attr.branch_sample_type;
407
408 if (has_branch_stack(event)) {
409 if (!precise_br_compat(event))
410 return -EOPNOTSUPP;
411
412 /* branch_sample_type is compatible */
413
414 } else {
415 /*
416 * user did not specify branch_sample_type
417 *
418 * For PEBS fixups, we capture all
419 * the branches at the priv level of the
420 * event.
421 */
422 *br_type = PERF_SAMPLE_BRANCH_ANY;
423
424 if (!event->attr.exclude_user)
425 *br_type |= PERF_SAMPLE_BRANCH_USER;
426
427 if (!event->attr.exclude_kernel)
428 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
429 }
430 }
371 } 431 }
372 432
373 /* 433 /*
@@ -425,6 +485,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
425 /* mark unused */ 485 /* mark unused */
426 event->hw.extra_reg.idx = EXTRA_REG_NONE; 486 event->hw.extra_reg.idx = EXTRA_REG_NONE;
427 487
488 /* mark not used */
489 event->hw.extra_reg.idx = EXTRA_REG_NONE;
490 event->hw.branch_reg.idx = EXTRA_REG_NONE;
491
428 return x86_pmu.hw_config(event); 492 return x86_pmu.hw_config(event);
429} 493}
430 494
@@ -578,14 +642,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
578 /* Prefer fixed purpose counters */ 642 /* Prefer fixed purpose counters */
579 if (x86_pmu.num_counters_fixed) { 643 if (x86_pmu.num_counters_fixed) {
580 idx = X86_PMC_IDX_FIXED; 644 idx = X86_PMC_IDX_FIXED;
581 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { 645 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
582 if (!__test_and_set_bit(idx, sched->state.used)) 646 if (!__test_and_set_bit(idx, sched->state.used))
583 goto done; 647 goto done;
584 } 648 }
585 } 649 }
586 /* Grab the first unused counter starting with idx */ 650 /* Grab the first unused counter starting with idx */
587 idx = sched->state.counter; 651 idx = sched->state.counter;
588 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 652 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
589 if (!__test_and_set_bit(idx, sched->state.used)) 653 if (!__test_and_set_bit(idx, sched->state.used))
590 goto done; 654 goto done;
591 } 655 }
@@ -1249,6 +1313,11 @@ static void __init pmu_check_apic(void)
1249 pr_info("no hardware sampling interrupt available.\n"); 1313 pr_info("no hardware sampling interrupt available.\n");
1250} 1314}
1251 1315
1316static struct attribute_group x86_pmu_format_group = {
1317 .name = "format",
1318 .attrs = NULL,
1319};
1320
1252static int __init init_hw_perf_events(void) 1321static int __init init_hw_perf_events(void)
1253{ 1322{
1254 struct x86_pmu_quirk *quirk; 1323 struct x86_pmu_quirk *quirk;
@@ -1323,6 +1392,7 @@ static int __init init_hw_perf_events(void)
1323 } 1392 }
1324 1393
1325 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1394 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1395 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1326 1396
1327 pr_info("... version: %d\n", x86_pmu.version); 1397 pr_info("... version: %d\n", x86_pmu.version);
1328 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1398 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
@@ -1551,6 +1621,9 @@ static int x86_pmu_event_idx(struct perf_event *event)
1551{ 1621{
1552 int idx = event->hw.idx; 1622 int idx = event->hw.idx;
1553 1623
1624 if (!x86_pmu.attr_rdpmc)
1625 return 0;
1626
1554 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { 1627 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
1555 idx -= X86_PMC_IDX_FIXED; 1628 idx -= X86_PMC_IDX_FIXED;
1556 idx |= 1 << 30; 1629 idx |= 1 << 30;
@@ -1603,38 +1676,51 @@ static struct attribute_group x86_pmu_attr_group = {
1603 1676
1604static const struct attribute_group *x86_pmu_attr_groups[] = { 1677static const struct attribute_group *x86_pmu_attr_groups[] = {
1605 &x86_pmu_attr_group, 1678 &x86_pmu_attr_group,
1679 &x86_pmu_format_group,
1606 NULL, 1680 NULL,
1607}; 1681};
1608 1682
1683static void x86_pmu_flush_branch_stack(void)
1684{
1685 if (x86_pmu.flush_branch_stack)
1686 x86_pmu.flush_branch_stack();
1687}
1688
1609static struct pmu pmu = { 1689static struct pmu pmu = {
1610 .pmu_enable = x86_pmu_enable, 1690 .pmu_enable = x86_pmu_enable,
1611 .pmu_disable = x86_pmu_disable, 1691 .pmu_disable = x86_pmu_disable,
1612 1692
1613 .attr_groups = x86_pmu_attr_groups, 1693 .attr_groups = x86_pmu_attr_groups,
1614 1694
1615 .event_init = x86_pmu_event_init, 1695 .event_init = x86_pmu_event_init,
1616 1696
1617 .add = x86_pmu_add, 1697 .add = x86_pmu_add,
1618 .del = x86_pmu_del, 1698 .del = x86_pmu_del,
1619 .start = x86_pmu_start, 1699 .start = x86_pmu_start,
1620 .stop = x86_pmu_stop, 1700 .stop = x86_pmu_stop,
1621 .read = x86_pmu_read, 1701 .read = x86_pmu_read,
1622 1702
1623 .start_txn = x86_pmu_start_txn, 1703 .start_txn = x86_pmu_start_txn,
1624 .cancel_txn = x86_pmu_cancel_txn, 1704 .cancel_txn = x86_pmu_cancel_txn,
1625 .commit_txn = x86_pmu_commit_txn, 1705 .commit_txn = x86_pmu_commit_txn,
1626 1706
1627 .event_idx = x86_pmu_event_idx, 1707 .event_idx = x86_pmu_event_idx,
1708 .flush_branch_stack = x86_pmu_flush_branch_stack,
1628}; 1709};
1629 1710
1630void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) 1711void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1631{ 1712{
1713 userpg->cap_usr_time = 0;
1714 userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
1715 userpg->pmc_width = x86_pmu.cntval_bits;
1716
1632 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 1717 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1633 return; 1718 return;
1634 1719
1635 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 1720 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1636 return; 1721 return;
1637 1722
1723 userpg->cap_usr_time = 1;
1638 userpg->time_mult = this_cpu_read(cyc2ns); 1724 userpg->time_mult = this_cpu_read(cyc2ns);
1639 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1725 userpg->time_shift = CYC2NS_SCALE_FACTOR;
1640 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1726 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 82db83b5c3bc..6638aaf54493 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
33 33
34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ 34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ 35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
36 EXTRA_REG_LBR = 2, /* lbr_select */
36 37
37 EXTRA_REG_MAX /* number of entries needed */ 38 EXTRA_REG_MAX /* number of entries needed */
38}; 39};
@@ -130,6 +131,8 @@ struct cpu_hw_events {
130 void *lbr_context; 131 void *lbr_context;
131 struct perf_branch_stack lbr_stack; 132 struct perf_branch_stack lbr_stack;
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 133 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
134 struct er_account *lbr_sel;
135 u64 br_sel;
133 136
134 /* 137 /*
135 * Intel host/guest exclude bits 138 * Intel host/guest exclude bits
@@ -268,6 +271,29 @@ struct x86_pmu_quirk {
268 void (*func)(void); 271 void (*func)(void);
269}; 272};
270 273
274union x86_pmu_config {
275 struct {
276 u64 event:8,
277 umask:8,
278 usr:1,
279 os:1,
280 edge:1,
281 pc:1,
282 interrupt:1,
283 __reserved1:1,
284 en:1,
285 inv:1,
286 cmask:8,
287 event2:4,
288 __reserved2:4,
289 go:1,
290 ho:1;
291 } bits;
292 u64 value;
293};
294
295#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
296
271/* 297/*
272 * struct x86_pmu - generic x86 pmu 298 * struct x86_pmu - generic x86 pmu
273 */ 299 */
@@ -313,6 +339,7 @@ struct x86_pmu {
313 * sysfs attrs 339 * sysfs attrs
314 */ 340 */
315 int attr_rdpmc; 341 int attr_rdpmc;
342 struct attribute **format_attrs;
316 343
317 /* 344 /*
318 * CPU Hotplug hooks 345 * CPU Hotplug hooks
@@ -321,6 +348,7 @@ struct x86_pmu {
321 void (*cpu_starting)(int cpu); 348 void (*cpu_starting)(int cpu);
322 void (*cpu_dying)(int cpu); 349 void (*cpu_dying)(int cpu);
323 void (*cpu_dead)(int cpu); 350 void (*cpu_dead)(int cpu);
351 void (*flush_branch_stack)(void);
324 352
325 /* 353 /*
326 * Intel Arch Perfmon v2+ 354 * Intel Arch Perfmon v2+
@@ -342,6 +370,8 @@ struct x86_pmu {
342 */ 370 */
343 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 371 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
344 int lbr_nr; /* hardware stack size */ 372 int lbr_nr; /* hardware stack size */
373 u64 lbr_sel_mask; /* LBR_SELECT valid bits */
374 const int *lbr_sel_map; /* lbr_select mappings */
345 375
346 /* 376 /*
347 * Extra registers for events 377 * Extra registers for events
@@ -455,6 +485,15 @@ extern struct event_constraint emptyconstraint;
455 485
456extern struct event_constraint unconstrained; 486extern struct event_constraint unconstrained;
457 487
488static inline bool kernel_ip(unsigned long ip)
489{
490#ifdef CONFIG_X86_32
491 return ip > PAGE_OFFSET;
492#else
493 return (long)ip < 0;
494#endif
495}
496
458#ifdef CONFIG_CPU_SUP_AMD 497#ifdef CONFIG_CPU_SUP_AMD
459 498
460int amd_pmu_init(void); 499int amd_pmu_init(void);
@@ -535,6 +574,10 @@ void intel_pmu_lbr_init_nhm(void);
535 574
536void intel_pmu_lbr_init_atom(void); 575void intel_pmu_lbr_init_atom(void);
537 576
577void intel_pmu_lbr_init_snb(void);
578
579int intel_pmu_setup_lbr_filter(struct perf_event *event);
580
538int p4_pmu_init(void); 581int p4_pmu_init(void);
539 582
540int p6_pmu_init(void); 583int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67250a52430b..95e7fe1c5f0b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
139 if (ret) 139 if (ret)
140 return ret; 140 return ret;
141 141
142 if (has_branch_stack(event))
143 return -EOPNOTSUPP;
144
142 if (event->attr.exclude_host && event->attr.exclude_guest) 145 if (event->attr.exclude_host && event->attr.exclude_guest)
143 /* 146 /*
144 * When HO == GO == 1 the hardware treats that as GO == HO == 0 147 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -401,6 +404,21 @@ static void amd_pmu_cpu_dead(int cpu)
401 } 404 }
402} 405}
403 406
407PMU_FORMAT_ATTR(event, "config:0-7,32-35");
408PMU_FORMAT_ATTR(umask, "config:8-15" );
409PMU_FORMAT_ATTR(edge, "config:18" );
410PMU_FORMAT_ATTR(inv, "config:23" );
411PMU_FORMAT_ATTR(cmask, "config:24-31" );
412
413static struct attribute *amd_format_attr[] = {
414 &format_attr_event.attr,
415 &format_attr_umask.attr,
416 &format_attr_edge.attr,
417 &format_attr_inv.attr,
418 &format_attr_cmask.attr,
419 NULL,
420};
421
404static __initconst const struct x86_pmu amd_pmu = { 422static __initconst const struct x86_pmu amd_pmu = {
405 .name = "AMD", 423 .name = "AMD",
406 .handle_irq = x86_pmu_handle_irq, 424 .handle_irq = x86_pmu_handle_irq,
@@ -423,6 +441,8 @@ static __initconst const struct x86_pmu amd_pmu = {
423 .get_event_constraints = amd_get_event_constraints, 441 .get_event_constraints = amd_get_event_constraints,
424 .put_event_constraints = amd_put_event_constraints, 442 .put_event_constraints = amd_put_event_constraints,
425 443
444 .format_attrs = amd_format_attr,
445
426 .cpu_prepare = amd_pmu_cpu_prepare, 446 .cpu_prepare = amd_pmu_cpu_prepare,
427 .cpu_starting = amd_pmu_cpu_starting, 447 .cpu_starting = amd_pmu_cpu_starting,
428 .cpu_dead = amd_pmu_cpu_dead, 448 .cpu_dead = amd_pmu_cpu_dead,
@@ -593,6 +613,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
593 .cpu_dead = amd_pmu_cpu_dead, 613 .cpu_dead = amd_pmu_cpu_dead,
594#endif 614#endif
595 .cpu_starting = amd_pmu_cpu_starting, 615 .cpu_starting = amd_pmu_cpu_starting,
616 .format_attrs = amd_format_attr,
596}; 617};
597 618
598__init int amd_pmu_init(void) 619__init int amd_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..26b3e2fef104 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
385#define NHM_LOCAL_DRAM (1 << 14) 385#define NHM_LOCAL_DRAM (1 << 14)
386#define NHM_NON_DRAM (1 << 15) 386#define NHM_NON_DRAM (1 << 15)
387 387
388#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) 388#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
389#define NHM_REMOTE (NHM_REMOTE_DRAM)
389 390
390#define NHM_DMND_READ (NHM_DMND_DATA_RD) 391#define NHM_DMND_READ (NHM_DMND_DATA_RD)
391#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) 392#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
392#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) 393#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
393 394
394#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) 395#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
395#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) 396#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
396#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) 397#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
397 398
398static __initconst const u64 nehalem_hw_cache_extra_regs 399static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
416 }, 417 },
417 [ C(NODE) ] = { 418 [ C(NODE) ] = {
418 [ C(OP_READ) ] = { 419 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, 420 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, 421 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
421 }, 422 },
422 [ C(OP_WRITE) ] = { 423 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, 424 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, 425 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
425 }, 426 },
426 [ C(OP_PREFETCH) ] = { 427 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, 428 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, 429 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
429 }, 430 },
430 }, 431 },
431}; 432};
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids
727 }, 728 },
728}; 729};
729 730
731static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
732{
733 /* user explicitly requested branch sampling */
734 if (has_branch_stack(event))
735 return true;
736
737 /* implicit branch sampling to correct PEBS skid */
738 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
739 return true;
740
741 return false;
742}
743
730static void intel_pmu_disable_all(void) 744static void intel_pmu_disable_all(void)
731{ 745{
732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 746 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
881 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); 895 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
882 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); 896 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
883 897
898 /*
899 * must disable before any actual event
900 * because any event may be combined with LBR
901 */
902 if (intel_pmu_needs_lbr_smpl(event))
903 intel_pmu_lbr_disable(event);
904
884 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 905 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
885 intel_pmu_disable_fixed(hwc); 906 intel_pmu_disable_fixed(hwc);
886 return; 907 return;
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
935 intel_pmu_enable_bts(hwc->config); 956 intel_pmu_enable_bts(hwc->config);
936 return; 957 return;
937 } 958 }
959 /*
960 * must enabled before any actual event
961 * because any event may be combined with LBR
962 */
963 if (intel_pmu_needs_lbr_smpl(event))
964 intel_pmu_lbr_enable(event);
938 965
939 if (event->attr.exclude_host) 966 if (event->attr.exclude_host)
940 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); 967 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1057,6 +1084,9 @@ again:
1057 1084
1058 data.period = event->hw.last_period; 1085 data.period = event->hw.last_period;
1059 1086
1087 if (has_branch_stack(event))
1088 data.br_stack = &cpuc->lbr_stack;
1089
1060 if (perf_event_overflow(event, &data, regs)) 1090 if (perf_event_overflow(event, &data, regs))
1061 x86_pmu_stop(event, 0); 1091 x86_pmu_stop(event, 0);
1062 } 1092 }
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1123 */ 1153 */
1124static struct event_constraint * 1154static struct event_constraint *
1125__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, 1155__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1126 struct perf_event *event) 1156 struct perf_event *event,
1157 struct hw_perf_event_extra *reg)
1127{ 1158{
1128 struct event_constraint *c = &emptyconstraint; 1159 struct event_constraint *c = &emptyconstraint;
1129 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1130 struct er_account *era; 1160 struct er_account *era;
1131 unsigned long flags; 1161 unsigned long flags;
1132 int orig_idx = reg->idx; 1162 int orig_idx = reg->idx;
1133 1163
1134 /* already allocated shared msr */ 1164 /* already allocated shared msr */
1135 if (reg->alloc) 1165 if (reg->alloc)
1136 return &unconstrained; 1166 return NULL; /* call x86_get_event_constraint() */
1137 1167
1138again: 1168again:
1139 era = &cpuc->shared_regs->regs[reg->idx]; 1169 era = &cpuc->shared_regs->regs[reg->idx];
@@ -1156,14 +1186,10 @@ again:
1156 reg->alloc = 1; 1186 reg->alloc = 1;
1157 1187
1158 /* 1188 /*
1159 * All events using extra_reg are unconstrained. 1189 * need to call x86_get_event_constraint()
1160 * Avoids calling x86_get_event_constraints() 1190 * to check if associated event has constraints
1161 *
1162 * Must revisit if extra_reg controlling events
1163 * ever have constraints. Worst case we go through
1164 * the regular event constraint table.
1165 */ 1191 */
1166 c = &unconstrained; 1192 c = NULL;
1167 } else if (intel_try_alt_er(event, orig_idx)) { 1193 } else if (intel_try_alt_er(event, orig_idx)) {
1168 raw_spin_unlock_irqrestore(&era->lock, flags); 1194 raw_spin_unlock_irqrestore(&era->lock, flags);
1169 goto again; 1195 goto again;
@@ -1200,11 +1226,23 @@ static struct event_constraint *
1200intel_shared_regs_constraints(struct cpu_hw_events *cpuc, 1226intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1201 struct perf_event *event) 1227 struct perf_event *event)
1202{ 1228{
1203 struct event_constraint *c = NULL; 1229 struct event_constraint *c = NULL, *d;
1204 1230 struct hw_perf_event_extra *xreg, *breg;
1205 if (event->hw.extra_reg.idx != EXTRA_REG_NONE) 1231
1206 c = __intel_shared_reg_get_constraints(cpuc, event); 1232 xreg = &event->hw.extra_reg;
1207 1233 if (xreg->idx != EXTRA_REG_NONE) {
1234 c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
1235 if (c == &emptyconstraint)
1236 return c;
1237 }
1238 breg = &event->hw.branch_reg;
1239 if (breg->idx != EXTRA_REG_NONE) {
1240 d = __intel_shared_reg_get_constraints(cpuc, event, breg);
1241 if (d == &emptyconstraint) {
1242 __intel_shared_reg_put_constraints(cpuc, xreg);
1243 c = d;
1244 }
1245 }
1208 return c; 1246 return c;
1209} 1247}
1210 1248
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1252 reg = &event->hw.extra_reg; 1290 reg = &event->hw.extra_reg;
1253 if (reg->idx != EXTRA_REG_NONE) 1291 if (reg->idx != EXTRA_REG_NONE)
1254 __intel_shared_reg_put_constraints(cpuc, reg); 1292 __intel_shared_reg_put_constraints(cpuc, reg);
1293
1294 reg = &event->hw.branch_reg;
1295 if (reg->idx != EXTRA_REG_NONE)
1296 __intel_shared_reg_put_constraints(cpuc, reg);
1255} 1297}
1256 1298
1257static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1299static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
1287 * 1329 *
1288 * Thereby we gain a PEBS capable cycle counter. 1330 * Thereby we gain a PEBS capable cycle counter.
1289 */ 1331 */
1290 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ 1332 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
1333
1291 1334
1292 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); 1335 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1293 event->hw.config = alt_config; 1336 event->hw.config = alt_config;
1294 } 1337 }
1295 1338
1339 if (intel_pmu_needs_lbr_smpl(event)) {
1340 ret = intel_pmu_setup_lbr_filter(event);
1341 if (ret)
1342 return ret;
1343 }
1344
1296 if (event->attr.type != PERF_TYPE_RAW) 1345 if (event->attr.type != PERF_TYPE_RAW)
1297 return 0; 1346 return 0;
1298 1347
@@ -1382,6 +1431,24 @@ static void core_pmu_enable_all(int added)
1382 } 1431 }
1383} 1432}
1384 1433
1434PMU_FORMAT_ATTR(event, "config:0-7" );
1435PMU_FORMAT_ATTR(umask, "config:8-15" );
1436PMU_FORMAT_ATTR(edge, "config:18" );
1437PMU_FORMAT_ATTR(pc, "config:19" );
1438PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
1439PMU_FORMAT_ATTR(inv, "config:23" );
1440PMU_FORMAT_ATTR(cmask, "config:24-31" );
1441
1442static struct attribute *intel_arch_formats_attr[] = {
1443 &format_attr_event.attr,
1444 &format_attr_umask.attr,
1445 &format_attr_edge.attr,
1446 &format_attr_pc.attr,
1447 &format_attr_inv.attr,
1448 &format_attr_cmask.attr,
1449 NULL,
1450};
1451
1385static __initconst const struct x86_pmu core_pmu = { 1452static __initconst const struct x86_pmu core_pmu = {
1386 .name = "core", 1453 .name = "core",
1387 .handle_irq = x86_pmu_handle_irq, 1454 .handle_irq = x86_pmu_handle_irq,
@@ -1406,6 +1473,7 @@ static __initconst const struct x86_pmu core_pmu = {
1406 .put_event_constraints = intel_put_event_constraints, 1473 .put_event_constraints = intel_put_event_constraints,
1407 .event_constraints = intel_core_event_constraints, 1474 .event_constraints = intel_core_event_constraints,
1408 .guest_get_msrs = core_guest_get_msrs, 1475 .guest_get_msrs = core_guest_get_msrs,
1476 .format_attrs = intel_arch_formats_attr,
1409}; 1477};
1410 1478
1411struct intel_shared_regs *allocate_shared_regs(int cpu) 1479struct intel_shared_regs *allocate_shared_regs(int cpu)
@@ -1431,7 +1499,7 @@ static int intel_pmu_cpu_prepare(int cpu)
1431{ 1499{
1432 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1500 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1433 1501
1434 if (!x86_pmu.extra_regs) 1502 if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
1435 return NOTIFY_OK; 1503 return NOTIFY_OK;
1436 1504
1437 cpuc->shared_regs = allocate_shared_regs(cpu); 1505 cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +1521,28 @@ static void intel_pmu_cpu_starting(int cpu)
1453 */ 1521 */
1454 intel_pmu_lbr_reset(); 1522 intel_pmu_lbr_reset();
1455 1523
1456 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) 1524 cpuc->lbr_sel = NULL;
1525
1526 if (!cpuc->shared_regs)
1457 return; 1527 return;
1458 1528
1459 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1529 if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
1460 struct intel_shared_regs *pc; 1530 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1531 struct intel_shared_regs *pc;
1461 1532
1462 pc = per_cpu(cpu_hw_events, i).shared_regs; 1533 pc = per_cpu(cpu_hw_events, i).shared_regs;
1463 if (pc && pc->core_id == core_id) { 1534 if (pc && pc->core_id == core_id) {
1464 cpuc->kfree_on_online = cpuc->shared_regs; 1535 cpuc->kfree_on_online = cpuc->shared_regs;
1465 cpuc->shared_regs = pc; 1536 cpuc->shared_regs = pc;
1466 break; 1537 break;
1538 }
1467 } 1539 }
1540 cpuc->shared_regs->core_id = core_id;
1541 cpuc->shared_regs->refcnt++;
1468 } 1542 }
1469 1543
1470 cpuc->shared_regs->core_id = core_id; 1544 if (x86_pmu.lbr_sel_map)
1471 cpuc->shared_regs->refcnt++; 1545 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
1472} 1546}
1473 1547
1474static void intel_pmu_cpu_dying(int cpu) 1548static void intel_pmu_cpu_dying(int cpu)
@@ -1486,6 +1560,33 @@ static void intel_pmu_cpu_dying(int cpu)
1486 fini_debug_store_on_cpu(cpu); 1560 fini_debug_store_on_cpu(cpu);
1487} 1561}
1488 1562
1563static void intel_pmu_flush_branch_stack(void)
1564{
1565 /*
1566 * Intel LBR does not tag entries with the
1567 * PID of the current task, then we need to
1568 * flush it on ctxsw
1569 * For now, we simply reset it
1570 */
1571 if (x86_pmu.lbr_nr)
1572 intel_pmu_lbr_reset();
1573}
1574
1575PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
1576
1577static struct attribute *intel_arch3_formats_attr[] = {
1578 &format_attr_event.attr,
1579 &format_attr_umask.attr,
1580 &format_attr_edge.attr,
1581 &format_attr_pc.attr,
1582 &format_attr_any.attr,
1583 &format_attr_inv.attr,
1584 &format_attr_cmask.attr,
1585
1586 &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
1587 NULL,
1588};
1589
1489static __initconst const struct x86_pmu intel_pmu = { 1590static __initconst const struct x86_pmu intel_pmu = {
1490 .name = "Intel", 1591 .name = "Intel",
1491 .handle_irq = intel_pmu_handle_irq, 1592 .handle_irq = intel_pmu_handle_irq,
@@ -1509,10 +1610,13 @@ static __initconst const struct x86_pmu intel_pmu = {
1509 .get_event_constraints = intel_get_event_constraints, 1610 .get_event_constraints = intel_get_event_constraints,
1510 .put_event_constraints = intel_put_event_constraints, 1611 .put_event_constraints = intel_put_event_constraints,
1511 1612
1613 .format_attrs = intel_arch3_formats_attr,
1614
1512 .cpu_prepare = intel_pmu_cpu_prepare, 1615 .cpu_prepare = intel_pmu_cpu_prepare,
1513 .cpu_starting = intel_pmu_cpu_starting, 1616 .cpu_starting = intel_pmu_cpu_starting,
1514 .cpu_dying = intel_pmu_cpu_dying, 1617 .cpu_dying = intel_pmu_cpu_dying,
1515 .guest_get_msrs = intel_guest_get_msrs, 1618 .guest_get_msrs = intel_guest_get_msrs,
1619 .flush_branch_stack = intel_pmu_flush_branch_stack,
1516}; 1620};
1517 1621
1518static __init void intel_clovertown_quirk(void) 1622static __init void intel_clovertown_quirk(void)
@@ -1689,9 +1793,11 @@ __init int intel_pmu_init(void)
1689 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1793 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1690 1794
1691 /* UOPS_ISSUED.STALLED_CYCLES */ 1795 /* UOPS_ISSUED.STALLED_CYCLES */
1692 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1796 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1797 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1798 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1799 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1800 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1695 1801
1696 x86_add_quirk(intel_nehalem_quirk); 1802 x86_add_quirk(intel_nehalem_quirk);
1697 1803
@@ -1726,9 +1832,11 @@ __init int intel_pmu_init(void)
1726 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1832 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1727 1833
1728 /* UOPS_ISSUED.STALLED_CYCLES */ 1834 /* UOPS_ISSUED.STALLED_CYCLES */
1729 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1835 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1836 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1730 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1837 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1731 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1838 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1839 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1732 1840
1733 pr_cont("Westmere events, "); 1841 pr_cont("Westmere events, ");
1734 break; 1842 break;
@@ -1739,7 +1847,7 @@ __init int intel_pmu_init(void)
1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1847 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1740 sizeof(hw_cache_event_ids)); 1848 sizeof(hw_cache_event_ids));
1741 1849
1742 intel_pmu_lbr_init_nhm(); 1850 intel_pmu_lbr_init_snb();
1743 1851
1744 x86_pmu.event_constraints = intel_snb_event_constraints; 1852 x86_pmu.event_constraints = intel_snb_event_constraints;
1745 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; 1853 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
@@ -1749,9 +1857,11 @@ __init int intel_pmu_init(void)
1749 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 1857 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1750 1858
1751 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1859 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1752 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1860 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1861 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1753 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ 1862 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1754 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; 1863 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1864 X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
1755 1865
1756 pr_cont("SandyBridge events, "); 1866 pr_cont("SandyBridge events, ");
1757 break; 1867 break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49faa40c..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5#include <asm/perf_event.h> 5#include <asm/perf_event.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -439,9 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
439 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 440 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
440 441
441 cpuc->pebs_enabled |= 1ULL << hwc->idx; 442 cpuc->pebs_enabled |= 1ULL << hwc->idx;
442
443 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
444 intel_pmu_lbr_enable(event);
445} 443}
446 444
447void intel_pmu_pebs_disable(struct perf_event *event) 445void intel_pmu_pebs_disable(struct perf_event *event)
@@ -454,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
454 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 452 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
455 453
456 hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 454 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
457
458 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
459 intel_pmu_lbr_disable(event);
460} 455}
461 456
462void intel_pmu_pebs_enable_all(void) 457void intel_pmu_pebs_enable_all(void)
@@ -475,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
475 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 470 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
476} 471}
477 472
478#include <asm/insn.h>
479
480static inline bool kernel_ip(unsigned long ip)
481{
482#ifdef CONFIG_X86_32
483 return ip > PAGE_OFFSET;
484#else
485 return (long)ip < 0;
486#endif
487}
488
489static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) 473static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
490{ 474{
491 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 475 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -572,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
572 * both formats and we don't use the other fields in this 556 * both formats and we don't use the other fields in this
573 * routine. 557 * routine.
574 */ 558 */
559 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
575 struct pebs_record_core *pebs = __pebs; 560 struct pebs_record_core *pebs = __pebs;
576 struct perf_sample_data data; 561 struct perf_sample_data data;
577 struct pt_regs regs; 562 struct pt_regs regs;
@@ -602,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
602 else 587 else
603 regs.flags &= ~PERF_EFLAGS_EXACT; 588 regs.flags &= ~PERF_EFLAGS_EXACT;
604 589
590 if (has_branch_stack(event))
591 data.br_stack = &cpuc->lbr_stack;
592
605 if (perf_event_overflow(event, &data, &regs)) 593 if (perf_event_overflow(event, &data, &regs))
606 x86_pmu_stop(event, 0); 594 x86_pmu_stop(event, 0);
607} 595}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63bfe54..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
3 3
4#include <asm/perf_event.h> 4#include <asm/perf_event.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -14,6 +15,100 @@ enum {
14}; 15};
15 16
16/* 17/*
18 * Intel LBR_SELECT bits
19 * Intel Vol3a, April 2011, Section 16.7 Table 16-10
20 *
21 * Hardware branch filter (not available on all CPUs)
22 */
23#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
24#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
25#define LBR_JCC_BIT 2 /* do not capture conditional branches */
26#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
27#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
28#define LBR_RETURN_BIT 5 /* do not capture near returns */
29#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
30#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
31#define LBR_FAR_BIT 8 /* do not capture far branches */
32
33#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
34#define LBR_USER (1 << LBR_USER_BIT)
35#define LBR_JCC (1 << LBR_JCC_BIT)
36#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
37#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
38#define LBR_RETURN (1 << LBR_RETURN_BIT)
39#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
40#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
41#define LBR_FAR (1 << LBR_FAR_BIT)
42
43#define LBR_PLM (LBR_KERNEL | LBR_USER)
44
45#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
46#define LBR_NOT_SUPP -1 /* LBR filter not supported */
47#define LBR_IGN 0 /* ignored */
48
49#define LBR_ANY \
50 (LBR_JCC |\
51 LBR_REL_CALL |\
52 LBR_IND_CALL |\
53 LBR_RETURN |\
54 LBR_REL_JMP |\
55 LBR_IND_JMP |\
56 LBR_FAR)
57
58#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
59
60#define for_each_branch_sample_type(x) \
61 for ((x) = PERF_SAMPLE_BRANCH_USER; \
62 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
63
64/*
65 * x86control flow change classification
66 * x86control flow changes include branches, interrupts, traps, faults
67 */
68enum {
69 X86_BR_NONE = 0, /* unknown */
70
71 X86_BR_USER = 1 << 0, /* branch target is user */
72 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
73
74 X86_BR_CALL = 1 << 2, /* call */
75 X86_BR_RET = 1 << 3, /* return */
76 X86_BR_SYSCALL = 1 << 4, /* syscall */
77 X86_BR_SYSRET = 1 << 5, /* syscall return */
78 X86_BR_INT = 1 << 6, /* sw interrupt */
79 X86_BR_IRET = 1 << 7, /* return from interrupt */
80 X86_BR_JCC = 1 << 8, /* conditional */
81 X86_BR_JMP = 1 << 9, /* jump */
82 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
83 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
84};
85
86#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
87
88#define X86_BR_ANY \
89 (X86_BR_CALL |\
90 X86_BR_RET |\
91 X86_BR_SYSCALL |\
92 X86_BR_SYSRET |\
93 X86_BR_INT |\
94 X86_BR_IRET |\
95 X86_BR_JCC |\
96 X86_BR_JMP |\
97 X86_BR_IRQ |\
98 X86_BR_IND_CALL)
99
100#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
101
102#define X86_BR_ANY_CALL \
103 (X86_BR_CALL |\
104 X86_BR_IND_CALL |\
105 X86_BR_SYSCALL |\
106 X86_BR_IRQ |\
107 X86_BR_INT)
108
109static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
110
111/*
17 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 112 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
18 * otherwise it becomes near impossible to get a reliable stack. 113 * otherwise it becomes near impossible to get a reliable stack.
19 */ 114 */
@@ -21,6 +116,10 @@ enum {
21static void __intel_pmu_lbr_enable(void) 116static void __intel_pmu_lbr_enable(void)
22{ 117{
23 u64 debugctl; 118 u64 debugctl;
119 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
120
121 if (cpuc->lbr_sel)
122 wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
24 123
25 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 124 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
26 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 125 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -76,11 +175,11 @@ void intel_pmu_lbr_enable(struct perf_event *event)
76 * Reset the LBR stack if we changed task context to 175 * Reset the LBR stack if we changed task context to
77 * avoid data leaks. 176 * avoid data leaks.
78 */ 177 */
79
80 if (event->ctx->task && cpuc->lbr_context != event->ctx) { 178 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
81 intel_pmu_lbr_reset(); 179 intel_pmu_lbr_reset();
82 cpuc->lbr_context = event->ctx; 180 cpuc->lbr_context = event->ctx;
83 } 181 }
182 cpuc->br_sel = event->hw.branch_reg.reg;
84 183
85 cpuc->lbr_users++; 184 cpuc->lbr_users++;
86} 185}
@@ -95,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
95 cpuc->lbr_users--; 194 cpuc->lbr_users--;
96 WARN_ON_ONCE(cpuc->lbr_users < 0); 195 WARN_ON_ONCE(cpuc->lbr_users < 0);
97 196
98 if (cpuc->enabled && !cpuc->lbr_users) 197 if (cpuc->enabled && !cpuc->lbr_users) {
99 __intel_pmu_lbr_disable(); 198 __intel_pmu_lbr_disable();
199 /* avoid stale pointer */
200 cpuc->lbr_context = NULL;
201 }
100} 202}
101 203
102void intel_pmu_lbr_enable_all(void) 204void intel_pmu_lbr_enable_all(void)
@@ -115,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)
115 __intel_pmu_lbr_disable(); 217 __intel_pmu_lbr_disable();
116} 218}
117 219
220/*
221 * TOS = most recently recorded branch
222 */
118static inline u64 intel_pmu_lbr_tos(void) 223static inline u64 intel_pmu_lbr_tos(void)
119{ 224{
120 u64 tos; 225 u64 tos;
@@ -142,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
142 247
143 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); 248 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
144 249
145 cpuc->lbr_entries[i].from = msr_lastbranch.from; 250 cpuc->lbr_entries[i].from = msr_lastbranch.from;
146 cpuc->lbr_entries[i].to = msr_lastbranch.to; 251 cpuc->lbr_entries[i].to = msr_lastbranch.to;
147 cpuc->lbr_entries[i].flags = 0; 252 cpuc->lbr_entries[i].mispred = 0;
253 cpuc->lbr_entries[i].predicted = 0;
254 cpuc->lbr_entries[i].reserved = 0;
148 } 255 }
149 cpuc->lbr_stack.nr = i; 256 cpuc->lbr_stack.nr = i;
150} 257}
151 258
152#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
153
154/* 259/*
155 * Due to lack of segmentation in Linux the effective address (offset) 260 * Due to lack of segmentation in Linux the effective address (offset)
156 * is the same as the linear address, allowing us to merge the LIP and EIP 261 * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -165,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
165 270
166 for (i = 0; i < x86_pmu.lbr_nr; i++) { 271 for (i = 0; i < x86_pmu.lbr_nr; i++) {
167 unsigned long lbr_idx = (tos - i) & mask; 272 unsigned long lbr_idx = (tos - i) & mask;
168 u64 from, to, flags = 0; 273 u64 from, to, mis = 0, pred = 0;
169 274
170 rdmsrl(x86_pmu.lbr_from + lbr_idx, from); 275 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
171 rdmsrl(x86_pmu.lbr_to + lbr_idx, to); 276 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
172 277
173 if (lbr_format == LBR_FORMAT_EIP_FLAGS) { 278 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
174 flags = !!(from & LBR_FROM_FLAG_MISPRED); 279 mis = !!(from & LBR_FROM_FLAG_MISPRED);
280 pred = !mis;
175 from = (u64)((((s64)from) << 1) >> 1); 281 from = (u64)((((s64)from) << 1) >> 1);
176 } 282 }
177 283
178 cpuc->lbr_entries[i].from = from; 284 cpuc->lbr_entries[i].from = from;
179 cpuc->lbr_entries[i].to = to; 285 cpuc->lbr_entries[i].to = to;
180 cpuc->lbr_entries[i].flags = flags; 286 cpuc->lbr_entries[i].mispred = mis;
287 cpuc->lbr_entries[i].predicted = pred;
288 cpuc->lbr_entries[i].reserved = 0;
181 } 289 }
182 cpuc->lbr_stack.nr = i; 290 cpuc->lbr_stack.nr = i;
183} 291}
@@ -193,28 +301,404 @@ void intel_pmu_lbr_read(void)
193 intel_pmu_lbr_read_32(cpuc); 301 intel_pmu_lbr_read_32(cpuc);
194 else 302 else
195 intel_pmu_lbr_read_64(cpuc); 303 intel_pmu_lbr_read_64(cpuc);
304
305 intel_pmu_lbr_filter(cpuc);
306}
307
308/*
309 * SW filter is used:
310 * - in case there is no HW filter
311 * - in case the HW filter has errata or limitations
312 */
313static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
314{
315 u64 br_type = event->attr.branch_sample_type;
316 int mask = 0;
317
318 if (br_type & PERF_SAMPLE_BRANCH_USER)
319 mask |= X86_BR_USER;
320
321 if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
322 mask |= X86_BR_KERNEL;
323
324 /* we ignore BRANCH_HV here */
325
326 if (br_type & PERF_SAMPLE_BRANCH_ANY)
327 mask |= X86_BR_ANY;
328
329 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
330 mask |= X86_BR_ANY_CALL;
331
332 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
333 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
334
335 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
336 mask |= X86_BR_IND_CALL;
337 /*
338 * stash actual user request into reg, it may
339 * be used by fixup code for some CPU
340 */
341 event->hw.branch_reg.reg = mask;
342}
343
344/*
345 * setup the HW LBR filter
346 * Used only when available, may not be enough to disambiguate
347 * all branches, may need the help of the SW filter
348 */
349static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
350{
351 struct hw_perf_event_extra *reg;
352 u64 br_type = event->attr.branch_sample_type;
353 u64 mask = 0, m;
354 u64 v;
355
356 for_each_branch_sample_type(m) {
357 if (!(br_type & m))
358 continue;
359
360 v = x86_pmu.lbr_sel_map[m];
361 if (v == LBR_NOT_SUPP)
362 return -EOPNOTSUPP;
363
364 if (v != LBR_IGN)
365 mask |= v;
366 }
367 reg = &event->hw.branch_reg;
368 reg->idx = EXTRA_REG_LBR;
369
370 /* LBR_SELECT operates in suppress mode so invert mask */
371 reg->config = ~mask & x86_pmu.lbr_sel_mask;
372
373 return 0;
374}
375
376int intel_pmu_setup_lbr_filter(struct perf_event *event)
377{
378 int ret = 0;
379
380 /*
381 * no LBR on this PMU
382 */
383 if (!x86_pmu.lbr_nr)
384 return -EOPNOTSUPP;
385
386 /*
387 * setup SW LBR filter
388 */
389 intel_pmu_setup_sw_lbr_filter(event);
390
391 /*
392 * setup HW LBR filter, if any
393 */
394 if (x86_pmu.lbr_sel_map)
395 ret = intel_pmu_setup_hw_lbr_filter(event);
396
397 return ret;
196} 398}
197 399
400/*
401 * return the type of control flow change at address "from"
402 * intruction is not necessarily a branch (in case of interrupt).
403 *
404 * The branch type returned also includes the priv level of the
405 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
406 *
407 * If a branch type is unknown OR the instruction cannot be
408 * decoded (e.g., text page not present), then X86_BR_NONE is
409 * returned.
410 */
411static int branch_type(unsigned long from, unsigned long to)
412{
413 struct insn insn;
414 void *addr;
415 int bytes, size = MAX_INSN_SIZE;
416 int ret = X86_BR_NONE;
417 int ext, to_plm, from_plm;
418 u8 buf[MAX_INSN_SIZE];
419 int is64 = 0;
420
421 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
422 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
423
424 /*
425 * maybe zero if lbr did not fill up after a reset by the time
426 * we get a PMU interrupt
427 */
428 if (from == 0 || to == 0)
429 return X86_BR_NONE;
430
431 if (from_plm == X86_BR_USER) {
432 /*
433 * can happen if measuring at the user level only
434 * and we interrupt in a kernel thread, e.g., idle.
435 */
436 if (!current->mm)
437 return X86_BR_NONE;
438
439 /* may fail if text not present */
440 bytes = copy_from_user_nmi(buf, (void __user *)from, size);
441 if (bytes != size)
442 return X86_BR_NONE;
443
444 addr = buf;
445 } else
446 addr = (void *)from;
447
448 /*
449 * decoder needs to know the ABI especially
450 * on 64-bit systems running 32-bit apps
451 */
452#ifdef CONFIG_X86_64
453 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
454#endif
455 insn_init(&insn, addr, is64);
456 insn_get_opcode(&insn);
457
458 switch (insn.opcode.bytes[0]) {
459 case 0xf:
460 switch (insn.opcode.bytes[1]) {
461 case 0x05: /* syscall */
462 case 0x34: /* sysenter */
463 ret = X86_BR_SYSCALL;
464 break;
465 case 0x07: /* sysret */
466 case 0x35: /* sysexit */
467 ret = X86_BR_SYSRET;
468 break;
469 case 0x80 ... 0x8f: /* conditional */
470 ret = X86_BR_JCC;
471 break;
472 default:
473 ret = X86_BR_NONE;
474 }
475 break;
476 case 0x70 ... 0x7f: /* conditional */
477 ret = X86_BR_JCC;
478 break;
479 case 0xc2: /* near ret */
480 case 0xc3: /* near ret */
481 case 0xca: /* far ret */
482 case 0xcb: /* far ret */
483 ret = X86_BR_RET;
484 break;
485 case 0xcf: /* iret */
486 ret = X86_BR_IRET;
487 break;
488 case 0xcc ... 0xce: /* int */
489 ret = X86_BR_INT;
490 break;
491 case 0xe8: /* call near rel */
492 case 0x9a: /* call far absolute */
493 ret = X86_BR_CALL;
494 break;
495 case 0xe0 ... 0xe3: /* loop jmp */
496 ret = X86_BR_JCC;
497 break;
498 case 0xe9 ... 0xeb: /* jmp */
499 ret = X86_BR_JMP;
500 break;
501 case 0xff: /* call near absolute, call far absolute ind */
502 insn_get_modrm(&insn);
503 ext = (insn.modrm.bytes[0] >> 3) & 0x7;
504 switch (ext) {
505 case 2: /* near ind call */
506 case 3: /* far ind call */
507 ret = X86_BR_IND_CALL;
508 break;
509 case 4:
510 case 5:
511 ret = X86_BR_JMP;
512 break;
513 }
514 break;
515 default:
516 ret = X86_BR_NONE;
517 }
518 /*
519 * interrupts, traps, faults (and thus ring transition) may
520 * occur on any instructions. Thus, to classify them correctly,
521 * we need to first look at the from and to priv levels. If they
522 * are different and to is in the kernel, then it indicates
523 * a ring transition. If the from instruction is not a ring
524 * transition instr (syscall, systenter, int), then it means
525 * it was a irq, trap or fault.
526 *
527 * we have no way of detecting kernel to kernel faults.
528 */
529 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
530 && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
531 ret = X86_BR_IRQ;
532
533 /*
534 * branch priv level determined by target as
535 * is done by HW when LBR_SELECT is implemented
536 */
537 if (ret != X86_BR_NONE)
538 ret |= to_plm;
539
540 return ret;
541}
542
543/*
544 * implement actual branch filter based on user demand.
545 * Hardware may not exactly satisfy that request, thus
546 * we need to inspect opcodes. Mismatched branches are
547 * discarded. Therefore, the number of branches returned
548 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
549 */
550static void
551intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
552{
553 u64 from, to;
554 int br_sel = cpuc->br_sel;
555 int i, j, type;
556 bool compress = false;
557
558 /* if sampling all branches, then nothing to filter */
559 if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
560 return;
561
562 for (i = 0; i < cpuc->lbr_stack.nr; i++) {
563
564 from = cpuc->lbr_entries[i].from;
565 to = cpuc->lbr_entries[i].to;
566
567 type = branch_type(from, to);
568
569 /* if type does not correspond, then discard */
570 if (type == X86_BR_NONE || (br_sel & type) != type) {
571 cpuc->lbr_entries[i].from = 0;
572 compress = true;
573 }
574 }
575
576 if (!compress)
577 return;
578
579 /* remove all entries with from=0 */
580 for (i = 0; i < cpuc->lbr_stack.nr; ) {
581 if (!cpuc->lbr_entries[i].from) {
582 j = i;
583 while (++j < cpuc->lbr_stack.nr)
584 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
585 cpuc->lbr_stack.nr--;
586 if (!cpuc->lbr_entries[i].from)
587 continue;
588 }
589 i++;
590 }
591}
592
593/*
594 * Map interface branch filters onto LBR filters
595 */
596static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
597 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
598 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
599 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
600 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
601 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
602 | LBR_IND_JMP | LBR_FAR,
603 /*
604 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
605 */
606 [PERF_SAMPLE_BRANCH_ANY_CALL] =
607 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
608 /*
609 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
610 */
611 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
612};
613
614static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
615 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
616 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
617 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
618 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
619 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
620 [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
621 | LBR_FAR,
622 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
623};
624
625/* core */
198void intel_pmu_lbr_init_core(void) 626void intel_pmu_lbr_init_core(void)
199{ 627{
200 x86_pmu.lbr_nr = 4; 628 x86_pmu.lbr_nr = 4;
201 x86_pmu.lbr_tos = 0x01c9; 629 x86_pmu.lbr_tos = MSR_LBR_TOS;
202 x86_pmu.lbr_from = 0x40; 630 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
203 x86_pmu.lbr_to = 0x60; 631 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
632
633 /*
634 * SW branch filter usage:
635 * - compensate for lack of HW filter
636 */
637 pr_cont("4-deep LBR, ");
204} 638}
205 639
640/* nehalem/westmere */
206void intel_pmu_lbr_init_nhm(void) 641void intel_pmu_lbr_init_nhm(void)
207{ 642{
208 x86_pmu.lbr_nr = 16; 643 x86_pmu.lbr_nr = 16;
209 x86_pmu.lbr_tos = 0x01c9; 644 x86_pmu.lbr_tos = MSR_LBR_TOS;
210 x86_pmu.lbr_from = 0x680; 645 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
211 x86_pmu.lbr_to = 0x6c0; 646 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
647
648 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
649 x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
650
651 /*
652 * SW branch filter usage:
653 * - workaround LBR_SEL errata (see above)
654 * - support syscall, sysret capture.
655 * That requires LBR_FAR but that means far
656 * jmp need to be filtered out
657 */
658 pr_cont("16-deep LBR, ");
659}
660
661/* sandy bridge */
662void intel_pmu_lbr_init_snb(void)
663{
664 x86_pmu.lbr_nr = 16;
665 x86_pmu.lbr_tos = MSR_LBR_TOS;
666 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
667 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
668
669 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
670 x86_pmu.lbr_sel_map = snb_lbr_sel_map;
671
672 /*
673 * SW branch filter usage:
674 * - support syscall, sysret capture.
675 * That requires LBR_FAR but that means far
676 * jmp need to be filtered out
677 */
678 pr_cont("16-deep LBR, ");
212} 679}
213 680
681/* atom */
214void intel_pmu_lbr_init_atom(void) 682void intel_pmu_lbr_init_atom(void)
215{ 683{
684 /*
685 * only models starting at stepping 10 seems
686 * to have an operational LBR which can freeze
687 * on PMU interrupt
688 */
689 if (boot_cpu_data.x86_mask < 10) {
690 pr_cont("LBR disabled due to erratum");
691 return;
692 }
693
216 x86_pmu.lbr_nr = 8; 694 x86_pmu.lbr_nr = 8;
217 x86_pmu.lbr_tos = 0x01c9; 695 x86_pmu.lbr_tos = MSR_LBR_TOS;
218 x86_pmu.lbr_from = 0x40; 696 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
219 x86_pmu.lbr_to = 0x60; 697 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
698
699 /*
700 * SW branch filter usage:
701 * - compensate for lack of HW filter
702 */
703 pr_cont("8-deep LBR, ");
220} 704}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef484d9d0a25..a2dfacfd7103 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1271,6 +1271,17 @@ done:
1271 return num ? -EINVAL : 0; 1271 return num ? -EINVAL : 0;
1272} 1272}
1273 1273
1274PMU_FORMAT_ATTR(cccr, "config:0-31" );
1275PMU_FORMAT_ATTR(escr, "config:32-62");
1276PMU_FORMAT_ATTR(ht, "config:63" );
1277
1278static struct attribute *intel_p4_formats_attr[] = {
1279 &format_attr_cccr.attr,
1280 &format_attr_escr.attr,
1281 &format_attr_ht.attr,
1282 NULL,
1283};
1284
1274static __initconst const struct x86_pmu p4_pmu = { 1285static __initconst const struct x86_pmu p4_pmu = {
1275 .name = "Netburst P4/Xeon", 1286 .name = "Netburst P4/Xeon",
1276 .handle_irq = p4_pmu_handle_irq, 1287 .handle_irq = p4_pmu_handle_irq,
@@ -1305,6 +1316,8 @@ static __initconst const struct x86_pmu p4_pmu = {
1305 * the former idea is taken from OProfile code 1316 * the former idea is taken from OProfile code
1306 */ 1317 */
1307 .perfctr_second_write = 1, 1318 .perfctr_second_write = 1,
1319
1320 .format_attrs = intel_p4_formats_attr,
1308}; 1321};
1309 1322
1310__init int p4_pmu_init(void) 1323__init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index c7181befecde..32bcfc7dd230 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
87 (void)checking_wrmsrl(hwc->config_base, val); 87 (void)checking_wrmsrl(hwc->config_base, val);
88} 88}
89 89
90PMU_FORMAT_ATTR(event, "config:0-7" );
91PMU_FORMAT_ATTR(umask, "config:8-15" );
92PMU_FORMAT_ATTR(edge, "config:18" );
93PMU_FORMAT_ATTR(pc, "config:19" );
94PMU_FORMAT_ATTR(inv, "config:23" );
95PMU_FORMAT_ATTR(cmask, "config:24-31" );
96
97static struct attribute *intel_p6_formats_attr[] = {
98 &format_attr_event.attr,
99 &format_attr_umask.attr,
100 &format_attr_edge.attr,
101 &format_attr_pc.attr,
102 &format_attr_inv.attr,
103 &format_attr_cmask.attr,
104 NULL,
105};
106
90static __initconst const struct x86_pmu p6_pmu = { 107static __initconst const struct x86_pmu p6_pmu = {
91 .name = "p6", 108 .name = "p6",
92 .handle_irq = x86_pmu_handle_irq, 109 .handle_irq = x86_pmu_handle_irq,
@@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = {
115 .cntval_mask = (1ULL << 32) - 1, 132 .cntval_mask = (1ULL << 32) - 1,
116 .get_event_constraints = x86_get_event_constraints, 133 .get_event_constraints = x86_get_event_constraints,
117 .event_constraints = p6_event_constraints, 134 .event_constraints = p6_event_constraints,
135
136 .format_attrs = intel_p6_formats_attr,
118}; 137};
119 138
120__init int p6_pmu_init(void) 139__init int p6_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6f537a..addf9e82a7f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, 40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, 41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, 42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, 44 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 45 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 46 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353d93f2..39472dd2323f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,7 +43,6 @@
43 43
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
46#include <asm/system.h>
47 46
48static struct class *cpuid_class; 47static struct class *cpuid_class;
49 48
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 642f75a68cd5..11891ca7b716 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
62 62
63 if (!userbuf) { 63 if (!userbuf) {
64 memcpy(buf, (vaddr + offset), csize); 64 memcpy(buf, (vaddr + offset), csize);
65 kunmap_atomic(vaddr, KM_PTE0); 65 kunmap_atomic(vaddr);
66 } else { 66 } else {
67 if (!kdump_buf_page) { 67 if (!kdump_buf_page) {
68 printk(KERN_WARNING "Kdump: Kdump buffer page not" 68 printk(KERN_WARNING "Kdump: Kdump buffer page not"
69 " allocated\n"); 69 " allocated\n");
70 kunmap_atomic(vaddr, KM_PTE0); 70 kunmap_atomic(vaddr);
71 return -EFAULT; 71 return -EFAULT;
72 } 72 }
73 copy_page(kdump_buf_page, vaddr); 73 copy_page(kdump_buf_page, vaddr);
74 kunmap_atomic(vaddr, KM_PTE0); 74 kunmap_atomic(vaddr);
75 if (copy_to_user(buf, (kdump_buf_page + offset), csize)) 75 if (copy_to_user(buf, (kdump_buf_page + offset), csize))
76 return -EFAULT; 76 return -EFAULT;
77 } 77 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 52821799a702..3ae2ced4a874 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,6 +4,7 @@
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/export.h> 5#include <linux/export.h>
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/irqdomain.h>
7#include <linux/interrupt.h> 8#include <linux/interrupt.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/of.h> 10#include <linux/of.h>
@@ -17,64 +18,14 @@
17#include <linux/initrd.h> 18#include <linux/initrd.h>
18 19
19#include <asm/hpet.h> 20#include <asm/hpet.h>
20#include <asm/irq_controller.h>
21#include <asm/apic.h> 21#include <asm/apic.h>
22#include <asm/pci_x86.h> 22#include <asm/pci_x86.h>
23 23
24__initdata u64 initial_dtb; 24__initdata u64 initial_dtb;
25char __initdata cmd_line[COMMAND_LINE_SIZE]; 25char __initdata cmd_line[COMMAND_LINE_SIZE];
26static LIST_HEAD(irq_domains);
27static DEFINE_RAW_SPINLOCK(big_irq_lock);
28 26
29int __initdata of_ioapic; 27int __initdata of_ioapic;
30 28
31#ifdef CONFIG_X86_IO_APIC
32static void add_interrupt_host(struct irq_domain *ih)
33{
34 unsigned long flags;
35
36 raw_spin_lock_irqsave(&big_irq_lock, flags);
37 list_add(&ih->l, &irq_domains);
38 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
39}
40#endif
41
42static struct irq_domain *get_ih_from_node(struct device_node *controller)
43{
44 struct irq_domain *ih, *found = NULL;
45 unsigned long flags;
46
47 raw_spin_lock_irqsave(&big_irq_lock, flags);
48 list_for_each_entry(ih, &irq_domains, l) {
49 if (ih->controller == controller) {
50 found = ih;
51 break;
52 }
53 }
54 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
55 return found;
56}
57
58unsigned int irq_create_of_mapping(struct device_node *controller,
59 const u32 *intspec, unsigned int intsize)
60{
61 struct irq_domain *ih;
62 u32 virq, type;
63 int ret;
64
65 ih = get_ih_from_node(controller);
66 if (!ih)
67 return 0;
68 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
69 if (ret)
70 return 0;
71 if (type == IRQ_TYPE_NONE)
72 return virq;
73 irq_set_irq_type(virq, type);
74 return virq;
75}
76EXPORT_SYMBOL_GPL(irq_create_of_mapping);
77
78unsigned long pci_address_to_pio(phys_addr_t address) 29unsigned long pci_address_to_pio(phys_addr_t address)
79{ 30{
80 /* 31 /*
@@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] =
354 }, 305 },
355}; 306};
356 307
357static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, 308static int ioapic_xlate(struct irq_domain *domain,
358 u32 *out_hwirq, u32 *out_type) 309 struct device_node *controller,
310 const u32 *intspec, u32 intsize,
311 irq_hw_number_t *out_hwirq, u32 *out_type)
359{ 312{
360 struct mp_ioapic_gsi *gsi_cfg;
361 struct io_apic_irq_attr attr; 313 struct io_apic_irq_attr attr;
362 struct of_ioapic_type *it; 314 struct of_ioapic_type *it;
363 u32 line, idx, type; 315 u32 line, idx;
316 int rc;
364 317
365 if (intsize < 2) 318 if (WARN_ON(intsize < 2))
366 return -EINVAL; 319 return -EINVAL;
367 320
368 line = *intspec; 321 line = intspec[0];
369 idx = (u32) id->priv;
370 gsi_cfg = mp_ioapic_gsi_routing(idx);
371 *out_hwirq = line + gsi_cfg->gsi_base;
372
373 intspec++;
374 type = *intspec;
375 322
376 if (type >= ARRAY_SIZE(of_ioapic_type)) 323 if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
377 return -EINVAL; 324 return -EINVAL;
378 325
379 it = of_ioapic_type + type; 326 it = &of_ioapic_type[intspec[1]];
380 *out_type = it->out_type;
381 327
328 idx = (u32) domain->host_data;
382 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); 329 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
383 330
384 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); 331 rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
332 cpu_to_node(0), &attr);
333 if (rc)
334 return rc;
335
336 *out_hwirq = line;
337 *out_type = it->out_type;
338 return 0;
385} 339}
386 340
341const struct irq_domain_ops ioapic_irq_domain_ops = {
342 .xlate = ioapic_xlate,
343};
344
387static void __init ioapic_add_ofnode(struct device_node *np) 345static void __init ioapic_add_ofnode(struct device_node *np)
388{ 346{
389 struct resource r; 347 struct resource r;
@@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)
399 for (i = 0; i < nr_ioapics; i++) { 357 for (i = 0; i < nr_ioapics; i++) {
400 if (r.start == mpc_ioapic_addr(i)) { 358 if (r.start == mpc_ioapic_addr(i)) {
401 struct irq_domain *id; 359 struct irq_domain *id;
360 struct mp_ioapic_gsi *gsi_cfg;
361
362 gsi_cfg = mp_ioapic_gsi_routing(i);
402 363
403 id = kzalloc(sizeof(*id), GFP_KERNEL); 364 id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
365 &ioapic_irq_domain_ops,
366 (void*)i);
404 BUG_ON(!id); 367 BUG_ON(!id);
405 id->controller = np;
406 id->xlate = ioapic_xlate;
407 id->priv = (void *)i;
408 add_interrupt_host(id);
409 return; 368 return;
410 } 369 }
411 } 370 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 28f98706b08b..1b81839b6c88 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
37 const struct stacktrace_ops *ops, 37 const struct stacktrace_ops *ops,
38 struct thread_info *tinfo, int *graph) 38 struct thread_info *tinfo, int *graph)
39{ 39{
40 struct task_struct *task = tinfo->task; 40 struct task_struct *task;
41 unsigned long ret_addr; 41 unsigned long ret_addr;
42 int index = task->curr_ret_stack; 42 int index;
43 43
44 if (addr != (unsigned long)return_to_handler) 44 if (addr != (unsigned long)return_to_handler)
45 return; 45 return;
46 46
47 task = tinfo->task;
48 index = task->curr_ret_stack;
49
47 if (!task->ret_stack || index < *graph) 50 if (!task->ret_stack || index < *graph)
48 return; 51 return;
49 52
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed013d5..88ec9129271d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)
87 int i; 87 int i;
88 88
89 print_modules(); 89 print_modules();
90 __show_regs(regs, 0); 90 __show_regs(regs, !user_mode_vm(regs));
91 91
92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
93 TASK_COMM_LEN, current->comm, task_pid_nr(current), 93 TASK_COMM_LEN, current->comm, task_pid_nr(current),
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e68f042..7b784f4ef1e4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -98,12 +98,6 @@
98#endif 98#endif
99.endm 99.endm
100 100
101#ifdef CONFIG_VM86
102#define resume_userspace_sig check_userspace
103#else
104#define resume_userspace_sig resume_userspace
105#endif
106
107/* 101/*
108 * User gs save/restore 102 * User gs save/restore
109 * 103 *
@@ -327,10 +321,19 @@ ret_from_exception:
327 preempt_stop(CLBR_ANY) 321 preempt_stop(CLBR_ANY)
328ret_from_intr: 322ret_from_intr:
329 GET_THREAD_INFO(%ebp) 323 GET_THREAD_INFO(%ebp)
330check_userspace: 324resume_userspace_sig:
325#ifdef CONFIG_VM86
331 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 326 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
332 movb PT_CS(%esp), %al 327 movb PT_CS(%esp), %al
333 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 328 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
329#else
330 /*
331 * We can be coming here from a syscall done in the kernel space,
332 * e.g. a failed kernel_execve().
333 */
334 movl PT_CS(%esp), %eax
335 andl $SEGMENT_RPL_MASK, %eax
336#endif
334 cmpl $USER_RPL, %eax 337 cmpl $USER_RPL, %eax
335 jb resume_kernel # not returning to v8086 or userspace 338 jb resume_kernel # not returning to v8086 or userspace
336 339
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2925e14fb1d9..cdc79b5cfcd9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64)
320 movq %rsp, %rsi 320 movq %rsp, %rsi
321 321
322 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 322 leaq -RBP(%rsp),%rdi /* arg1 for handler */
323 testl $3, CS(%rdi) 323 testl $3, CS-RBP(%rsi)
324 je 1f 324 je 1f
325 SWAPGS 325 SWAPGS
326 /* 326 /*
@@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64)
330 * moving irq_enter into assembly, which would be too much work) 330 * moving irq_enter into assembly, which would be too much work)
331 */ 331 */
3321: incl PER_CPU_VAR(irq_count) 3321: incl PER_CPU_VAR(irq_count)
333 jne 2f 333 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
334 mov PER_CPU_VAR(irq_stack_ptr),%rsp
335 CFI_DEF_CFA_REGISTER rsi 334 CFI_DEF_CFA_REGISTER rsi
336 335
3372: /* Store previous stack value */ 336 /* Store previous stack value */
338 pushq %rsi 337 pushq %rsi
339 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 338 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
340 0x77 /* DW_OP_breg7 */, 0, \ 339 0x77 /* DW_OP_breg7 */, 0, \
@@ -857,7 +856,7 @@ ret_from_intr:
857 856
858 /* Restore saved previous stack */ 857 /* Restore saved previous stack */
859 popq %rsi 858 popq %rsi
860 CFI_DEF_CFA_REGISTER rsi 859 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
861 leaq ARGOFFSET-RBP(%rsi), %rsp 860 leaq ARGOFFSET-RBP(%rsi), %rsp
862 CFI_DEF_CFA_REGISTER rsp 861 CFI_DEF_CFA_REGISTER rsp
863 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 862 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
@@ -1574,6 +1573,7 @@ ENTRY(nmi)
1574 1573
1575 /* Use %rdx as out temp variable throughout */ 1574 /* Use %rdx as out temp variable throughout */
1576 pushq_cfi %rdx 1575 pushq_cfi %rdx
1576 CFI_REL_OFFSET rdx, 0
1577 1577
1578 /* 1578 /*
1579 * If %cs was not the kernel segment, then the NMI triggered in user 1579 * If %cs was not the kernel segment, then the NMI triggered in user
@@ -1598,6 +1598,7 @@ ENTRY(nmi)
1598 */ 1598 */
1599 lea 6*8(%rsp), %rdx 1599 lea 6*8(%rsp), %rdx
1600 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1600 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1601 CFI_REMEMBER_STATE
1601 1602
1602nested_nmi: 1603nested_nmi:
1603 /* 1604 /*
@@ -1629,10 +1630,12 @@ nested_nmi:
1629 1630
1630nested_nmi_out: 1631nested_nmi_out:
1631 popq_cfi %rdx 1632 popq_cfi %rdx
1633 CFI_RESTORE rdx
1632 1634
1633 /* No need to check faults here */ 1635 /* No need to check faults here */
1634 INTERRUPT_RETURN 1636 INTERRUPT_RETURN
1635 1637
1638 CFI_RESTORE_STATE
1636first_nmi: 1639first_nmi:
1637 /* 1640 /*
1638 * Because nested NMIs will use the pushed location that we 1641 * Because nested NMIs will use the pushed location that we
@@ -1664,10 +1667,15 @@ first_nmi:
1664 * | pt_regs | 1667 * | pt_regs |
1665 * +-------------------------+ 1668 * +-------------------------+
1666 * 1669 *
1667 * The saved RIP is used to fix up the copied RIP that a nested 1670 * The saved stack frame is used to fix up the copied stack frame
1668 * NMI may zero out. The original stack frame and the temp storage 1671 * that a nested NMI may change to make the interrupted NMI iret jump
1672 * to the repeat_nmi. The original stack frame and the temp storage
1669 * is also used by nested NMIs and can not be trusted on exit. 1673 * is also used by nested NMIs and can not be trusted on exit.
1670 */ 1674 */
1675 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1676 movq (%rsp), %rdx
1677 CFI_RESTORE rdx
1678
1671 /* Set the NMI executing variable on the stack. */ 1679 /* Set the NMI executing variable on the stack. */
1672 pushq_cfi $1 1680 pushq_cfi $1
1673 1681
@@ -1675,22 +1683,39 @@ first_nmi:
1675 .rept 5 1683 .rept 5
1676 pushq_cfi 6*8(%rsp) 1684 pushq_cfi 6*8(%rsp)
1677 .endr 1685 .endr
1686 CFI_DEF_CFA_OFFSET SS+8-RIP
1687
1688 /* Everything up to here is safe from nested NMIs */
1689
1690 /*
1691 * If there was a nested NMI, the first NMI's iret will return
1692 * here. But NMIs are still enabled and we can take another
1693 * nested NMI. The nested NMI checks the interrupted RIP to see
1694 * if it is between repeat_nmi and end_repeat_nmi, and if so
1695 * it will just return, as we are about to repeat an NMI anyway.
1696 * This makes it safe to copy to the stack frame that a nested
1697 * NMI will update.
1698 */
1699repeat_nmi:
1700 /*
1701 * Update the stack variable to say we are still in NMI (the update
1702 * is benign for the non-repeat case, where 1 was pushed just above
1703 * to this very stack slot).
1704 */
1705 movq $1, 5*8(%rsp)
1678 1706
1679 /* Make another copy, this one may be modified by nested NMIs */ 1707 /* Make another copy, this one may be modified by nested NMIs */
1680 .rept 5 1708 .rept 5
1681 pushq_cfi 4*8(%rsp) 1709 pushq_cfi 4*8(%rsp)
1682 .endr 1710 .endr
1683 1711 CFI_DEF_CFA_OFFSET SS+8-RIP
1684 /* Do not pop rdx, nested NMIs will corrupt it */ 1712end_repeat_nmi:
1685 movq 11*8(%rsp), %rdx
1686 1713
1687 /* 1714 /*
1688 * Everything below this point can be preempted by a nested 1715 * Everything below this point can be preempted by a nested
1689 * NMI if the first NMI took an exception. Repeated NMIs 1716 * NMI if the first NMI took an exception and reset our iret stack
1690 * caused by an exception and nested NMI will start here, and 1717 * so that we repeat another NMI.
1691 * can still be preempted by another NMI.
1692 */ 1718 */
1693restart_nmi:
1694 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1719 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1695 subq $ORIG_RAX-R15, %rsp 1720 subq $ORIG_RAX-R15, %rsp
1696 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1721 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1719,26 +1744,6 @@ nmi_restore:
1719 CFI_ENDPROC 1744 CFI_ENDPROC
1720END(nmi) 1745END(nmi)
1721 1746
1722 /*
1723 * If an NMI hit an iret because of an exception or breakpoint,
1724 * it can lose its NMI context, and a nested NMI may come in.
1725 * In that case, the nested NMI will change the preempted NMI's
1726 * stack to jump to here when it does the final iret.
1727 */
1728repeat_nmi:
1729 INTR_FRAME
1730 /* Update the stack variable to say we are still in NMI */
1731 movq $1, 5*8(%rsp)
1732
1733 /* copy the saved stack back to copy stack */
1734 .rept 5
1735 pushq_cfi 4*8(%rsp)
1736 .endr
1737
1738 jmp restart_nmi
1739 CFI_ENDPROC
1740end_repeat_nmi:
1741
1742ENTRY(ignore_sysret) 1747ENTRY(ignore_sysret)
1743 CFI_STARTPROC 1748 CFI_STARTPROC
1744 mov $-ENOSYS,%eax 1749 mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 739d8598f789..7734bcbb5a3a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,6 +16,7 @@
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/ptrace.h> 17#include <asm/ptrace.h>
18#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/fpu-internal.h>
19#include <asm/user.h> 20#include <asm/user.h>
20 21
21#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -32,6 +33,86 @@
32# define user32_fxsr_struct user_fxsr_struct 33# define user32_fxsr_struct user_fxsr_struct
33#endif 34#endif
34 35
36/*
37 * Were we in an interrupt that interrupted kernel mode?
38 *
39 * We can do a kernel_fpu_begin/end() pair *ONLY* if that
40 * pair does nothing at all: the thread must not have fpu (so
41 * that we don't try to save the FPU state), and TS must
42 * be set (so that the clts/stts pair does nothing that is
43 * visible in the interrupted kernel thread).
44 */
45static inline bool interrupted_kernel_fpu_idle(void)
46{
47 return !__thread_has_fpu(current) &&
48 (read_cr0() & X86_CR0_TS);
49}
50
51/*
52 * Were we in user mode (or vm86 mode) when we were
53 * interrupted?
54 *
55 * Doing kernel_fpu_begin/end() is ok if we are running
56 * in an interrupt context from user mode - we'll just
57 * save the FPU state as required.
58 */
59static inline bool interrupted_user_mode(void)
60{
61 struct pt_regs *regs = get_irq_regs();
62 return regs && user_mode_vm(regs);
63}
64
65/*
66 * Can we use the FPU in kernel mode with the
67 * whole "kernel_fpu_begin/end()" sequence?
68 *
69 * It's always ok in process context (ie "not interrupt")
70 * but it is sometimes ok even from an irq.
71 */
72bool irq_fpu_usable(void)
73{
74 return !in_interrupt() ||
75 interrupted_user_mode() ||
76 interrupted_kernel_fpu_idle();
77}
78EXPORT_SYMBOL(irq_fpu_usable);
79
80void kernel_fpu_begin(void)
81{
82 struct task_struct *me = current;
83
84 WARN_ON_ONCE(!irq_fpu_usable());
85 preempt_disable();
86 if (__thread_has_fpu(me)) {
87 __save_init_fpu(me);
88 __thread_clear_has_fpu(me);
89 /* We do 'stts()' in kernel_fpu_end() */
90 } else {
91 percpu_write(fpu_owner_task, NULL);
92 clts();
93 }
94}
95EXPORT_SYMBOL(kernel_fpu_begin);
96
97void kernel_fpu_end(void)
98{
99 stts();
100 preempt_enable();
101}
102EXPORT_SYMBOL(kernel_fpu_end);
103
104void unlazy_fpu(struct task_struct *tsk)
105{
106 preempt_disable();
107 if (__thread_has_fpu(tsk)) {
108 __save_init_fpu(tsk);
109 __thread_fpu_end(tsk);
110 } else
111 tsk->fpu_counter = 0;
112 preempt_enable();
113}
114EXPORT_SYMBOL(unlazy_fpu);
115
35#ifdef CONFIG_MATH_EMULATION 116#ifdef CONFIG_MATH_EMULATION
36# define HAVE_HWFP (boot_cpu_data.hard_math) 117# define HAVE_HWFP (boot_cpu_data.hard_math)
37#else 118#else
@@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size);
44unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); 125unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
45static struct i387_fxsave_struct fx_scratch __cpuinitdata; 126static struct i387_fxsave_struct fx_scratch __cpuinitdata;
46 127
47void __cpuinit mxcsr_feature_mask_init(void) 128static void __cpuinit mxcsr_feature_mask_init(void)
48{ 129{
49 unsigned long mask = 0; 130 unsigned long mask = 0;
50 131
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 610485223bdb..36d1853e91af 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,7 +15,6 @@
15#include <linux/delay.h> 15#include <linux/delay.h>
16 16
17#include <linux/atomic.h> 17#include <linux/atomic.h>
18#include <asm/system.h>
19#include <asm/timer.h> 18#include <asm/timer.h>
20#include <asm/hw_irq.h> 19#include <asm/hw_irq.h>
21#include <asm/pgtable.h> 20#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7943e0c21bde..3dafc6003b7c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -282,8 +282,13 @@ void fixup_irqs(void)
282 else if (!(warned++)) 282 else if (!(warned++))
283 set_affinity = 0; 283 set_affinity = 0;
284 284
285 /*
286 * We unmask if the irq was not marked masked by the
287 * core code. That respects the lazy irq disable
288 * behaviour.
289 */
285 if (!irqd_can_move_in_process_context(data) && 290 if (!irqd_can_move_in_process_context(data) &&
286 !irqd_irq_disabled(data) && chip->irq_unmask) 291 !irqd_irq_masked(data) && chip->irq_unmask)
287 chip->irq_unmask(data); 292 chip->irq_unmask(data);
288 293
289 raw_spin_unlock(&desc->lock); 294 raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d92..58b7f27cb3e9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 100 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 101 irqctx->tinfo.previous_esp = current_stack_pointer;
102 102
103 /* 103 /* Copy the preempt_count so that the [soft]irq checks work. */
104 * Copy the softirq bits in preempt_count so that the 104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
105 * softirq checks work in the hardirq context.
106 */
107 irqctx->tinfo.preempt_count =
108 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
109 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
110 105
111 if (unlikely(overflow)) 106 if (unlikely(overflow))
112 call_on_stack(print_stack_overflow, isp); 107 call_on_stack(print_stack_overflow, isp);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
196 if (unlikely(!desc)) 191 if (unlikely(!desc))
197 return false; 192 return false;
198 193
199 if (!execute_on_irq_stack(overflow, desc, irq)) { 194 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
200 if (unlikely(overflow)) 195 if (unlikely(overflow))
201 print_stack_overflow(); 196 print_stack_overflow();
202 desc->handle_irq(irq, desc); 197 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7b77062dea11..252981afd6c4 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,7 +16,6 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17 17
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include <asm/system.h>
20#include <asm/timer.h> 19#include <asm/timer.h>
21#include <asm/hw_irq.h> 20#include <asm/hw_irq.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -306,10 +305,10 @@ void __init native_init_IRQ(void)
306 * us. (some of these will be overridden and become 305 * us. (some of these will be overridden and become
307 * 'special' SMP interrupts) 306 * 'special' SMP interrupts)
308 */ 307 */
309 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 308 i = FIRST_EXTERNAL_VECTOR;
309 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
310 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 310 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
311 if (!test_bit(i, used_vectors)) 311 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
312 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
313 } 312 }
314 313
315 if (!acpi_ioapic && !of_ioapic) 314 if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 90fcf62854bb..1d5d31ea686b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
68 return count; 68 return count;
69} 69}
70 70
71static int setup_data_open(struct inode *inode, struct file *file)
72{
73 file->private_data = inode->i_private;
74
75 return 0;
76}
77
78static const struct file_operations fops_setup_data = { 71static const struct file_operations fops_setup_data = {
79 .read = setup_data_read, 72 .read = setup_data_read,
80 .open = setup_data_open, 73 .open = simple_open,
81 .llseek = default_llseek, 74 .llseek = default_llseek,
82}; 75};
83 76
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba5771acad..8bfb6146f753 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,10 +43,11 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h> 45#include <linux/hw_breakpoint.h>
46#include <linux/uaccess.h>
47#include <linux/memory.h>
46 48
47#include <asm/debugreg.h> 49#include <asm/debugreg.h>
48#include <asm/apicdef.h> 50#include <asm/apicdef.h>
49#include <asm/system.h>
50#include <asm/apic.h> 51#include <asm/apic.h>
51#include <asm/nmi.h> 52#include <asm/nmi.h>
52 53
@@ -67,8 +68,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
67 { "ss", 4, offsetof(struct pt_regs, ss) }, 68 { "ss", 4, offsetof(struct pt_regs, ss) },
68 { "ds", 4, offsetof(struct pt_regs, ds) }, 69 { "ds", 4, offsetof(struct pt_regs, ds) },
69 { "es", 4, offsetof(struct pt_regs, es) }, 70 { "es", 4, offsetof(struct pt_regs, es) },
70 { "fs", 4, -1 },
71 { "gs", 4, -1 },
72#else 71#else
73 { "ax", 8, offsetof(struct pt_regs, ax) }, 72 { "ax", 8, offsetof(struct pt_regs, ax) },
74 { "bx", 8, offsetof(struct pt_regs, bx) }, 73 { "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +89,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
90 { "flags", 4, offsetof(struct pt_regs, flags) }, 89 { "flags", 4, offsetof(struct pt_regs, flags) },
91 { "cs", 4, offsetof(struct pt_regs, cs) }, 90 { "cs", 4, offsetof(struct pt_regs, cs) },
92 { "ss", 4, offsetof(struct pt_regs, ss) }, 91 { "ss", 4, offsetof(struct pt_regs, ss) },
92 { "ds", 4, -1 },
93 { "es", 4, -1 },
93#endif 94#endif
95 { "fs", 4, -1 },
96 { "gs", 4, -1 },
94}; 97};
95 98
96int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) 99int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
@@ -740,6 +743,64 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
740 regs->ip = ip; 743 regs->ip = ip;
741} 744}
742 745
746int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
747{
748 int err;
749 char opc[BREAK_INSTR_SIZE];
750
751 bpt->type = BP_BREAKPOINT;
752 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
753 BREAK_INSTR_SIZE);
754 if (err)
755 return err;
756 err = probe_kernel_write((char *)bpt->bpt_addr,
757 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
758#ifdef CONFIG_DEBUG_RODATA
759 if (!err)
760 return err;
761 /*
762 * It is safe to call text_poke() because normal kernel execution
763 * is stopped on all cores, so long as the text_mutex is not locked.
764 */
765 if (mutex_is_locked(&text_mutex))
766 return -EBUSY;
767 text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
768 BREAK_INSTR_SIZE);
769 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
770 if (err)
771 return err;
772 if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
773 return -EINVAL;
774 bpt->type = BP_POKE_BREAKPOINT;
775#endif /* CONFIG_DEBUG_RODATA */
776 return err;
777}
778
779int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
780{
781#ifdef CONFIG_DEBUG_RODATA
782 int err;
783 char opc[BREAK_INSTR_SIZE];
784
785 if (bpt->type != BP_POKE_BREAKPOINT)
786 goto knl_write;
787 /*
788 * It is safe to call text_poke() because normal kernel execution
789 * is stopped on all cores, so long as the text_mutex is not locked.
790 */
791 if (mutex_is_locked(&text_mutex))
792 goto knl_write;
793 text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
794 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
795 if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
796 goto knl_write;
797 return err;
798knl_write:
799#endif /* CONFIG_DEBUG_RODATA */
800 return probe_kernel_write((char *)bpt->bpt_addr,
801 (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
802}
803
743struct kgdb_arch arch_kgdb_ops = { 804struct kgdb_arch arch_kgdb_ops = {
744 /* Breakpoint instruction: */ 805 /* Breakpoint instruction: */
745 .gdb_bpt_instr = { 0xcc }, 806 .gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 000000000000..3230b68ef29a
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
1#ifndef __X86_KERNEL_KPROBES_COMMON_H
2#define __X86_KERNEL_KPROBES_COMMON_H
3
4/* Kprobes and Optprobes common header */
5
6#ifdef CONFIG_X86_64
7#define SAVE_REGS_STRING \
8 /* Skip cs, ip, orig_ax. */ \
9 " subq $24, %rsp\n" \
10 " pushq %rdi\n" \
11 " pushq %rsi\n" \
12 " pushq %rdx\n" \
13 " pushq %rcx\n" \
14 " pushq %rax\n" \
15 " pushq %r8\n" \
16 " pushq %r9\n" \
17 " pushq %r10\n" \
18 " pushq %r11\n" \
19 " pushq %rbx\n" \
20 " pushq %rbp\n" \
21 " pushq %r12\n" \
22 " pushq %r13\n" \
23 " pushq %r14\n" \
24 " pushq %r15\n"
25#define RESTORE_REGS_STRING \
26 " popq %r15\n" \
27 " popq %r14\n" \
28 " popq %r13\n" \
29 " popq %r12\n" \
30 " popq %rbp\n" \
31 " popq %rbx\n" \
32 " popq %r11\n" \
33 " popq %r10\n" \
34 " popq %r9\n" \
35 " popq %r8\n" \
36 " popq %rax\n" \
37 " popq %rcx\n" \
38 " popq %rdx\n" \
39 " popq %rsi\n" \
40 " popq %rdi\n" \
41 /* Skip orig_ax, ip, cs */ \
42 " addq $24, %rsp\n"
43#else
44#define SAVE_REGS_STRING \
45 /* Skip cs, ip, orig_ax and gs. */ \
46 " subl $16, %esp\n" \
47 " pushl %fs\n" \
48 " pushl %es\n" \
49 " pushl %ds\n" \
50 " pushl %eax\n" \
51 " pushl %ebp\n" \
52 " pushl %edi\n" \
53 " pushl %esi\n" \
54 " pushl %edx\n" \
55 " pushl %ecx\n" \
56 " pushl %ebx\n"
57#define RESTORE_REGS_STRING \
58 " popl %ebx\n" \
59 " popl %ecx\n" \
60 " popl %edx\n" \
61 " popl %esi\n" \
62 " popl %edi\n" \
63 " popl %ebp\n" \
64 " popl %eax\n" \
65 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
66 " addl $24, %esp\n"
67#endif
68
69/* Ensure if the instruction can be boostable */
70extern int can_boost(kprobe_opcode_t *instruction);
71/* Recover instruction if given address is probed */
72extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
73 unsigned long addr);
74/*
75 * Copy an instruction and adjust the displacement if the instruction
76 * uses the %rip-relative addressing mode.
77 */
78extern int __copy_instruction(u8 *dest, u8 *src);
79
80/* Generate a relative-jump/call instruction */
81extern void synthesize_reljump(void *from, void *to);
82extern void synthesize_relcall(void *from, void *to);
83
84#ifdef CONFIG_OPTPROBES
85extern int arch_init_optprobes(void);
86extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
87extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
88#else /* !CONFIG_OPTPROBES */
89static inline int arch_init_optprobes(void)
90{
91 return 0;
92}
93static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
94{
95 return 0;
96}
97static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
98{
99 return addr;
100}
101#endif
102#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 000000000000..c5e410eed403
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
1/*
2 * Kernel Probes Jump Optimization (Optprobes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 * Copyright (C) Hitachi Ltd., 2012
20 */
21#include <linux/kprobes.h>
22#include <linux/ptrace.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25#include <linux/hardirq.h>
26#include <linux/preempt.h>
27#include <linux/module.h>
28#include <linux/kdebug.h>
29#include <linux/kallsyms.h>
30#include <linux/ftrace.h>
31
32#include <asm/cacheflush.h>
33#include <asm/desc.h>
34#include <asm/pgtable.h>
35#include <asm/uaccess.h>
36#include <asm/alternative.h>
37#include <asm/insn.h>
38#include <asm/debugreg.h>
39
40#include "kprobes-common.h"
41
42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
43{
44 struct optimized_kprobe *op;
45 struct kprobe *kp;
46 long offs;
47 int i;
48
49 for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
50 kp = get_kprobe((void *)addr - i);
51 /* This function only handles jump-optimized kprobe */
52 if (kp && kprobe_optimized(kp)) {
53 op = container_of(kp, struct optimized_kprobe, kp);
54 /* If op->list is not empty, op is under optimizing */
55 if (list_empty(&op->list))
56 goto found;
57 }
58 }
59
60 return addr;
61found:
62 /*
63 * If the kprobe can be optimized, original bytes which can be
64 * overwritten by jump destination address. In this case, original
65 * bytes must be recovered from op->optinsn.copied_insn buffer.
66 */
67 memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
68 if (addr == (unsigned long)kp->addr) {
69 buf[0] = kp->opcode;
70 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
71 } else {
72 offs = addr - (unsigned long)kp->addr - 1;
73 memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
74 }
75
76 return (unsigned long)buf;
77}
78
79/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
80static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
81{
82#ifdef CONFIG_X86_64
83 *addr++ = 0x48;
84 *addr++ = 0xbf;
85#else
86 *addr++ = 0xb8;
87#endif
88 *(unsigned long *)addr = val;
89}
90
91static void __used __kprobes kprobes_optinsn_template_holder(void)
92{
93 asm volatile (
94 ".global optprobe_template_entry\n"
95 "optprobe_template_entry:\n"
96#ifdef CONFIG_X86_64
97 /* We don't bother saving the ss register */
98 " pushq %rsp\n"
99 " pushfq\n"
100 SAVE_REGS_STRING
101 " movq %rsp, %rsi\n"
102 ".global optprobe_template_val\n"
103 "optprobe_template_val:\n"
104 ASM_NOP5
105 ASM_NOP5
106 ".global optprobe_template_call\n"
107 "optprobe_template_call:\n"
108 ASM_NOP5
109 /* Move flags to rsp */
110 " movq 144(%rsp), %rdx\n"
111 " movq %rdx, 152(%rsp)\n"
112 RESTORE_REGS_STRING
113 /* Skip flags entry */
114 " addq $8, %rsp\n"
115 " popfq\n"
116#else /* CONFIG_X86_32 */
117 " pushf\n"
118 SAVE_REGS_STRING
119 " movl %esp, %edx\n"
120 ".global optprobe_template_val\n"
121 "optprobe_template_val:\n"
122 ASM_NOP5
123 ".global optprobe_template_call\n"
124 "optprobe_template_call:\n"
125 ASM_NOP5
126 RESTORE_REGS_STRING
127 " addl $4, %esp\n" /* skip cs */
128 " popf\n"
129#endif
130 ".global optprobe_template_end\n"
131 "optprobe_template_end:\n");
132}
133
134#define TMPL_MOVE_IDX \
135 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
136#define TMPL_CALL_IDX \
137 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
138#define TMPL_END_IDX \
139 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
140
141#define INT3_SIZE sizeof(kprobe_opcode_t)
142
143/* Optimized kprobe call back function: called from optinsn */
144static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
145{
146 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
147 unsigned long flags;
148
149 /* This is possible if op is under delayed unoptimizing */
150 if (kprobe_disabled(&op->kp))
151 return;
152
153 local_irq_save(flags);
154 if (kprobe_running()) {
155 kprobes_inc_nmissed_count(&op->kp);
156 } else {
157 /* Save skipped registers */
158#ifdef CONFIG_X86_64
159 regs->cs = __KERNEL_CS;
160#else
161 regs->cs = __KERNEL_CS | get_kernel_rpl();
162 regs->gs = 0;
163#endif
164 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
165 regs->orig_ax = ~0UL;
166
167 __this_cpu_write(current_kprobe, &op->kp);
168 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
169 opt_pre_handler(&op->kp, regs);
170 __this_cpu_write(current_kprobe, NULL);
171 }
172 local_irq_restore(flags);
173}
174
175static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
176{
177 int len = 0, ret;
178
179 while (len < RELATIVEJUMP_SIZE) {
180 ret = __copy_instruction(dest + len, src + len);
181 if (!ret || !can_boost(dest + len))
182 return -EINVAL;
183 len += ret;
184 }
185 /* Check whether the address range is reserved */
186 if (ftrace_text_reserved(src, src + len - 1) ||
187 alternatives_text_reserved(src, src + len - 1) ||
188 jump_label_text_reserved(src, src + len - 1))
189 return -EBUSY;
190
191 return len;
192}
193
194/* Check whether insn is indirect jump */
195static int __kprobes insn_is_indirect_jump(struct insn *insn)
196{
197 return ((insn->opcode.bytes[0] == 0xff &&
198 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
199 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
200}
201
202/* Check whether insn jumps into specified address range */
203static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
204{
205 unsigned long target = 0;
206
207 switch (insn->opcode.bytes[0]) {
208 case 0xe0: /* loopne */
209 case 0xe1: /* loope */
210 case 0xe2: /* loop */
211 case 0xe3: /* jcxz */
212 case 0xe9: /* near relative jump */
213 case 0xeb: /* short relative jump */
214 break;
215 case 0x0f:
216 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
217 break;
218 return 0;
219 default:
220 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
221 break;
222 return 0;
223 }
224 target = (unsigned long)insn->next_byte + insn->immediate.value;
225
226 return (start <= target && target <= start + len);
227}
228
229/* Decode whole function to ensure any instructions don't jump into target */
230static int __kprobes can_optimize(unsigned long paddr)
231{
232 unsigned long addr, size = 0, offset = 0;
233 struct insn insn;
234 kprobe_opcode_t buf[MAX_INSN_SIZE];
235
236 /* Lookup symbol including addr */
237 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
238 return 0;
239
240 /*
241 * Do not optimize in the entry code due to the unstable
242 * stack handling.
243 */
244 if ((paddr >= (unsigned long)__entry_text_start) &&
245 (paddr < (unsigned long)__entry_text_end))
246 return 0;
247
248 /* Check there is enough space for a relative jump. */
249 if (size - offset < RELATIVEJUMP_SIZE)
250 return 0;
251
252 /* Decode instructions */
253 addr = paddr - offset;
254 while (addr < paddr - offset + size) { /* Decode until function end */
255 if (search_exception_tables(addr))
256 /*
257 * Since some fixup code will jumps into this function,
258 * we can't optimize kprobe in this function.
259 */
260 return 0;
261 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
262 insn_get_length(&insn);
263 /* Another subsystem puts a breakpoint */
264 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
265 return 0;
266 /* Recover address */
267 insn.kaddr = (void *)addr;
268 insn.next_byte = (void *)(addr + insn.length);
269 /* Check any instructions don't jump into target */
270 if (insn_is_indirect_jump(&insn) ||
271 insn_jump_into_range(&insn, paddr + INT3_SIZE,
272 RELATIVE_ADDR_SIZE))
273 return 0;
274 addr += insn.length;
275 }
276
277 return 1;
278}
279
280/* Check optimized_kprobe can actually be optimized. */
281int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
282{
283 int i;
284 struct kprobe *p;
285
286 for (i = 1; i < op->optinsn.size; i++) {
287 p = get_kprobe(op->kp.addr + i);
288 if (p && !kprobe_disabled(p))
289 return -EEXIST;
290 }
291
292 return 0;
293}
294
295/* Check the addr is within the optimized instructions. */
296int __kprobes
297arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
298{
299 return ((unsigned long)op->kp.addr <= addr &&
300 (unsigned long)op->kp.addr + op->optinsn.size > addr);
301}
302
303/* Free optimized instruction slot */
304static __kprobes
305void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
306{
307 if (op->optinsn.insn) {
308 free_optinsn_slot(op->optinsn.insn, dirty);
309 op->optinsn.insn = NULL;
310 op->optinsn.size = 0;
311 }
312}
313
314void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
315{
316 __arch_remove_optimized_kprobe(op, 1);
317}
318
319/*
320 * Copy replacing target instructions
321 * Target instructions MUST be relocatable (checked inside)
322 * This is called when new aggr(opt)probe is allocated or reused.
323 */
324int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
325{
326 u8 *buf;
327 int ret;
328 long rel;
329
330 if (!can_optimize((unsigned long)op->kp.addr))
331 return -EILSEQ;
332
333 op->optinsn.insn = get_optinsn_slot();
334 if (!op->optinsn.insn)
335 return -ENOMEM;
336
337 /*
338 * Verify if the address gap is in 2GB range, because this uses
339 * a relative jump.
340 */
341 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
342 if (abs(rel) > 0x7fffffff)
343 return -ERANGE;
344
345 buf = (u8 *)op->optinsn.insn;
346
347 /* Copy instructions into the out-of-line buffer */
348 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
349 if (ret < 0) {
350 __arch_remove_optimized_kprobe(op, 0);
351 return ret;
352 }
353 op->optinsn.size = ret;
354
355 /* Copy arch-dep-instance from template */
356 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
357
358 /* Set probe information */
359 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
360
361 /* Set probe function call */
362 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
363
364 /* Set returning jmp instruction at the tail of out-of-line buffer */
365 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
366 (u8 *)op->kp.addr + op->optinsn.size);
367
368 flush_icache_range((unsigned long) buf,
369 (unsigned long) buf + TMPL_END_IDX +
370 op->optinsn.size + RELATIVEJUMP_SIZE);
371 return 0;
372}
373
374#define MAX_OPTIMIZE_PROBES 256
375static struct text_poke_param *jump_poke_params;
376static struct jump_poke_buffer {
377 u8 buf[RELATIVEJUMP_SIZE];
378} *jump_poke_bufs;
379
380static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
381 u8 *insn_buf,
382 struct optimized_kprobe *op)
383{
384 s32 rel = (s32)((long)op->optinsn.insn -
385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
386
387 /* Backup instructions which will be replaced by jump address */
388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
389 RELATIVE_ADDR_SIZE);
390
391 insn_buf[0] = RELATIVEJUMP_OPCODE;
392 *(s32 *)(&insn_buf[1]) = rel;
393
394 tprm->addr = op->kp.addr;
395 tprm->opcode = insn_buf;
396 tprm->len = RELATIVEJUMP_SIZE;
397}
398
399/*
400 * Replace breakpoints (int3) with relative jumps.
401 * Caller must call with locking kprobe_mutex and text_mutex.
402 */
403void __kprobes arch_optimize_kprobes(struct list_head *oplist)
404{
405 struct optimized_kprobe *op, *tmp;
406 int c = 0;
407
408 list_for_each_entry_safe(op, tmp, oplist, list) {
409 WARN_ON(kprobe_disabled(&op->kp));
410 /* Setup param */
411 setup_optimize_kprobe(&jump_poke_params[c],
412 jump_poke_bufs[c].buf, op);
413 list_del_init(&op->list);
414 if (++c >= MAX_OPTIMIZE_PROBES)
415 break;
416 }
417
418 /*
419 * text_poke_smp doesn't support NMI/MCE code modifying.
420 * However, since kprobes itself also doesn't support NMI/MCE
421 * code probing, it's not a problem.
422 */
423 text_poke_smp_batch(jump_poke_params, c);
424}
425
426static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
427 u8 *insn_buf,
428 struct optimized_kprobe *op)
429{
430 /* Set int3 to first byte for kprobes */
431 insn_buf[0] = BREAKPOINT_INSTRUCTION;
432 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
433
434 tprm->addr = op->kp.addr;
435 tprm->opcode = insn_buf;
436 tprm->len = RELATIVEJUMP_SIZE;
437}
438
439/*
440 * Recover original instructions and breakpoints from relative jumps.
441 * Caller must call with locking kprobe_mutex.
442 */
443extern void arch_unoptimize_kprobes(struct list_head *oplist,
444 struct list_head *done_list)
445{
446 struct optimized_kprobe *op, *tmp;
447 int c = 0;
448
449 list_for_each_entry_safe(op, tmp, oplist, list) {
450 /* Setup param */
451 setup_unoptimize_kprobe(&jump_poke_params[c],
452 jump_poke_bufs[c].buf, op);
453 list_move(&op->list, done_list);
454 if (++c >= MAX_OPTIMIZE_PROBES)
455 break;
456 }
457
458 /*
459 * text_poke_smp doesn't support NMI/MCE code modifying.
460 * However, since kprobes itself also doesn't support NMI/MCE
461 * code probing, it's not a problem.
462 */
463 text_poke_smp_batch(jump_poke_params, c);
464}
465
466/* Replace a relative jump with a breakpoint (int3). */
467void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
468{
469 u8 buf[RELATIVEJUMP_SIZE];
470
471 /* Set int3 to first byte for kprobes */
472 buf[0] = BREAKPOINT_INSTRUCTION;
473 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
474 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
475}
476
477int __kprobes
478setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
479{
480 struct optimized_kprobe *op;
481
482 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
483 /* This kprobe is really able to run optimized path. */
484 op = container_of(p, struct optimized_kprobe, kp);
485 /* Detour through copied instructions */
486 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
487 if (!reenter)
488 reset_current_kprobe();
489 preempt_enable_no_resched();
490 return 1;
491 }
492 return 0;
493}
494
495int __kprobes arch_init_optprobes(void)
496{
497 /* Allocate code buffer and parameter array */
498 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
499 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
500 if (!jump_poke_bufs)
501 return -ENOMEM;
502
503 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
504 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
505 if (!jump_poke_params) {
506 kfree(jump_poke_bufs);
507 jump_poke_bufs = NULL;
508 return -ENOMEM;
509 }
510
511 return 0;
512}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d8b64c..e213fc8408d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi 30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes. 31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com> 32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality 33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added 34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386. 35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster 36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64 37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven 38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> 39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code. 40 * unified x86 kprobes code.
41 */ 41 */
42
43#include <linux/kprobes.h> 42#include <linux/kprobes.h>
44#include <linux/ptrace.h> 43#include <linux/ptrace.h>
45#include <linux/string.h> 44#include <linux/string.h>
@@ -59,6 +58,8 @@
59#include <asm/insn.h> 58#include <asm/insn.h>
60#include <asm/debugreg.h> 59#include <asm/debugreg.h>
61 60
61#include "kprobes-common.h"
62
62void jprobe_return_end(void); 63void jprobe_return_end(void);
63 64
64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 65DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
108 doesn't switch kernel stack.*/ 109 doesn't switch kernel stack.*/
109 {NULL, NULL} /* Terminator */ 110 {NULL, NULL} /* Terminator */
110}; 111};
112
111const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 113const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
112 114
113static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) 115static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
123} 125}
124 126
125/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 127/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
126static void __kprobes synthesize_reljump(void *from, void *to) 128void __kprobes synthesize_reljump(void *from, void *to)
127{ 129{
128 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); 130 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
129} 131}
130 132
133/* Insert a call instruction at address 'from', which calls address 'to'.*/
134void __kprobes synthesize_relcall(void *from, void *to)
135{
136 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
137}
138
131/* 139/*
132 * Skip the prefixes of the instruction. 140 * Skip the prefixes of the instruction.
133 */ 141 */
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
151 * Returns non-zero if opcode is boostable. 159 * Returns non-zero if opcode is boostable.
152 * RIP relative instructions are adjusted at copying time in 64 bits mode 160 * RIP relative instructions are adjusted at copying time in 64 bits mode
153 */ 161 */
154static int __kprobes can_boost(kprobe_opcode_t *opcodes) 162int __kprobes can_boost(kprobe_opcode_t *opcodes)
155{ 163{
156 kprobe_opcode_t opcode; 164 kprobe_opcode_t opcode;
157 kprobe_opcode_t *orig_opcodes = opcodes; 165 kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +215,15 @@ retry:
207 } 215 }
208} 216}
209 217
210/* Recover the probed instruction at addr for further analysis. */ 218static unsigned long
211static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) 219__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
212{ 220{
213 struct kprobe *kp; 221 struct kprobe *kp;
222
214 kp = get_kprobe((void *)addr); 223 kp = get_kprobe((void *)addr);
224 /* There is no probe, return original address */
215 if (!kp) 225 if (!kp)
216 return -EINVAL; 226 return addr;
217 227
218 /* 228 /*
219 * Basically, kp->ainsn.insn has an original instruction. 229 * Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 */ 240 */
231 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 241 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
232 buf[0] = kp->opcode; 242 buf[0] = kp->opcode;
233 return 0; 243 return (unsigned long)buf;
244}
245
246/*
247 * Recover the probed instruction at addr for further analysis.
248 * Caller must lock kprobes by kprobe_mutex, or disable preemption
249 * for preventing to release referencing kprobes.
250 */
251unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
252{
253 unsigned long __addr;
254
255 __addr = __recover_optprobed_insn(buf, addr);
256 if (__addr != addr)
257 return __addr;
258
259 return __recover_probed_insn(buf, addr);
234} 260}
235 261
236/* Check if paddr is at an instruction boundary */ 262/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 263static int __kprobes can_probe(unsigned long paddr)
238{ 264{
239 int ret; 265 unsigned long addr, __addr, offset = 0;
240 unsigned long addr, offset = 0;
241 struct insn insn; 266 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 267 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 268
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)
247 /* Decode instructions */ 272 /* Decode instructions */
248 addr = paddr - offset; 273 addr = paddr - offset;
249 while (addr < paddr) { 274 while (addr < paddr) {
250 kernel_insn_init(&insn, (void *)addr);
251 insn_get_opcode(&insn);
252
253 /* 275 /*
254 * Check if the instruction has been modified by another 276 * Check if the instruction has been modified by another
255 * kprobe, in which case we replace the breakpoint by the 277 * kprobe, in which case we replace the breakpoint by the
256 * original instruction in our buffer. 278 * original instruction in our buffer.
279 * Also, jump optimization will change the breakpoint to
280 * relative-jump. Since the relative-jump itself is
281 * normally used, we just go through if there is no kprobe.
257 */ 282 */
258 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { 283 __addr = recover_probed_instruction(buf, addr);
259 ret = recover_probed_instruction(buf, addr); 284 kernel_insn_init(&insn, (void *)__addr);
260 if (ret)
261 /*
262 * Another debugging subsystem might insert
263 * this breakpoint. In that case, we can't
264 * recover it.
265 */
266 return 0;
267 kernel_insn_init(&insn, buf);
268 }
269 insn_get_length(&insn); 285 insn_get_length(&insn);
286
287 /*
288 * Another debugging subsystem might insert this breakpoint.
289 * In that case, we can't recover it.
290 */
291 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
292 return 0;
270 addr += insn.length; 293 addr += insn.length;
271 } 294 }
272 295
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
299 * If not, return null. 322 * If not, return null.
300 * Only applicable to 64-bit x86. 323 * Only applicable to 64-bit x86.
301 */ 324 */
302static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) 325int __kprobes __copy_instruction(u8 *dest, u8 *src)
303{ 326{
304 struct insn insn; 327 struct insn insn;
305 int ret;
306 kprobe_opcode_t buf[MAX_INSN_SIZE]; 328 kprobe_opcode_t buf[MAX_INSN_SIZE];
307 329
308 kernel_insn_init(&insn, src); 330 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
309 if (recover) {
310 insn_get_opcode(&insn);
311 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
312 ret = recover_probed_instruction(buf,
313 (unsigned long)src);
314 if (ret)
315 return 0;
316 kernel_insn_init(&insn, buf);
317 }
318 }
319 insn_get_length(&insn); 331 insn_get_length(&insn);
332 /* Another subsystem puts a breakpoint, failed to recover */
333 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
334 return 0;
320 memcpy(dest, insn.kaddr, insn.length); 335 memcpy(dest, insn.kaddr, insn.length);
321 336
322#ifdef CONFIG_X86_64 337#ifdef CONFIG_X86_64
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
337 * extension of the original signed 32-bit displacement would 352 * extension of the original signed 32-bit displacement would
338 * have given. 353 * have given.
339 */ 354 */
340 newdisp = (u8 *) src + (s64) insn.displacement.value - 355 newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
341 (u8 *) dest;
342 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 356 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
343 disp = (u8 *) dest + insn_offset_displacement(&insn); 357 disp = (u8 *) dest + insn_offset_displacement(&insn);
344 *(s32 *) disp = (s32) newdisp; 358 *(s32 *) disp = (s32) newdisp;
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
349 363
350static void __kprobes arch_copy_kprobe(struct kprobe *p) 364static void __kprobes arch_copy_kprobe(struct kprobe *p)
351{ 365{
366 /* Copy an instruction with recovering if other optprobe modifies it.*/
367 __copy_instruction(p->ainsn.insn, p->addr);
368
352 /* 369 /*
353 * Copy an instruction without recovering int3, because it will be 370 * __copy_instruction can modify the displacement of the instruction,
354 * put by another subsystem. 371 * but it doesn't affect boostable check.
355 */ 372 */
356 __copy_instruction(p->ainsn.insn, p->addr, 0); 373 if (can_boost(p->ainsn.insn))
357
358 if (can_boost(p->addr))
359 p->ainsn.boostable = 0; 374 p->ainsn.boostable = 0;
360 else 375 else
361 p->ainsn.boostable = -1; 376 p->ainsn.boostable = -1;
362 377
363 p->opcode = *p->addr; 378 /* Also, displacement change doesn't affect the first byte */
379 p->opcode = p->ainsn.insn[0];
364} 380}
365 381
366int __kprobes arch_prepare_kprobe(struct kprobe *p) 382int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)
442 } 458 }
443} 459}
444 460
445void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 461void __kprobes
446 struct pt_regs *regs) 462arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
447{ 463{
448 unsigned long *sara = stack_addr(regs); 464 unsigned long *sara = stack_addr(regs);
449 465
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
453 *sara = (unsigned long) &kretprobe_trampoline; 469 *sara = (unsigned long) &kretprobe_trampoline;
454} 470}
455 471
456#ifdef CONFIG_OPTPROBES 472static void __kprobes
457static int __kprobes setup_detour_execution(struct kprobe *p, 473setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
458 struct pt_regs *regs,
459 int reenter);
460#else
461#define setup_detour_execution(p, regs, reenter) (0)
462#endif
463
464static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
465 struct kprobe_ctlblk *kcb, int reenter)
466{ 474{
467 if (setup_detour_execution(p, regs, reenter)) 475 if (setup_detour_execution(p, regs, reenter))
468 return; 476 return;
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
504 * within the handler. We save the original kprobes variables and just single 512 * within the handler. We save the original kprobes variables and just single
505 * step on the instruction of the new probe without calling any user handlers. 513 * step on the instruction of the new probe without calling any user handlers.
506 */ 514 */
507static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, 515static int __kprobes
508 struct kprobe_ctlblk *kcb) 516reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
509{ 517{
510 switch (kcb->kprobe_status) { 518 switch (kcb->kprobe_status) {
511 case KPROBE_HIT_SSDONE: 519 case KPROBE_HIT_SSDONE:
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
600 return 0; 608 return 0;
601} 609}
602 610
603#ifdef CONFIG_X86_64
604#define SAVE_REGS_STRING \
605 /* Skip cs, ip, orig_ax. */ \
606 " subq $24, %rsp\n" \
607 " pushq %rdi\n" \
608 " pushq %rsi\n" \
609 " pushq %rdx\n" \
610 " pushq %rcx\n" \
611 " pushq %rax\n" \
612 " pushq %r8\n" \
613 " pushq %r9\n" \
614 " pushq %r10\n" \
615 " pushq %r11\n" \
616 " pushq %rbx\n" \
617 " pushq %rbp\n" \
618 " pushq %r12\n" \
619 " pushq %r13\n" \
620 " pushq %r14\n" \
621 " pushq %r15\n"
622#define RESTORE_REGS_STRING \
623 " popq %r15\n" \
624 " popq %r14\n" \
625 " popq %r13\n" \
626 " popq %r12\n" \
627 " popq %rbp\n" \
628 " popq %rbx\n" \
629 " popq %r11\n" \
630 " popq %r10\n" \
631 " popq %r9\n" \
632 " popq %r8\n" \
633 " popq %rax\n" \
634 " popq %rcx\n" \
635 " popq %rdx\n" \
636 " popq %rsi\n" \
637 " popq %rdi\n" \
638 /* Skip orig_ax, ip, cs */ \
639 " addq $24, %rsp\n"
640#else
641#define SAVE_REGS_STRING \
642 /* Skip cs, ip, orig_ax and gs. */ \
643 " subl $16, %esp\n" \
644 " pushl %fs\n" \
645 " pushl %es\n" \
646 " pushl %ds\n" \
647 " pushl %eax\n" \
648 " pushl %ebp\n" \
649 " pushl %edi\n" \
650 " pushl %esi\n" \
651 " pushl %edx\n" \
652 " pushl %ecx\n" \
653 " pushl %ebx\n"
654#define RESTORE_REGS_STRING \
655 " popl %ebx\n" \
656 " popl %ecx\n" \
657 " popl %edx\n" \
658 " popl %esi\n" \
659 " popl %edi\n" \
660 " popl %ebp\n" \
661 " popl %eax\n" \
662 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
663 " addl $24, %esp\n"
664#endif
665
666/* 611/*
667 * When a retprobed function returns, this code saves registers and 612 * When a retprobed function returns, this code saves registers and
668 * calls trampoline_handler() runs, which calls the kretprobe's handler. 613 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
816 * jump instruction after the copied instruction, that jumps to the next 761 * jump instruction after the copied instruction, that jumps to the next
817 * instruction after the probepoint. 762 * instruction after the probepoint.
818 */ 763 */
819static void __kprobes resume_execution(struct kprobe *p, 764static void __kprobes
820 struct pt_regs *regs, struct kprobe_ctlblk *kcb) 765resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
821{ 766{
822 unsigned long *tos = stack_addr(regs); 767 unsigned long *tos = stack_addr(regs);
823 unsigned long copy_ip = (unsigned long)p->ainsn.insn; 768 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
996/* 941/*
997 * Wrapper routine for handling exceptions. 942 * Wrapper routine for handling exceptions.
998 */ 943 */
999int __kprobes kprobe_exceptions_notify(struct notifier_block *self, 944int __kprobes
1000 unsigned long val, void *data) 945kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
1001{ 946{
1002 struct die_args *args = data; 947 struct die_args *args = data;
1003 int ret = NOTIFY_DONE; 948 int ret = NOTIFY_DONE;
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1107 return 0; 1052 return 0;
1108} 1053}
1109 1054
1110
1111#ifdef CONFIG_OPTPROBES
1112
1113/* Insert a call instruction at address 'from', which calls address 'to'.*/
1114static void __kprobes synthesize_relcall(void *from, void *to)
1115{
1116 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1117}
1118
1119/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1120static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1121 unsigned long val)
1122{
1123#ifdef CONFIG_X86_64
1124 *addr++ = 0x48;
1125 *addr++ = 0xbf;
1126#else
1127 *addr++ = 0xb8;
1128#endif
1129 *(unsigned long *)addr = val;
1130}
1131
1132static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{
1134 asm volatile (
1135 ".global optprobe_template_entry\n"
1136 "optprobe_template_entry: \n"
1137#ifdef CONFIG_X86_64
1138 /* We don't bother saving the ss register */
1139 " pushq %rsp\n"
1140 " pushfq\n"
1141 SAVE_REGS_STRING
1142 " movq %rsp, %rsi\n"
1143 ".global optprobe_template_val\n"
1144 "optprobe_template_val: \n"
1145 ASM_NOP5
1146 ASM_NOP5
1147 ".global optprobe_template_call\n"
1148 "optprobe_template_call: \n"
1149 ASM_NOP5
1150 /* Move flags to rsp */
1151 " movq 144(%rsp), %rdx\n"
1152 " movq %rdx, 152(%rsp)\n"
1153 RESTORE_REGS_STRING
1154 /* Skip flags entry */
1155 " addq $8, %rsp\n"
1156 " popfq\n"
1157#else /* CONFIG_X86_32 */
1158 " pushf\n"
1159 SAVE_REGS_STRING
1160 " movl %esp, %edx\n"
1161 ".global optprobe_template_val\n"
1162 "optprobe_template_val: \n"
1163 ASM_NOP5
1164 ".global optprobe_template_call\n"
1165 "optprobe_template_call: \n"
1166 ASM_NOP5
1167 RESTORE_REGS_STRING
1168 " addl $4, %esp\n" /* skip cs */
1169 " popf\n"
1170#endif
1171 ".global optprobe_template_end\n"
1172 "optprobe_template_end: \n");
1173}
1174
1175#define TMPL_MOVE_IDX \
1176 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1177#define TMPL_CALL_IDX \
1178 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1179#define TMPL_END_IDX \
1180 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1181
1182#define INT3_SIZE sizeof(kprobe_opcode_t)
1183
1184/* Optimized kprobe call back function: called from optinsn */
1185static void __kprobes optimized_callback(struct optimized_kprobe *op,
1186 struct pt_regs *regs)
1187{
1188 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1189 unsigned long flags;
1190
1191 /* This is possible if op is under delayed unoptimizing */
1192 if (kprobe_disabled(&op->kp))
1193 return;
1194
1195 local_irq_save(flags);
1196 if (kprobe_running()) {
1197 kprobes_inc_nmissed_count(&op->kp);
1198 } else {
1199 /* Save skipped registers */
1200#ifdef CONFIG_X86_64
1201 regs->cs = __KERNEL_CS;
1202#else
1203 regs->cs = __KERNEL_CS | get_kernel_rpl();
1204 regs->gs = 0;
1205#endif
1206 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1207 regs->orig_ax = ~0UL;
1208
1209 __this_cpu_write(current_kprobe, &op->kp);
1210 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1211 opt_pre_handler(&op->kp, regs);
1212 __this_cpu_write(current_kprobe, NULL);
1213 }
1214 local_irq_restore(flags);
1215}
1216
1217static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1218{
1219 int len = 0, ret;
1220
1221 while (len < RELATIVEJUMP_SIZE) {
1222 ret = __copy_instruction(dest + len, src + len, 1);
1223 if (!ret || !can_boost(dest + len))
1224 return -EINVAL;
1225 len += ret;
1226 }
1227 /* Check whether the address range is reserved */
1228 if (ftrace_text_reserved(src, src + len - 1) ||
1229 alternatives_text_reserved(src, src + len - 1) ||
1230 jump_label_text_reserved(src, src + len - 1))
1231 return -EBUSY;
1232
1233 return len;
1234}
1235
1236/* Check whether insn is indirect jump */
1237static int __kprobes insn_is_indirect_jump(struct insn *insn)
1238{
1239 return ((insn->opcode.bytes[0] == 0xff &&
1240 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1241 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1242}
1243
1244/* Check whether insn jumps into specified address range */
1245static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1246{
1247 unsigned long target = 0;
1248
1249 switch (insn->opcode.bytes[0]) {
1250 case 0xe0: /* loopne */
1251 case 0xe1: /* loope */
1252 case 0xe2: /* loop */
1253 case 0xe3: /* jcxz */
1254 case 0xe9: /* near relative jump */
1255 case 0xeb: /* short relative jump */
1256 break;
1257 case 0x0f:
1258 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1259 break;
1260 return 0;
1261 default:
1262 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1263 break;
1264 return 0;
1265 }
1266 target = (unsigned long)insn->next_byte + insn->immediate.value;
1267
1268 return (start <= target && target <= start + len);
1269}
1270
1271/* Decode whole function to ensure any instructions don't jump into target */
1272static int __kprobes can_optimize(unsigned long paddr)
1273{
1274 int ret;
1275 unsigned long addr, size = 0, offset = 0;
1276 struct insn insn;
1277 kprobe_opcode_t buf[MAX_INSN_SIZE];
1278
1279 /* Lookup symbol including addr */
1280 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1281 return 0;
1282
1283 /*
1284 * Do not optimize in the entry code due to the unstable
1285 * stack handling.
1286 */
1287 if ((paddr >= (unsigned long )__entry_text_start) &&
1288 (paddr < (unsigned long )__entry_text_end))
1289 return 0;
1290
1291 /* Check there is enough space for a relative jump. */
1292 if (size - offset < RELATIVEJUMP_SIZE)
1293 return 0;
1294
1295 /* Decode instructions */
1296 addr = paddr - offset;
1297 while (addr < paddr - offset + size) { /* Decode until function end */
1298 if (search_exception_tables(addr))
1299 /*
1300 * Since some fixup code will jumps into this function,
1301 * we can't optimize kprobe in this function.
1302 */
1303 return 0;
1304 kernel_insn_init(&insn, (void *)addr);
1305 insn_get_opcode(&insn);
1306 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1307 ret = recover_probed_instruction(buf, addr);
1308 if (ret)
1309 return 0;
1310 kernel_insn_init(&insn, buf);
1311 }
1312 insn_get_length(&insn);
1313 /* Recover address */
1314 insn.kaddr = (void *)addr;
1315 insn.next_byte = (void *)(addr + insn.length);
1316 /* Check any instructions don't jump into target */
1317 if (insn_is_indirect_jump(&insn) ||
1318 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1319 RELATIVE_ADDR_SIZE))
1320 return 0;
1321 addr += insn.length;
1322 }
1323
1324 return 1;
1325}
1326
1327/* Check optimized_kprobe can actually be optimized. */
1328int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1329{
1330 int i;
1331 struct kprobe *p;
1332
1333 for (i = 1; i < op->optinsn.size; i++) {
1334 p = get_kprobe(op->kp.addr + i);
1335 if (p && !kprobe_disabled(p))
1336 return -EEXIST;
1337 }
1338
1339 return 0;
1340}
1341
1342/* Check the addr is within the optimized instructions. */
1343int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1344 unsigned long addr)
1345{
1346 return ((unsigned long)op->kp.addr <= addr &&
1347 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1348}
1349
1350/* Free optimized instruction slot */
1351static __kprobes
1352void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1353{
1354 if (op->optinsn.insn) {
1355 free_optinsn_slot(op->optinsn.insn, dirty);
1356 op->optinsn.insn = NULL;
1357 op->optinsn.size = 0;
1358 }
1359}
1360
1361void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1362{
1363 __arch_remove_optimized_kprobe(op, 1);
1364}
1365
1366/*
1367 * Copy replacing target instructions
1368 * Target instructions MUST be relocatable (checked inside)
1369 */
1370int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1371{
1372 u8 *buf;
1373 int ret;
1374 long rel;
1375
1376 if (!can_optimize((unsigned long)op->kp.addr))
1377 return -EILSEQ;
1378
1379 op->optinsn.insn = get_optinsn_slot();
1380 if (!op->optinsn.insn)
1381 return -ENOMEM;
1382
1383 /*
1384 * Verify if the address gap is in 2GB range, because this uses
1385 * a relative jump.
1386 */
1387 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1388 if (abs(rel) > 0x7fffffff)
1389 return -ERANGE;
1390
1391 buf = (u8 *)op->optinsn.insn;
1392
1393 /* Copy instructions into the out-of-line buffer */
1394 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1395 if (ret < 0) {
1396 __arch_remove_optimized_kprobe(op, 0);
1397 return ret;
1398 }
1399 op->optinsn.size = ret;
1400
1401 /* Copy arch-dep-instance from template */
1402 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1403
1404 /* Set probe information */
1405 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1406
1407 /* Set probe function call */
1408 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1409
1410 /* Set returning jmp instruction at the tail of out-of-line buffer */
1411 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1412 (u8 *)op->kp.addr + op->optinsn.size);
1413
1414 flush_icache_range((unsigned long) buf,
1415 (unsigned long) buf + TMPL_END_IDX +
1416 op->optinsn.size + RELATIVEJUMP_SIZE);
1417 return 0;
1418}
1419
1420#define MAX_OPTIMIZE_PROBES 256
1421static struct text_poke_param *jump_poke_params;
1422static struct jump_poke_buffer {
1423 u8 buf[RELATIVEJUMP_SIZE];
1424} *jump_poke_bufs;
1425
1426static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1427 u8 *insn_buf,
1428 struct optimized_kprobe *op)
1429{
1430 s32 rel = (s32)((long)op->optinsn.insn -
1431 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1432
1433 /* Backup instructions which will be replaced by jump address */
1434 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1435 RELATIVE_ADDR_SIZE);
1436
1437 insn_buf[0] = RELATIVEJUMP_OPCODE;
1438 *(s32 *)(&insn_buf[1]) = rel;
1439
1440 tprm->addr = op->kp.addr;
1441 tprm->opcode = insn_buf;
1442 tprm->len = RELATIVEJUMP_SIZE;
1443}
1444
1445/*
1446 * Replace breakpoints (int3) with relative jumps.
1447 * Caller must call with locking kprobe_mutex and text_mutex.
1448 */
1449void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1450{
1451 struct optimized_kprobe *op, *tmp;
1452 int c = 0;
1453
1454 list_for_each_entry_safe(op, tmp, oplist, list) {
1455 WARN_ON(kprobe_disabled(&op->kp));
1456 /* Setup param */
1457 setup_optimize_kprobe(&jump_poke_params[c],
1458 jump_poke_bufs[c].buf, op);
1459 list_del_init(&op->list);
1460 if (++c >= MAX_OPTIMIZE_PROBES)
1461 break;
1462 }
1463
1464 /*
1465 * text_poke_smp doesn't support NMI/MCE code modifying.
1466 * However, since kprobes itself also doesn't support NMI/MCE
1467 * code probing, it's not a problem.
1468 */
1469 text_poke_smp_batch(jump_poke_params, c);
1470}
1471
1472static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1473 u8 *insn_buf,
1474 struct optimized_kprobe *op)
1475{
1476 /* Set int3 to first byte for kprobes */
1477 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1478 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1479
1480 tprm->addr = op->kp.addr;
1481 tprm->opcode = insn_buf;
1482 tprm->len = RELATIVEJUMP_SIZE;
1483}
1484
1485/*
1486 * Recover original instructions and breakpoints from relative jumps.
1487 * Caller must call with locking kprobe_mutex.
1488 */
1489extern void arch_unoptimize_kprobes(struct list_head *oplist,
1490 struct list_head *done_list)
1491{
1492 struct optimized_kprobe *op, *tmp;
1493 int c = 0;
1494
1495 list_for_each_entry_safe(op, tmp, oplist, list) {
1496 /* Setup param */
1497 setup_unoptimize_kprobe(&jump_poke_params[c],
1498 jump_poke_bufs[c].buf, op);
1499 list_move(&op->list, done_list);
1500 if (++c >= MAX_OPTIMIZE_PROBES)
1501 break;
1502 }
1503
1504 /*
1505 * text_poke_smp doesn't support NMI/MCE code modifying.
1506 * However, since kprobes itself also doesn't support NMI/MCE
1507 * code probing, it's not a problem.
1508 */
1509 text_poke_smp_batch(jump_poke_params, c);
1510}
1511
1512/* Replace a relative jump with a breakpoint (int3). */
1513void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1514{
1515 u8 buf[RELATIVEJUMP_SIZE];
1516
1517 /* Set int3 to first byte for kprobes */
1518 buf[0] = BREAKPOINT_INSTRUCTION;
1519 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1520 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1521}
1522
1523static int __kprobes setup_detour_execution(struct kprobe *p,
1524 struct pt_regs *regs,
1525 int reenter)
1526{
1527 struct optimized_kprobe *op;
1528
1529 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1530 /* This kprobe is really able to run optimized path. */
1531 op = container_of(p, struct optimized_kprobe, kp);
1532 /* Detour through copied instructions */
1533 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1534 if (!reenter)
1535 reset_current_kprobe();
1536 preempt_enable_no_resched();
1537 return 1;
1538 }
1539 return 0;
1540}
1541
1542static int __kprobes init_poke_params(void)
1543{
1544 /* Allocate code buffer and parameter array */
1545 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1546 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1547 if (!jump_poke_bufs)
1548 return -ENOMEM;
1549
1550 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1551 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1552 if (!jump_poke_params) {
1553 kfree(jump_poke_bufs);
1554 jump_poke_bufs = NULL;
1555 return -ENOMEM;
1556 }
1557
1558 return 0;
1559}
1560#else /* !CONFIG_OPTPROBES */
1561static int __kprobes init_poke_params(void)
1562{
1563 return 0;
1564}
1565#endif
1566
1567int __init arch_init_kprobes(void) 1055int __init arch_init_kprobes(void)
1568{ 1056{
1569 return init_poke_params(); 1057 return arch_init_optprobes();
1570} 1058}
1571 1059
1572int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1060int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176b..b8ba6e4a27e4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -38,6 +38,7 @@
38#include <asm/traps.h> 38#include <asm/traps.h>
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h>
41 42
42static int kvmapf = 1; 43static int kvmapf = 1;
43 44
@@ -253,7 +254,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
253 kvm_async_pf_task_wait((u32)read_cr2()); 254 kvm_async_pf_task_wait((u32)read_cr2());
254 break; 255 break;
255 case KVM_PV_REASON_PAGE_READY: 256 case KVM_PV_REASON_PAGE_READY:
257 rcu_irq_enter();
258 exit_idle();
256 kvm_async_pf_task_wake((u32)read_cr2()); 259 kvm_async_pf_task_wake((u32)read_cr2());
260 rcu_irq_exit();
257 break; 261 break;
258 } 262 }
259} 263}
@@ -438,9 +442,9 @@ void __init kvm_guest_init(void)
438static __init int activate_jump_labels(void) 442static __init int activate_jump_labels(void)
439{ 443{
440 if (has_steal_clock) { 444 if (has_steal_clock) {
441 jump_label_inc(&paravirt_steal_enabled); 445 static_key_slow_inc(&paravirt_steal_enabled);
442 if (steal_acc) 446 if (steal_acc)
443 jump_label_inc(&paravirt_steal_rq_enabled); 447 static_key_slow_inc(&paravirt_steal_rq_enabled);
444 } 448 }
445 449
446 return 0; 450 return 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b29..f8492da65bfc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
136 return ret; 136 return ret;
137} 137}
138 138
139static void kvm_save_sched_clock_state(void)
140{
141}
142
143static void kvm_restore_sched_clock_state(void)
144{
145 kvm_register_clock("primary cpu clock, resume");
146}
147
139#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
140static void __cpuinit kvm_setup_secondary_clock(void) 149static void __cpuinit kvm_setup_secondary_clock(void)
141{ 150{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
144 * we shouldn't fail. 153 * we shouldn't fail.
145 */ 154 */
146 WARN_ON(kvm_register_clock("secondary cpu clock")); 155 WARN_ON(kvm_register_clock("secondary cpu clock"));
147 /* ok, done with our trickery, call native */
148 setup_secondary_APIC_clock();
149} 156}
150#endif 157#endif
151 158
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
194 x86_platform.get_wallclock = kvm_get_wallclock; 201 x86_platform.get_wallclock = kvm_get_wallclock;
195 x86_platform.set_wallclock = kvm_set_wallclock; 202 x86_platform.set_wallclock = kvm_set_wallclock;
196#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
197 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.early_percpu_clock_init =
198 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
199#endif 206#endif
207 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
208 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
200 machine_ops.shutdown = kvm_shutdown; 209 machine_ops.shutdown = kvm_shutdown;
201#ifdef CONFIG_KEXEC 210#ifdef CONFIG_KEXEC
202 machine_ops.crash_shutdown = kvm_crash_shutdown; 211 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea697263b373..ebc987398923 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,7 +15,6 @@
15#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17 17
18#include <asm/system.h>
19#include <asm/ldt.h> 18#include <asm/ldt.h>
20#include <asm/desc.h> 19#include <asm/desc.h>
21#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43ba5d3b..5b19e4d78b00 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,7 +23,6 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h>
27#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
28#include <asm/debugreg.h> 27#include <asm/debugreg.h>
29 28
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 177183cbb6ae..7eb1e2b97827 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -43,7 +43,6 @@
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h> 45#include <linux/slab.h>
46#include <asm/system.h>
47#include <asm/io.h> 46#include <asm/io.h>
48#include <linux/proc_fs.h> 47#include <linux/proc_fs.h>
49#include <linux/mman.h> 48#include <linux/mman.h>
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fda91c307104..87a0f8688301 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -86,6 +86,7 @@
86 86
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h>
89 90
90MODULE_DESCRIPTION("Microcode Update Driver"); 91MODULE_DESCRIPTION("Microcode Update Driver");
91MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 92MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = {
504 .notifier_call = mc_cpu_callback, 505 .notifier_call = mc_cpu_callback,
505}; 506};
506 507
508#ifdef MODULE
509/* Autoload on Intel and AMD systems */
510static const struct x86_cpu_id microcode_id[] = {
511#ifdef CONFIG_MICROCODE_INTEL
512 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
513#endif
514#ifdef CONFIG_MICROCODE_AMD
515 { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
516#endif
517 {}
518};
519MODULE_DEVICE_TABLE(x86cpu, microcode_id);
520#endif
521
507static int __init microcode_init(void) 522static int __init microcode_init(void)
508{ 523{
509 struct cpuinfo_x86 *c = &cpu_data(0); 524 struct cpuinfo_x86 *c = &cpu_data(0);
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 925179f871de..f21fd94ac897 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -26,7 +26,6 @@
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/jump_label.h> 27#include <linux/jump_label.h>
28 28
29#include <asm/system.h>
30#include <asm/page.h> 29#include <asm/page.h>
31#include <asm/pgtable.h> 30#include <asm/pgtable.h>
32 31
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 96356762a51d..eb113693f043 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,6 @@
40 40
41#include <asm/processor.h> 41#include <asm/processor.h>
42#include <asm/msr.h> 42#include <asm/msr.h>
43#include <asm/system.h>
44 43
45static struct class *msr_class; 44static struct class *msr_class;
46 45
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 0d01a8ea4e11..2c39dcd510fa 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -12,6 +12,7 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/delay.h> 14#include <linux/delay.h>
15#include <linux/init.h>
15 16
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18#include <asm/nmi.h>
@@ -20,35 +21,35 @@
20#define FAILURE 1 21#define FAILURE 1
21#define TIMEOUT 2 22#define TIMEOUT 2
22 23
23static int nmi_fail; 24static int __initdata nmi_fail;
24 25
25/* check to see if NMI IPIs work on this machine */ 26/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; 27static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
27 28
28static int testcase_total; 29static int __initdata testcase_total;
29static int testcase_successes; 30static int __initdata testcase_successes;
30static int expected_testcase_failures; 31static int __initdata expected_testcase_failures;
31static int unexpected_testcase_failures; 32static int __initdata unexpected_testcase_failures;
32static int unexpected_testcase_unknowns; 33static int __initdata unexpected_testcase_unknowns;
33 34
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) 35static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{ 36{
36 unexpected_testcase_unknowns++; 37 unexpected_testcase_unknowns++;
37 return NMI_HANDLED; 38 return NMI_HANDLED;
38} 39}
39 40
40static void init_nmi_testsuite(void) 41static void __init init_nmi_testsuite(void)
41{ 42{
42 /* trap all the unknown NMIs we may generate */ 43 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); 44 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44} 45}
45 46
46static void cleanup_nmi_testsuite(void) 47static void __init cleanup_nmi_testsuite(void)
47{ 48{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); 49 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49} 50}
50 51
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) 52static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{ 53{
53 int cpu = raw_smp_processor_id(); 54 int cpu = raw_smp_processor_id();
54 55
@@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
58 return NMI_DONE; 59 return NMI_DONE;
59} 60}
60 61
61static void test_nmi_ipi(struct cpumask *mask) 62static void __init test_nmi_ipi(struct cpumask *mask)
62{ 63{
63 unsigned long timeout; 64 unsigned long timeout;
64 65
@@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask)
86 return; 87 return;
87} 88}
88 89
89static void remote_ipi(void) 90static void __init remote_ipi(void)
90{ 91{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); 92 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); 93 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
@@ -94,19 +95,19 @@ static void remote_ipi(void)
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask)); 95 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95} 96}
96 97
97static void local_ipi(void) 98static void __init local_ipi(void)
98{ 99{
99 cpumask_clear(to_cpumask(nmi_ipi_mask)); 100 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); 101 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask)); 102 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102} 103}
103 104
104static void reset_nmi(void) 105static void __init reset_nmi(void)
105{ 106{
106 nmi_fail = 0; 107 nmi_fail = 0;
107} 108}
108 109
109static void dotest(void (*testcase_fn)(void), int expected) 110static void __init dotest(void (*testcase_fn)(void), int expected)
110{ 111{
111 testcase_fn(); 112 testcase_fn();
112 /* 113 /*
@@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected)
131 reset_nmi(); 132 reset_nmi();
132} 133}
133 134
134static inline void print_testname(const char *testname) 135static inline void __init print_testname(const char *testname)
135{ 136{
136 printk("%12s:", testname); 137 printk("%12s:", testname);
137} 138}
138 139
139void nmi_selftest(void) 140void __init nmi_selftest(void)
140{ 141{
141 init_nmi_testsuite(); 142 init_nmi_testsuite();
142 143
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..ab137605e694 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
26 26
27#include <asm/bug.h> 27#include <asm/bug.h>
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/debugreg.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
@@ -37,6 +38,7 @@
37#include <asm/apic.h> 38#include <asm/apic.h>
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include <asm/timer.h> 40#include <asm/timer.h>
41#include <asm/special_insns.h>
40 42
41/* nop stub */ 43/* nop stub */
42void _paravirt_nop(void) 44void _paravirt_nop(void)
@@ -202,8 +204,8 @@ static void native_flush_tlb_single(unsigned long addr)
202 __native_flush_tlb_single(addr); 204 __native_flush_tlb_single(addr);
203} 205}
204 206
205struct jump_label_key paravirt_steal_enabled; 207struct static_key paravirt_steal_enabled;
206struct jump_label_key paravirt_steal_rq_enabled; 208struct static_key paravirt_steal_rq_enabled;
207 209
208static u64 native_steal_clock(int cpu) 210static u64 native_steal_clock(int cpu)
209{ 211{
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b58345..d0b2fb9ccbb1 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -42,7 +42,6 @@
42#include <asm/calgary.h> 42#include <asm/calgary.h>
43#include <asm/tce.h> 43#include <asm/tce.h>
44#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
45#include <asm/system.h>
46#include <asm/dma.h> 45#include <asm/dma.h>
47#include <asm/rio.h> 46#include <asm/rio.h>
48#include <asm/bios_ebda.h> 47#include <asm/bios_ebda.h>
@@ -431,7 +430,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
431} 430}
432 431
433static void* calgary_alloc_coherent(struct device *dev, size_t size, 432static void* calgary_alloc_coherent(struct device *dev, size_t size,
434 dma_addr_t *dma_handle, gfp_t flag) 433 dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs)
435{ 434{
436 void *ret = NULL; 435 void *ret = NULL;
437 dma_addr_t mapping; 436 dma_addr_t mapping;
@@ -464,7 +463,8 @@ error:
464} 463}
465 464
466static void calgary_free_coherent(struct device *dev, size_t size, 465static void calgary_free_coherent(struct device *dev, size_t size,
467 void *vaddr, dma_addr_t dma_handle) 466 void *vaddr, dma_addr_t dma_handle,
467 struct dma_attrs *attrs)
468{ 468{
469 unsigned int npages; 469 unsigned int npages;
470 struct iommu_table *tbl = find_iommu_table(dev); 470 struct iommu_table *tbl = find_iommu_table(dev);
@@ -477,8 +477,8 @@ static void calgary_free_coherent(struct device *dev, size_t size,
477} 477}
478 478
479static struct dma_map_ops calgary_dma_ops = { 479static struct dma_map_ops calgary_dma_ops = {
480 .alloc_coherent = calgary_alloc_coherent, 480 .alloc = calgary_alloc_coherent,
481 .free_coherent = calgary_free_coherent, 481 .free = calgary_free_coherent,
482 .map_sg = calgary_map_sg, 482 .map_sg = calgary_map_sg,
483 .unmap_sg = calgary_unmap_sg, 483 .unmap_sg = calgary_unmap_sg,
484 .map_page = calgary_map_page, 484 .map_page = calgary_map_page,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21ea..3003250ac51d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void)
96 } 96 }
97} 97}
98void *dma_generic_alloc_coherent(struct device *dev, size_t size, 98void *dma_generic_alloc_coherent(struct device *dev, size_t size,
99 dma_addr_t *dma_addr, gfp_t flag) 99 dma_addr_t *dma_addr, gfp_t flag,
100 struct dma_attrs *attrs)
100{ 101{
101 unsigned long dma_mask; 102 unsigned long dma_mask;
102 struct page *page; 103 struct page *page;
@@ -262,10 +263,11 @@ rootfs_initcall(pci_iommu_init);
262 263
263static __devinit void via_no_dac(struct pci_dev *dev) 264static __devinit void via_no_dac(struct pci_dev *dev)
264{ 265{
265 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 266 if (forbid_dac == 0) {
266 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 267 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
267 forbid_dac = 1; 268 forbid_dac = 1;
268 } 269 }
269} 270}
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); 271DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
272 PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
271#endif 273#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3af4af810c07..f96050685b46 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
75} 75}
76 76
77static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, 77static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 dma_addr_t dma_addr) 78 dma_addr_t dma_addr, struct dma_attrs *attrs)
79{ 79{
80 free_pages((unsigned long)vaddr, get_order(size)); 80 free_pages((unsigned long)vaddr, get_order(size));
81} 81}
@@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev,
96} 96}
97 97
98struct dma_map_ops nommu_dma_ops = { 98struct dma_map_ops nommu_dma_ops = {
99 .alloc_coherent = dma_generic_alloc_coherent, 99 .alloc = dma_generic_alloc_coherent,
100 .free_coherent = nommu_free_coherent, 100 .free = nommu_free_coherent,
101 .map_sg = nommu_map_sg, 101 .map_sg = nommu_map_sg,
102 .map_page = nommu_map_page, 102 .map_page = nommu_map_page,
103 .sync_single_for_device = nommu_sync_single_for_device, 103 .sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 8f972cbddef0..6c483ba98b9c 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -15,21 +15,30 @@
15int swiotlb __read_mostly; 15int swiotlb __read_mostly;
16 16
17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
18 dma_addr_t *dma_handle, gfp_t flags) 18 dma_addr_t *dma_handle, gfp_t flags,
19 struct dma_attrs *attrs)
19{ 20{
20 void *vaddr; 21 void *vaddr;
21 22
22 vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); 23 vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
24 attrs);
23 if (vaddr) 25 if (vaddr)
24 return vaddr; 26 return vaddr;
25 27
26 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); 28 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
27} 29}
28 30
31static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
32 void *vaddr, dma_addr_t dma_addr,
33 struct dma_attrs *attrs)
34{
35 swiotlb_free_coherent(dev, size, vaddr, dma_addr);
36}
37
29static struct dma_map_ops swiotlb_dma_ops = { 38static struct dma_map_ops swiotlb_dma_ops = {
30 .mapping_error = swiotlb_dma_mapping_error, 39 .mapping_error = swiotlb_dma_mapping_error,
31 .alloc_coherent = x86_swiotlb_alloc_coherent, 40 .alloc = x86_swiotlb_alloc_coherent,
32 .free_coherent = swiotlb_free_coherent, 41 .free = x86_swiotlb_free_coherent,
33 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 42 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
34 .sync_single_for_device = swiotlb_sync_single_for_device, 43 .sync_single_for_device = swiotlb_sync_single_for_device,
35 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 44 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e84ce31..0bc72e2069e3 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/export.h> 13#include <linux/export.h>
14 14
15#include <asm/probe_roms.h>
15#include <asm/pci-direct.h> 16#include <asm/pci-direct.h>
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/mmzone.h> 18#include <asm/mmzone.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..1d92a5ab6e8b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -12,16 +12,37 @@
12#include <linux/user-return-notifier.h> 12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h> 13#include <linux/dmi.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/stackprotector.h>
16#include <linux/tick.h>
17#include <linux/cpuidle.h>
15#include <trace/events/power.h> 18#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 19#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h> 20#include <asm/cpu.h>
18#include <asm/system.h>
19#include <asm/apic.h> 21#include <asm/apic.h>
20#include <asm/syscalls.h> 22#include <asm/syscalls.h>
21#include <asm/idle.h> 23#include <asm/idle.h>
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/fpu-internal.h>
24#include <asm/debugreg.h> 27#include <asm/debugreg.h>
28#include <asm/nmi.h>
29
30#ifdef CONFIG_X86_64
31static DEFINE_PER_CPU(unsigned char, is_idle);
32static ATOMIC_NOTIFIER_HEAD(idle_notifier);
33
34void idle_notifier_register(struct notifier_block *n)
35{
36 atomic_notifier_chain_register(&idle_notifier, n);
37}
38EXPORT_SYMBOL_GPL(idle_notifier_register);
39
40void idle_notifier_unregister(struct notifier_block *n)
41{
42 atomic_notifier_chain_unregister(&idle_notifier, n);
43}
44EXPORT_SYMBOL_GPL(idle_notifier_unregister);
45#endif
25 46
26struct kmem_cache *task_xstate_cachep; 47struct kmem_cache *task_xstate_cachep;
27EXPORT_SYMBOL_GPL(task_xstate_cachep); 48EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -341,44 +362,113 @@ void (*pm_idle)(void);
341EXPORT_SYMBOL(pm_idle); 362EXPORT_SYMBOL(pm_idle);
342#endif 363#endif
343 364
344#ifdef CONFIG_X86_32 365static inline int hlt_use_halt(void)
345/*
346 * This halt magic was a workaround for ancient floppy DMA
347 * wreckage. It should be safe to remove.
348 */
349static int hlt_counter;
350void disable_hlt(void)
351{ 366{
352 hlt_counter++; 367 return 1;
353} 368}
354EXPORT_SYMBOL(disable_hlt);
355 369
356void enable_hlt(void) 370#ifndef CONFIG_SMP
371static inline void play_dead(void)
357{ 372{
358 hlt_counter--; 373 BUG();
359} 374}
360EXPORT_SYMBOL(enable_hlt); 375#endif
361 376
362static inline int hlt_use_halt(void) 377#ifdef CONFIG_X86_64
378void enter_idle(void)
363{ 379{
364 return (!hlt_counter && boot_cpu_data.hlt_works_ok); 380 percpu_write(is_idle, 1);
381 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
365} 382}
366#else 383
367static inline int hlt_use_halt(void) 384static void __exit_idle(void)
368{ 385{
369 return 1; 386 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
387 return;
388 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
389}
390
391/* Called from interrupts to signify idle end */
392void exit_idle(void)
393{
394 /* idle loop has pid 0 */
395 if (current->pid)
396 return;
397 __exit_idle();
370} 398}
371#endif 399#endif
372 400
373/* 401/*
402 * The idle thread. There's no useful work to be
403 * done, so just try to conserve power and have a
404 * low exit latency (ie sit in a loop waiting for
405 * somebody to say that they'd like to reschedule)
406 */
407void cpu_idle(void)
408{
409 /*
410 * If we're the non-boot CPU, nothing set the stack canary up
411 * for us. CPU0 already has it initialized but no harm in
412 * doing it again. This is a good place for updating it, as
413 * we wont ever return from this function (so the invalid
414 * canaries already on the stack wont ever trigger).
415 */
416 boot_init_stack_canary();
417 current_thread_info()->status |= TS_POLLING;
418
419 while (1) {
420 tick_nohz_idle_enter();
421
422 while (!need_resched()) {
423 rmb();
424
425 if (cpu_is_offline(smp_processor_id()))
426 play_dead();
427
428 /*
429 * Idle routines should keep interrupts disabled
430 * from here on, until they go to idle.
431 * Otherwise, idle callbacks can misfire.
432 */
433 local_touch_nmi();
434 local_irq_disable();
435
436 enter_idle();
437
438 /* Don't trace irqs off for idle */
439 stop_critical_timings();
440
441 /* enter_idle() needs rcu for notifiers */
442 rcu_idle_enter();
443
444 if (cpuidle_idle_call())
445 pm_idle();
446
447 rcu_idle_exit();
448 start_critical_timings();
449
450 /* In many cases the interrupt that ended idle
451 has already called exit_idle. But some idle
452 loops can be woken up without interrupt. */
453 __exit_idle();
454 }
455
456 tick_nohz_idle_exit();
457 preempt_enable_no_resched();
458 schedule();
459 preempt_disable();
460 }
461}
462
463/*
374 * We use this if we don't have any better 464 * We use this if we don't have any better
375 * idle routine.. 465 * idle routine..
376 */ 466 */
377void default_idle(void) 467void default_idle(void)
378{ 468{
379 if (hlt_use_halt()) { 469 if (hlt_use_halt()) {
380 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 470 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
381 trace_cpu_idle(1, smp_processor_id()); 471 trace_cpu_idle_rcuidle(1, smp_processor_id());
382 current_thread_info()->status &= ~TS_POLLING; 472 current_thread_info()->status &= ~TS_POLLING;
383 /* 473 /*
384 * TS_POLLING-cleared state must be visible before we 474 * TS_POLLING-cleared state must be visible before we
@@ -391,8 +481,8 @@ void default_idle(void)
391 else 481 else
392 local_irq_enable(); 482 local_irq_enable();
393 current_thread_info()->status |= TS_POLLING; 483 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id()); 484 trace_power_end_rcuidle(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 485 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
396 } else { 486 } else {
397 local_irq_enable(); 487 local_irq_enable();
398 /* loop is done by the caller */ 488 /* loop is done by the caller */
@@ -450,8 +540,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
450static void mwait_idle(void) 540static void mwait_idle(void)
451{ 541{
452 if (!need_resched()) { 542 if (!need_resched()) {
453 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 543 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
454 trace_cpu_idle(1, smp_processor_id()); 544 trace_cpu_idle_rcuidle(1, smp_processor_id());
455 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 545 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
456 clflush((void *)&current_thread_info()->flags); 546 clflush((void *)&current_thread_info()->flags);
457 547
@@ -461,8 +551,8 @@ static void mwait_idle(void)
461 __sti_mwait(0, 0); 551 __sti_mwait(0, 0);
462 else 552 else
463 local_irq_enable(); 553 local_irq_enable();
464 trace_power_end(smp_processor_id()); 554 trace_power_end_rcuidle(smp_processor_id());
465 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 555 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
466 } else 556 } else
467 local_irq_enable(); 557 local_irq_enable();
468} 558}
@@ -474,13 +564,13 @@ static void mwait_idle(void)
474 */ 564 */
475static void poll_idle(void) 565static void poll_idle(void)
476{ 566{
477 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 567 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
478 trace_cpu_idle(0, smp_processor_id()); 568 trace_cpu_idle_rcuidle(0, smp_processor_id());
479 local_irq_enable(); 569 local_irq_enable();
480 while (!need_resched()) 570 while (!need_resched())
481 cpu_relax(); 571 cpu_relax();
482 trace_power_end(smp_processor_id()); 572 trace_power_end_rcuidle(smp_processor_id());
483 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 573 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
484} 574}
485 575
486/* 576/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c08d1ff12b7c..ae6847303e26 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,7 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <linux/stackprotector.h>
13#include <linux/cpu.h> 12#include <linux/cpu.h>
14#include <linux/errno.h> 13#include <linux/errno.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
@@ -31,20 +30,18 @@
31#include <linux/kallsyms.h> 30#include <linux/kallsyms.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
33#include <linux/personality.h> 32#include <linux/personality.h>
34#include <linux/tick.h>
35#include <linux/percpu.h> 33#include <linux/percpu.h>
36#include <linux/prctl.h> 34#include <linux/prctl.h>
37#include <linux/ftrace.h> 35#include <linux/ftrace.h>
38#include <linux/uaccess.h> 36#include <linux/uaccess.h>
39#include <linux/io.h> 37#include <linux/io.h>
40#include <linux/kdebug.h> 38#include <linux/kdebug.h>
41#include <linux/cpuidle.h>
42 39
43#include <asm/pgtable.h> 40#include <asm/pgtable.h>
44#include <asm/system.h>
45#include <asm/ldt.h> 41#include <asm/ldt.h>
46#include <asm/processor.h> 42#include <asm/processor.h>
47#include <asm/i387.h> 43#include <asm/i387.h>
44#include <asm/fpu-internal.h>
48#include <asm/desc.h> 45#include <asm/desc.h>
49#ifdef CONFIG_MATH_EMULATION 46#ifdef CONFIG_MATH_EMULATION
50#include <asm/math_emu.h> 47#include <asm/math_emu.h>
@@ -57,7 +54,7 @@
57#include <asm/idle.h> 54#include <asm/idle.h>
58#include <asm/syscalls.h> 55#include <asm/syscalls.h>
59#include <asm/debugreg.h> 56#include <asm/debugreg.h>
60#include <asm/nmi.h> 57#include <asm/switch_to.h>
61 58
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 60
@@ -69,62 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
69 return ((unsigned long *)tsk->thread.sp)[3]; 66 return ((unsigned long *)tsk->thread.sp)[3];
70} 67}
71 68
72#ifndef CONFIG_SMP
73static inline void play_dead(void)
74{
75 BUG();
76}
77#endif
78
79/*
80 * The idle thread. There's no useful work to be
81 * done, so just try to conserve power and have a
82 * low exit latency (ie sit in a loop waiting for
83 * somebody to say that they'd like to reschedule)
84 */
85void cpu_idle(void)
86{
87 int cpu = smp_processor_id();
88
89 /*
90 * If we're the non-boot CPU, nothing set the stack canary up
91 * for us. CPU0 already has it initialized but no harm in
92 * doing it again. This is a good place for updating it, as
93 * we wont ever return from this function (so the invalid
94 * canaries already on the stack wont ever trigger).
95 */
96 boot_init_stack_canary();
97
98 current_thread_info()->status |= TS_POLLING;
99
100 /* endless idle loop with no priority at all */
101 while (1) {
102 tick_nohz_idle_enter();
103 rcu_idle_enter();
104 while (!need_resched()) {
105
106 check_pgt_cache();
107 rmb();
108
109 if (cpu_is_offline(cpu))
110 play_dead();
111
112 local_touch_nmi();
113 local_irq_disable();
114 /* Don't trace irqs off for idle */
115 stop_critical_timings();
116 if (cpuidle_idle_call())
117 pm_idle();
118 start_critical_timings();
119 }
120 rcu_idle_exit();
121 tick_nohz_idle_exit();
122 preempt_enable_no_resched();
123 schedule();
124 preempt_disable();
125 }
126}
127
128void __show_regs(struct pt_regs *regs, int all) 69void __show_regs(struct pt_regs *regs, int all)
129{ 70{
130 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 71 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 550e77b1b948..733ca39f367e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,7 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h> 17#include <linux/cpu.h>
19#include <linux/errno.h> 18#include <linux/errno.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
@@ -32,17 +31,15 @@
32#include <linux/notifier.h> 31#include <linux/notifier.h>
33#include <linux/kprobes.h> 32#include <linux/kprobes.h>
34#include <linux/kdebug.h> 33#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h> 34#include <linux/prctl.h>
37#include <linux/uaccess.h> 35#include <linux/uaccess.h>
38#include <linux/io.h> 36#include <linux/io.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
41 38
42#include <asm/pgtable.h> 39#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/processor.h> 40#include <asm/processor.h>
45#include <asm/i387.h> 41#include <asm/i387.h>
42#include <asm/fpu-internal.h>
46#include <asm/mmu_context.h> 43#include <asm/mmu_context.h>
47#include <asm/prctl.h> 44#include <asm/prctl.h>
48#include <asm/desc.h> 45#include <asm/desc.h>
@@ -51,116 +48,11 @@
51#include <asm/idle.h> 48#include <asm/idle.h>
52#include <asm/syscalls.h> 49#include <asm/syscalls.h>
53#include <asm/debugreg.h> 50#include <asm/debugreg.h>
54#include <asm/nmi.h> 51#include <asm/switch_to.h>
55 52
56asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
57 54
58DEFINE_PER_CPU(unsigned long, old_rsp); 55DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle);
60
61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62
63void idle_notifier_register(struct notifier_block *n)
64{
65 atomic_notifier_chain_register(&idle_notifier, n);
66}
67EXPORT_SYMBOL_GPL(idle_notifier_register);
68
69void idle_notifier_unregister(struct notifier_block *n)
70{
71 atomic_notifier_chain_unregister(&idle_notifier, n);
72}
73EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74
75void enter_idle(void)
76{
77 percpu_write(is_idle, 1);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79}
80
81static void __exit_idle(void)
82{
83 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 return;
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86}
87
88/* Called from interrupts to signify idle end */
89void exit_idle(void)
90{
91 /* idle loop has pid 0 */
92 if (current->pid)
93 return;
94 __exit_idle();
95}
96
97#ifndef CONFIG_SMP
98static inline void play_dead(void)
99{
100 BUG();
101}
102#endif
103
104/*
105 * The idle thread. There's no useful work to be
106 * done, so just try to conserve power and have a
107 * low exit latency (ie sit in a loop waiting for
108 * somebody to say that they'd like to reschedule)
109 */
110void cpu_idle(void)
111{
112 current_thread_info()->status |= TS_POLLING;
113
114 /*
115 * If we're the non-boot CPU, nothing set the stack canary up
116 * for us. CPU0 already has it initialized but no harm in
117 * doing it again. This is a good place for updating it, as
118 * we wont ever return from this function (so the invalid
119 * canaries already on the stack wont ever trigger).
120 */
121 boot_init_stack_canary();
122
123 /* endless idle loop with no priority at all */
124 while (1) {
125 tick_nohz_idle_enter();
126 while (!need_resched()) {
127
128 rmb();
129
130 if (cpu_is_offline(smp_processor_id()))
131 play_dead();
132 /*
133 * Idle routines should keep interrupts disabled
134 * from here on, until they go to idle.
135 * Otherwise, idle callbacks can misfire.
136 */
137 local_touch_nmi();
138 local_irq_disable();
139 enter_idle();
140 /* Don't trace irqs off for idle */
141 stop_critical_timings();
142
143 /* enter_idle() needs rcu for notifiers */
144 rcu_idle_enter();
145
146 if (cpuidle_idle_call())
147 pm_idle();
148
149 rcu_idle_exit();
150 start_critical_timings();
151
152 /* In many cases the interrupt that ended idle
153 has already called exit_idle. But some idle
154 loops can be woken up without interrupt. */
155 __exit_idle();
156 }
157
158 tick_nohz_idle_exit();
159 preempt_enable_no_resched();
160 schedule();
161 preempt_disable();
162 }
163}
164 56
165/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
166void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -342,6 +234,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 loadsegment(es, _ds); 234 loadsegment(es, _ds);
343 loadsegment(ds, _ds); 235 loadsegment(ds, _ds);
344 load_gs_index(0); 236 load_gs_index(0);
237 current->thread.usersp = new_sp;
345 regs->ip = new_ip; 238 regs->ip = new_ip;
346 regs->sp = new_sp; 239 regs->sp = new_sp;
347 percpu_write(old_rsp, new_sp); 240 percpu_write(old_rsp, new_sp);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 6fb330adc7c7..685845cf16e0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,9 +24,9 @@
24 24
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/system.h>
28#include <asm/processor.h> 27#include <asm/processor.h>
29#include <asm/i387.h> 28#include <asm/i387.h>
29#include <asm/fpu-internal.h>
30#include <asm/debugreg.h> 30#include <asm/debugreg.h>
31#include <asm/ldt.h> 31#include <asm/ldt.h>
32#include <asm/desc.h> 32#include <asm/desc.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7d5099fe874..1a2901562059 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -90,7 +90,6 @@
90#include <asm/processor.h> 90#include <asm/processor.h>
91#include <asm/bugs.h> 91#include <asm/bugs.h>
92 92
93#include <asm/system.h>
94#include <asm/vsyscall.h> 93#include <asm/vsyscall.h>
95#include <asm/cpu.h> 94#include <asm/cpu.h>
96#include <asm/desc.h> 95#include <asm/desc.h>
@@ -509,15 +508,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
509 508
510#ifdef CONFIG_KEXEC 509#ifdef CONFIG_KEXEC
511 510
512static inline unsigned long long get_total_mem(void)
513{
514 unsigned long long total;
515
516 total = max_pfn - min_low_pfn;
517
518 return total << PAGE_SHIFT;
519}
520
521/* 511/*
522 * Keep the crash kernel below this limit. On 32 bits earlier kernels 512 * Keep the crash kernel below this limit. On 32 bits earlier kernels
523 * would limit the kernel to the low 512 MiB due to mapping restrictions. 513 * would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -536,7 +526,7 @@ static void __init reserve_crashkernel(void)
536 unsigned long long crash_size, crash_base; 526 unsigned long long crash_size, crash_base;
537 int ret; 527 int ret;
538 528
539 total_mem = get_total_mem(); 529 total_mem = memblock_phys_mem_size();
540 530
541 ret = parse_crashkernel(boot_command_line, total_mem, 531 ret = parse_crashkernel(boot_command_line, total_mem,
542 &crash_size, &crash_base); 532 &crash_size, &crash_base);
@@ -749,10 +739,16 @@ void __init setup_arch(char **cmdline_p)
749#endif 739#endif
750#ifdef CONFIG_EFI 740#ifdef CONFIG_EFI
751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 741 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
752 EFI_LOADER_SIGNATURE, 4)) { 742 "EL32", 4)) {
743 efi_enabled = 1;
744 efi_64bit = false;
745 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
746 "EL64", 4)) {
753 efi_enabled = 1; 747 efi_enabled = 1;
754 efi_memblock_x86_reserve_range(); 748 efi_64bit = true;
755 } 749 }
750 if (efi_enabled && efi_memblock_x86_reserve_range())
751 efi_enabled = 0;
756#endif 752#endif
757 753
758 x86_init.oem.arch_setup(); 754 x86_init.oem.arch_setup();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b3cd6913ceea..041af2fd088d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -23,6 +23,7 @@
23#include <asm/processor.h> 23#include <asm/processor.h>
24#include <asm/ucontext.h> 24#include <asm/ucontext.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/fpu-internal.h>
26#include <asm/vdso.h> 27#include <asm/vdso.h>
27#include <asm/mce.h> 28#include <asm/mce.h>
28#include <asm/sighandling.h> 29#include <asm/sighandling.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..6e1e406038c2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -50,6 +50,7 @@
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h> 51#include <linux/stackprotector.h>
52#include <linux/gfp.h> 52#include <linux/gfp.h>
53#include <linux/cpuidle.h>
53 54
54#include <asm/acpi.h> 55#include <asm/acpi.h>
55#include <asm/desc.h> 56#include <asm/desc.h>
@@ -219,14 +220,9 @@ static void __cpuinit smp_callin(void)
219 * Update loops_per_jiffy in cpu_data. Previous call to 220 * Update loops_per_jiffy in cpu_data. Previous call to
220 * smp_store_cpu_info() stored a value that is close but not as 221 * smp_store_cpu_info() stored a value that is close but not as
221 * accurate as the value just calculated. 222 * accurate as the value just calculated.
222 *
223 * Need to enable IRQs because it can take longer and then
224 * the NMI watchdog might kill us.
225 */ 223 */
226 local_irq_enable();
227 calibrate_delay(); 224 calibrate_delay();
228 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; 225 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
229 local_irq_disable();
230 pr_debug("Stack at about %p\n", &cpuid); 226 pr_debug("Stack at about %p\n", &cpuid);
231 227
232 /* 228 /*
@@ -255,6 +251,7 @@ notrace static void __cpuinit start_secondary(void *unused)
255 * most necessary things. 251 * most necessary things.
256 */ 252 */
257 cpu_init(); 253 cpu_init();
254 x86_cpuinit.early_percpu_clock_init();
258 preempt_disable(); 255 preempt_disable();
259 smp_callin(); 256 smp_callin();
260 257
@@ -291,19 +288,6 @@ notrace static void __cpuinit start_secondary(void *unused)
291 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 288 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
292 x86_platform.nmi_init(); 289 x86_platform.nmi_init();
293 290
294 /*
295 * Wait until the cpu which brought this one up marked it
296 * online before enabling interrupts. If we don't do that then
297 * we can end up waking up the softirq thread before this cpu
298 * reached the active state, which makes the scheduler unhappy
299 * and schedule the softirq thread on the wrong cpu. This is
300 * only observable with forced threaded interrupts, but in
301 * theory it could also happen w/o them. It's just way harder
302 * to achieve.
303 */
304 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
305 cpu_relax();
306
307 /* enable local interrupts */ 291 /* enable local interrupts */
308 local_irq_enable(); 292 local_irq_enable();
309 293
@@ -740,8 +724,6 @@ do_rest:
740 * the targeted processor. 724 * the targeted processor.
741 */ 725 */
742 726
743 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
744
745 atomic_set(&init_deasserted, 0); 727 atomic_set(&init_deasserted, 0);
746 728
747 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 729 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -791,9 +773,10 @@ do_rest:
791 schedule(); 773 schedule();
792 } 774 }
793 775
794 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 776 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
777 print_cpu_msr(&cpu_data(cpu));
795 pr_debug("CPU%d: has booted.\n", cpu); 778 pr_debug("CPU%d: has booted.\n", cpu);
796 else { 779 } else {
797 boot_error = 1; 780 boot_error = 1;
798 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) 781 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
799 == 0xA5A5A5A5) 782 == 0xA5A5A5A5)
@@ -847,7 +830,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
847 830
848 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 831 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
849 !physid_isset(apicid, phys_cpu_present_map) || 832 !physid_isset(apicid, phys_cpu_present_map) ||
850 (!x2apic_mode && apicid >= 255)) { 833 !apic->apic_id_valid(apicid)) {
851 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 834 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
852 return -EINVAL; 835 return -EINVAL;
853 } 836 }
@@ -1422,7 +1405,8 @@ void native_play_dead(void)
1422 tboot_shutdown(TB_SHUTDOWN_WFS); 1405 tboot_shutdown(TB_SHUTDOWN_WFS);
1423 1406
1424 mwait_play_dead(); /* Only returns on failure */ 1407 mwait_play_dead(); /* Only returns on failure */
1425 hlt_play_dead(); 1408 if (cpuidle_play_dead())
1409 hlt_play_dead();
1426} 1410}
1427 1411
1428#else /* ... !CONFIG_HOTPLUG_CPU */ 1412#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index f921df8c2099..b4d3c3927dd8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195{ 195{
196 struct vm_area_struct *vma; 196 struct vm_area_struct *vma;
197 struct mm_struct *mm = current->mm; 197 struct mm_struct *mm = current->mm;
198 unsigned long addr = addr0; 198 unsigned long addr = addr0, start_addr;
199 199
200 /* requested length too big for entire address space */ 200 /* requested length too big for entire address space */
201 if (len > TASK_SIZE) 201 if (len > TASK_SIZE)
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
223 mm->free_area_cache = mm->mmap_base; 223 mm->free_area_cache = mm->mmap_base;
224 } 224 }
225 225
226try_again:
226 /* either no address requested or can't fit in requested address hole */ 227 /* either no address requested or can't fit in requested address hole */
227 addr = mm->free_area_cache; 228 start_addr = addr = mm->free_area_cache;
228
229 /* make sure it can fit in the remaining address space */
230 if (addr > len) {
231 unsigned long tmp_addr = align_addr(addr - len, filp,
232 ALIGN_TOPDOWN);
233
234 vma = find_vma(mm, tmp_addr);
235 if (!vma || tmp_addr + len <= vma->vm_start)
236 /* remember the address as a hint for next time */
237 return mm->free_area_cache = tmp_addr;
238 }
239
240 if (mm->mmap_base < len)
241 goto bottomup;
242 229
243 addr = mm->mmap_base-len; 230 if (addr < len)
231 goto fail;
244 232
233 addr -= len;
245 do { 234 do {
246 addr = align_addr(addr, filp, ALIGN_TOPDOWN); 235 addr = align_addr(addr, filp, ALIGN_TOPDOWN);
247 236
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
263 addr = vma->vm_start-len; 252 addr = vma->vm_start-len;
264 } while (len < vma->vm_start); 253 } while (len < vma->vm_start);
265 254
255fail:
256 /*
257 * if hint left us with no space for the requested
258 * mapping then try again:
259 */
260 if (start_addr != mm->mmap_base) {
261 mm->free_area_cache = mm->mmap_base;
262 mm->cached_hole_size = 0;
263 goto try_again;
264 }
265
266bottomup: 266bottomup:
267 /* 267 /*
268 * A failed mmap() very likely causes application failure, 268 * A failed mmap() very likely causes application failure,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e2410e27f97e..6410744ac5cb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
272 offsetof(struct acpi_table_facs, firmware_waking_vector); 272 offsetof(struct acpi_table_facs, firmware_waking_vector);
273} 273}
274 274
275void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) 275static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
276{ 276{
277 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { 277 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
278 /* S0,1,2: */ -1, -1, -1, 278 /* S0,1,2: */ -1, -1, -1,
@@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
281 /* S5: */ TB_SHUTDOWN_S5 }; 281 /* S5: */ TB_SHUTDOWN_S5 };
282 282
283 if (!tboot_enabled()) 283 if (!tboot_enabled())
284 return; 284 return 0;
285 285
286 tboot_copy_fadt(&acpi_gbl_FADT); 286 tboot_copy_fadt(&acpi_gbl_FADT);
287 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; 287 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -292,10 +292,11 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
292 if (sleep_state >= ACPI_S_STATE_COUNT || 292 if (sleep_state >= ACPI_S_STATE_COUNT ||
293 acpi_shutdown_map[sleep_state] == -1) { 293 acpi_shutdown_map[sleep_state] == -1) {
294 pr_warning("unsupported sleep state 0x%x\n", sleep_state); 294 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
295 return; 295 return -1;
296 } 296 }
297 297
298 tboot_shutdown(acpi_shutdown_map[sleep_state]); 298 tboot_shutdown(acpi_shutdown_map[sleep_state]);
299 return 0;
299} 300}
300 301
301static atomic_t ap_wfs_count; 302static atomic_t ap_wfs_count;
@@ -345,6 +346,8 @@ static __init int tboot_late_init(void)
345 346
346 atomic_set(&ap_wfs_count, 0); 347 atomic_set(&ap_wfs_count, 0);
347 register_hotcpu_notifier(&tboot_cpu_notifier); 348 register_hotcpu_notifier(&tboot_cpu_notifier);
349
350 acpi_os_set_prepare_sleep(&tboot_sleep);
348 return 0; 351 return 0;
349} 352}
350 353
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index 9e540fee7009..ab40954e113e 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,6 +34,7 @@
34#include <asm/tce.h> 34#include <asm/tce.h>
35#include <asm/calgary.h> 35#include <asm/calgary.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/cacheflush.h>
37 38
38/* flush a tce at 'tceaddr' to main memory */ 39/* flush a tce at 'tceaddr' to main memory */
39static inline void flush_tce(void* tceaddr) 40static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dd5fbf4101fc..c6eba2b42673 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc);
57 */ 57 */
58static irqreturn_t timer_interrupt(int irq, void *dev_id) 58static irqreturn_t timer_interrupt(int irq, void *dev_id)
59{ 59{
60 /* Keep nmi watchdog up to date */
61 inc_irq_stat(irq0_irqs);
62
63 global_clock_event->event_handler(global_clock_event); 60 global_clock_event->event_handler(global_clock_event);
64 61
65 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 62 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e70..9d9d2f9e77a5 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -6,7 +6,6 @@
6 6
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8#include <asm/desc.h> 8#include <asm/desc.h>
9#include <asm/system.h>
10#include <asm/ldt.h> 9#include <asm/ldt.h>
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/proto.h> 11#include <asm/proto.h>
@@ -163,7 +162,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
163{ 162{
164 const struct desc_struct *tls; 163 const struct desc_struct *tls;
165 164
166 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || 165 if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
167 (pos % sizeof(struct user_desc)) != 0 || 166 (pos % sizeof(struct user_desc)) != 0 ||
168 (count % sizeof(struct user_desc)) != 0) 167 (count % sizeof(struct user_desc)) != 0)
169 return -EINVAL; 168 return -EINVAL;
@@ -198,7 +197,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
198 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; 197 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
199 const struct user_desc *info; 198 const struct user_desc *info;
200 199
201 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || 200 if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
202 (pos % sizeof(struct user_desc)) != 0 || 201 (pos % sizeof(struct user_desc)) != 0 ||
203 (count % sizeof(struct user_desc)) != 0) 202 (count % sizeof(struct user_desc)) != 0)
204 return -EINVAL; 203 return -EINVAL;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c6d17ad59b8a..ff9281f16029 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,10 +50,10 @@
50#include <asm/processor.h> 50#include <asm/processor.h>
51#include <asm/debugreg.h> 51#include <asm/debugreg.h>
52#include <linux/atomic.h> 52#include <linux/atomic.h>
53#include <asm/system.h>
54#include <asm/traps.h> 53#include <asm/traps.h>
55#include <asm/desc.h> 54#include <asm/desc.h>
56#include <asm/i387.h> 55#include <asm/i387.h>
56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58 58
59#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..fc0a147e3727 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
620 620
621 if (cpu_khz) { 621 if (cpu_khz) {
622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
623 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); 623 *offset = ns_now - mult_frac(tsc_now, *scale,
624 (1UL << CYC2NS_SCALE_FACTOR));
624 } 625 }
625 626
626 sched_clock_idle_wakeup_event(0); 627 sched_clock_idle_wakeup_event(0);
@@ -629,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
629 630
630static unsigned long long cyc2ns_suspend; 631static unsigned long long cyc2ns_suspend;
631 632
632void save_sched_clock_state(void) 633void tsc_save_sched_clock_state(void)
633{ 634{
634 if (!sched_clock_stable) 635 if (!sched_clock_stable)
635 return; 636 return;
@@ -645,7 +646,7 @@ void save_sched_clock_state(void)
645 * that sched_clock() continues from the point where it was left off during 646 * that sched_clock() continues from the point where it was left off during
646 * suspend. 647 * suspend.
647 */ 648 */
648void restore_sched_clock_state(void) 649void tsc_restore_sched_clock_state(void)
649{ 650{
650 unsigned long long offset; 651 unsigned long long offset;
651 unsigned long flags; 652 unsigned long flags;
@@ -932,6 +933,16 @@ static int __init init_tsc_clocksource(void)
932 clocksource_tsc.rating = 0; 933 clocksource_tsc.rating = 0;
933 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 934 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
934 } 935 }
936
937 /*
938 * Trust the results of the earlier calibration on systems
939 * exporting a reliable TSC.
940 */
941 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
942 clocksource_register_khz(&clocksource_tsc, tsc_khz);
943 return 0;
944 }
945
935 schedule_delayed_work(&tsc_irqwork, 0); 946 schedule_delayed_work(&tsc_irqwork, 0);
936 return 0; 947 return 0;
937} 948}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9eba29b46cb7..fc25e60a5884 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;
42/* 42/*
43 * TSC-warp measurement loop running on both CPUs: 43 * TSC-warp measurement loop running on both CPUs:
44 */ 44 */
45static __cpuinit void check_tsc_warp(void) 45static __cpuinit void check_tsc_warp(unsigned int timeout)
46{ 46{
47 cycles_t start, now, prev, end; 47 cycles_t start, now, prev, end;
48 int i; 48 int i;
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void)
51 start = get_cycles(); 51 start = get_cycles();
52 rdtsc_barrier(); 52 rdtsc_barrier();
53 /* 53 /*
54 * The measurement runs for 20 msecs: 54 * The measurement runs for 'timeout' msecs:
55 */ 55 */
56 end = start + tsc_khz * 20ULL; 56 end = start + (cycles_t) tsc_khz * timeout;
57 now = start; 57 now = start;
58 58
59 for (i = 0; ; i++) { 59 for (i = 0; ; i++) {
@@ -99,6 +99,25 @@ static __cpuinit void check_tsc_warp(void)
99} 99}
100 100
101/* 101/*
102 * If the target CPU coming online doesn't have any of its core-siblings
103 * online, a timeout of 20msec will be used for the TSC-warp measurement
104 * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
105 * information about this socket already (and this information grows as we
106 * have more and more logical-siblings in that socket).
107 *
108 * Ideally we should be able to skip the TSC sync check on the other
109 * core-siblings, if the first logical CPU in a socket passed the sync test.
110 * But as the TSC is per-logical CPU and can potentially be modified wrongly
111 * by the bios, TSC sync test for smaller duration should be able
112 * to catch such errors. Also this will catch the condition where all the
113 * cores in the socket doesn't get reset at the same time.
114 */
115static inline unsigned int loop_timeout(int cpu)
116{
117 return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
118}
119
120/*
102 * Source CPU calls into this - it waits for the freshly booted 121 * Source CPU calls into this - it waits for the freshly booted
103 * target CPU to arrive and then starts the measurement: 122 * target CPU to arrive and then starts the measurement:
104 */ 123 */
@@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
135 */ 154 */
136 atomic_inc(&start_count); 155 atomic_inc(&start_count);
137 156
138 check_tsc_warp(); 157 check_tsc_warp(loop_timeout(cpu));
139 158
140 while (atomic_read(&stop_count) != cpus-1) 159 while (atomic_read(&stop_count) != cpus-1)
141 cpu_relax(); 160 cpu_relax();
@@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void)
183 while (atomic_read(&start_count) != cpus) 202 while (atomic_read(&start_count) != cpus)
184 cpu_relax(); 203 cpu_relax();
185 204
186 check_tsc_warp(); 205 check_tsc_warp(loop_timeout(smp_processor_id()));
187 206
188 /* 207 /*
189 * Ok, we are done: 208 * Ok, we are done:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a1315ab2d6b9..255f58ae71e8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
172 spinlock_t *ptl; 172 spinlock_t *ptl;
173 int i; 173 int i;
174 174
175 down_write(&mm->mmap_sem);
175 pgd = pgd_offset(mm, 0xA0000); 176 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd)) 177 if (pgd_none_or_clear_bad(pgd))
177 goto out; 178 goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
190 } 191 }
191 pte_unmap_unlock(pte, ptl); 192 pte_unmap_unlock(pte, ptl);
192out: 193out:
194 up_write(&mm->mmap_sem);
193 flush_tlb(); 195 flush_tlb();
194} 196}
195 197
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 327509b95e0e..f386dc49f988 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -52,10 +52,7 @@
52#include "vsyscall_trace.h" 52#include "vsyscall_trace.h"
53 53
54DEFINE_VVAR(int, vgetcpu_mode); 54DEFINE_VVAR(int, vgetcpu_mode);
55DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 55DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
56{
57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
58};
59 56
60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; 57static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
61 58
@@ -80,20 +77,15 @@ early_param("vsyscall", vsyscall_setup);
80 77
81void update_vsyscall_tz(void) 78void update_vsyscall_tz(void)
82{ 79{
83 unsigned long flags;
84
85 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
86 /* sys_tz has changed */
87 vsyscall_gtod_data.sys_tz = sys_tz; 80 vsyscall_gtod_data.sys_tz = sys_tz;
88 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
89} 81}
90 82
91void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, 83void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
92 struct clocksource *clock, u32 mult) 84 struct clocksource *clock, u32 mult)
93{ 85{
94 unsigned long flags; 86 struct timespec monotonic;
95 87
96 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 88 write_seqcount_begin(&vsyscall_gtod_data.seq);
97 89
98 /* copy vsyscall data */ 90 /* copy vsyscall data */
99 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; 91 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
@@ -101,12 +93,19 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
101 vsyscall_gtod_data.clock.mask = clock->mask; 93 vsyscall_gtod_data.clock.mask = clock->mask;
102 vsyscall_gtod_data.clock.mult = mult; 94 vsyscall_gtod_data.clock.mult = mult;
103 vsyscall_gtod_data.clock.shift = clock->shift; 95 vsyscall_gtod_data.clock.shift = clock->shift;
96
104 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 97 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
105 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 98 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
106 vsyscall_gtod_data.wall_to_monotonic = *wtm; 99
100 monotonic = timespec_add(*wall_time, *wtm);
101 vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec;
102 vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec;
103
107 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 104 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
105 vsyscall_gtod_data.monotonic_time_coarse =
106 timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
108 107
109 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 108 write_seqcount_end(&vsyscall_gtod_data.seq);
110} 109}
111 110
112static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc673..e9f265fd79ae 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
91}; 91};
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .early_percpu_clock_init = x86_init_noop,
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 95 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id, 96 .fixup_cpu_id = x86_default_fixup_cpu_id,
96}; 97};
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
107 .is_untracked_pat_range = is_ISA_range, 108 .is_untracked_pat_range = is_ISA_range,
108 .nmi_init = default_nmi_init, 109 .nmi_init = default_nmi_init,
109 .get_nmi_reason = default_get_nmi_reason, 110 .get_nmi_reason = default_get_nmi_reason,
110 .i8042_detect = default_i8042_detect 111 .i8042_detect = default_i8042_detect,
112 .save_sched_clock_state = tsc_save_sched_clock_state,
113 .restore_sched_clock_state = tsc_restore_sched_clock_state,
111}; 114};
112 115
113EXPORT_SYMBOL_GPL(x86_platform); 116EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 711091114119..e62728e30b01 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -6,6 +6,7 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/compat.h> 7#include <linux/compat.h>
8#include <asm/i387.h> 8#include <asm/i387.h>
9#include <asm/fpu-internal.h>
9#ifdef CONFIG_IA32_EMULATION 10#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h> 11#include <asm/sigcontext32.h>
11#endif 12#endif